o
    ir2                     @   s   d dl Z d dlZd dlmZ d dlZG dd de jjZdd e jj	j
dZG dd	 d	e jjZG d
d de jjZG dd de jjZG dd de jjZG dd de jjZG dd de jjZG dd de jjZG dd de jjZdS )    N)optimized_attention_for_devicec                       s&   e Zd Zd fdd	Zdd Z  ZS )T5LayerNormư>Nc                    s.   t    tjtj|||d| _|| _d S )Ndtypedevice)super__init__torchnn	Parameteremptyweightvariance_epsilon)selfhidden_sizeepsr   r   
operations	__class__ 4/mnt/c/Users/fbmor/ComfyUI/comfy/text_encoders/t5.pyr	      s   

zT5LayerNorm.__init__c                 C   s<   | djddd}|t|| j  }tj| j|| S )N   T)keepdim)	powmeanr
   rsqrtr   comfyopscast_to_inputr   )r   xvariancer   r   r   forward   s   zT5LayerNorm.forward)r   NNN__name__
__module____qualname__r	   r#   __classcell__r   r   r   r   r      s    r   c                 C   s   t jjj| ddS )Ntanh)approximate)r
   r   
functionalgelu)ar   r   r   <lambda>   s    r.   )gelu_pytorch_tanhreluc                       $   e Zd Z fddZdd Z  ZS )T5DenseActDensec                    sD   t    |j||d||d| _|j||d||d| _t| | _d S NFbiasr   r   )r   r	   Linearwiwoactivationsactr   	model_dimff_dimff_activationr   r   r   r   r   r   r	      s   
zT5DenseActDense.__init__c                 C   s   |  | |}| |}|S N)r:   r7   r8   )r   r!   r   r   r   r#      s   
zT5DenseActDense.forwardr$   r   r   r   r   r2      s    r2   c                       r1   )T5DenseGatedActDensec                    sZ   t    |j||d||d| _|j||d||d| _|j||d||d| _t| | _d S r3   )r   r	   r6   wi_0wi_1r8   r9   r:   r;   r   r   r   r	   %   s
   
zT5DenseGatedActDense.__init__c                 C   s0   |  | |}| |}|| }| |}|S r?   )r:   rA   rB   r8   )r   r!   hidden_geluhidden_linearr   r   r   r#   -   s
   

zT5DenseGatedActDense.forwardr$   r   r   r   r   r@   $   s    r@   c                       r1   )	T5LayerFFc                    sN   t    |rt||||||| _n
t||||||| _t||||d| _d S Nr   r   r   )r   r	   r@   DenseReluDenser2   r   
layer_norm)r   r<   r=   r>   	gated_actr   r   r   r   r   r   r	   6   s
   
zT5LayerFF.__init__c                 C   s    |  |}| |}||7 }|S r?   )rI   rH   )r   r!   forwarded_statesr   r   r   r#   @   s   

zT5LayerFF.forwardr$   r   r   r   r   rE   5   s    
rE   c                       s<   e Zd Z fddZedddZdd	 ZdddZ  ZS )T5Attentionc                    s   t    |j||d||d| _|j||d||d| _|j||d||d| _|j||d||d| _|| _d | _|rMd| _	d| _
|j| j	| j||d| _d S d S )NFr4          r   r   )r   r	   r6   qkvo	num_headsrelative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distance	Embedding)r   r<   	inner_dimrT   rU   r   r   r   r   r   r   r	   H   s   
zT5Attention.__init__TrM   rN   c                 C   s   d}|r|d }|| dk tj| 7 }t| } n
t| t|  } |d }| |k }|t|  | t||  ||   tj }t|t	||d }|t
|| |7 }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r      )tor
   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larger   r   r   _relative_position_bucketX   s*   z%T5Attention._relative_position_bucketc           
      C   s   t j|t j|ddddf }t j|t j|ddddf }|| }| j|d| j| jd}| j||d}	|	g dd}	|		 S )z%Compute binned relative position biasr   NT)rf   rg   rh   	out_dtype)r   r   rZ   r   )
r
   aranger\   rm   rV   rW   rU   permute	unsqueeze
contiguous)
r   query_length
key_lengthr   r   context_positionmemory_positionre   relative_position_bucketvaluesr   r   r   compute_bias   s   zT5Attention.compute_biasNc           	      C   s   |  |}| |}| |}| jd ur$| |jd |jd |j|j}|d ur3|d ur1|| }n|}||||jd | j d  || j|}| 	||fS )NrZ   r   g      ?)
rP   rQ   rR   rU   rz   shaper   r   rT   rS   )	r   r!   mask	past_biasoptimized_attentionrP   rQ   rR   outr   r   r   r#      s   



 
&zT5Attention.forward)TrM   rN   NNN)	r%   r&   r'   r	   staticmethodrm   rz   r#   r(   r   r   r   r   rL   G   s    /rL   c                       &   e Zd Z fddZdddZ  ZS )T5LayerSelfAttentionc	           	         s6   t    t|||||||| _t||||d| _d S rF   )r   r	   rL   SelfAttentionr   rI   )	r   r<   rY   r=   rT   rU   r   r   r   r   r   r   r	      s   
zT5LayerSelfAttention.__init__Nc                 C   s,   | j | ||||d\}}||7 }||fS )N)r|   r}   r~   )r   rI   )r   r!   r|   r}   r~   outputr   r   r   r#      s   zT5LayerSelfAttention.forwardr   r$   r   r   r   r   r          r   c                       r   )T5Blockc                    sT   t    tj | _| jt|||||||	|
 | jt||||||	|
 d S r?   )	r   r	   r
   r   
ModuleListlayerappendr   rE   )r   r<   rY   r=   r>   rJ   rT   rU   r   r   r   r   r   r   r	      s   
 zT5Block.__init__Nc                 C   s.   | j d ||||\}}| j d |}||fS )Nr   r   )r   )r   r!   r|   r}   r~   r   r   r   r#      s   zT5Block.forwardr   r$   r   r   r   r   r      r   r   c                       s0   e Zd Z fddZddddg fddZ  ZS )T5Stackc                    sR   t    tj 	f
ddt|D | _t d| _d S )Nc                    s2   g | ]}t 	 p|d k d
qS )r   )rU   r   r   r   )r   ).0i
r   r   r>   r=   rJ   rY   r<   rT   r   relative_attentionr   r   
<listcomp>   s   2 z$T5Stack.__init__.<locals>.<listcomp>rG   )	r   r	   r
   r   r   rangeblockr   final_layer_norm)r   
num_layersr<   rY   r=   r>   rJ   rT   r   r   r   r   r   r   r   r	      s
   
&zT5Stack.__init__NTc                 C   s  d }|d ur:d| |j|jd dd|jd f|jd d|jd |jd  }|| tjt|jj	 }d }t
|j|d udd}	d }
|d urW|dk rWt| j| }t| jD ]\}}||||
|	\}}
||krq| }q\| |}|d ur|r| |}||fS )Ng      ?r   rZ   r   T)r|   small_input)r[   r   reshaper{   expandmasked_fillr
   boolfinfomaxr   r   lenr   	enumeratecloner   )r   r!   attention_maskintermediate_outputfinal_layer_norm_intermediater   embeds_infor|   intermediater~   r}   r   lr   r   r   r#      s&   H 

zT5Stack.forwardr$   r   r   r   r   r      s    	r   c                       s6   e Zd Z fddZdd Zdd Zd
dd	Z  ZS )T5c                    s   t    |d | _|d }|d |d  }t| j|||d |d |d |d |d d	k|||| _|| _|j|d
 |||d| _d S )Nr   d_modeld_kvrT   d_ffdense_act_fnis_gated_act
model_typeumt5
vocab_sizerO   )r   r	   r   r   encoderr   rX   shared)r   config_dictr   r   r   r<   rY   r   r   r   r	      s   

8zT5.__init__c                 C   s   | j S r?   r   )r   r   r   r   get_input_embeddings   s   zT5.get_input_embeddingsc                 C   s
   || _ d S r?   r   )r   
embeddingsr   r   r   set_input_embeddings   s   
zT5.set_input_embeddingsNc                 K   s\   |d u r|}n| j ||dtjd}| jtjtjtjfvr#t|}| j|fd|i|S )Nr   rn   r   )	r   getr
   float32r   float16bfloat16
nan_to_numr   )r   	input_idsr   embeds
num_tokenskwargsr!   r   r   r   r#      s   
z
T5.forward)NN)r%   r&   r'   r	   r   r   r#   r(   r   r   r   r   r      s
    
r   )r
   rb   comfy.ldm.modules.attentionr   	comfy.opsr   r   Moduler   r+   r0   r9   r2   r@   rE   rL   r   r   r   r   r   r   r   r   <module>   s     `!