o
    i*                     @   s   d dl Z d dlmZ d dlmZ G dd dejZG dd dejZG dd dejZG d	d
 d
ejZ	G dd dejZ
G dd dejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )    N)optimized_attention_maskedc                       &   e Zd Zd fdd	Zdd Z  ZS )LayerNormConvFNc	           	   	      s<   t    |j|||||||d| _|j|d||d| _d S )Nkernel_sizestridebiasdevicedtypeT)elementwise_affiner	   r
   )super__init__Conv1dconv	LayerNorm
layer_norm	selfin_channelsout_channelsr   r   r   r
   r	   
operations	__class__ ;/mnt/c/Users/fbmor/ComfyUI/comfy/audio_encoders/wav2vec2.pyr      s   
zLayerNormConv.__init__c                 C   s.   |  |}tjj| |ddddS )N)r   torchnn
functionalgelur   	transposer   xr   r   r   forward   s   
$zLayerNormConv.forwardFNNN__name__
__module____qualname__r   r$   __classcell__r   r   r   r   r          r   c                       r   )LayerGroupNormConvFNc	           	   	      s>   t    |j|||||||d| _|j||d||d| _d S )Nr   T)
num_groupsnum_channelsaffiner	   r
   )r   r   r   r   	GroupNormr   r   r   r   r   r      s   
zLayerGroupNormConv.__init__c                 C   s   |  |}tjj| |S N)r   r   r   r   r    r   r"   r   r   r   r$      s   
zLayerGroupNormConv.forwardr%   r&   r   r   r   r   r,      r+   r,   c                       r   )
ConvNoNormFNc	           	   	      s(   t    |j|||||||d| _d S )Nr   )r   r   r   r   r   r   r   r   r      s   
zConvNoNorm.__init__c                 C   s   |  |}tjj|S r1   )r   r   r   r   r    r"   r   r   r   r$      s   
zConvNoNorm.forwardr%   r&   r   r   r   r   r2      s    r2   c                       &   e Zd Zd fdd	Zdd Z  ZS )	ConvFeatureEncoderFTNc                    sb  t    |r\ttd|ddd|||dt||dd||||dt||dd||||dt||dd||||dt||dd||||dt||dd||||dt||dd||||dg| _d S ttd|dd||||dt||dd||||dt||dd||||dt||dd||||dt||dd||||dt||dd||||dt||dd||||dg| _d S )N   
      T)r   r   r   r	   r
   r         )r   r   r   
ModuleListr   conv_layersr,   r2   )r   conv_dim	conv_bias	conv_normr
   r	   r   r   r   r   r   %   s(   

zConvFeatureEncoder.__init__c                 C   s*   | d}| jD ]}||}q|ddS )Nr5   r9   )	unsqueezer;   r!   )r   r#   r   r   r   r   r$   <   s   


zConvFeatureEncoder.forward)FTNNNr&   r   r   r   r   r4   $   s    r4   c                       &   e Zd Zd fdd	Zdd Z  ZS )FeatureProjectionNc                    s6   t    |j|d||d| _|j||||d| _d S )Nh㈵>epsr	   r
   r	   r
   )r   r   r   r   Linear
projection)r   r<   	embed_dimr
   r	   r   r   r   r   r   F   s   
zFeatureProjection.__init__c                 C   s   |  |}| |}|S r1   )r   rG   r"   r   r   r   r$   K   s   

zFeatureProjection.forwardNNNr&   r   r   r   r   rA   E   r+   rA   c                       r3   )	PositionalConvEmbedding         c                    sL   t    tj||||d |d| _tjjjj| jddd| _t	 | _
d S )Nr9   )r   paddinggroupsweight)namedim)r   r   r   r   r   r   utilsparametrizationsweight_normGELU
activation)r   rH   r   rO   r   r   r   r   R   s   
z PositionalConvEmbedding.__init__c                 C   sF   | dd}| |d d d d d df }| |}| dd}|S )Nr5   r9   r   )r!   r   rW   r"   r   r   r   r$   ^   s
    
zPositionalConvEmbedding.forward)rK   rL   rM   r&   r   r   r   r   rJ   Q   s    rJ   c                       s4   e Zd Z						d
 fdd	Zddd	Z  ZS )TransformerEncoderrK            @TNc	           	   	      s^   t    td| _t fddt|D | _jd d| _	| _
d S )N)rH   c                    s"   g | ]}t  d qS ))rH   	num_heads	mlp_ratiodo_stable_layer_normr	   r
   r   )TransformerEncoderLayer).0_r	   r]   r
   rH   r\   r[   r   r   r   
<listcomp>s   s    z/TransformerEncoder.__init__.<locals>.<listcomp>rB   rC   )r   r   rJ   pos_conv_embedr   r:   rangelayersr   r   r]   )	r   rH   r[   
num_layersr\   r]   r
   r	   r   r   ra   r   r   g   s   
	

zTransformerEncoder.__init__c                 C   sd   ||  | }d}| js| |}| jD ]}||f7 }|||}q| jr)| |}||f7 }||fS )Nr   )rc   r]   r   re   )r   r#   maskall_xlayerr   r   r   r$      s   




zTransformerEncoder.forward)rK   rY   rY   rZ   TNNNr1   r&   r   r   r   r   rX   f   s    rX   c                       s(   e Zd Zd fdd	ZdddZ  ZS )		AttentionTNc                    s|   t    || _|| _|| | _|j|||||d| _|j|||||d| _|j|||||d| _|j|||||d| _	d S )N)r   r	   r
   )
r   r   rH   r[   head_dimrF   k_projv_projq_projout_proj)r   rH   r[   r   r
   r	   r   r   r   r   r      s   

zAttention.__init__c                 C   sD   |d u sJ |  |}| |}| |}t|||| j}| |S r1   )rn   rl   rm   r   r[   ro   )r   r#   rg   qkvoutr   r   r   r$      s   



zAttention.forward)TNNNr1   r&   r   r   r   r   rj      s    rj   c                       r@   )FeedForwardNc                    sF   t    |j|t|| ||d| _|jt|| |||d| _d S )NrE   )r   r   rF   intintermediate_denseoutput_dense)r   rH   r\   r
   r	   r   r   r   r   r      s   
 zFeedForward.__init__c                 C   s&   |  |}tjj|}| |}|S r1   )rv   r   r   r   r    rw   r"   r   r   r   r$      s   

zFeedForward.forwardrI   r&   r   r   r   r   rt      r+   rt   c                       s2   e Zd Z					d
 fdd	Zddd	Z  ZS )r^   rK   rY   rZ   TNc                    s`   t    t|||||d| _|j|||d| _t|||||d| _|j|||d| _|| _	d S )Nr	   r
   r   rE   )
r   r   rj   	attentionr   r   rt   feed_forwardfinal_layer_normr]   )r   rH   r[   r\   r]   r
   r	   r   r   r   r   r      s   

z TransformerEncoderLayer.__init__c                 C   sb   |}| j r
| |}| j||d}|| }| j s'| |}| || | S || | | S )N)rg   )r]   r   ry   r{   rz   )r   r#   rg   residualr   r   r   r$      s   

zTransformerEncoderLayer.forward)rK   rY   rZ   TNNNr1   r&   r   r   r   r   r^      s    r^   c                       s>   e Zd ZdZ									d fdd		ZdddZ  ZS )Wav2Vec2ModelzComplete Wav2Vec 2.0 model.      rM      TNc              	      sr   t    d}t||||
|	|d| _t|||
|	|d| _ttj	||
|	d| _
|| _t|||||
|	|d| _d S )Ni   )r>   r=   r	   r
   r   rx   rE   )rH   r[   rf   r]   r	   r
   r   )r   r   r4   feature_extractorrA   feature_projectionr   	Parameterr   emptymasked_spec_embeddo_normalizerX   encoder)r   rH   	final_dimr[   rf   r>   r=   r   r]   r
   r	   r   r<   r   r   r   r      s   
zWav2Vec2Model.__init__Fc           	      C   sh   t j|dd}| jr||  t | d  }| |}| |}|j\}}}| |\}}||fS )Nr5   )rR   gHz>)	r   meanr   sqrtvarr   r   shaper   )	r   r#   mask_time_indicesreturn_dictfeatures
batch_sizeseq_lenr`   rh   r   r   r   r$      s   

zWav2Vec2Model.forward)r~   r   rM   r   TTTTNNN)NF)r'   r(   r)   __doc__r   r$   r*   r   r   r   r   r}      s    r}   )r   torch.nnr   comfy.ldm.modules.attentionr   Moduler   r,   r2   r4   rA   rJ   rX   rj   rt   r^   r}   r   r   r   r   <module>   s    


!)