o
    iD                     @   s   d dl mZmZmZ d dlZd dlmZ d dlZd dlZd dl	m
Z
mZ ddlmZmZ ddlmZ dd	 ZG d
d dejZG dd dejZG dd dejZG dd dejZdS )    )OptionalListUnionN)nn)TimestepEmbedding	Timesteps   )LinearTransformerBlockt2i_modulate)ConformerEncoderc                 C   sX   | j ddd| jddd}}|j ddd|jddd}}|| ||d   | }|S )N)r      T)dimkeepdimg-q=)meanstd)hidden_statescontrolnet_inputmean_hidden_statesstd_hidden_statesmean_controlnet_inputstd_controlnet_input r   1/mnt/c/Users/fbmor/ComfyUI/comfy/ldm/ace/model.py
cross_norm   s   r   c                       s0   e Zd Zd
 fdd	Zdd Zddd	Z  ZS )Qwen2RotaryEmbedding   '  Nc              	      sr   t    || _|| _|| _d| jtjd| jdtj|d | j   }| j	d|dd | j
|| jjtjd d S )	N      ?r   r   dtypedeviceinv_freqF
persistentseq_lenr    r   )super__init__r   max_position_embeddingsbasetorcharangeint64floatregister_buffer_set_cos_sin_cacher!   r    float32)selfr   r(   r)   r   r    r!   	__class__r   r   r'   '   s   
,
zQwen2RotaryEmbedding.__init__c                 C   sz   || _ tj| j |tjd| j}t|| j}tj||fdd}| jd|	 
|dd | jd| 
|dd d S )Nr    r   r   
cos_cachedFr"   
sin_cached)max_seq_len_cachedr*   r+   r,   type_asr!   outercatr.   costosin)r1   r%   r    r   tfreqsembr   r   r   r/   5   s   z'Qwen2RotaryEmbedding._set_cos_sin_cachec                 C   sN   || j kr| j||j|jd | jd | j|jd| jd | j|jdfS )Nr$   r   )r9   r/   r    r   r7   r>   r8   )r1   xr%   r   r   r   forward?   s
   
zQwen2RotaryEmbedding.forward)r   r   NN)N)__name__
__module____qualname__r'   r/   rE   __classcell__r   r   r2   r   r   &   s    
r   c                       sL   e Zd ZdZddgddddf fdd	Zdejd	efd
dZdd Z	  Z
S )T2IFinalLayerz"
    The final layer of Sana.
       r      Nc                    sp   t    |j|dd||d| _|j||d |d  | d||d| _ttj	d|||d	| _
|| _|| _d S )
NFư>)elementwise_affineepsr   r    r   r   Tbiasr   r    r   r   )r&   r'   RMSNorm
norm_finalLinearlinearr   	Parameterr*   emptyscale_shift_tableout_channels
patch_size)r1   hidden_sizerZ   rY   r   r    
operationsr2   r   r   r'   O   s   
&
zT2IFinalLayer.__init__r   widthc                 C   s   d| d}}|j|jd ||| jd | jd | jfd }td|}|j|jd | j|| jd  || jd  fd }||krVtjj	
|d|| ddfdd}|S ||k rj|d d d d d d d |f }|S )Nr   r   )shapeznhwpqc->nchpwqconstant)sizereshaper^   rZ   rY   
contiguousr*   einsumr   
functionalpad)r1   r   r]   
new_height	new_widthoutputr   r   r   	unpatchfyW   s$   "&  zT2IFinalLayer.unpatchfyc                 C   sf   t jj| jd  |j|jd|d d d f  jddd\}}t| |||}| 	|}| 
||}|S )Nr4   r   r   r6   )comfymodel_managementcast_torX   r    r   chunkr
   rS   rU   ri   )r1   rD   r@   output_lengthshiftscalerh   r   r   r   rE   k   s
   :
zT2IFinalLayer.forward)rF   rG   rH   __doc__r'   r*   Tensorintri   rE   rI   r   r   r2   r   rJ   J   s    
rJ   c                       s8   e Zd ZdZ							d fd	d
	Zdd Z  ZS )
PatchEmbedz2D Image to Patch EmbeddingrK      rK   r        TNc
                    s   t    |\}
}t|	j||d ||d|||d|	jd|d dd||d|	j|d |ddd|||d| _|| _||
 || | _| _	| j	| _
d S )	NrL   r   )kernel_sizestridepaddingrQ   r   r        rM   T)
num_groupsnum_channelsrO   affiner   r    r   )r&   r'   r   
SequentialConv2d	GroupNormearly_conv_layersrZ   heightr]   	base_size)r1   r   r]   rZ   in_channels	embed_dimrQ   r   r    r\   patch_size_hpatch_size_wr2   r   r   r'   w   s   

zPatchEmbed.__init__c                 C   s    |  |}|ddd}|S )Nr   r   )r   flatten	transpose)r1   latentr   r   r   rE      s   
zPatchEmbed.forward)	rK   ru   rv   rw   rx   TNNN)rF   rG   rH   rq   r'   rE   rI   r   r   r2   r   rt   t   s    rt   c                '       sz  e Zd Zdddddddddd	d
ddgddgdd
gddddgddddddfdee dededededededededededee d ee d!ee d"ed#ed$ee d%ed&ef& fd'd(Z			dCd)ee	j
 d*ee	j
 fd+d,Z						-dDd.ee	j d/ee	j
 d0ee	j d)ee	j
 d*ee	j
 f
d1d2Zd3dd-i fd4e	jd5e	jd6e	jd7e	jd8ee	j d9ed:eeee	j e	jf  d;eee	jf fd<d=Z								-	-dEd>ee	j d/ee	j
 d0ee	j d)ee	j
 d*ee	j
 d:eeee	j e	jf  d;eee	jf fd?d@Z								-	-dEd>ee	j d/ee	j
 d0ee	j d)ee	j
 d*ee	j
 d:eeee	j e	jf  d;eee	jf fdAdBZ  ZS )FACEStepTransformer2DModelrw      i   @      g      @i   g    .Ai   i   	   mertzm-huberti   i  rK   r   ru   Nr   
num_layers	inner_dimattention_head_dimnum_attention_heads	mlp_ratiorY   max_position
rope_thetaspeaker_embedding_dimtext_embedding_dimssl_encoder_depths	ssl_namesssl_latent_dimslyric_encoder_vocab_sizelyric_hidden_sizerZ   
max_height	max_widthc              
      s  t    _|_ _|  }|_|_|_|_|	_	t
jjj	d_|_|_t fddtjD _tdddd_tdjd_tt jjd	j dd
_j|
jd_j|jd_j||d_t|dd_j|jd_ dj tfdd|D _!t"|||jdd_#t$j||d_%d S )N)r   r(   r)   r   r    c                    s,   g | ]}t jj d jd	qS )T)	r   r   r   r   add_cross_attentionadd_cross_attention_dimr   r    r\   )r	   r   r   ).0i)r   r    r   r   r\   r1   r   r   
<listcomp>   s    z6ACEStepTransformer2DModel.__init__.<locals>.<listcomp>rL   Tr   )r~   flip_sin_to_cosdownscale_freq_shift)r   time_embed_dimr   r    r\      rP   r   )
input_sizestatic_chunk_sizer   r    r\   r   c                    sR   g | ]%}t jj d t  j d t  j| d qS )r   )r   r   rT   r   SiLU)r   ssl_dim)r    r   r\   projector_dimr1   r   r   r      s    )r   r]   rZ   r   rQ   r   r    r\   )rZ   rY   r   r    r\   )&r&   r'   r   r   r   r   rY   r   rZ   r   r   
rotary_embr   r   r   
ModuleListrangetransformer_blocksr   	time_projr   timestep_embedderr   r   rT   t_blockspeaker_embeddergenre_embedder	Embedding
lyric_embsLyricEncoderlyric_encoder
lyric_proj
projectorsrt   proj_inrJ   final_layer)r1   r   r   r   r   r   r   rY   r   r   r   r   r   r   r   r   r   rZ   r   r   audio_modelr   r    r\   r2   )r   r    r   r   r\   r   r1   r   r'      s^   
	*


z"ACEStepTransformer2DModel.__init__lyric_token_idx
lyric_maskc                 C   s2   | j ||d}| j||ddd\}}| |}|S )N)	out_dtyper   r5   )decoding_chunk_sizenum_decoding_left_chunks)r   r   r   )r1   r   r   r   r   prompt_prenet_out_maskr   r   r   forward_lyric_encoder  s   
z/ACEStepTransformer2DModel.forward_lyric_encoderr   encoder_text_hidden_statestext_attention_maskspeaker_embedsc                 C   s   |j d }|j}| |d}	| |}| j|||jd}
|
|9 }
tj|	||
gdd}d }|d urDtj	|d|d}tj|||gdd}||fS )Nr   r   )r   r   r   r6   )r    )
r^   r    r   	unsqueezer   r   r   r*   r<   ones)r1   r   r   r   r   r   lyrics_strengthbsr    encoder_spk_hidden_statesencoder_lyric_hidden_statesencoder_hidden_statesencoder_hidden_maskspeaker_maskr   r   r   encode  s    


z ACEStepTransformer2DModel.encoder   r   attention_maskr   r   timesteprn   block_controlnet_hidden_statescontrolnet_scalec
                 C   s   |  | |j|jd}
| |
}| |}|d ur&t||}|||  }| j||jd d}| j||jd d}t	| j
D ]\}}|||||||||	d}q?| ||
|}|S )NrC   r   )r%   )r   r   r   encoder_attention_maskrotary_freqs_cisrotary_freqs_cis_crosstembtransformer_options)r   r   r>   r   r   r   r   r   r^   	enumerater   r   )r1   r   r   r   r   r   rn   r   r   r   embedded_timestepr   control_condir   encoder_rotary_freqs_cisindex_blockblockrh   r   r   r   decode3  s*   


z ACEStepTransformer2DModel.decodecontextc                 K   sN   t jj| j| t jt jjj|di j	|||||||||	|
|fi |S )Nr   )
rj   patcher_extensionWrapperExecutornew_class_executor_forwardget_all_wrappers
WrappersMPDIFFUSION_MODELgetexecute)r1   rD   r   r   r   r   r   r   r   r   r   r   kwargsr   r   r   rE   ]  s   z!ACEStepTransformer2DModel.forwardc                 K   sX   |}|}| j ||||||d\}}|jd }|di }| j|||||||	|
|d	}|S )N)r   r   r   r   r   r   r5   r   )	r   r   r   r   r   rn   r   r   r   )r   r^   r   r   )r1   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rn   r   rh   r   r   r   r   r  s0   

	z"ACEStepTransformer2DModel._forward)NNN)NNNNNr   )	NNNNNNNr   r   )rF   rG   rH   r   rs   r-   r   strr'   r*   
LongTensorr   rr   FloatTensorr   r   r   rE   r   rI   r   r   r2   r   r      s6   	
n

,	
-	

	
r   )typingr   r   r   r*   r   comfy.model_managementrj   comfy.patcher_extensioncomfy.ldm.lightricks.modelr   r   	attentionr	   r
   r   r   r   r   Moduler   rJ   rt   r   r   r   r   r   <module>   s   	$*