o
    iY                     @   s   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dl	m
Z
 d dl mZmZ d dlmZmZmZmZmZmZ d dlZeG dd dZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZG dd dejZdS )    N)optimized_attention)	dataclass)repeat)Tensornn)DoubleStreamBlockEmbedND	LastLayerMLPEmbedderSingleStreamBlocktimestep_embeddingc                   @   s   e Zd ZU eed< eed< eed< eed< eed< eed< eed< eed< eed	< eed
< eed< eed< eed< eed< eed< eed< eed< eed< eed< dS )HunyuanVideoParamsin_channelsout_channels
vec_in_dimcontext_in_dimhidden_size	mlp_ratio	num_headsdepthdepth_single_blocksaxes_dimtheta
patch_sizeqkv_biasguidance_embedbyt5meanflowuse_cond_type_embeddingvision_in_dimmeanflow_sumN)__name__
__module____qualname__int__annotations__floatlistbool r)   r)   ;/mnt/c/Users/fbmor/ComfyUI/comfy/ldm/hunyuan_video/model.pyr      s(   
 r   c                       s(   e Zd Zddedef fddZ  ZS )SelfAttentionRefFNdimr   c                    s<   t    |j||d |||d| _|j||||d| _d S )N   biasdtypedevicer0   r1   )super__init__Linearqkvproj)selfr,   r   r0   r1   
operations	__class__r)   r*   r4   2   s   
zSelfAttentionRef.__init__FNNN)r!   r"   r#   r$   r(   r4   __classcell__r)   r)   r:   r*   r+   1   s     r+   c                       0   e Zd Z			d fdd	Zi fddZ  ZS )TokenRefinerBlockNc                    s   t    || _|d }tt |j|d| d||d| _|j|dd||d| _	t
|d|||d| _|j|dd||d| _t|j||d||dt |j||d||d| _d S )N      Tr.   gư>)elementwise_affineepsr0   r1   r0   r1   r9   )r3   r4   headsr   
SequentialSiLUr5   adaLN_modulation	LayerNormnorm1r+   	self_attnnorm2mlp)r8   r   rE   r0   r1   r9   mlp_hidden_dimr:   r)   r*   r4   9   s   

zTokenRefinerBlock.__init__c              	   C   s   |  |jddd\}}| |}| j|}||jd |jd d| jdddddd\}	}
}t	|	|
|| j|d|d	}|| j
||d  }|| | ||d  }|S )
NrA      r,   r   r-   r@   T)maskskip_reshapetransformer_options)rH   chunkrJ   rK   r6   reshapeshaperE   permuter   r7   	unsqueezerM   rL   )r8   xcrR   rT   mod1mod2norm_xr6   qkvattnr)   r)   r*   forwardU   s   
4zTokenRefinerBlock.forwardNNNr!   r"   r#   r4   rc   r=   r)   r)   r:   r*   r?   8   s    r?   c                       r>   )IndividualTokenRefinerNc                    s4   t    t fddt|D | _d S )Nc              	      s   g | ]}t  d qS ))r   rE   r0   r1   r9   )r?   .0_r1   r0   rE   r   r9   r)   r*   
<listcomp>n   s    z3IndividualTokenRefiner.__init__.<locals>.<listcomp>)r3   r4   r   
ModuleListrangeblocks)r8   r   rE   
num_blocksr0   r1   r9   r:   rj   r*   r4   c   s   
	
zIndividualTokenRefiner.__init__c                 C   sj   d }|d ur%| |jd dd|jd dd|jd d}||dd }| jD ]
}|||||d}q(|S )Nr   rO   rA   r-   rT   )viewrW   r   	transposern   )r8   rZ   r[   rR   rT   mblockr)   r)   r*   rc   z   s   .
zIndividualTokenRefiner.forwardrd   re   r)   r)   r:   r*   rf   b   s    rf   c                       r>   )TokenRefinerNc                    sb   t    |j||d||d| _td||||d| _t|||||d| _t||||||d| _d S )NTr.      rD   )	r3   r4   r5   input_embedderr
   
t_embedder
c_embedderrf   individual_token_refiner)r8   text_dimr   rE   ro   r0   r1   r9   r:   r)   r*   r4      s
   

zTokenRefiner.__init__c                 C   s   |  t|ddd|j}|jtjkr"| jdd|jd  }n|jdd|jd  }|| 	||j }| 
|}| j||||d}|S )Nrv         ?time_factorrO   rP   rp   )rx   r   tor0   torchfloat16r&   sumrW   ry   rw   rz   )r8   rZ   	timestepsrR   rT   tr[   r)   r)   r*   rc      s   
zTokenRefiner.forwardrd   re   r)   r)   r:   r*   ru      s    ru   c                       s&   e Zd Zd fdd	Zdd Z  ZS )
ByT5MapperFNc	           	         sl   t    |j|||d| _|j||||d| _|j||||d| _|j||||d| _|| _t	
 | _d S )Nr2   )r3   r4   rI   	layernormr5   fc1fc2fc3use_resr   GELUact_fn)	r8   in_dimout_dim
hidden_dimout_dim1r   r0   r1   r9   r:   r)   r*   r4      s   
zByT5Mapper.__init__c                 C   sX   | j r|}| |}| |}| |}| |}| |}| |}| j r*|| }|S N)r   r   r   r   r   r   )r8   rZ   resx2r)   r)   r*   rc      s   





zByT5Mapper.forwardr<   re   r)   r)   r:   r*   r      s    	r   c                       s   e Zd ZdZd fdd	Zddddddddi f	deded	ed
edededededefddZdd Zdd Zdddddddddi f
ddZ	dddddddddi f
ddZ
  ZS )HunyuanVideoz;
    Transformer model for flow matching on sequences.
    NTc           
   
      s  t    _ d}tdi |_j_j_j_j_j	_	j
j dkr@tdj
 dj j
j }tj|krXtdj d| j
_
j_t|jjd_tjjjjjd jjj
tjdk d	_td
j
 d_jd urtjj
 d_nd _jrtd
j
 dnt  _!t"j#j
jd d_$t% fddt&j'D _(t% fddt&j)D _*j+rt,dddj
d d_-nd _-j.rtd
j
 d_/nd _/|r(t0j
jd j d_1j	d ur?ddl2m3}	 |	j	j
|d_4nd _4jrPt5dj
_6d S d _6d S )N)r9   r1   r0   r   zHidden size z  must be divisible by num_heads zGot z but expected positional dim )r,   r   r   r-   )conv3dr0   r1   r9   rv   )r   r   r0   r1   r9   rD   rA   c                    s*   g | ]}t jjjj d qS ))r   r   r0   r1   r9   )r   r   r   r   r   rg   r1   r0   r9   paramsr8   r)   r*   rk      s    z)HunyuanVideo.__init__.<locals>.<listcomp>c              
      s&   g | ]}t jjj d qS ))r   r0   r1   r9   )r   r   r   r   rg   r   r)   r*   rk      s    i  i   F)r   r   r   r   r   r0   r1   r9   rQ   )MLPProj)r   r   operation_settingsr)   )7r3   r4   r0   r   r   r   r   r   r   r   r   r   
ValueErrorr   r   r   r   pe_embeddercomfyldmmodulesdiffusionmodulesmmdit
PatchEmbedlenimg_inr
   time_inr   	vector_inr   r   Identityguidance_inru   r   txt_inrl   rm   r   double_blocksr   single_blocksr   r   byt5_inr   	time_r_inr	   final_layercomfy.ldm.wan.modelr   	vision_in	Embeddingcond_type_embedding)
r8   image_modelr   r0   r1   r9   kwargsr   pe_dimr   r:   r   r*   r4      sx   
4
 
	 
zHunyuanVideo.__init__Fimgimg_idstxttxt_idstxt_maskr   yguidancereturnc           -         sJ  |  }|di }t|j}| |}| t|ddd|j}| j	d urp|spt
|d d |d kd }t|dkrp|d |d d  }|dj|j|jd	}| 	t|dd
d|j}| jjrj|| n|| d }|d ur| |}| |}t
j||gdd}d|d< |d  |d | jd  7  < t
j||gdd}|d ur	| t|ddd}| jd ur| |d d d | jjf }t
j|| d|| dgdd}nt
j|d|dgdd}|d | jd  |d | jd   }d|df|d dfg}dg}n| jd ur || |d d d | jjf  }d }d }| jjr<|
d ur<|| t|
d|j }|d urVt
|sV|d |jt
|jj }| j||||d}| jd ur| j|j | t
j|d d d d df |jt
jd	}|||j }| jd ur|d ur| |}| jd ur| t
j |d d d d df |jt
jd	}|||j }t
j||fdd}n	t
j||fdd}t
j!|jd |jd |jd f|j|jd	}t
j||fdd}|	d urH| "|	}| jd ur| dt
j |d d d d df t
j|jd }|| }t
j||j|fdd}t
j!|jd |jd |jd f|j|jd	}t
j||fdd}t
j||fdd} | #| }!|jd }"|d ur|"|jd  }#t
j!dd|#f|j|jd}$||$d d dd |jd f< nd }$|di }%t| j$|d< d|d< t%| j$D ]_\}& |&|d< d|&f|%v r͇ fdd}'|%d|&f ||||!|$|||dd|'i}(|(d }|(d }n ||||!|$|||d \}}|d ur|d!})|&t|)k r|)|& }*|*d ur||*7 }qt
||fd}t| j&|d< d"|d< |jd |jd g|d#< t%| j&D ]g\}& |&|d< d$|&f|%v rK fd%d}'|%d$|&f |||!|$||d&d|'i}(|(d }n
 |||!|$||d'}|d ur|d(}+|&t|+k r|+|& }*|*d ur|d d |jd |"|jd  f  |*7  < q|d d |jd |"|jd  f }|d ur|d d |jd d f }| j'|||d)}|t| j d  },t(t|,D ]}&|,|& | j|&  |,|&< q|)|jd g|, | j*g | j }|j+d*kr
|,dd+dd,dd-d.d/}|)|d | j*|d |d. |d+ }|S |,dd.dd+dd,}|)|d | j*|d |d. }|S )0Npatches_replacerv   r|   r}   sigmasr   sample_sigmasrO   r1   r0   g     @@rA   rP   rQ   ).r   ).rA   )r   NrO   rp   r2   dittotal_blocksdouble
block_typeblock_indexdouble_blockc              
      sL   i } | d | d | d | d | d | d | d | d d	\|d< |d< |S )
Nr   r   vecpeattention_maskmodulation_dims_imgmodulation_dims_txtrT   r   r   r   r   	attn_maskr   r   rT   r)   argsoutrt   r)   r*   
block_wrap  s   Dz-HunyuanVideo.forward_orig.<locals>.block_wrap)r   r   r   r   r   r   r   rT   original_blockr   r   r   inputsingle	img_slicesingle_blockc                    s8   i } | d | d | d | d | d | d d|d< |S )Nr   r   r   r   modulation_dimsrT   r   r   r   r   rT   r)   r   r   r)   r*   r     s   0)r   r   r   r   r   rT   r   output)r      r@         r-      )-copygetr'   rW   r   r   r   r   r0   r   r   wherer   rY   r1   r   r    r   catr   r   r   r   r   is_floating_pointfinfomaxr   r   
zeros_likelongr   	ones_likezerosr   r   r   	enumerater   r   rm   rV   r   ndimrX   )-r8   r   r   r   r   r   r   r   txt_byt5clip_fear   guiding_frame_index
ref_latentdisable_time_rcontrolrT   r   initial_shaper   wtimesteps_rvec_rref_latent_idstoken_replace_vecvec_frame_tokensr   r   cond_embtxt_byt5_idstxt_vision_statesextra_txt_idsidsr   img_lenattn_mask_lenr   blocks_replaceir   r   	control_iadd	control_orW   r)   r   r*   forward_orig!  s   





($"

,
,,

0,


(



$



,$
$$zHunyuanVideo.forward_origc                 C   s  |j \}}}}}| j}||d d  |d  }||d d  |d  }	||d d  |d  }
tj||	|
df|j|jd}|d d d d d d df tjd|d ||j|jdddd |d d d d d d df< |d d d d d d df tjd|	d |	|j|jdddd |d d d d d d df< |d d d d d d df tjd|
d |
|j|jdddd |d d d d d d df< t|d|d	S )
Nr   rA   rO   r-   r   stepsr1   r0   rQ   zt h w c -> b (t h w) cb)	rW   r   r   r   r1   r0   linspacerV   r   )r8   rZ   bsr[   r   hr   r   t_lenh_lenw_lenr   r)   r)   r*   r     s   ZZZzHunyuanVideo.img_idsc           
      C   s   |j \}}}}| j}||d d  |d  }||d d  |d  }tj||df|j|jd}	|	d d d d df tjd|d ||j|jdd |	d d d d df< |	d d d d df tjd|d ||j|jdd |	d d d d df< t|	d|dS )Nr   rA   rO   r   r  zh w c -> b (h w) cr
  )	rW   r   r   r   r1   r0   r  rY   r   )
r8   rZ   r  r[   r  r   r   r  r  r   r)   r)   r*   
img_ids_2d  s   JJzHunyuanVideo.img_ids_2dc                 K   sJ   t jj| j| t jt jjj|j|||||||||	|
|||fi |S r   )	r   patcher_extensionWrapperExecutornew_class_executor_forwardget_all_wrappers
WrappersMPDIFFUSION_MODELexecute)r8   rZ   timestepcontextr   r   r   r   r   r   r   r   r   rT   r   r)   r)   r*   rc     s   zHunyuanVideo.forwardc                 K   s   |j d }t| jdkr"| |}tj||j d df|j|jd}n| |}tj||j d df|j|jd}| j	|||||||||||	|
|||d}|S )Nr   r-   rO   r   rA   )r   r   rT   )
rW   r   r   r   r   r   r1   r0   r  r  )r8   rZ   r  r  r   r   r   r   r   r   r   r   r   rT   r   r  r   r   r   r)   r)   r*   r    s   

"
 (zHunyuanVideo._forward)NTNNN)r!   r"   r#   __doc__r4   r   r  r   r  rc   r  r=   r)   r)   r:   r*   r      sH    `
 +
&r   )r   comfy.patcher_extensionr   comfy.ldm.flux.layers(comfy.ldm.modules.diffusionmodules.mmditcomfy.ldm.modules.attentionr   dataclassesr   einopsr   r   r   r   r   r	   r
   r   r   comfy.ldm.common_ditr   Moduler+   r?   rf   ru   r   r   r)   r)   r)   r*   <module>   s$    	*$'