o
    iF                     @   s   d dl Z d dlmZ d dlm  mZ d dlmZ d dlm	Z	m
Z
 d dlZejjZdZG dd dejZG dd dejZG d	d
 d
ejZG dd dejZG dd dejZG dd dejZG dd dejZdd ZG dd dejZdS )    N)	rearrange)vae_attentiontorch_cat_if_needed   c                       s.   e Zd ZdZ fddZd fdd	Z  ZS )CausalConv3dz 
    Causal 3d convolusion.
    c                    s>   t  j|i | d| jd  | _d| jd | jd f| _d S )Nr   r      )super__init__padding_padding)selfargskwargs	__class__ //mnt/c/Users/fbmor/ComfyUI/comfy/ldm/wan/vae.pyr	      s   zCausalConv3d.__init__Nc                    s   |d ur|| }d ||< |d u r|j d dkrt j|ddS | jdkrX| j}|d ur;||j}td||j d  }t|j }||d< tj	||j|j
d}t|||gdd}~t |S )Nr   r   causal_zero)autopadr   )devicedtypedim)shaper   forwardr   tor   maxlisttorchzerosr   r   )r   xcache_x
cache_list	cache_idxpadding_neededpadding_shaper
   r   r   r   r      s    

zCausalConv3d.forward)NNN__name__
__module____qualname____doc__r	   r   __classcell__r   r   r   r   r      s    r   c                       s&   e Zd Zd fdd	Zdd Z  ZS )RMS_normTFc                    sr   t    |s	dnd}|r|g|R n|f}|| _|d | _tt|| _|r4tt	|| _
d S d | _
d S )N)r   r   r   )r   r   g      ?)r   r	   channel_firstscalenn	Parameterr   onesgammar   bias)r   r   r-   imagesr3   broadcastable_dimsr   r   r   r   r	   4   s   

$zRMS_norm.__init__c                 C   sF   t j|| jrdndd| j | j| | jd ur | j| S d S )Nr   r   r   )F	normalizer-   r.   r2   r   r3   )r   r    r   r   r   r   >   s   
zRMS_norm.forwardTTFr'   r(   r)   r	   r   r+   r   r   r   r   r,   2   s    
r,   c                       s.   e Zd Z fddZddgdfddZ  ZS )Resamplec              	      s"  |dv sJ t    || _|| _|dkr,ttjdddtj||d ddd	| _	d S |d
krRttjdddtj||d ddd	| _	t
||d ddd	| _d S |dkrittdtj||ddd| _	d S |dkrttdtj||ddd| _	t
||dddd| _d S t | _	d S )N)none
upsample2d
upsample3ddownsample2ddownsample3dr=   )       @rA   znearest-exact)scale_factormoder      r   r
   r>   )rD   r   r   )r   r   r   r?   )r   r   r   r   )r   r   )strider@   )r   r   r   )r   r   r   )rF   r
   )r   r	   r   rC   r/   
SequentialUpsampleopsConv2dresampler   	time_conv	ZeroPad2dIdentity)r   r   rC   r   r   r   r	   E   s>   



zResample.__init__Nr   Fc                 C   sr  |  \}}}}}	| jdkr|d ur|d }
||
 d u r)d||
< |d  d7  < nu|d d d d t d d d d d f }||
 dkrI| |}n| |||
 }|||
< |d  d7  < ||d||||	}t|d d dd d d d d d d d f |d d dd d d d d d d d f fd}||||d ||	}|jd }t|d}| 	|}t|d|d	}| jd
kr7|d ur7|d }
||
 d u r|||
< na|d d d d dd d d d d f }| t
||
 d d d d dd d d d d f |gd}|||
< ||
d  }|d urt
||gd}d ||
d < |jd dkr/|s/|||
d < d }|d  d7  < |S )Nr>   r   Repr   r   rD   b c t h w -> (b t) c h wz(b t) c h w -> b c t h wtr@   r6   )sizerC   CACHE_TrL   reshaper   stackr   r   rK   cat)r   r    
feat_cachefeat_idxfinalbcrR   hwidxr!   
deferred_xr   r   r   r   f   sR   
(R




&4
zResample.forwardr:   r   r   r   r   r;   C   s    !r;   c                       s0   e Zd Zd	 fdd	ZddgdfddZ  ZS )
ResidualBlock        c                    s   t    || _|| _tt|ddt t||dddt|ddt t	|t||ddd| _
||kr?t||d| _d S t | _d S )NFr4   rD   r   rE   )r   r	   in_dimout_dimr/   rG   r,   SiLUr   DropoutresidualrN   shortcut)r   rd   re   dropoutr   r   r   r	      s   
zResidualBlock.__init__Nr   Fc           	      C   s   |}| j D ];}t|tr<|d ur<|d }|d d d d t d d d d d f }||||d}|||< |d  d7  < q||}q|| | S )Nr   )r"   r#   r   )rh   
isinstancer   rT   ri   )	r   r    rX   rY   rZ   old_xlayerr_   r!   r   r   r   r      s   
(
zResidualBlock.forward)rb   r:   r   r   r   r   ra      s    ra   c                       s2   e Zd ZdZ fddZddgdfddZ  ZS )	AttentionBlockz3
    Causal self-attention with a single head.
    c                    sJ   t    || _t|| _t||d d| _t||d| _t	 | _
d S )NrD   r   )r   r	   r   r,   normrI   rJ   to_qkvprojr   optimized_attention)r   r   r   r   r   r	      s   

zAttentionBlock.__init__Nr   Fc                 C   sr   |}|  \}}}}	}
t|d}| |}| |jddd\}}}| |||}| |}t|d|d}|| S )NrP   rD   r   r   z(b t) c h w-> b c t h wrQ   )rS   r   ro   rp   chunkrr   rq   )r   r    rX   rY   rZ   identityr[   r\   rR   r]   r^   qkvr   r   r   r      s   


zAttentionBlock.forwardr&   r   r   r   r   rn      s    
rn   c                       sH   e Zd Zdddg ddg g ddf fdd		Zd
dgdfddZ  ZS )	Encoder3d      rD   r   r   rz   rz   r   r9   rb   c	              
      s`  t     | _|| _|| _|| _|| _|| _ fdddg| D }	d}
t||	d ddd| _	g }t
t|	d d |	dd  D ]@\}\}}t|D ]}|t||| |
|v ra|t| |}qK|t|d kr|| rrd	nd
}|t||d |
d }
qAtj| | _tt|||t|t|||| _tt|ddt t||ddd| _d S )Nc                       g | ]} | qS r   r   .0ur   r   r   
<listcomp>       z&Encoder3d.__init__.<locals>.<listcomp>r         ?r   rD   rE   r6   r@   r?   rC   rA   Frc   )r   r	   r   z_dimdim_multnum_res_blocksattn_scalestemperal_downsampler   conv1	enumerateziprangeappendra   rn   lenr;   r/   rG   downsamplesmiddler,   rf   head)r   r   r   input_channelsr   r   r   r   rj   dimsr.   r   ird   re   _rC   r   r   r   r	      sF   
	*

zEncoder3d.__init__Nr   Fc                 C   sX  |d ur1|d }|d d d d t  d d d d d f }| ||| }|||< |d  d7  < n| |}| jD ]}|d urO|||||d}|d u rN d S q9||}q9| jD ]}|d urf|||||d}qW||}qW| jD ];}t|tr|d ur|d }|d d d d t  d d d d d f }|||| }|||< |d  d7  < qn||}qn|S )Nr   r   )rZ   )rT   r   r   r   r   rk   r   )r   r    rX   rY   rZ   r_   r!   rm   r   r   r   r     s6   (





(
zEncoder3d.forwardr:   r   r   r   r   rx      s    4rx   c                       sN   e Zd Zdddg ddg g ddf fdd		Zd
d ZddgfddZ  ZS )	Decoder3dry   rz   rD   r{   r   )FTTrb   c	              
      s  t     | _|| _|| _|| _|| _|| _ fdd|d g|d d d  D }	ddt|d   }
t	||	d ddd	| _
tt|	d |	d |t|	d t|	d |	d || _g }tt|	d d |	dd  D ]R\}\}}|dks~|dks~|dkr|d }t|d D ]}|t||| |
|v r|t| |}q|t|d kr|| rd
nd}|t||d |
d9 }
qltj| | _tt|ddt t	||ddd	| _d S )Nc                    r|   r   r   r}   r   r   r   r   H  r   z&Decoder3d.__init__.<locals>.<listcomp>r6   r   r   r   rD   r   rE   r>   r=   r   rA   Frc   )r   r	   r   r   r   r   r   temperal_upsampler   r   r   r/   rG   ra   rn   r   r   r   r   r   r;   	upsamplesr,   rf   r   )r   r   r   output_channelsr   r   r   r   rj   r   r.   r   r   rd   re   r   rC   r   r   r   r	   6  sB   
	&*
zDecoder3d.__init__c                 C   sv  |d }d |d< |t | jkrU| jD ];}t|trI|d urI|d d d d t d d d d d f }||||d  }|||d < |d  d7  < q||}q|| d S | j| }|d ure||||}n||}t|tr|jdkr|j	d dkrt
d|j	d dD ]#}	| |d |d d d d |	|	d d d d d f g|| | q~d S |g}
~| |d |
||| d S )Nr   r   r>   r   )r   r   r   rk   r   rT   r   r;   rC   r   r   run_upcopy)r   	layer_idxx_refrX   rY   
out_chunksr    rm   r!   	frame_idx
next_x_refr   r   r   r   k  s<   
(


"*zDecoder3d.run_upNr   c                 C   s   |d ur1|d }|d d d d t  d d d d d f }| ||| }|||< |d  d7  < n| |}| jD ]}|d urF||||}q9||}q9g }| d|g||| |S )Nr   r   )rT   r   r   r   )r   r    rX   rY   r_   r!   rm   r   r   r   r   r     s   (


zDecoder3d.forward)r'   r(   r)   r	   r   r   r+   r   r   r   r   r   4  s    5%r   c                 C   s<   d}|   D ]}t|tst|tr|jdkr|d7 }q|S )Nr   r@   r   )modulesrk   r   r;   rC   )modelcountmr   r   r   count_cache_layers  s   r   c                	       sH   e Zd Zddg ddg g ddddf	 fdd		Zd
d Zdd Z  ZS )WanVAEry   rz   r{   r   r9   rD   rb   c
           
   	      s   t    || _|| _|| _|| _|| _|| _|d d d | _t	||d ||||| j|	| _
t|d |d d| _t||d| _t||||||| j|	| _d S )Nr6   r   r   )r   r	   r   r   r   r   r   r   r   rx   encoderr   r   conv2r   decoder)
r   r   r   r   r   r   r   image_channelsconv_out_channelsrj   r   r   r   r	     s    


zWanVAE.__init__c              	   C   s&  dg}|j d }d|d d d  }d|d d  }d }|dkr(d gt| j }t|D ]X}dg}|dkrO| j|d d d d d dd d d d f ||d}q,| j|d d d d dd|d   dd|  d d d d f ||||d kd}|d u r|q,t||gd}q,| |jddd\}	}
|	S )Nr   r   r   rz   rX   rY   )rX   rY   rZ   r   )r   r   r   r   r   rW   r   rs   )r   r    conv_idxrR   iter_feat_mapr   outout_mulog_varr   r   r   encode  s4   
$8
zWanVAE.encodec           	   	   C   s   d|j d d  }d }|dkrd gt| j }| |}t|D ]L}dg}|dkrE| j|d d d d ||d d d d d f ||d}q | j|d d d d dd|d   dd|  d d d d f ||d}||7 }q t|dS )Nr   r   r   r   )r   r   r   r   r   r   rW   )	r   zr   r   r    r   r   r   r   r   r   r   decode  s(   
(8
zWanVAE.decode)r'   r(   r)   r	   r   r   r+   r   r   r   r   r     s    r   )r   torch.nnr/   torch.nn.functional
functionalr7   einopsr   (comfy.ldm.modules.diffusionmodules.modelr   r   	comfy.opscomfyrI   disable_weight_initrT   Conv3dr   Moduler,   r;   ra   rn   rx   r   r   r   r   r   r   r   <module>   s"   "W]t