o
    ia)                     @   s^   d dl Z d dlmZ d dlmZmZ ddlmZ d dlZ	ddl
mZmZ G dd dejZdS )    N)TimestepEmbedder
PatchEmbed   )AttentionPool)HunYuanDiTBlock	calc_ropec                       s   e Zd ZdZdddddddd	d
dddddddeddddfdedededededededef fddZ							dddZ
  ZS )HunYuanControlNeta`  
    HunYuanDiT: Diffusion model with a Transformer backbone.

    Inherit ModelMixin and ConfigMixin to be compatible with the sampler StableDiffusionPipeline of diffusers.

    Inherit PeftAdapterMixin to be compatible with the PEFT training pipeline.

    Parameters
    ----------
    args: argparse.Namespace
        The arguments parsed by argparse.
    input_size: tuple
        The size of the input image.
    patch_size: int
        The size of the patch.
    in_channels: int
        The number of input channels.
    hidden_size: int
        The hidden size of the transformer backbone.
    depth: int
        The number of transformer blocks.
    num_heads: int
        The number of attention heads.
    mlp_ratio: float
        The ratio of the hidden size of the MLP in the transformer block.
    log_fn: callable
        The logging function.
             i  (      g8mt@   i   M      TFlayerN
input_size
patch_sizein_channelshidden_sizedepth	num_heads	mlp_ratiolog_fnc                    s$  t    |_|_|_|_|r|d n|_|___	|_
|	_|
_|_|_|_|__tjj_ttjjjd ddt tjjd j
dd_ttjjj j
d_d}tjjd|d_ |_!jr j!d	7  _!jrtj"d
d_# j!7  _!t$|||d_%t&d_'tjj!d dt jd dd_(t) f	ddt*dD _+jj	j	d_,t)fddt*t-j+D _.d S )Nr
   r   T)biasdtypedevicer   r   r      )r   
output_dimr   r   
operationsi   r   )r   r   r    c                    s0   g | ]}t jjd  dqS )F)r   
c_emb_sizer   r   text_states_dimqk_norm	norm_typeskipattn_precisionr   r   r    )r   r"   norm.0_	r&   r   r   r   r   r   r    r#   self 8/mnt/c/Users/fbmor/ComfyUI/comfy/ldm/hydit/controlnet.py
<listcomp>   s"    z.HunYuanControlNet.__init__.<locals>.<listcomp>   c                    s"   g | ]}j jj d qS )r   )Linearr   r(   )r   r   r    r,   r-   r.   r/      s    )/super__init__r   r   learn_sigmar   out_channelsr   r   r   r"   text_states_dim_t5text_lentext_len_t5	size_conduse_style_condr'   r   comfylatent_formatsSDXLlatent_formatnn
Sequentialr1   SiLUmlp_t5	Parametertorchrandntext_embedding_paddingr   poolerextra_in_dim	Embeddingstyle_embedderr   
x_embedderr   
t_embedderextra_embedder
ModuleListrangeblocksbefore_projlenafter_proj_list)r,   r   r   r   r   r   r   r   r"   r6   r7   r8   r#   r9   r:   r4   r'   r   r&   r   r   r    kwargspooler_out_dim	__class__r+   r.   r3   /   s   



	
zHunYuanControlNet.__init__c                 K   s  |}|j d dkrtj||j d dd}|}|}| }| }|j \}}}| |d|||d}tj| j	|}t
|dd| j df d|dd| j df |d| j |dd| j df< t
|dd| j df d|dd| j df || jd |dd| j df< tj||gdd}t|| j| j| j }| j|| jd}| |}| |}|	dur| |	}tj||gdd}|| | }| |}g }|| | }t| jD ]\}}|||||}|| j| | qd|iS )	a  
        Forward pass of the encoder.

        Parameters
        ----------
        x: torch.Tensor
            (B, D, H, W)
        t: torch.Tensor
            (B)
        encoder_hidden_states: torch.Tensor
            CLIP text embedding, (B, L_clip, D)
        text_embedding_mask: torch.Tensor
            CLIP text embedding mask, (B, L_clip)
        encoder_hidden_states_t5: torch.Tensor
            T5 text embedding, (B, L_t5, D)
        text_embedding_mask_t5: torch.Tensor
            T5 text embedding mask, (B, L_t5)
        image_meta_size: torch.Tensor
            (B, 6)
        style: torch.Tensor
            (B)
        cos_cis_img: torch.Tensor
        sin_cis_img: torch.Tensor
        return_dict: bool
            Whether to return a dictionary.
        r   r   )dimNr
   )r   output)shaperD   repeat_interleaveboolrB   viewr;   opscast_to_inputrF   wherer7   	unsqueezer8   catr   r   r   r   rL   r   rK   rG   rJ   rM   rQ   	enumeraterP   appendrS   )r,   xhint	timestepscontexttext_embedding_maskencoder_hidden_states_t5text_embedding_mask_t5image_meta_sizestylereturn_dictkwarg	conditiontext_statestext_states_t5text_states_masktext_states_t5_maskb_t5l_t5c_t5paddingfreqs_cis_imgt	extra_vecstyle_embeddingccontrolsr   blockr-   r-   r.   forward   sL   (



zHunYuanControlNet.forward)NNNNNF)__name__
__module____qualname____doc__printtupleintfloatcallabler3   r   __classcell__r-   r-   rV   r.   r      s`     #r   )rD   torch.nnr?   (comfy.ldm.modules.diffusionmodules.mmditr   r   poolersr   comfy.latent_formatsr;   modelsr   r   Moduler   r-   r-   r-   r.   <module>   s   