o
    i%                     @   s   d dl mZ d dlZ d dlZ ddlmZ d dlmZ d dlZd dl	Z	d dl
Z
d dlZ dddZG d	d
 d
ejZG dd dejZG dd dZG dd deZG dd dejjZdddZdS )    )sd1_clipN   )HunyuanImageTokenizer)LlamaTokenizerFast c                 C   s^   i }d |d |g}|D ]}|| v r| | j|d<  nqtj| |}|d ur-||d< |S )Nz{}model.norm.weightz'{}model.layers.0.input_layernorm.weightdtype_llamallama_quantization_metadata)formatdtypecomfyutilsdetect_layer_quantization)
state_dictprefixout	norm_keysnorm_keyquant r   ?/mnt/c/Users/fbmor/ComfyUI/comfy/text_encoders/hunyuan_video.pyllama_detect   s   r   c                       s&   e Zd Zdi ddf fdd	Z  ZS )LLAMA3TokenizerN    c                    sH   t jt jt jtd}t j||dddtdddd|||d d S )Nllama_tokenizerFi   llamaTi)embedding_directorypad_with_endembedding_sizeembedding_keytokenizer_classhas_start_tokenhas_end_tokenpad_to_max_length
max_length	pad_token
min_lengthtokenizer_data)	ospathjoindirnamerealpath__file__super__init__r   )selfr   r'   r&   r%   tokenizer_path	__class__r   r   r/      s   *zLLAMA3Tokenizer.__init____name__
__module____qualname__r/   __classcell__r   r   r2   r   r      s    r   c                	       s2   e Zd Zdddddi dddf fd	d
	Z  ZS )
LLAMAModelcpuhiddenNTi  r   )startpadc                    s   | dd }|d ur| }||d< i }	| dd }
|
d ur"|
|	d< i |ddi}t j||||	||dtjjj|||d d S )Nr   quantization_metadata
vocab_size
model_namer   F)devicelayer	layer_idxtextmodel_json_configr
   special_tokenslayer_norm_hidden_statemodel_classenable_attention_masksreturn_attention_masksmodel_options)getcopyr.   r/   r   text_encodersr   Llama2)r0   rB   rC   rD   r
   attention_maskrK   rF   r   rE   r@   r2   r   r   r/       s   ,zLLAMAModel.__init__r4   r   r   r2   r   r9      s    *r9   c                   @   s4   e Zd Zdi fddZdddZdd	 Zd
d ZdS )HunyuanVideoTokenizerNc                 C   s*   t j||d| _d| _t|d|d| _d S )Nr   r'   a   <|start_header_id|>system<|end_header_id|>

Describe the video by detailing the following aspects: 1. The main content and theme of the video.2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.4. background environment, light, style and atmosphere.5. camera angles, movements, and transitions used in the video:<|eot_id|><|start_header_id|>user<|end_header_id|>

{}<|eot_id|>r   )r   r&   r'   )r   SDTokenizerclip_lllama_templater   r   r0   r   r'   r   r   r   r/   0   s   zHunyuanVideoTokenizer.__init__Fr   c                 K   s   i }| j j||fi ||d< |d u r| j|}n||}| jj||fi |}	d}
|	D ]6}tt|D ]-}|| d dkrc|d urc|
|jd k rcd||
 d|df|| dd   ||< |
d7 }
q6q.|	|d< |S )	Nlr   i 	embeddingimage)typedataoriginal_typeimage_interleaver   r   )rT   tokenize_with_weightsrU   r	   r   rangelenshape)r0   textreturn_word_idsrU   image_embedsr]   kwargsr   
llama_textllama_text_tokensembed_countrir   r   r   r^   5   s"   
(z+HunyuanVideoTokenizer.tokenize_with_weightsc                 C   s   | j |S N)rT   
untokenize)r0   token_weight_pairr   r   r   rl   H   s   z HunyuanVideoTokenizer.untokenizec                 C   s   i S rk   r   r0   r   r   r   r   K   s   z HunyuanVideoTokenizer.state_dict)FNNr   )r5   r6   r7   r/   r^   rl   r   r   r   r   r   rQ   /   s
    
rQ   c                       s6   e Zd Zdi f fdd	Zddef fddZ  ZS )	HunyuanVideo15TokenizerNc                    s   t  j||d d| _d S )NrR   a  <|im_start|>system
You are a helpful assistant. Describe the video by detailing the following aspects:
1. The main content and theme of the video.
2. The color, shape, size, texture, quantity, text, and spatial relationships of the objects.
3. Actions, events, behaviors temporal relationships, physical movement changes of the objects.
4. background environment, light, style and atmosphere.
5. camera angles, movements, and transitions used in the video.<|im_end|>
<|im_start|>user
{}<|im_end|>
<|im_start|>assistant
)r.   r/   rU   rV   r2   r   r   r/   P   s   
z HunyuanVideo15Tokenizer.__init__Frb   c                    s   t  j||fddi|S )Nprevent_empty_textT)r.   r^   )r0   rb   rc   re   r2   r   r   r^   T   s   z-HunyuanVideo15Tokenizer.tokenize_with_weights)F)r5   r6   r7   r/   strr^   r8   r   r   r2   r   ro   O   s    ro   c                       sF   e Zd Zdddi f fdd	Zdd Zdd Zd	d
 Zdd Z  ZS )HunyuanVideoClipModelNr:   c                    sP   t    tj|||}tj||d|d| _t|||d| _	t
||g| _d S )NF)rB   r
   return_projected_pooledrK   )rB   r
   rK   )r.   r/   r   model_managementpick_weight_dtyper   SDClipModelrT   r9   r   setdtypes)r0   r   rB   r
   rK   r2   r   r   r/   X   s
   
zHunyuanVideoClipModel.__init__c                 C   s   | j | | j| d S rk   )rT   set_clip_optionsr   )r0   optionsr   r   r   ry   _   s   z&HunyuanVideoClipModel.set_clip_optionsc                 C   s   | j   | j  d S rk   )rT   reset_clip_optionsr   rn   r   r   r   r{   c   s   
z(HunyuanVideoClipModel.reset_clip_optionsc              	   C   sD  |d }|d }| j |\}}}d}d}d}	d}
g }|d }t|D ]x\}}|d }t|st|tjra|dkrT||d  d dkrT||d  d d	krT|d }d
}
|dkr`|
d
kr`|d }
q#|ddkr|dj	d }|dkr|
d
kr~||d 7 }q#||	 }|| |	 }|
|||ddf |	|d 7 }	q#|j	d |d kr||d  d dkr|d7 }|d d ||	 |
|	 | f }|d d d ||	 |
|	 | f |d< |d  t|d kr|d t|dkrg }|D ]}|
|d d |d |d |d f  qtj||g dd}| j|\}}|||fS )NrW   r   r   l   a$i r   ir     i i	 r\   rY   r[   r]   i  rP   )dim)r   encode_token_weights	enumeratetorch	is_tensor
isinstancenumbersIntegralrL   ra   appendsumnumelpopr`   catrT   )r0   token_weight_pairstoken_weight_pairs_ltoken_weight_pairs_llama	llama_outllama_pooledllama_extra_outtemplate_endextra_template_endextra_sizesuser_endimages	tok_pairsrj   velem	elem_sizeimage_start	image_endllama_outputr   l_outl_pooledr   r   r   r   g   s\   
 (
*
z*HunyuanVideoClipModel.encode_token_weightsc                 C   s    d|v r
| j |S | j|S )Nz*text_model.encoder.layers.1.mlp.fc1.weight)rT   load_sdr   )r0   sdr   r   r   r      s   zHunyuanVideoClipModel.load_sd)	r5   r6   r7   r/   ry   r{   r   r   r8   r   r   r2   r   rr   W   s    5rr   c                    s   G  fdddt }|S )Nc                       s(   e Zd Zddi f fdd	Z  ZS )z2hunyuan_video_clip.<locals>.HunyuanVideoClipModel_r:   Nc                    s0   d ur|  }|d< t j|||d d S )Nr   )r   rB   r
   rK   )rM   r.   r/   )r0   rB   r
   rK   )r3   r   r   r   r   r/      s   z;hunyuan_video_clip.<locals>.HunyuanVideoClipModel_.__init__r4   r   r   r   r2   r   HunyuanVideoClipModel_   s     r   )rr   )r   r   r   r   r   r   hunyuan_video_clip   s   r   )r   )NN)r   r   comfy.model_managementcomfy.text_encoders.llamahunyuan_imager   transformersr   r   r(   r   comfy.utilsr   rS   r   rv   r9   rQ   ro   nnModulerr   r   r   r   r   r   <module>   s     
 L