o
    i(                     @   sx   d dl mZ ddlmZmZ ddlmZ G dd deZG dd deZG d	d
 d
ej	Z
G dd deZdddZdS )    )sd1_clip   )QwenImageTokenizerQwenImageTEModel)Qwen25_7BVLIc                       s6   e Zd Zdi f fdd	Zddef fddZ  ZS )	Kandinsky5TokenizerNc                    s*   t  j||d d| _tj||d| _d S )Nembedding_directorytokenizer_dataa  <|im_start|>system
You are a prompt engineer. Describe the video in detail.
Describe how the camera moves or shakes, describe the zoom and view angle, whether it follows the objects.
Describe the location of the video, main characters or objects and their action.
Describe the dynamism of the video and presented actions.
Name the visual style of the video: whether it is a professional footage, user generated content, some kind of animation, video game or screen content.
Describe the visual effects, postprocessing and transitions if they are presented in the video.
Pay attention to the order of key actions shown in the scene.<|im_end|>
<|im_start|>user
{}<|im_end|>)super__init__llama_templater   SDTokenizerclip_lselfr	   r
   	__class__ </mnt/c/Users/fbmor/ComfyUI/comfy/text_encoders/kandinsky5.pyr      s   zKandinsky5Tokenizer.__init__Ftextc                    s4   t  j||fi |}| jj||fi ||d< |S )Nl)r   tokenize_with_weightsr   )r   r   return_word_idskwargsoutr   r   r   r      s   z)Kandinsky5Tokenizer.tokenize_with_weights)F)__name__
__module____qualname__r   strr   __classcell__r   r   r   r   r      s    r   c                       s"   e Zd Zdi f fdd	Z  ZS )Kandinsky5TokenizerImageNc                    s   t  j||d d| _d S )Nr   z<|im_start|>system
You are a promt engineer. Describe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>
<|im_start|>user
{}<|im_end|>)r   r   r   r   r   r   r   r      s   
z!Kandinsky5TokenizerImage.__init__r   r   r   r   r    r   r   r   r   r!      s    r!   c                       s*   e Zd Zdddddi f fdd	Z  ZS )Qwen25_7BVLIModelcpuhiddenNTc                    sN   | dd }|d ur| }||d< t j|||i |ddidt|||d d S )Nllama_quantization_metadataquantization_metadatapadi[P F)devicelayer	layer_idxtextmodel_json_configdtypespecial_tokenslayer_norm_hidden_statemodel_classenable_attention_masksreturn_attention_masksmodel_options)getcopyr   r   r   )r   r*   r+   r,   r.   attention_maskr4   r'   r   r   r   r      s
   *zQwen25_7BVLIModel.__init__r"   r   r   r   r   r#      s    "r#   c                       sT   e Zd Zddi f fdd	Z fddZ fddZ fd	d
Z fddZ  ZS )Kandinsky5TEModelr$   Nc                    s2   t t| j||dt|d tj||d|d| _d S )N	qwen25_7b)r*   r.   name
clip_modelr4   F)r*   r.   return_projected_pooledr4   )r   r   r   r#   r   SDClipModelr   r   r*   r.   r4   r   r   r   r   #   s   zKandinsky5TEModel.__init__c                    s4   t  j|dd\}}}| j|d \}}|||fS )Nr&   )template_endr   )r   encode_token_weightsr   )r   token_weight_pairscondpextral_outl_pooledr   r   r   r@   '   s   
z&Kandinsky5TEModel.encode_token_weightsc                    s   t  | | j| d S N)r   set_clip_optionsr   )r   optionsr   r   r   rH   -   s   z"Kandinsky5TEModel.set_clip_optionsc                    s   t    | j  d S rG   )r   reset_clip_optionsr   )r   r   r   r   rJ   1   s   
z$Kandinsky5TEModel.reset_clip_optionsc                    s    d|v r
| j |S t |S )Nz*text_model.encoder.layers.1.mlp.fc1.weight)r   load_sdr   )r   sdr   r   r   rK   5   s   zKandinsky5TEModel.load_sd)	r   r   r   r   r@   rH   rJ   rK   r    r   r   r   r   r8   "   s    r8   Nc                    s   G  fdddt }|S )Nc                       s(   e Zd Zddi f fdd	Z  ZS )zte.<locals>.Kandinsky5TEModel_r$   Nc                    s:   d ur|  }|d< d ur}t j|||d d S )Nr'   )r*   r.   r4   )r6   r   r   r>   )r   dtype_llamar'   r   r   r   =   s   z'te.<locals>.Kandinsky5TEModel_.__init__r"   r   rM   r'   r   r   Kandinsky5TEModel_<   s     rO   )r8   )rM   r'   rO   r   rN   r   te;   s   rP   )NN)comfyr   
qwen_imager   r   llamar   r   r!   r=   r#   r8   rP   r   r   r   r   <module>   s    	