o
    ¶Ïi  ã                   @   s¾   d dl mZmZmZ ddlZddlZddlZddlZddl	Zddl
ZddlZddlZddlZG dd„ dƒZejjZejjejjejjejjjdœZG dd„ dƒZd	d
„ Zddd„Zdd„ ZdS )é   )Úload_torch_fileÚtransformers_convertÚstate_dict_prefix_replaceé    Nc                   @   s   e Zd Zdd„ Zdd„ ZdS )ÚOutputc                 C   s
   t | |ƒS ©N)Úgetattr)ÚselfÚkey© r   ú//mnt/c/Users/fbmor/ComfyUI/comfy/clip_vision.pyÚ__getitem__   ó   
zOutput.__getitem__c                 C   s   t | ||ƒ d S r   )Úsetattr)r	   r
   Úitemr   r   r   Ú__setitem__   s   zOutput.__setitem__N)Ú__name__Ú
__module__Ú__qualname__r   r   r   r   r   r   r      s    r   )Úclip_vision_modelÚsiglip_vision_modelÚsiglip2_vision_modelÚdinov2c                   @   s.   e Zd Zdd„ Zdd„ Zdd„ Zddd	„Zd
S )ÚClipVisionModelc                 C   s  t |ƒ}t |¡}W d   ƒ n1 sw   Y  | dd¡| _| dg d¢¡| _| dg d¢¡| _| dd¡| _| ¡ | _	t
 | j¡}| jd	krMd
| _nd| _tj ¡ | _tj ¡ }tj | j¡| _||| j|tjjƒ| _| j ¡  tjj| j| j|d| _d S )NÚ
image_sizeéà   Ú
image_mean)g3<Í4'ÐÞ?gwgí¶MÝ?gy{Îå Ú?Ú	image_std)g‡Bô91Ñ?g•wÝt.¹Ð?gÝ	U¦Ñ?Ú
model_typer   r   TF)Úload_deviceÚoffload_device)ÚopenÚjsonÚloadÚgetr   r   r   r   ÚcopyÚconfigÚIMAGE_ENCODERSÚreturn_all_hidden_statesÚcomfyÚmodel_managementÚtext_encoder_devicer   Útext_encoder_offload_deviceÚtext_encoder_dtypeÚdtypeÚopsÚmanual_castÚmodelÚevalÚmodel_patcherÚCoreModelPatcherÚpatcher)r	   Újson_configÚfr&   Úmodel_classr    r   r   r   Ú__init__   s$   
ÿ



zClipVisionModel.__init__c                 C   s   | j j|d| j ¡ dS )NF)ÚstrictÚassign)r1   Úload_state_dictr5   Ú
is_dynamic)r	   Úsdr   r   r   Úload_sd4   s   zClipVisionModel.load_sdc                 C   s
   | j  ¡ S r   )r1   Ú
state_dict)r	   r   r   r   Úget_sd7   r   zClipVisionModel.get_sdTc              	   C   sN  t j | j¡ | jdkr-t jj| | j¡| j	| j
 dd¡| j
 dd¡| j| j|d ¡ }nt jj| | j¡| j	| j| j|d ¡ }| j|| jrIdnd	d
}tƒ }|d  t j ¡ ¡|d< |d  t j ¡ ¡|d< |jdd … g|jd  |d< | jr“|d  t j ¡ ¡}|d d …d	f |d< ||d< n|d  t j ¡ ¡|d< |d |d< |S )Nr   Ú
patch_sizeé   Únum_patchesé   )ÚsizerB   rD   ÚmeanÚstdÚcrop)rF   rG   rH   rI   Úalléþÿÿÿ)Úpixel_valuesÚintermediate_outputr   Úlast_hidden_stateé   Úimage_embedsr   Úimage_sizesÚpenultimate_hidden_statesÚall_hidden_statesé   Úmm_projected)r)   r*   Úload_model_gpur5   r   Ú
clip_modelÚsiglip2_preprocessÚtor   r   r&   r$   r   r   ÚfloatÚclip_preprocessr1   r(   r   Úintermediate_deviceÚshape)r	   ÚimagerI   rL   ÚoutÚoutputsÚall_hsr   r   r   Úencode_image:   s    
B(
zClipVisionModel.encode_imageN)T)r   r   r   r9   r?   rA   rb   r   r   r   r   r      s
    r   c                 C   sÖ   |   ¡ }d |¡|v r`d |¡dd |¡dd |¡dd |¡d	d
 |¡dd |¡dd |¡di}|D ]}||v rA|  |¡| || < q2d |¡|v rW|  d |¡¡ dd¡| d< t| |ddƒ} | S |di}t| |ƒ} | S )Nz-{}transformer.resblocks.0.attn.in_proj_weightz{}class_embeddingz'vision_model.embeddings.class_embeddingz{}conv1.weightú.vision_model.embeddings.patch_embedding.weightz{}positional_embeddingú1vision_model.embeddings.position_embedding.weightz{}ln_post.biasz vision_model.post_layernorm.biasz{}ln_post.weightz"vision_model.post_layernorm.weightz{}ln_pre.biaszvision_model.pre_layrnorm.biasz{}ln_pre.weightz vision_model.pre_layrnorm.weightz{}projr   r   zvisual_projection.weightzvision_model.é0   Ú )ÚkeysÚformatÚpopÚ	transposer   r   )r>   ÚprefixÚsd_kÚkeys_to_replaceÚxÚreplace_prefixr   r   r   Úconvert_to_transformersP   s*   






ù
€þ
rp   rf   Fc              	   C   sP  |rt | |ƒ} d| v rtj tj tj t¡¡d¡}n×d| v r/tj tj tj t¡¡d¡}nÃd| v r¾| d jd }| d jd d	kr†| d
 j}t|ƒdkr^tj tj tj t¡¡d¡}n”|dkrrtj tj tj t¡¡d¡}n€|dkr…tj tj tj t¡¡d¡}nl|dkr®d| v ržtj tj tj t¡¡d¡}nTtj tj tj t¡¡d¡}nDtj tj tj t¡¡d¡}n4d| v r×tj tj tj tj t¡¡d¡d¡}nd| v rðtj tj tj tj t¡¡d¡d¡}nd S t	|ƒ}| 
| ¡\}}t|ƒdkrt d |¡¡ t|ƒ}t|  ¡ ƒ}	|	D ]}
|
|vr$|  |
¡ q|S )Nz1vision_model.encoder.layers.47.layer_norm1.weightzclip_vision_config_g.jsonz1vision_model.encoder.layers.30.layer_norm1.weightzclip_vision_config_h.jsonz1vision_model.encoder.layers.22.layer_norm1.weightrd   r   z0vision_model.encoder.layers.0.layer_norm1.weighti€  rc   rO   z$clip_vision_siglip2_base_naflex.jsoniÙ  zclip_vision_siglip_384.jsoni   zclip_vision_siglip_512.jsoniA  z#multi_modal_projector.linear_1.biasz&clip_vision_config_vitl_336_llava.jsonz clip_vision_config_vitl_336.jsonzclip_vision_config_vitl.jsonz%encoder.layer.39.layer_scale2.lambda1Úimage_encoderszdino2_giant.jsonz%encoder.layer.23.layer_scale2.lambda1zdino2_large.jsonzmissing clip vision: {})rp   ÚosÚpathÚjoinÚdirnameÚrealpathÚ__file__r]   Úlenr   r?   ÚloggingÚwarningrh   ÚsetÚlistrg   ri   )r>   rk   Úconvert_keysr6   Úembed_shapeÚpatch_embedding_shapeÚclipÚmÚurg   Úkr   r   r   Úload_clipvision_from_sdj   sL   
  
  €   **

€r„   c                 C   s&   t | ƒ}d|v rt|dddS t|ƒS )Nz2visual.transformer.resblocks.0.attn.in_proj_weightzvisual.T)rk   r}   )r   r„   )Ú	ckpt_pathr>   r   r   r   r#   —   s   r#   )rf   F)Úutilsr   r   r   rr   r"   ry   Ú	comfy.opsr)   Úcomfy.model_patcherÚcomfy.model_managementÚcomfy.utilsÚcomfy.clip_modelÚcomfy.image_encoders.dino2r   rW   r[   ÚCLIPVisionModelProjectionrq   Údino2ÚDinov2Modelr'   r   rp   r„   r#   r   r   r   r   Ú<module>   s*    ü4
-