o
    i@                     @   sv  d dl Z d dlmZ d dlZd dlZdg dg ddfddZd-d
dZg dg ddfddZG dd de j	j
Zdd e j	jjdd dZG dd de j	j
ZG dd de j	j
ZG dd de j	j
ZG dd de j	j
ZG dd de j	j
ZG dd  d e j	j
Zd!d" ZG d#d$ d$e j	j
ZG d%d& d&e j	j
ZG d'd( d(e j	j
ZG d)d* d*e j	j
ZG d+d, d,e j	j
ZdS ).    N)optimized_attention_for_device   )g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?Tc           	      C   sj  | j d dkr| d d d d d d d df n| } tj|| j| jd}tj|| j| jd}| dd} | j d |krA| j d |ks|rc|t| j d | j d  }t|| j d  t|| j d  f}n||f}tjj	j
| |ddd} | j d | d }| j d | d }| d d d d ||| ||| f } td	|  d
d d	 } | |g d |g d S )N   devicedtype      bicubicTsizemode	antialias     o@r      r   r	   r	   )shapetorchtensorr   r   movedimminroundnn
functionalinterpolateclipview)	imager   meanstdcropscale
scale_sizehw r&   ./mnt/c/Users/fbmor/ComfyUI/comfy/clip_model.pyclip_preprocess   s   2&( r(   h㈵>c                    s    fdd}|d d}}|| |kr9|| d }|| ||||}	}
|	  |
   |kr1|}n|}|| |ks|| ||||fS )Nc                    s$   t | |     }t t|S N)mathceilmaxint)r   r"   scaled
patch_sizer&   r'   	scale_dim   s   z/siglip2_flex_calc_resolution.<locals>.scale_dim
   g      Y@r
   r&   )ohowr1   max_num_patchesepsr2   lohimidr$   r%   r&   r0   r'   siglip2_flex_calc_resolution   s   r;   )      ?r<   r<   c                 C   s   |dkrt | ||||dS | jd dkr$| d d d d d d d df n| } tj|| j| jd}tj|| j| jd}| dd} | j\}}}	}
t|	|
||\}	}
tjj	j
| |	|
fddd	} td
|  dd d
 } | |g d |g d S )Nr   )r   r   r    r!   r   r   r   r	   bilinearTr   r   r   r   )r(   r   r   r   r   r   r   r;   r   r   r   r   r   r   )r   r   r1   num_patchesr   r    r!   bcr$   r%   r&   r&   r'   siglip2_preprocess*   s   2 rA   c                       &   e Zd Z fddZdddZ  ZS )CLIPAttentionc                    sl   t    || _|j||d||d| _|j||d||d| _|j||d||d| _|j||d||d| _d S NTbiasr   r   )super__init__headsLinearq_projk_projv_projout_proj)self	embed_dimrI   r   r   
operations	__class__r&   r'   rH   ;   s   
zCLIPAttention.__init__Nc                 C   s:   |  |}| |}| |}||||| j|}| |S r*   )rK   rL   rM   rI   rN   )rO   xmaskoptimized_attentionqkvoutr&   r&   r'   forwardE   s
   



zCLIPAttention.forwardNN__name__
__module____qualname__rH   r[   __classcell__r&   r&   rR   r'   rC   :   s    
rC   c                 C   s   | t d|   S )NgZd;?)r   sigmoidar&   r&   r'   <lambda>M       re   c                 C   s   t jjj| ddS )Ntanh)approximate)r   r   r   gelurc   r&   r&   r'   re   O   rf   )
quick_geluri   gelu_pytorch_tanhc                       $   e Zd Z fddZdd Z  ZS )CLIPMLPc                    sD   t    |j||d||d| _t| | _|j||d||d| _d S rD   )rG   rH   rJ   fc1ACTIVATIONS
activationfc2)rO   rP   intermediate_sizerp   r   r   rQ   rR   r&   r'   rH   S   s   

zCLIPMLP.__init__c                 C   s"   |  |}| |}| |}|S r*   )rn   rp   rq   rO   rT   r&   r&   r'   r[   Y   s   


zCLIPMLP.forwardr]   r&   r&   rR   r'   rm   R   s    rm   c                       rB   )	CLIPLayerc                    sX   t    |j|||d| _t|||||| _|j|||d| _t||||||| _d S Nr   r   )	rG   rH   	LayerNormlayer_norm1rC   	self_attnlayer_norm2rm   mlp)rO   rP   rI   rr   intermediate_activationr   r   rQ   rR   r&   r'   rH   `   s
   
zCLIPLayer.__init__Nc                 C   s0   ||  | |||7 }|| | |7 }|S r*   )ry   rx   r{   rz   )rO   rT   rU   rV   r&   r&   r'   r[   g   s   zCLIPLayer.forwardr\   r]   r&   r&   rR   r'   rt   _   s    rt   c                       rB   )CLIPEncoderc	           	   	      s:   t    tj fddt|D | _d S )Nc              
      s    g | ]}t  qS r&   )rt   ).0ir   r   rP   rI   r|   rr   rQ   r&   r'   
<listcomp>p   s     z(CLIPEncoder.__init__.<locals>.<listcomp>)rG   rH   r   r   
ModuleListrangelayers)	rO   
num_layersrP   rI   rr   r|   r   r   rQ   rR   r   r'   rH   n   s   
0zCLIPEncoder.__init__Nc           	      C   s   t |j|d udd}d }|d ur$|dkrg }d }n|dk r$t| j| }d }t| jD ] \}}||||}||kr=| }|d urK||d  q+|d urWtj	|dd}||fS )NT)rU   small_inputallr   r	   dim)
r   r   lenr   	enumeratecloneappend	unsqueezer   cat)	rO   rT   rU   intermediate_outputrV   all_intermediateintermediater   lr&   r&   r'   r[   r   s&   zCLIPEncoder.forwardr\   r]   r&   r&   rR   r'   r}   m   s    r}   c                       s,   e Zd Zd fdd	ZejfddZ  ZS )	CLIPEmbeddings   M   Nc                    s6   t    |j||||d| _|j||||d| _d S ru   )rG   rH   	Embeddingtoken_embeddingposition_embedding)rO   rP   
vocab_sizenum_positionsr   r   rQ   rR   r&   r'   rH      s   
zCLIPEmbeddings.__init__c                 C   s&   | j ||dtjj| jj||jd S )N)	out_dtyperv   )r   comfyopscast_tor   weightr   )rO   input_tokensr   r&   r&   r'   r[      s   &zCLIPEmbeddings.forward)r   r   NNNr^   r_   r`   rH   r   float32r[   ra   r&   r&   rR   r'   r      s    r   c                       s8   e Zd Z fddZddddddejg fddZ  ZS )CLIPTextModel_c              	      s   |d }|d }|d }|d }|d }	|d }
|d | _ t   t||
|||d| _t|||||	|||| _|j|||d	| _d S )
Nnum_hidden_layershidden_sizenum_attention_headsrr   
hidden_actmax_position_embeddingseos_token_id)r   r   r   rQ   rv   )	r   rG   rH   r   
embeddingsr}   encoderrw   final_layer_norm)rO   config_dictr   r   rQ   r   rP   rI   rr   r|   r   rR   r&   r'   rH      s   

zCLIPTextModel_.__init__NTc	                 C   s  |d ur|t jj| jjj||jd }	n| j||d}	d }
|d urUd||	j	|j
d dd|j
d f|j
d d|j
d |j
d  }
|
|
tjt|	jj }
tj|	j
d |	j
d ft|	jj |	j|	jdd}|
d urz|
|7 }
n|}
| j|	|
|d\}	}| |	}	|d ur|r| |}|d ur|	tt|	j
d ttdd	 |f }n"|	tj|	j
d |	jd
t|jtj|	jd| jk jddf }|	||fS )Nrv   )r   g      ?r   r	   r   rU   r   c                 S   s   | d S Nr	   r&   rc   r&   r&   r'   re      s    z(CLIPTextModel_.forward.<locals>.<lambda>)r   r   )r   r   r   r   r   r   r   tor   reshaper   expandmasked_fillr   boolfinfor-   fulltriu_r   r   listr   maparanger   r.   r   argmax)rO   r   attention_maskembeds
num_tokensr   final_layer_norm_intermediater   embeds_inforT   rU   causal_maskr   pooled_outputr&   r&   r'   r[      s&    H 8


*D
zCLIPTextModel_.forwardr   r&   r&   rR   r'   r      s    $r   c                       s4   e Zd Z fddZdd Zdd Zdd Z  ZS )	CLIPTextModelc                    sL   t    |d | _t||||| _|d }|j||d||d| _|| _d S )Nr   r   FrE   )rG   rH   r   r   
text_modelrJ   text_projectionr   )rO   r   r   r   rQ   rP   rR   r&   r'   rH      s   


zCLIPTextModel.__init__c                 C   s
   | j jjS r*   r   r   r   )rO   r&   r&   r'   get_input_embeddings   s   
z"CLIPTextModel.get_input_embeddingsc                 C   s   || j j_d S r*   r   )rO   r   r&   r&   r'   set_input_embeddings   s   z"CLIPTextModel.set_input_embeddingsc                 O   s6   | j |i |}| |d }|d |d ||d fS )Nr
   r   r	   )r   r   )rO   argskwargsrT   rZ   r&   r&   r'   r[      s   zCLIPTextModel.forward)r^   r_   r`   rH   r   r   r[   ra   r&   r&   rR   r'   r      s
    r   c                 C   sx   t | jd d }tj| |dddd||} tjj	j
| |dddd} | d| jd	 | jd  dd} ||  S )
Nr   r<   r	   r   r=   FT)r   r   align_cornersr   )r   r   r   r   cast_to_inputr   r   r   r   r   r   )embed_weightr   
orig_shapeembed_weight_lenr&   r&   r'   siglip2_pos_embed   s
   "$r   c                       &   e Zd Zd
 fdd	Zdd	 Z  ZS )Siglip2Embeddingsr      r    Nc
           
         sD   t    |	j|| | |||d| _|	j||||d| _|| _d S ru   )rG   rH   rJ   patch_embeddingr   r   r1   )
rO   rP   num_channelsr1   
image_size
model_typer>   r   r   rQ   rR   r&   r'   rH      s   

zSiglip2Embeddings.__init__c                 C   s   |j \}}}}|dd||| j | j|| j | j|}|dddddd}|||j d |j d  d}| |}t| jj||| j || j fS )Nr	   r   r   r   r
         )	r   r   r   r1   permuter   r   r   r   )rO   pixel_valuesr?   r@   r$   r%   imgr&   r&   r'   r[      s   ,
 zSiglip2Embeddings.forward)r   r   r   r   NNNNr]   r&   r&   rR   r'   r      s    r   c                       r   )CLIPVisionEmbeddingsr   r   r   r   Nc	              	      s   t    || d }	|dkrd | _d}
n|	d }	tjtj|||d| _d}
|j|||||
||d| _|j	|	|||d| _
d S )Nr
   siglip_vision_modelTr	   rv   F)in_channelsout_channelskernel_sizestriderF   r   r   )rG   rH   class_embeddingr   r   	ParameteremptyConv2dr   r   r   )rO   rP   r   r1   r   r   r   r   rQ   r>   
patch_biasrR   r&   r'   rH      s$   

zCLIPVisionEmbeddings.__init__c                 C   sf   |  |ddd}| jd ur(tjtj| j|	|j
d dd|gdd}|tj| jj| S )Nr
   r	   r   r   r   )r   flatten	transposer   r   r   r   r   r   r   r   r   r   )rO   r   r   r&   r&   r'   r[     s   
.zCLIPVisionEmbeddings.forward)r   r   r   r   NNNr]   r&   r&   rR   r'   r      s    r   c                       rB   )
CLIPVisionc                    s   t    |d }|d }|d }|d }|d }	|d }
|
dv r:t||d |d	 |d
 |
|dd |||d	| _nt||d |d	 |d
 |
|||d| _|
dv rZdd | _d| _n	||| _d| _t	|||||	|||| _
||| _d S )Nr   r   r   rr   r   r   )siglip2_vision_modelr   r1   r   r>   )r   r>   r   r   rQ   )r   r   r   rQ   )r   r   c                 S      | S r*   r&   rc   r&   r&   r'   re         z%CLIPVision.__init__.<locals>.<lambda>TF)rG   rH   r   getr   r   pre_layrnormoutput_layernormrw   r}   r   post_layernorm)rO   r   r   r   rQ   r   rP   rI   rr   r|   r   rR   r&   r'   rH     s"   
2&
zCLIPVision.__init__Nc                 C   sd   |  |}| |}| j|d |d\}}| jr| |}|}n| |d d dd d f }|||fS )Nr   r   )r   r   r   r   r   )rO   r   r   r   rT   r   r   r&   r&   r'   r[   !  s   



zCLIPVision.forwardr\   r]   r&   r&   rR   r'   r   
  s    r   c                       rl   )LlavaProjectorc                    s:   t    |j||d||d| _|j||d||d| _d S )NT)rF   r   r   )rG   rH   rJ   linear_1linear_2)rO   in_dimout_dimr   r   rQ   rR   r&   r'   rH   .  s   
zLlavaProjector.__init__c              
   C   s*   |  tjj| |d d dd f S r   )r   r   r   r   ri   r   rs   r&   r&   r'   r[   3  s   *zLlavaProjector.forwardr]   r&   r&   rR   r'   r   -  s    r   c                       rl   )CLIPVisionModelProjectionc                    s|   t    t||||| _d|v r|j|d |d dd| _ndd | _d|dd kr9t|d d	|||| _d S d | _d S )
Nprojection_dimr   F)rF   c                 S   r   r*   r&   rc   r&   r&   r'   re   =  r   z4CLIPVisionModelProjection.__init__.<locals>.<lambda>llava3projector_typei   )	rG   rH   r   vision_modelrJ   visual_projectionr   r   multi_modal_projector)rO   r   r   r   rQ   rR   r&   r'   rH   7  s   


z"CLIPVisionModelProjection.__init__c                 O   sN   | j |i |}| |d }d }| jd ur| |d }|d |d ||fS )Nr
   r	   r   )r   r   r   )rO   r   r   rT   rZ   	projectedr&   r&   r'   r[   D  s   
z!CLIPVisionModelProjection.forwardr]   r&   r&   rR   r'   r   6  s    r   )r)   )r   comfy.ldm.modules.attentionr   	comfy.opsr   r+   r(   r;   rA   r   ModulerC   r   ri   ro   rm   rt   r}   r   r   r   r   r   r   r   r   r   r&   r&   r&   r'   <module>   s0    

, #	