o
    i@                     @   s  d dl Z d dlmZ d dlm  mZ d dlmZmZ d dl	Z	d dl
mZ 							d$de jd	ed
edededededefddZG dd dejZdd Zdd ZG dd dejZG dd dejZG dd dejZG dd dejZG d d! d!ejZG d"d# d#ejZdS )%    N)OptionalTuple)optimized_attention_for_device@           images
min_pixels
max_pixels
patch_sizetemporal_patch_size
merge_size
image_mean	image_stdc                 C   s6  |d u rg d}|d u rg d}| j \}}	}
}| j}| dddd} g }| d }|| }t|	| | }t|
| | }|| |krht|	|
 | }t|t|	| | | }t|t|
| | | }n%|| |k rt||	|
  }t|	| | | }t|
| | | }t	j
|d||fddd	d}| }tdD ]}|| ||  ||  ||< q|| }|| }tjd||g|tjd
}|}|| t|}d}|j d }|ddddd}|||||| |||| ||	}|ddddddddd	}||| | || | | }||fS )N)g3<4'?gwgM?gy{ ?)gB91?gwt.?g	U?r         r   bilinearF)sizemodealign_cornersdevicedtype               )shaper   permuteroundmathsqrtmaxfloorceilFinterpolate	unsqueezesqueezeclonerangetorchtensorlongappendstackrepeatreshape)r	   r
   r   r   r   r   r   r   
batch_sizeheightwidthchannelsr   grid_thw_listimgfactorh_barw_barbetaimg_resized
normalizedcgrid_hgrid_wgrid_thwpixel_valuesimage_grid_thwgrid_tchannelpatchesflatten_patches rJ   9/mnt/c/Users/fbmor/ComfyUI/comfy/text_encoders/qwen_vl.pyprocess_qwen2vl_images	   sr   




rL   c                	       sT   e Zd Z							ddededed	ef fd
dZdejdejfddZ  ZS )VisionPatchEmbedr   r   r      Nr   r   in_channels	embed_dimc           	   	      sJ   t    || _|| _|| _|| _|||g}|j||||d||d| _d S )NF)kernel_sizestridebiasr   r   )super__init__r   r   rO   rP   Conv3dproj)	selfr   r   rO   rP   r   r   opsrQ   	__class__rJ   rK   rU   \   s   


zVisionPatchEmbed.__init__hidden_statesreturnc                 C   s2   | d| j| j| j| j}| |}| d| jS N)viewrO   r   r   rW   rP   )rX   r\   rJ   rJ   rK   forwardw   s
   
zVisionPatchEmbed.forward)r   r   r   rN   NNN	__name__
__module____qualname__intrU   r-   Tensorra   __classcell__rJ   rJ   rZ   rK   rM   [   s$    rM   c                 C   sH   | dd | j d d f }| d| j d d d f }tj| |fddS )N.r_   r   dim)r   r-   cat)xx1x2rJ   rJ   rK   rotate_half   s   ro   c                 C   sN   | d | d }}| | t| |  }|| t||  }||fS )N)r)   floatro   )qkcossinq_embedk_embedrJ   rJ   rK   apply_rotary_pos_emb_vision   s   rx   c                       s<   e Zd Zd
dedef fddZdedejfdd	Z  Z	S )VisionRotaryEmbedding     @rj   thetac                    s   t    || _|| _d S N)rT   rU   rj   r{   )rX   rj   r{   rZ   rJ   rK   rU      s   

zVisionRotaryEmbedding.__init__seqlenr]   c              	   C   sL   d| j tjd| jdtj|d| j   }tj||j|jd}t||}|S )Ng      ?r   r   )r   r   r   )r{   r-   arangerj   rq   r   r   outer)rX   r}   r   inv_freqseqfreqsrJ   rJ   rK   ra      s   (zVisionRotaryEmbedding.forward)rz   )
rc   rd   re   rf   rq   rU   r-   rg   ra   rh   rJ   rJ   rZ   rK   ry      s    ry   c                       sB   e Zd Zddededef fddZdejd	ejfd
dZ  ZS )PatchMergerr   Nrj   context_dimspatial_merge_sizec              
      sf   t    ||d  | _|j|d||d| _t|j| j| j||dt |j| j|||d| _	d S )Nr   ư>epsr   r   r   )
rT   rU   hidden_sizeRMSNormln_qnn
SequentialLinearGELUmlp)rX   rj   r   r   r   r   rY   rZ   rJ   rK   rU      s   

zPatchMerger.__init__rl   r]   c                 C   s"   |  |d| j}| |}|S r^   )r   r3   r   r   )rX   rl   rJ   rJ   rK   ra      s   
zPatchMerger.forward)r   NNNrb   rJ   rJ   rZ   rK   r      s    
r   c                       sZ   e Zd Zddedef fddZ			ddejdeeejejf  dejfd	d
Z	  Z
S )VisionAttentionNr   	num_headsc                    s`   t    || _|| _|| | _| jd | _|j||d d||d| _|j||d||d| _d S )Ng      r   TrS   r   r   )	rT   rU   r   r   head_dimscalingr   qkvrW   )rX   r   r   r   r   rY   rZ   rJ   rK   rU      s   

zVisionAttention.__init__r\   position_embeddingsr]   c                    sD  |  dkr|j\}}d}|d}n|j\}}}|}|||djj}||djdddddd\}	}
}|d urO|\}}t	|	|
||\}	}
|	
ddd}	|

ddd}
|
ddd}|dd  |d d    fdd|	|
|fD }fddt| D }tj|dd	}||d}|}|S )
Nr   r   r   r   r_   c                    s    g | ]}t j|  d dqS )r   ri   )r-   splittolist).0r.   )lengthsrJ   rK   
<listcomp>   s    z+VisionAttention.forward.<locals>.<listcomp>c              	      s&   g | ]\}}} |||j d dqS )T)skip_reshape)r   )r   rr   rs   v)optimized_attentionrX   rJ   rK   r      s    ri   )rj   r   r)   r   r3   r   r   r    unbindrx   	transposezipr-   rk   rW   )rX   r\   r   
cu_seqlensr   
seq_length_r4   r   query_states
key_statesvalue_statesrt   ru   splitsattn_outputsattn_outputrJ   )r   r   rX   rK   ra      s2   

*

zVisionAttention.forwardNNNrc   rd   re   rf   rU   r-   rg   r   r   ra   rh   rJ   rJ   rZ   rK   r      s    r   c                       s0   e Zd Zddedef fddZdd Z  ZS )		VisionMLPNr   intermediate_sizec                    sZ   t    |j||d||d| _|j||d||d| _|j||d||d| _t | _d S )NTr   )	rT   rU   r   	gate_projup_proj	down_projr   SiLUact_fn)rX   r   r   r   r   rY   rZ   rJ   rK   rU      s
   
zVisionMLP.__init__c                 C   s    |  | | || | S r|   )r   r   r   r   )rX   hidden_staterJ   rJ   rK   ra      s    zVisionMLP.forwardr   )rc   rd   re   rf   rU   ra   rh   rJ   rJ   rZ   rK   r      s    r   c                       s^   e Zd Zddededef fddZ			ddejdeeejejf  d	ejfd
dZ	  Z
S )VisionBlockNr   r   r   c                    s^   t    |j|d||d| _|j|d||d| _t|||||d| _t|||||d| _d S )Nr   r   )r   r   rY   )	rT   rU   r   norm1norm2r   attnr   r   )rX   r   r   r   r   r   rY   rZ   rJ   rK   rU      s
   
zVisionBlock.__init__r\   r   r]   c                 C   sJ   |}|  |}| ||||}|| }|}| |}| |}|| }|S r|   )r   r   r   r   )rX   r\   r   r   r   residualrJ   rJ   rK   ra      s   


zVisionBlock.forwardr   r   rJ   rJ   rZ   rK   r      s    
r   c                       s   e Zd Z												dd	ed
edededededededef fddZdd Zdd Z	ddejde	ej dejfddZ
  ZS )Qwen2VLVisionTransformerrN   \         r   r   p   Nr   output_hidden_sizer   r   
num_layersr   r   r   window_sizec              	      s   t    | _|| _|| _|	| _g d| _t||d d| _ }t	|d | _
t fddt|D | _t|| d| _d S )N)r            r   )r   r   rO   rP   r   r   rY   r   c              	      s   g | ]}t  qS rJ   )r   )r   r   r   r   r   r   r   rY   rJ   rK   r   +  s    z5Qwen2VLVisionTransformer.__init__.<locals>.<listcomp>)rj   r   r   r   r   rY   )rT   rU   r   r   r   r   fullatt_block_indexesrM   patch_embedry   rotary_pos_embr   
ModuleListr,   blocksr   merger)rX   r   r   r   r   r   r   r   r   r   r   r   rY   r   rZ   r   rK   rU     s8   



z!Qwen2VLVisionTransformer.__init__c                 C   sn  g }dg}d}| j | j | j }|D ]\}}}|| j }	|| j }
t||	 |
 ||	|
}||	|  }||
|  }|	| | }|
| | }t|d|d|fdd}||||||}|ddddd||| ||}|dk	ddgd}|d}||dk }|
||  |d| j | j |d  }||  |||	 |
  7 }qtj|dd	}||fS )
Nr   constantir   r   r   r   r_   ri   )r   r   r   r-   r~   r3   r'   padr    sumr0   cumsumextendr   itemrk   )rX   rC   window_indexcu_window_seqlenswindow_index_idvit_merger_window_sizerF   rA   rB   
llm_grid_h
llm_grid_windexpad_hpad_wnum_windows_hnum_windows_windex_paddedseqlens	index_newcu_seqlens_tmprJ   rJ   rK   get_window_index9  sF   


z)Qwen2VLVisionTransformer.get_window_indexc                 C   s  g }|D ]e\}}}t j||ddd|}||| j | j|| j | j}|dddd }t j||dd|d}||| j | j|| j | j}|dddd }|t j	||gdd
|d qt j|dd}|d d dd f  }	| |	|}
|
| dS )Nr   r   r_   r   r   r   ri   )r-   r~   r)   expandr3   r   r    flattenr0   r1   r2   rk   r$   r   )rX   rC   r   pos_idsthwhpos_idswpos_idsmax_grid_sizerotary_pos_emb_fullrJ   rJ   rK   get_position_embeddingse  s.   "z0Qwen2VLVisionTransformer.get_position_embeddingsrD   rE   r]   c                 C   s  t |jddd}| |}| |\}}tj||jd}t|}| ||j}| \}}	| j	| j	 }
|
||
 |
d}||d d d d f }|
|d}|
||
 |
d}||d d d d f }|
|d}tj||fdd}| | f}t|d d df |d d df  |d d d	f jd	tjd
}tj|dd	d}t| jD ]\}}|| jv r|}n|}|||||d}q| |}t|}||d d f }|S )NFT)masksmall_inputr   r_   ri   r   r   r   )rj   r   )r   r   )value)r   )r   r   r   r   r-   r.   unique_consecutiver   r   r   r3   rk   rt   ru   repeat_interleaver   int32r'   r   	enumerater   r   r   argsort)rX   rD   rE   r   r\   r   r   r   seq_lenr   spatial_merge_unitr   iblockcu_seqlens_nowreverse_indicesrJ   rJ   rK   ra     s<   

4


z Qwen2VLVisionTransformer.forward)rN   rN   r   r   r   r   r   r   r   NNNr|   )rc   rd   re   rf   rU   r   r   r-   rg   r   ra   rh   rJ   rJ   rZ   rK   r     sT    	
1, r   )r   r   r   r   r   NN)r-   torch.nnr   torch.nn.functional
functionalr'   typingr   r   r"   comfy.ldm.modules.attentionr   rg   rf   listrL   ModulerM   ro   rx   ry   r   r   r   r   r   rJ   rJ   rJ   rK   <module>   sN    
R$5