o
    i%                  	   @   sj  d Z ddlZddlZddlZddlmZmZ ddlZddl	Z	ddl
ZddlZddlZddlZddlZdd Zdd Zz8ej rqejjrqddlmZmZ ddlZd	eejv rlejejejgZe dej! d
d Zne"d W n e#e$fy   e"d Y nw dZ%z(ej& rej'j() Z*e*dkre*dk rejj+dkrejj+dkrdZ%e,d W n   Y ejj-Z-d1ddZ.dd Z/d2ddZ0dd Z1G dd dZ2G dd dZ3G d d! d!e3Z4d"d# Z5G d$d% d%e4Z6dZ7zdd&l8m9Z9m:Z: dZ7W n
 e;y   Y nw e7rG d'd( d(e4Z8d)d*l<m=Z=m>Z>m?Z?m@Z@ G d+d, d,ejAjBZCi ejDdg fd-d.ZEd3d/d0ZFdS )4a  
    This file is part of ComfyUI.
    Copyright (C) 2024 Stability AI

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
    N)argsPerformanceFeaturec                   C   s   t j rd S tj  d S N)torchcompileris_compilingcomfymodel_management)throw_exception_if_processing_interrupted r   r   '/mnt/c/Users/fbmor/ComfyUI/comfy/ops.pyrun_every_op    s   
r   c                 O   s    t jjj| ||g|R i |S r   )r   nn
functionalscaled_dot_product_attentionqkvr   kwargsr   r   r   r   &   s    r   )
SDPBackendsdpa_kernelset_priorityc                 O   s|   |   dk rtjjj| ||g|R i |S ttdd tjjj| ||g|R i |W  d    S 1 s7w   Y  d S )Ni   T)r   )nelementr   r   r   r   r   SDPA_BACKEND_PRIORITYr   r   r   r   r   7   s
    $z3Torch version too old to set sdpa backend priority.z$Could not set sdpa backend priority.Fizc ile )   	   )r   
   Tz(working around nvidia conv3d memory bug.c                 C   s   t jj| |j|j||dS )N)non_blockingcopy)r   r	   cast_todtypedevice)weightinputr   r   r   r   r   cast_to_inputN   s   r%   c                    s2  t j|r+| jj|dd}t|tr| }d }| jd ur&| jj|dd}||dfS d }	d }
t	j
| j}t	j
|| j}|d urS|rK| j}| j}nt	j| j|}
|sdt j| j| jg}d }| j| jg}t j| }|d uru|g}t| j| jg|D ]'\}}|d u rq~|j|jkr|
}|d u rtjt j|ftj|d}d }
 nq~t j|}t j|}	|
d u r|	d urt j|	||| }
|
d u rt j|}	t j|	||| }
|
d u rtj|ftj|d}
d }	|d u r|d u rt j|  t j| }nd }|d urt j|| |g}t jj||
||	d t j ||	 |d urGtt j!| j| jg|
t j!||D ]\}}|d urC|"| q5|}
t j!||
}|d }|d }|d ura|| _|| _|| _ fdd	}|d u}|| d
||||}| jd ur|| d||||}|||	|d ur|d ffS d d ffS )NT)r!   r   )NNN)r!   r"   )r   streamr      c                    s  t | |d d }t | |d g }|}dd }	|j|ks!t|dkr&|	||}|sw|d urw|	| d u r4|n }||}rCt|dksE|rftj| j}
t|tr\tj	|| j
d|
d}n
tjj||j|
d}rpt|dkrp|}|rw|| |D ]}||}qy|S )	N_lowvram_function	_functionc                 S   s"   | j |d} t| tr|  } | S )Nr!   )to
isinstanceQuantizedTensor
dequantize)tensorr!   r   r   r   
to_dequant   s   
zAcast_bias_weight_with_vbar.<locals>.post_cast.<locals>.to_dequantr   recalculate)scalestochastic_rounding)seed)getattrr!   lenr   utilsstring_to_seedseed_keyr,   r-   
from_floatlayout_typefloatr3   copy_)s	param_keyxr!   residentupdate_weight
lowvram_fnfnsorigr0   r4   yfcompute_dtypewant_requantr   r   	post_cast   s*   



z-cast_bias_weight_with_vbar.<locals>.post_castr#   bias)#r   r	   is_device_cpur#   r+   r,   r-   r.   rL   comfy_aimdo
model_vbar
vbar_fault_vvbar_signature_compare_v_signature	_v_weight_v_biasr   aimdo_to_tensormemory_managementtensors_to_geometriespinned_memoryget_pinzipr!   emptyvram_aligned_sizeuint8get_offload_streamget_cast_buffer
pin_memorycast_to_gatheredsync_streaminterpret_gathered_liker=   )r>   r!   r"   
bias_dtyper   rI   rJ   r#   rL   offload_stream	xfer_dest	signaturerA   cast_geometry	cast_destxfer_sourcepindatageometry	dest_sizepre_castrK   paramsrB   r   rH   r   cast_bias_weight_with_vbarR   s   







$rr   c              	   C   s.  |d ur"|d u rt |tr|jj}n|j}|d u r|}|d u r"|j}tj|}t	| dr7t
| ||||||S |rQ|| jjksJ| jd urQ|| jjkrQtj|}	nd }	d }
d }|	d urtjstj| j| jg}tj|	||| }|d u rtj|}	tj|	||| }tj| j| jg|}|d }|d }
t| jdk}t| jdk}tjj| jd ||||	|d}| jd urtjj| jd ||||	|
d}
tj||	 |
}|}| jd ur|
j|d}
| jD ]}||
}
q|s|j|kr|j|d}t |tr| }| jD ]}||}q |r||
|	||ffS ||
fS )NrQ   r   r'   )r   r   r&   rr*   )r,   r-   rq   
orig_dtyper!   r"   r   r	   device_supports_non_blockinghasattrrr   r#   rL   r_   r   cuda_mallocrW   r]   r`   rd   r6   weight_functionbias_functionr    rc   r+   r.   )r>   r$   r!   r"   re   offloadablerI   rJ   r   rf   rL   r#   cast_buffer_sizecast_bufferrq   weight_has_functionbias_has_functionbias_aweight_arG   r   r   r   cast_bias_weight   s`   








r   c                 C   s   |d u rd S |\}}}d }|d ur t |tjs tj| j |}|d u r&d S |d u r;|d ur2|j}n	|d u r8d S |j}|t	j
| d S r   )r,   r   TensorrN   rO   
vbar_unpinrQ   r"   wait_streamr   r	   current_stream)r>   r#   rL   rf   osr   r   r"   r   r   r   uncast_bias_weight  s    
r   c                   @   s   e Zd ZdZg Zg ZdS )CastWeightBiasOpFN)__name__
__module____qualname__comfy_cast_weightsrx   ry   r   r   r   r   r   /  s    r   c                   @   s  e Zd Ze	dddZG dd dejjeZG dd dejj	eZ	G dd	 d	ejj
eZ
G d
d dejjeZG dd dejjeZG dd dejjeZG dd dejjeZG dd dejjeZG dd dejjeZG dd dejjeZedd ZdS )disable_weight_initNc                 C   s   | dd}t|}	| D ]?\}
}|
|	d  }|dkr,|s"| }tjj|dd| _q|d urH|dkrH|d urH|s>| }tjj|dd| _q|	|
 q| jd u rftjjt
|dd| _|	|d  |d ur| jd u rt| ddrtjjt
|dd| _|	|d  d S d S d S d S )Nassign_to_params_buffersFr#   requires_gradrL   comfy_need_lazy_init_bias)getr6   itemscloner   r   	Parameterr#   rL   appendzerosr5   )module
state_dictprefixlocal_metadatamissing_keysunexpected_keysweight_shape
bias_shaper   
prefix_lenr   r   keyr   r   r   _lazy_load_from_state_dict5  s(   
z.disable_weight_init._lazy_load_from_state_dictc                       sF   e Zd Zd fdd	Z fddZdd Zd	d
 Z fddZ  ZS )zdisable_weight_init.LinearTNc                    sv   t jjrt jjrt| jtjjurt	 
||||| d S tjj
|  || _|| _d | _d | _|| _|| _|| _d S r   )r   r	   WINDOWSrW   aimdo_enabledtype_load_from_state_dictr   Linearsuper__init__r   r   Modulein_featuresout_featuresr#   rL   r   weight_comfy_model_dtypebias_comfy_model_dtypeselfr   r   rL   r"   r!   	__class__r   r   r   R  s   
z#disable_weight_init.Linear.__init__c              
      sd   t jjrt jjrt| jtjjurt	 |||||||S tj
| |||||| j| jf| jfd d S )N)r   r   )r   r	   r   rW   r   r   r   r   r   r   r   r   r   r   r   r   r   strictr   r   
error_msgsr   r   r   r   h  s"   

z0disable_weight_init.Linear._load_from_state_dictc                 C      d S r   r   r   r   r   r   reset_parameters|     z+disable_weight_init.Linear.reset_parametersc                 C   s8   t | |dd\}}}tjj|||}t| ||| |S NTrz   )r   r   r   r   linearr   r   r$   r#   rL   rf   r@   r   r   r   forward_comfy_cast_weights     z5disable_weight_init.Linear.forward_comfy_cast_weightsc                    J   t   | jst| jdkst| jdkr| j|i |S t j|i |S Nr   r   r   r6   rx   ry   r   r   forwardr   r   r   r   r   r   r        "z"disable_weight_init.Linear.forwardTNN	r   r   r   r   r   r   r   r   __classcell__r   r   r   r   r   P  s    r   c                       ,   e Zd Zdd Zdd Z fddZ  ZS )zdisable_weight_init.Conv1dc                 C   r   r   r   r   r   r   r   r     r   z+disable_weight_init.Conv1d.reset_parametersc                 C   4   t | |dd\}}}| |||}t| ||| |S r   r   _conv_forwardr   r   r   r   r   r        z5disable_weight_init.Conv1d.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r     r   z"disable_weight_init.Conv1d.forwardr   r   r   r   r   r   r   r   r   r   r   Conv1d      r   c                       r   )zdisable_weight_init.Conv2dc                 C   r   r   r   r   r   r   r   r     r   z+disable_weight_init.Conv2d.reset_parametersc                 C   r   r   r   r   r   r   r   r     r   z5disable_weight_init.Conv2d.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r     r   z"disable_weight_init.Conv2d.forwardr   r   r   r   r   Conv2d  r   r   c                       s<   e Zd Zdd Zd
 fdd	Zd
ddZ fdd	Z  ZS )zdisable_weight_init.Conv3dc                 C   r   r   r   r   r   r   r   r     r   z+disable_weight_init.Conv3d.reset_parametersNc                    s   |dkr|d d d d |j d  d d d d d f }trL|jtjtjfv rLtj||| j| j| j	| j
dddd	}|d urJ||dd|jd   7 }|S t j|||g|R i |S )Ncausal_zeror   FT)	benchmarkdeterministic
allow_tf32)r'   )r'   )shape!NVIDIA_MEMORY_CONV_BUG_WORKAROUNDr!   r   float16bfloat16cudnn_convolutionpaddingstridedilationgroupsreshapendimr   r   )r   r$   r#   rL   autopadr   r   outr   r   r   r     s   .$z(disable_weight_init.Conv3d._conv_forwardc                 C   s8   t | |dd\}}}| j||||d}t| ||| |S )NTr   )r   r   )r   r$   r   r#   rL   rf   r@   r   r   r   r     r   z5disable_weight_init.Conv3d.forward_comfy_cast_weightsc                    sR   t   | jst| jdkst| jdksd|v r | j|i |S t j|i |S )Nr   r   r   r   r   r   r   r     s   *z"disable_weight_init.Conv3d.forwardr   )r   r   r   r   r   r   r   r   r   r   r   r   Conv3d  s
    
r   c                       r   )zdisable_weight_init.GroupNormc                 C   r   r   r   r   r   r   r   r     r   z.disable_weight_init.GroupNorm.reset_parametersc                 C   s@   t | |dd\}}}tjj|| j||| j}t| ||| |S r   )r   r   r   r   
group_norm
num_groupsepsr   r   r   r   r   r     s   z8disable_weight_init.GroupNorm.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r     r   z%disable_weight_init.GroupNorm.forwardr   r   r   r   r   	GroupNorm  r   r   c                       r   )zdisable_weight_init.LayerNormc                 C   r   r   r   r   r   r   r   r     r   z.disable_weight_init.LayerNorm.reset_parametersc                 C   sX   | j d urt| |dd\}}}nd }d }d }tjj|| j||| j}t| ||| |S r   )	r#   r   r   r   r   
layer_normnormalized_shaper   r   r   r   r   r   r     s   
z8disable_weight_init.LayerNorm.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r     r   z%disable_weight_init.LayerNorm.forwardr   r   r   r   r   	LayerNorm  s    r   c                       r   )zdisable_weight_init.RMSNormc                 C   
   d | _ d S r   rL   r   r   r   r   r        z,disable_weight_init.RMSNorm.reset_parametersc                 C   sV   | j d urt| |dd\}}}nd }d }d }tjj|| j|| j}t| ||| |S r   )	r#   r   r   r   r   rms_normr   r   r   r   r   r   r   r     s   
z6disable_weight_init.RMSNorm.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r     r   z#disable_weight_init.RMSNorm.forwardr   r   r   r   r   RMSNorm  s    r   c                       .   e Zd Zdd ZdddZ fddZ  ZS )	z#disable_weight_init.ConvTranspose2dc                 C   r   r   r   r   r   r   r   r   	  r   z4disable_weight_init.ConvTranspose2d.reset_parametersNc           	   
   C   l   d}|  ||| j| j| j|| j}t| |dd\}}}tjj	|||| j| j|| j
| j}t| ||| |S )Nr   Tr   )_output_paddingr   r   kernel_sizer   r   r   r   r   conv_transpose2dr   r   	r   r$   output_sizenum_spatial_dimsoutput_paddingr#   rL   rf   r@   r   r   r   r        
z>disable_weight_init.ConvTranspose2d.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r     r   z+disable_weight_init.ConvTranspose2d.forwardr   r   r   r   r   r   ConvTranspose2d      
r   c                       r   )	z#disable_weight_init.ConvTranspose1dc                 C   r   r   r   r   r   r   r   r   !  r   z4disable_weight_init.ConvTranspose1d.reset_parametersNc           	   
   C   r   )Nr'   Tr   )r   r   r   r   r   r   r   r   r   conv_transpose1dr   r   r   r   r   r   r   $  r   z>disable_weight_init.ConvTranspose1d.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r   1  r   z+disable_weight_init.ConvTranspose1d.forwardr   r   r   r   r   r   ConvTranspose1d   r   r   c                       sN   e Zd Z			d fdd	Z fddZdd	 Zdd
dZ fddZ  ZS )zdisable_weight_init.EmbeddingN       @Fc                    s   t jjrt jjrt| jtjjur#t	 
|||||||||	|
| d S tjj
|  || _|| _|| _|| _|| _|| _|| _|d urE|nt }tjjtj||fd|ddd| _d | _|| _d S )Nmetar"   r!   Fr   )r   r	   r   rW   r   r   r   r   	Embeddingr   r   r   r   r   num_embeddingsembedding_dimpadding_idxmax_norm	norm_typescale_grad_by_freqsparseget_default_dtyper   r\   r#   rL   r   )r   r  r  r  r  r  r  r	  _weight_freezer"   r!   embedding_dtyper   r   r   r   9  s0   
z&disable_weight_init.Embedding.__init__c              	      s^   t jjrt jjrt| jtjjurt	 |||||||S tj
| |||||| j| jfd d S )N)r   )r   r	   r   rW   r   r   r   r   r  r   r   r  r  r   r   r   r   r   V  s    

z3disable_weight_init.Embedding._load_from_state_dictc                 C   r   r   r   r   r   r   r   r   h  r   z.disable_weight_init.Embedding.reset_parametersc              	   C   sz   |}| j jtjks| j jtjkrd }t| |j|dd\}}}tjj	||| j
| j| j| j| jj|d}t| ||| |S )NT)r"   r!   rz   r*   )r#   r!   r   r   r   r   r"   r   r   	embeddingr  r  r  r  r	  r+   r   )r   r$   	out_dtypeoutput_dtyper#   rL   rf   r@   r   r   r   r   l  s   ,z8disable_weight_init.Embedding.forward_comfy_cast_weightsc                    s\   t   | jst| jdkst| jdkr| j|i |S d|v r%|d t j|i |S )Nr   r  )	r   r   r6   rx   ry   r   popr   r   r   r   r   r   r   v  s   "
z%disable_weight_init.Embedding.forward)	NNr   FFNFNNr   r   r   r   r   r   r  8  s    

r  c                 O   s>   |dkr| j |i |S |dkr| j|i |S td| )Nr      zunsupported dimensions: )r   r   
ValueError)r>   dimsr   r   r   r   r   conv_nd  s
   zdisable_weight_init.conv_ndr   )r   r   r   staticmethodr   r   r   r   r   r   r   r   r   r   r   r   r   r  classmethodr  r   r   r   r   r   4  s     <Gr   c                   @   s   e Zd ZG dd dejZG dd dejZG dd dejZG dd dejZG d	d
 d
ejZG dd dej	Z	G dd dej
Z
G dd dejZG dd dejZG dd dejZdS )manual_castc                   @      e Zd ZdZdS )zmanual_cast.LinearTNr   r   r   r   r   r   r   r   r         r   c                   @   r  )zmanual_cast.Conv1dTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.Conv2dTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.Conv3dTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.GroupNormTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.LayerNormTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.ConvTranspose2dTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.ConvTranspose1dTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.RMSNormTNr  r   r   r   r   r     r  r   c                   @   r  )zmanual_cast.EmbeddingTNr  r   r   r   r   r    r  r  N)r   r   r   r   r   r   r   r   r   r   r   r   r   r  r   r   r   r   r    s    r  c              	   C   sL  | j j}|tjfvrdS |j}|j}|jdk}|r!|d|d }|jdkr(dS tj	|j
}t| |||d|dd\}}}	tjd|j
tjd}
tjd|j
tjd}tj|d	d
|d}|| }tj||t|jd}t|d|}tj|
|t|jd}t|d|}tjj|||}t| |||	 |r||d |d |jd f}|S )zp
    Legacy FP8 linear function for backward compatibility.
    Uses QuantizedTensor subclass for dispatch.
    Nr  r   r   T)r!   re   rz   rI   rJ   r   r  i@i  )minmaxr   r2   rt   
orig_shapeTensorCoreFP8Layoutr   r'   )r#   r!   r   float8_e4m3fnr   r   r   r   r	   lora_compute_dtyper"   r   onesfloat32clampr+   
contiguousr   Paramstupler-   r   r   r   r   )r   r$   r!   input_dtypeinput_shape	tensor_3dr"  wrL   rf   scale_weightscale_input	input_fp8layout_params_inputquantized_inputlayout_params_weightquantized_weightor   r   r   
fp8_linear  s2   

r5  c                   @   s   e Zd ZG dd dejZdS )fp8_opsc                   @   s   e Zd Zdd Zdd ZdS )zfp8_ops.Linearc                 C   s   d | _ d | _d S r   )r-  r.  r   r   r   r   r     s   zfp8_ops.Linear.reset_parametersc              
   C   s   t | jdkr7t | jdkr7zt| |}|d ur|W S W n ty6 } ztd| W Y d }~nd }~ww t| |dd\}}}t	j
j|||}t| ||| |S )Nr   zException during fp8 op: {}Tr   )r6   rx   ry   r5  	Exceptionlogginginfoformatr   r   r   r   r   r   )r   r$   r   er#   rL   rf   r@   r   r   r   r     s   
z)fp8_ops.Linear.forward_comfy_cast_weightsN)r   r   r   r   r   r   r   r   r   r     s    r   N)r   r   r   r  r   r   r   r   r   r6    s    r6  )CublasLinearcublas_half_matmulc                   @   s    e Zd ZG dd deejZdS )
cublas_opsc                       r   )zcublas_ops.Linearc                 C   r   r   r   r   r   r   r   r     r   z"cublas_ops.Linear.reset_parametersc                 C   s:   t | |dd\}}}t|||| j| j}t| ||| |S r   )r   r=  _epilogue_strhas_biasr   r   r   r   r   r     s   z,cublas_ops.Linear.forward_comfy_cast_weightsc                    r   r   r   r   r   r   r   r     r   zcublas_ops.Linear.forwardr   r   r   r   r   r     r   r   N)r   r   r   r<  r  r   r   r   r   r   r>    s    r>  r'   )r-   QUANT_ALGOSr   get_layout_classc                   @   s2   e Zd ZdZedd Zeejjj	dd Z
dS )QuantLinearFunca  Custom autograd function for quantized linear: quantized forward, optionally FP8 backward.

    When training_fp8_bwd is enabled:
      - Forward: quantize input per layout (FP8/NVFP4), use quantized matmul
      - Backward: all matmuls use FP8 tensor cores via torch.mm dispatch
      - Cached input is FP8 (half the memory of bf16)

    When training_fp8_bwd is disabled:
      - Forward: quantize input per layout, use quantized matmul
      - Backward: dequantize weight to compute_dtype, use standard matmul
    c                 C   s  |j }| dd}|d urtj|||d}	n|}	|jr!| n|}
|d ur.|jr.| n|}tjj	|	|
|}t
|dkrI|d|d d }|| _|d u| _|| _|j| _tjj| _| jr|t|	trn|drn|	| _nt|d| _| | |S d | _| || |S )Nr   r2   r   r   TensorCoreFP8TensorCoreFP8E4M3Layout)r   detachflattenr-   r:   r   r   r   r   r   r6   	unflattenr*  r@  rI   weight_requires_gradr   r	   training_fp8_bwdfp8_bwdr,   
startswithq_inputsave_for_backward)ctxinput_floatr#   rL   r;   input_scalerI   r*  inprO  r,  boutputr   r   r   r     s0   


zQuantLinearFunc.forwardc                 C   sH  | j }|dd|}| jrE| j\}t|d}t|tr'|j	dr'|}nt|tr8t|
 |d}n	t||d}| j}n'| j\}}|}t|trY|
 |}n||}| jrj|dd|nd }t||}	t| jdkr|	d| jd d }	d }
| jrt| |}
d }| jr|jdd}|	|
|d d d fS )	Nr   rD  TensorCoreFP8E5M2LayoutrF  rG  r   r   )dim)rI   rI  r+   rM  saved_tensorsr-   r:   r,   _layout_clsrN  r.   rO  rK  r   mmr6   r*  rJ  tr@  sum)rQ  grad_outputrI   grad_2dr#   grad_mm	weight_mminput_mmrR  
grad_inputgrad_weight	grad_biasr   r   r   backwardA  s6   



zQuantLinearFunc.backwardN)r   r   r   __doc__r  r   r   autogradfunctiononce_differentiablerf  r   r   r   r   rC    s    
(rC  c                    s    G  fdddt   S )Nc                       s6   e Zd ZZZZZG  fdddejj	e
ZdS )z.mixed_precision_ops.<locals>.MixedPrecisionOpsc                	       s   e Zd Z			d dedededdf fddZd	d
 Zd!ddZ fddZdddddZ	dd Z
d"ddZdd Zd#ddZd$ddZd%ddZ  ZS )&z5mixed_precision_ops.<locals>.MixedPrecisionOps.LinearTNr   r   rL   returnc                    sl   t    | jd| _|| _|| _|r$tjtj	|fi | j| _
n| dd  d | _ j| _d| _d S )Nr  rL   F)r   r   _compute_dtypefactory_kwargsr   r   r   r   r   r\   rL   register_parametertensor_class_full_precision_mm_full_precision_mm_configr   MixedPrecisionOpsr   r   r   r   u  s   
 
z>mixed_precision_ops.<locals>.MixedPrecisionOps.Linear.__init__c                 S   r   r   r   r   r   r   r   r     r   zFmixed_precision_ops.<locals>.MixedPrecisionOps.Linear.reset_parametersc           	      S   sN   | | }| |d }|d ur%|j|d}|d ur |j|d}|| |S )Nr"   r*   )r  r+   viewr   )	r   r   r   
param_namer"   manually_loaded_keysr!   r   valuer   r   r   _load_scale_param  s   
zGmixed_precision_ops.<locals>.MixedPrecisionOps.Linear._load_scale_paramc              	      s
  | j d }|d}	| d}
||
d }|d u r&td|	  d | _d S |
g}|| dd }|d ur?t| 	 }|d u rTt
jj|j| jddd| _n|d	d | _|d
d| _| jsi| j| _| j jv rrd| _| jd u r~td|	 t| j }|d | _t| j}| jdv r| ||d||}|j| j| j| jfd}nr| jdkr| j||d||t
jd}|d u rtd|	 |t
j}|j| j| j| jfd}nB| jdkr| ||d||}| j||d||t
jd}|d u s|d u rtd|	 |j|| j| j| jfd}ntd| j t
jjt |j||d d| j|dd| _|d D ]2}|dv r=q4| | }||d }|d u rPq4| !|t
jj|j|ddd |"| q4t# $||||||| |D ]}||v r|%| qud S )Nr"   .r#   zMissing weight for layer comfy_quantr  Fr   r:  full_precision_matrix_multTz&Unknown quantization format for layer comfy_tensor_layout)r!  float8_e5m2weight_scaler  mxfp8r*   z%Missing MXFP8 block scales for layer nvfp4weight_scale_2zMissing NVFP4 scales for layer )r2   block_scalert   r  z!Unsupported quantization format: 	storage_t
parameters>   r  r  rt  )&rm  rstripr  r8  warningr#   jsonloadsnumpytobytesr   r   r   r+   rl  r   quant_formatrq  rp  	_disabledr  rA  r;   rB  ry  r'  r   r   r^   ru  float8_e8m0fnur!  r-   rn  r   r   r   remove)r   r   r   r   r   r   r   r   r"   
layer_name
weight_keyr#   rw  
layer_confqconfig
layout_clsr2   rq   r  tensor_scalerv  r?   rQ   r   rr  r   r   r     s   


"










 

zKmixed_precision_ops.<locals>.MixedPrecisionOps.Linear._load_from_state_dict )destinationr   c          
      _   s  |d ur|}ni }t | dstd| |S | jd ur%| j|d|< | jd u r,|S t| jtrz| jd|}|D ]}|| ||< q=d| j	i}| j
rRd|d< tjtt|dtjd	|d
|< t| dd }	|	d urx|	|d|< |S | j|d|< |S )Nr#   z*Warning: state dict on uninitialized op {}z{}biasz{}weightr:  Tr|  zutf-8r*   z{}comfy_quantrS  z{}input_scale)rv   r8  r  r:  rL   r#   r,   r-   r   r  rq  r   r/   listr  dumpsencoder^   r5   )
r   r  r   r   r   sdsd_outr   
quant_confrS  r   r   r   r     s0   



*z@mixed_precision_ops.<locals>.MixedPrecisionOps.Linear.state_dictc                 S   s   t jj|||S r   )r   r   r   r   )r   r$   r#   rL   r   r   r   _forward"  s   z>mixed_precision_ops.<locals>.MixedPrecisionOps.Linear._forwardFc                 S   s8   t | |d||d\}}}| |||}t| ||| |S )NTrz   rI   rJ   )r   r  r   )r   r$   rI   rJ   r#   rL   rf   r@   r   r   r   r   %  s   zPmixed_precision_ops.<locals>.MixedPrecisionOps.Linear.forward_comfy_cast_weightsc                 _   s  t   |j}d}|j}t| dd d uo1t|t o1| j o1t| dd o1t| jdko1t| j	dk}|j
rj|rjt| |d|dd\}}	}
t| dd }|d urVtj||jd }t|||	| j||}t| ||	|
 |S |r|jdkry|d	|d
 n|}|jd
kr|jdk}t| dd }|d urtj||jd }tj|| j|d}| j||t|td}|r||d |d | jjd f}|S )NFr;   comfy_force_cast_weightsr   Tr  rS  r  r   r   rE  )rJ   r'   )r   r   r!   r5   r,   r-   rp  r6   rx   ry   r   r   r   r	   cast_to_devicer"   rC  applyr;   r   r   r   r:   r   r#   )r   r$   r   r   r*  reshaped_3drI   _use_quantizedr#   rL   rf   r2   rV  input_reshapedr   r   r   r   +  sV   



 z=mixed_precision_ops.<locals>.MixedPrecisionOps.Linear.forwardc                 [   s   t |tr	| S |S r   )r,   r-   r.   )r   r#   inplacer   r   r   r   convert_weightg  s   
zDmixed_precision_ops.<locals>.MixedPrecisionOps.Linear.convert_weightc                 [   sj   t | dd d urtj|| jd|dd| jj}n|| jj}|r$|S |du s*J tjj	|dd| _d S )Nr;   r1   T)r2   r3   inplace_opsFr   )
r5   r-   r:   r;   r+   r#   r!   r   r   r   )r   r#   inplace_updater4   return_weightr   r   r   r   
set_weightm  s   "z@mixed_precision_ops.<locals>.MixedPrecisionOps.Linear.set_weightc                 S   s   |r|   D ]}|| q| j D ]!\}}|d u rq||}| r(| }| |tjj	|dd q| j
 D ]\}}|d urI||| j
|< q:| S )NFr   )children_apply_parametersr   is_inferencer   rn  r   r   r   _buffers)r   fnrecurser   r   parampbufr   r   r   r  y  s   z<mixed_precision_ops.<locals>.MixedPrecisionOps.Linear._applyr   r   )NF)F)FNF)T)r   r   r   intboolr   r   ry  r   r   r  r   r   r  r  r  r   r   )rs  r   r   r   t  s.    

g!

<
r   N)r   r   r   _quant_configrl  rp  r  r   r   r   r   r   r   rs  rI   disabledfull_precision_mmquant_configr   r   rs  n  s    rs  )r  )r  rI   r  r  r   r  r   mixed_precision_opsm  s     r  c           
      C   s   t j|}t j|}t j|}|rFt|drF|jrFtd t	 }	|s+|	
d |s2|	
d |s>|	
d |	
d t|j||	dS |rT|sPtjtjv rT|sTtS tjtjv rqtrq| tjkrq|tjksj|d u rqtd tS |d u sy| |kr{tS tS )	Nr  z Using mixed precision operationsr  r  r!  r~  )r  zUsing cublas ops)r   r	   supports_fp8_computesupports_nvfp4_computesupports_mxfp8_computerv   r  r8  r9  setaddr  r   Fp8MatrixMultiplicationr   fastr6  	CublasOpsCUBLAS_IS_AVAILABLEr   r   r>  r   r  )
weight_dtyperI   load_devicedisable_fast_fp8fp8_optimizationsmodel_configfp8_computenvfp4_computemxfp8_computer  r   r   r   pick_operations  s@   






r  )FT)NNNNFNF)NFFN)Grg  r   r8  comfy.model_managementr   comfy.cli_argsr   r   comfy.floatr  comfy.memory_managementcomfy.pinned_memorycomfy.utilscomfy_aimdo.model_vbarrN   comfy_aimdo.torchr   r   cudais_availabler	   r   torch.nn.attentionr   r   inspectrh   r  FLASH_ATTENTIONEFFICIENT_ATTENTIONMATHr   insertCUDNN_ATTENTIONr  ModuleNotFoundError	TypeErrorr   	is_nvidiabackendscudnnversioncudnn_versiontorch_version_numericr9  r    r%   rr   r   r   r   r   r  r5  r6  r  r>  r<  r=  ImportError	quant_opsr-   rA  r   rB  rh  FunctionrC  r   r  r  r   r   r   r   <module>   s    


(

 
H  W (b  !