o
    i	                     @   s"   d dl Z d dlZG dd dZdS )    Nc                   @   sD   e Zd Zedd ZdddZdd	 Zd
d ZdddZdd Z	dS )SPieceTokenizerc                 K   s   t | fi |S N)r   )pathkwargs r   B/mnt/c/Users/fbmor/ComfyUI/comfy/text_encoders/spiece_tokenizer.pyfrom_pretrained   s   zSPieceTokenizer.from_pretrainedFTNc                 C   s   || _ || _|| _dd l}t|r|  }t|t	r*|j
|| j | jd| _d S tj|s4td|j
|| j | jd| _d S )Nr   )model_protoadd_bosadd_eoszinvalid tokenizer)
model_filer
   r   )r
   r   special_tokenssentencepiecetorch	is_tensornumpytobytes
isinstancebytesSentencePieceProcessor	tokenizerosr   isfile
ValueError)selftokenizer_pathr
   r   r   r   r   r   r   __init__	   s   

zSPieceTokenizer.__init__c                 C   s,   i }t | j D ]
}||| j|< q	|S r   )ranger   get_piece_sizeid_to_piece)r   outir   r   r   	get_vocab   s   zSPieceTokenizer.get_vocabc                    s   | j d urTdd l d fdd| j  D }|rT ||rT d| d|}g }|D ]!}|s3q.|| j v rA|| j |  q.| jj|ddd}|	| q.d	|iS | j|}d	|iS )
Nr   |c                 3   s    | ]}  |V  qd S r   )escape).0tokenrer   r   	<genexpr>!   s    z+SPieceTokenizer.__call__.<locals>.<genexpr>()F)r
   r   	input_ids)
r   r(   joinkeyssearchsplitappendr   encodeextend)r   stringspecial_tokens_patternpartsresultpartencodedr    r   r'   r   __call__   s    

zSPieceTokenizer.__call__c                    s6   |r| j rt| j    fdd|D }| j|S )Nc                    s   g | ]}| vr|qS r   r   )r%   tidspecial_token_idsr   r   
<listcomp>6   s    z*SPieceTokenizer.decode.<locals>.<listcomp>)r   setvaluesr   decode)r   	token_idsskip_special_tokensr   r<   r   rA   2   s   
zSPieceTokenizer.decodec                 C   s   t t| j S r   )r   
ByteTensorlistr   serialized_model_proto)r   r   r   r   serialize_model:   s   zSPieceTokenizer.serialize_model)FTN)F)
__name__
__module____qualname__staticmethodr   r   r"   r:   rA   rG   r   r   r   r   r      s    


r   )r   r   r   r   r   r   r   <module>   s    