
    Uh7                         d dl Z d dlmZ d dlmZmZmZmZmZm	Z	 d dl
ZddlmZmZ ddlmZ ddlmZ erddlmZ  ej*                  e      Zd	d
iZdZ ed       G d de             ZdgZy)    N)copyfile)TYPE_CHECKINGAnyDictListOptionalTuple   )
AddedTokenPreTrainedTokenizer)logging)requires)	TextInput
vocab_fileztokenizer.modelu   ▁)sentencepiece)backendsc            
       `    e Zd ZdZeZddgZ	 	 	 	 	 	 	 	 	 	 ddeee	e
f      f fdZd Zd Zed        Zd	 Zd
ddee	   f fdZd Zd Zd Zd Zddee	   dee	   fdZddZ	 d dee   deee      dedee   f fdZ	 ddee   deee      dee   fdZ	 	 d!dee   dedede	fdZ xZS )"GemmaTokenizera
  
    Construct a Gemma tokenizer. Based on byte-level Byte-Pair-Encoding. The default padding token is unset as there is
    no padding token in the original model.

    Args:
        vocab_file (`str`):
            Path to the vocabulary file.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<bos>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<eos>"`):
            The end of sequence token.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<pad>"`):
            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
            attention mechanisms or loss computation.
        sp_model_kwargs (`Dict[str, Any]`, `Optional`, *optional*):
            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
            to set:

            - `enable_sampling`: Enable subword regularization.
            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.

              - `nbest_size = {0,1}`: No sampling is performed.
              - `nbest_size > 1`: samples from the nbest_size results.
              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
                using forward-filtering-and-backward-sampling algorithm.

            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
              BPE-dropout.

        add_bos_token (`bool`, *optional*, defaults to `True`):
            Whether or not to add an `bos_token` at the start of sequences.
        add_eos_token (`bool`, *optional*, defaults to `False`):
            Whether or not to add an `eos_token` at the end of sequences.
        clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
            Whether or not to cleanup spaces after decoding, cleanup consists in removing potential artifacts like
            extra spaces.
        use_default_system_prompt (`bool`, *optional*, defaults to `False`):
            Whether or not the default system prompt for Gemma should be used.
        spaces_between_special_tokens (`bool`, *optional*, defaults to `False`):
            Whether or not to add spaces between special tokens.
    	input_idsattention_masksp_model_kwargsc                    |i n|| _         t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}t        |t              rt        |dd      n|}|| _        || _        || _        |
| _        t        j                  di | j                   | _
        | j                  j                  |       t        | 4  d||||||||	|
|d
| y )NFT)
normalizedspecial)
	bos_token	eos_token	unk_token	pad_tokenadd_bos_tokenadd_eos_tokenr   clean_up_tokenization_spacesuse_default_system_promptspaces_between_special_tokens )r   
isinstancestrr   r   r   r    r"   spmSentencePieceProcessorsp_modelLoadsuper__init__)selfr   r   r   r   r   r   r   r    r!   r"   r#   kwargs	__class__s                ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/gemma/tokenization_gemma.pyr,   zGemmaTokenizer.__init__^   s
    &5%<r/MWXacfMgJyUDImv	MWXacfMgJyUDImv	MWXacfMgJyUDImv	MWXacfMgJyUDImv	$**)B&22JT5I5IJ:& 	
''+)E&?*G	
 	
    c                 ~    | j                   j                         }d |d<   | j                  j                         |d<   |S )Nr)   sp_model_proto)__dict__copyr)   serialized_model_proto)r-   states     r0   __getstate__zGemmaTokenizer.__getstate__   s;    ""$ j"&--"F"F"Hr1   c                     | j                   j                  |       t        j                  di | j                  | _        | j
                  j                  | j                         y )Nr$   )r4   updater'   r(   r   r)   LoadFromSerializedProtor3   )r-   ds     r0   __setstate__zGemmaTokenizer.__setstate__   sG    Q22JT5I5IJ--d.A.ABr1   c                 6    | j                   j                         S )zReturns vocab size)r)   get_piece_size)r-   s    r0   
vocab_sizezGemmaTokenizer.vocab_size   s     }}++--r1   c                     t        | j                        D ci c]  }| j                  |      | }}|j                  | j                         |S c c}w )zReturns vocab as a dict)ranger@   convert_ids_to_tokensr:   added_tokens_encoder)r-   ivocabs      r0   	get_vocabzGemmaTokenizer.get_vocab   sK    ;@;QRa++A.1RRT../ Ss   Atextr   returnc                 $    t        |   |fi |S )ze
        Args:
            text: TextInput
        Simply calls PreTrainedTokenizer's method
        )r+   tokenize)r-   rH   r.   r/   s      r0   rK   zGemmaTokenizer.tokenize   s     w///r1   c                 D    | j                   j                  |t              S )z
        Args:
            text: TextInput
        Returns a tokenized string. The Gemma tokenizer never adds a prefix space.
        )out_type)r)   encoder&   )r-   rH   r.   s      r0   	_tokenizezGemmaTokenizer._tokenize   s     }}##D3#77r1   c                 8    | j                   j                  |      S )z0Converts a token (str) in an id using the vocab.)r)   piece_to_id)r-   tokens     r0   _convert_token_to_idz#GemmaTokenizer._convert_token_to_id   s    }}((//r1   c                 <    | j                   j                  |      }|S )z=Converts an index (integer) in a token (str) using the vocab.)r)   	IdToPiece)r-   indexrR   s      r0   _convert_id_to_tokenz#GemmaTokenizer._convert_id_to_token   s    ''.r1   c                     g }d}|D ]E  }|| j                   v r$|| j                  j                  |      |z   z  }g }5|j                  |       G || j                  j                  |      z  }|S )z:Converts a sequence of tokens (string) in a single string. )_added_tokens_encoderr)   decodeappend)r-   tokenscurrent_sub_tokens
out_stringrR   s        r0   convert_tokens_to_stringz'GemmaTokenizer.convert_tokens_to_string   s    
 	1E222dmm223EFNN
%'""))%0	1 	dmm**+=>>
r1   filename_prefixc                    t         j                  j                  |      st        j	                  d| d       yt         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rBt         j                  j                  | j                        rt        | j                  |       |fS t         j                  j                  | j                        sCt        |d      5 }| j                  j                         }|j                  |       ddd       |fS |fS # 1 sw Y   |fS xY w)a  
        Save the vocabulary and special tokens file to a directory.

        Args:
            save_directory (`str`):
                The directory in which to save the vocabulary.

        Returns:
            `Tuple(str)`: Paths to the files saved.
        zVocabulary path (z) should be a directoryN-rY   r   wb)ospathisdirloggererrorjoinVOCAB_FILES_NAMESabspathr   isfiler   openr)   r6   write)r-   save_directoryra   out_vocab_fileficontent_spiece_models         r0   save_vocabularyzGemmaTokenizer.save_vocabulary   s'    ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNSUSZSZSaSabfbqbqSrT__n5    0nd+ /r'+}}'K'K'M$-./     	/   s   +,E%%E0c                     | j                   r| j                  gng }| j                  r| j                  gng }||z   |z   }|||z   |z   |z   }|S N)r   bos_token_idr    eos_token_idr-   token_ids_0token_ids_1rw   rx   outputs         r0    build_inputs_with_special_tokensz/GemmaTokenizer.build_inputs_with_special_tokens   s`    .2.@.@))*b.2.@.@))*b+l:"l*[8<GFr1   rz   r{   already_has_special_tokensc                     |rt         |   ||d      S | j                  rdgng }| j                  rdgng }||dgt	        |      z  z   |z   S |dgt	        |      z  z   |z   |z   dgt	        |      z  z   |z   S )a  
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer `prepare_for_model` method.

        Args:
            token_ids_0 (`List[int]`):
                List of IDs.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        T)rz   r{   r~      r   )r+   get_special_tokens_maskr   r    len)r-   rz   r{   r~   rw   rx   r/   s         r0   r   z&GemmaTokenizer.get_special_tokens_mask   s    $ &72'[]a 3   #00sb"00sbA3[)9#9:\IIsS%%'  sS%%	'
 	
r1   c                     | j                   r| j                  gng }| j                  r| j                  gng }dgt	        ||z   |z         z  }||dgt	        ||z   |z         z  z  }|S )a  
        Creates a mask from the two sequences passed to be used in a sequence-pair classification task. An ALBERT
        sequence pair mask has the following format:

        ```
        0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
        | first sequence    | second sequence |
        ```

        if token_ids_1 is None, only returns the first portion of the mask (0s).

        Args:
            token_ids_0 (`List[int]`):
                List of ids.
            token_ids_1 (`List[int]`, *optional*):
                Optional second list of IDs for sequence pairs.

        Returns:
            `List[int]`: List of [token type IDs](../glossary#token-type-ids) according to the given sequence(s).
        r   r   )r   rw   r    rx   r   ry   s         r0   $create_token_type_ids_from_sequencesz3GemmaTokenizer.create_token_type_ids_from_sequences  sz    . /3.@.@))*b.2.@.@))*bs<+5DEE"qcC{ :\ IJJJFr1   	token_idsskip_special_tokensr#   c                    g }g }|D ]  }|r|| j                   v r|| j                  v rW|r*|j                  | j                  j	                  |             |j                  | j                  |   j
                         g }y|j                  |        |r*|j                  | j                  j	                  |             |rdj                  |      }ndj                  |      }|j                  t        d      S )N rY   )	all_special_ids_added_tokens_decoderr\   r)   r[   contentrj   replaceSPIECE_UNDERLINE)r-   r   r   r#   r.   	sub_textscurrent_sub_textidss           r0   _decodezGemmaTokenizer._decode1  s     	 		-C"sd.B.B'Bd000#$$T]]%9%9:J%KL  !;!;C!@!H!HI#%  '',		- T]]112BCD(+I	*I  !1377r1   )
z<unk>z<bos>z<eos>z<pad>NTFFFFrv   )NF)FF) __name__
__module____qualname____doc__rk   vocab_files_namesmodel_input_namesr   r   r&   r   r,   r8   r=   propertyr@   rG   r   rK   rO   rS   rW   r`   r	   rt   r}   intboolr   r   r   __classcell__)r/   s   @r0   r   r   +   sw   ,\ *$&67
 48%*"'&+(
 "$sCx.1(
TC
 . .0[ 0tCy 080
!x} !X]^aXb !6	 sx#
9#
3;DI3F#
ko#
	c#
L JN93;DI3F	cH %*.3	898 "8 (,	8 
8r1   r   )re   shutilr   typingr   r   r   r   r   r	   r   r'   tokenization_utilsr   r   utilsr   utils.import_utilsr   tokenization_utils_baser   
get_loggerr   rh   rk   r   r   __all__r$   r1   r0   <module>r      s{   , 
  B B  A  * 4			H	%!#45   
%&`8( `8 '`8F	 
r1   