
    Uh[$                     b   d dl Z d dlZd dlmZmZ d dlmZ d dlmZm	Z	m
Z
mZ d dlZd dlmZ d dlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZmZmZmZ  ej:                  e      Z e  ejB                               Z" e#d e"D              Z$e G d d             Z% G d de      Z& G d de      Z'y)    N)	dataclassfield)Enum)DictListOptionalUnion)FileLock)Dataset   )$MODEL_FOR_QUESTION_ANSWERING_MAPPING)PreTrainedTokenizer)check_torch_load_is_safelogging   )SquadFeaturesSquadV1ProcessorSquadV2Processor"squad_convert_examples_to_featuresc              #   4   K   | ]  }|j                     y wN)
model_type).0confs     r/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/data/datasets/squad.py	<genexpr>r   "   s     EDOOEs   c                      e Zd ZU dZ eddddj                  e      z   i      Zee	d<    edddi      Z
ee	d	<    ed
ddi      Zee	d<    ed
ddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    edddi      Zee	d<    eddd i      Zee	d!<    ed"dd#i      Zee	d$<   y)%SquadDataTrainingArgumentszb
    Arguments pertaining to what data we are going to input our model for training and eval.
    Nhelpz!Model type selected in the list: z, )defaultmetadatar   zFThe input data dir. Should contain the .json files for the SQuAD task.data_dir   zThe maximum total input sequence length after tokenization. Sequences longer than this will be truncated, sequences shorter will be padded.max_seq_lengthzVWhen splitting up a long document into chunks, how much stride to take between chunks.
doc_stride@   zkThe maximum number of tokens for the question. Questions longer than this will be truncated to this length.max_query_length   zThe maximum length of an answer that can be generated. This is needed because the start and end predictions are not conditioned on one another.max_answer_lengthFz1Overwrite the cached training and evaluation setsoverwrite_cachezDIf true, the SQuAD examples contain some that do not have an answer.version_2_with_negativeg        zIIf null_score - best_non_null is greater than the threshold predict null.null_score_diff_threshold   n_best_sizer   zjlanguage id of input for language-specific xlm models (see tokenization_xlm.PRETRAINED_INIT_CONFIGURATION)lang_id   z3multiple threads for converting example to featuresthreads)__name__
__module____qualname____doc__r   joinMODEL_TYPESr   str__annotations__r"   r$   intr%   r'   r)   r*   boolr+   r,   floatr.   r/   r1        r   r   r   %   s    (KdiiXcNd(deJ  (pqHc   Q
NC  rsJ  "/
c  #J
s  ")\ ]OT  %*)o p%T  (-v'rs(u  f&qrK  C
GS  f6k-lmGSmr>   r   c                       e Zd ZdZdZy)SplittraindevN)r2   r3   r4   rA   rB   r=   r>   r   r@   r@   h   s    E
Cr>   r@   c                       e Zd ZU dZeed<   ee   ed<   eed<   e	ed<   dej                  dddfded	ed
ee   deeef   dee	   dee   dee   fdZd Zdeeej(                  f   fdZy)SquadDatasetzH
    This will be superseded by a framework-agnostic approach soon.
    argsfeaturesmodeis_language_sensitiveNFpt	tokenizerlimit_length	cache_dirdataset_formatc                    || _         || _        |j                  r
t               n	t	               | _        t        |t              r
	 t        |   }|| _
        |j                  rdnd}t        j                  j                  ||n|j                  d|j                   d|j                   j"                   d|j$                   d|       }	|	dz   }
t'        |
      5  t        j                  j)                  |	      r|j*                  st-        j,                         }t/                t1        j2                  |	d      | _        | j4                  d	   | _        | j4                  j9                  d
d       | _        | j4                  j9                  dd       | _        t>        jA                  d|	 dt-        j,                         |z
         | j:                  | j<                  dt>        jC                  d|	 d       nI|t        jD                  k(  r+| j
                  jG                  |j                        | _        n*| j
                  jI                  |j                        | _        tK        | j<                  ||j$                  |jL                  |jN                  |t        jP                  k(  |jR                  |      \  | _        | _        t-        j,                         }t1        jT                  | j6                  | j:                  | j<                  d|	       t>        jA                  d|	 dt-        j,                         |z
  dd       d d d        y # t        $ r t        d      w xY w# 1 sw Y   y xY w)Nzmode is not a valid split namev2v1cached__z.lockT)weights_onlyrF   datasetexamplesz"Loading features from cached file z [took %.3f s]zDeleting cached file z; will allow dataset and examples to be cached in future run)rU   rJ   r$   r%   r'   is_trainingr1   return_dataset)rF   rT   rU   z!Saving features into cached file z [took z.3fz s])+rE   rH   r+   r   r   	processor
isinstancer8   r@   KeyErrorrG   ospathr6   r"   value	__class__r2   r$   r
   existsr*   timer   torchloadold_featuresrF   getrT   rU   loggerinfowarningrB   get_dev_examplesget_train_examplesr   r%   r'   rA   r1   save)selfrE   rJ   rK   rG   rH   rL   rM   version_tagcached_features_file	lock_pathstarts               r   __init__zSquadDataset.__init__w   s    	%:"/3/K/K)+QaQcdC AT{ 	"::d!ww||".IDMMdjj\9#6#6#?#?"@$BUBUAVVWXcWde 
 )72	i  -	ww~~23D<P<P		(*$)JJ/CRV$W! !% 1 1* =#0044YE $ 1 1 5 5j$ G89M8Nn]_c_h_h_jmr_r <<'4==+@NN/0D/E F& &
 599$$(NN$C$CDMM$RDM$(NN$E$Edmm$TDM.P!]]'#'#6#6#%)%:%: $ 3 LL#1	/+t| 		

!%4<<UYUbUbc(
 78L7MWUYU^U^U`chUhilTmmpqW-	 -	  A?@@A-	 -	s   	M I(M M M)c                 ,    t        | j                        S r   )lenrF   )rk   s    r   __len__zSquadDataset.__len__   s    4==!!r>   returnc                 (   | j                   |   }t        j                  |j                  t        j                        }t        j                  |j
                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }t        j                  |j                  t        j                        }|||d}	| j                  j                  dv r|	d= | j                  j                  dv r|	j                  ||d       | j                  j                  r|	j                  d|i       | j                  rW|	j                  dt        j                   |j"                  t        j$                        | j                  j&                  z  i       | j(                  t*        j,                  k(  rrt        j                  |j.                  t        j                        }
t        j                  |j0                  t        j                        }|	j                  |
|d	       |	S )
N)dtype)	input_idsattention_masktoken_type_ids)xlmroberta
distilbert	camembertry   )xlnetrz   )	cls_indexp_maskis_impossiblelangs)start_positionsend_positions)rF   ra   tensorrw   longrx   ry   r   r   r<   r   rE   r   updater+   rH   onesshapeint64r/   rG   r@   rA   start_positionend_position)rk   ifeaturerw   rx   ry   r   r   r   inputsr   r   s               r   __getitem__zSquadDataset.__getitem__   s   --"LL!2!2%**E	g&<&<EJJOg&<&<EJJOLL!2!2%**E	gnnEKK@W%:%:%++N #,,
 99#PP'(99#33MM	VDEyy00>?))wIOO5;;)WZ^ZcZcZkZk)kmn99##ll7+A+ATO!LL)=)=UZZPMMMoP]^_r>   )r2   r3   r4   r5   r   r9   r   r   r@   r;   rA   r   r   r:   r	   r8   rp   rs   r   ra   Tensorr   r=   r>   r   rD   rD   m   s     %$=!!
K '+"'++05#'(,J(J 'J sm	J
 CJJ  (~J C=J !JX" S%,,%6 7  r>   rD   )(r[   r`   dataclassesr   r   enumr   typingr   r   r   r	   ra   filelockr
   torch.utils.datar   models.auto.modeling_autor   tokenization_utilsr   utilsr   r   processors.squadr   r   r   r   
get_loggerr2   re   listkeysMODEL_CONFIG_CLASSEStupler7   r   r@   rD   r=   r>   r   <module>r      s    
  (  . .   $ M 5 6 t t 
		H	%E@EEGH E0DEE ?n ?n ?nDD 
y7 yr>   