
    Uh`                        d Z ddlZddlmZ ddlmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z#m$Z$ ddl%m&Z&  e$jN                  e(      Z)da*d Z+d Z,dHdZ-dHdZ.dHdZ/d Z0 G d dejb                  jd                        Z3 G d dejb                  jd                        Z4 G d d      Z5dIdZ6d Z7	 	 	 dJdZ8 G d d e
jr                        Z: G d! d"e
jr                        Z; G d# d$e
jr                        Z< G d% d&e
jr                        Z= G d' d(e
jr                        Z> G d) d*e
jr                        Z? G d+ d,e
jr                        Z@ G d- d.e
jr                        ZA G d/ d0e
jr                        ZB G d1 d2e
jr                        ZC G d3 d4e
jr                        ZDe! G d5 d6e             ZEe! G d7 d8eE             ZFe! G d9 d:eE             ZG G d; d<e
jr                        ZH e!d=>       G d? d@eE             ZIe! G dA dBeE             ZJe! G dC dDeE             ZKe! G dE dFeE             ZLg dGZMy)KzPyTorch MRA model.    N)Path)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss)load   )ACT2FN)"BaseModelOutputWithCrossAttentionsMaskedLMOutputMultipleChoiceModelOutputQuestionAnsweringModelOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_ninja_availableis_torch_cuda_availablelogging   )	MraConfigc                      t        t              j                         j                  j                  j                  dz  dz  fd}  | g d      }t	        d|d      ay )Nkernelsmrac                 4    | D cg c]  }|z  	 c}S c c}w N )filesfile
src_folders     v/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mra/modeling_mra.pyappend_rootz&load_cuda_kernels.<locals>.append_root3   s    .34d
T!444s   )zcuda_kernel.cuzcuda_launch.cuztorch_extension.cppcuda_kernelT)verbose)r   __file__resolveparentr   mra_cuda_kernel)r(   	src_filesr&   s     @r'   load_cuda_kernelsr0   /   sQ    h'')0077>>JURJ5 WXI=)TBO    c                 N   t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d      dk7  rt        d      | j                  d      dk7  rt        d      | j                  d	
      j                  j                  dd	      }|j                         }|j                         }|j                         }t        j                  ||||      \  }}|j                  dd	      dddddddf   }||fS )z8
    Computes maximum values for softmax stability.
       z.sparse_qk_prod must be a 4-dimensional tensor.   'indices must be a 2-dimensional tensor.    z>The size of the second dimension of sparse_qk_prod must be 32.r   z=The size of the third dimension of sparse_qk_prod must be 32.dimN)
lensize
ValueErrormaxvalues	transpose
contiguousintr.   	index_max)sparse_qk_prodindicesquery_num_blockkey_num_block
index_valsmax_valsmax_vals_scatters          r'   
sparse_maxrK   ;   s    > !Q&IJJ
7<<>aBCC1#YZZ1#XYY###+22<<RDJ&&(JkkmG  "G!0!:!::wP_an!oH'11"b9!Qa-H%%%r1   c                    t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                  d   |j                  d   k7  rt        d      | j                  \  }}||z  }t	        j
                  |j                  d      t        j                  |j                        }| j                  |||      } | |dddf   ||z  j                         ddf   } | S )zN
    Converts attention mask to a sparse mask for high resolution logits.
    r4   z$mask must be a 2-dimensional tensor.r5   r   zBmask and indices must have the same size in the zero-th dimension.dtypedeviceN)	r;   r<   r=   shapetorcharangelongrO   reshape)maskrE   
block_size
batch_sizeseq_len	num_block	batch_idxs          r'   sparse_maskr[   W   s     499;1?@@
7<<>aBCCzz!}a((]^^**J:%IW\\!_EJJw~~VI<<
Iz:D	!T'"Wy%8$>$>$@!CDDKr1   c                 j   | j                         \  }}}|j                         \  }}}||z  dk7  rt        d      ||z  dk7  rt        d      | j                  |||z  ||      j                  dd      } |j                  |||z  ||      j                  dd      }t	        | j                               dk7  rt        d      t	        |j                               dk7  rt        d      t	        |j                               d	k7  rt        d
      | j                  d      dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | ||j                               S )z7
    Performs Sampled Dense Matrix Multiplication.
    r   zTquery_size (size of first dimension of dense_query) must be divisible by block_size.Pkey_size (size of first dimension of dense_key) must be divisible by block_size.r:   r7   r3   z+dense_query must be a 4-dimensional tensor.)dense_key must be a 4-dimensional tensor.r4   r5   r   r6   z.The third dimension of dense_query must be 32.z,The third dimension of dense_key must be 32.)	r<   r=   rT   r@   r;   rA   rB   r.   mm_to_sparse)	dense_query	dense_keyrE   rV   rW   
query_sizer9   _key_sizes	            r'   r_   r_   n   s    #."2"2"4J
C ~~'AxJ!#opp*!kll%%j*
2JJX[\ffgikmnK!!*h*.DjRUV``aceghI
;!#FGG
9>>!DEE
7<<>aBCCb IJJ~~aBGHH((*K$$&IkkmG  "G''YNNr1   c                 B   |j                         \  }}}||z  dk7  rt        d      | j                  d      |k7  rt        d      | j                  d      |k7  rt        d      |j                  |||z  ||      j                  dd      }t	        | j                               d	k7  rt        d
      t	        |j                               d	k7  rt        d      t	        |j                               dk7  rt        d      |j                  d      dk7  rt        d      | j                         } |j                         }|j                         }|j                         }t        j                  | |||      }|j                  dd      j                  |||z  |      }|S )zP
    Performs matrix multiplication of a sparse matrix with a dense matrix.
    r   r]   r4   zQThe size of the second dimension of sparse_query must be equal to the block_size.r   zPThe size of the third dimension of sparse_query must be equal to the block_size.r:   r7   r3   ,sparse_query must be a 4-dimensional tensor.r^   r5   r6   z8The size of the third dimension of dense_key must be 32.)	r<   r=   rT   r@   r;   rA   rB   r.   sparse_dense_mm)	sparse_queryrE   ra   rF   rV   rW   rd   r9   dense_qk_prods	            r'   rg   rg      s    !* 0J#*!kllz)lmmz)kll!!*h*.DjRUV``aceghI
<1$GHH
9>>!DEE
7<<>aBCC~~aBSTT**,LkkmG  "G$$&I#33L'9VefM!++B3;;JZdHdfijMr1   c                 `    | |z  |z  t        j                  | |d      z   j                         S )Nfloorrounding_mode)rQ   divrS   )rE   dim_1_blockdim_2_blocks      r'   transpose_indicesrq      s.    {"k1EIIg{bi4jjpprrr1   c                   >    e Zd Zed        Zed        Zedd       Zy)MraSampledDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S r"   )r_   save_for_backwardrV   )ctxr`   ra   rE   rV   rD   s         r'   forwardzMraSampledDenseMatMul.forward   s1    %k9gzRk9g>#r1   c                    | j                   \  }}}| j                  }|j                  d      |z  }|j                  d      |z  }t        |||      }t	        |j                  dd      |||      }	t	        ||||      }
|
|	d d fS Nr   r:   r7   )saved_tensorsrV   r<   rq   rg   r@   )rv   gradr`   ra   rE   rV   rF   rG   	indices_Tgrad_key
grad_querys              r'   backwardzMraSampledDenseMatMul.backward   s    *-*;*;'Y^^
%**1-;!q)Z7%gN	"4>>"b#99kS`a$T7IO
8T4//r1   c                 2    t         j                  | |||      S r"   )rs   apply)r`   ra   rE   rV   s       r'   operator_callz#MraSampledDenseMatMul.operator_call   s    $**;	7JWWr1   Nr6   __name__
__module____qualname__staticmethodrw   r   r   r#   r1   r'   rs   rs      s>      0 0 X Xr1   rs   c                   <    e Zd Zed        Zed        Zed        Zy)MraSparseDenseMatMulc                 V    t        ||||      }| j                  |||       || _        |S r"   )rg   ru   rF   )rv   rh   rE   ra   rF   rD   s         r'   rw   zMraSparseDenseMatMul.forward   s2    (w	?[lGY?-r1   c                     | j                   \  }}}| j                  }|j                  d      |j                  d      z  }t        |||      }t	        |j                  dd      |||      }t        |||      }	|	d |d fS ry   )rz   rF   r<   rq   rg   r@   r_   )
rv   r{   rh   rE   ra   rF   rG   r|   r}   r~   s
             r'   r   zMraSparseDenseMatMul.backward   s    +.+<+<(gy--!q)\->->r-BB%gN	"<#9#9"b#A9dTab!$	7;
44//r1   c                 2    t         j                  | |||      S r"   )r   r   )rh   rE   ra   rF   s       r'   r   z"MraSparseDenseMatMul.operator_call   s    #)),O\\r1   Nr   r#   r1   r'   r   r      s>      0 0 ] ]r1   r   c                       e Zd Zed        Zy)MraReduceSumc                 B   | j                         \  }}}}t        | j                               dk7  rt        d      t        |j                               dk7  rt        d      | j                         \  }}}}|j                         \  }}| j                  d      j	                  ||z  |      } t        j                  |j                  d      t
        j                  |j                        }t        j                  ||d	      j                         |d d d f   |z  z   j	                  ||z        }	t        j                  ||z  |f| j                  | j                        }
|
j                  d|	|       j	                  |||      }|j	                  |||z        }|S )
Nr3   rf   r4   r5   r8   r   rM   rk   rl   )r<   r;   r=   sumrT   rQ   rR   rS   rO   rn   zerosrN   	index_add)rh   rE   rF   rG   rW   rY   rV   rc   rZ   global_idxestempoutputs               r'   r   zMraReduceSum.operator_call   sy   /;/@/@/B,
Iz1|  "#q(KLLw||~!#FGG*//11j! '
I#''A'.66zI7MzZLLa

7>>Z	IIg}GDIIKiXY[_X_N`crNrr
'*y(
) 	 {{/):6l>P>PYeYlYl
 <>FFzSbdno
Oj,HIr1   N)r   r   r   r   r   r#   r1   r'   r   r      s     r1   r   c                 &   | j                         \  }}}||z  }d}	||j                  |||      j                  d      }
| j                  ||||      j                  d      |
dddddf   dz   z  }|j                  ||||      j                  d      |
dddddf   dz   z  }||j                  ||||      j                  d      |
dddddf   dz   z  }	n|t        j                  ||t        j
                  | j                        z  }
| j                  ||||      j                  d      }|j                  ||||      j                  d      }|$|j                  ||||      j                  d      }	t        j                  ||j                  dd            t        j                  |      z  }|j                  dd      j                  }|0|d	|
dddddf   |
dddddf   z  d
k  j                         z  z
  }||
||	fS )z/
    Compute low resolution approximation.
    Nr:   r8   r7   ư>rM   T)r9   keepdims     @g      ?)r<   rT   r   rQ   onesfloatrO   meanmatmulr@   mathsqrtr>   r?   )querykeyrV   rU   valuerW   rX   head_dimnum_block_per_row	value_hattoken_count	query_hatkey_hatlow_resolution_logitlow_resolution_logit_row_maxs                  r'   get_low_resolution_logitr     sN    %*JJL!J:-Ill:/@*MQQVXQYMM*.?XVZZ_aZb1d
#d*
	 ++j*;ZRVV[]V^1d
#d*
 j2CZQYZ^^ce^fAq$J'$.I !5::j:KSXS^S^glgsgs#ttMM*.?XV[[`b[c	++j*;ZRWW\^W_j2CZQYZ__df_gI <<	73D3DR3LMPTPYPYZbPcc#7#;#;T#;#R#Y#Y  3;q$z+B[QRTUW[Q[E\+\`c*c)j)j)l#ll 	  .JIUUr1   c                    | j                   \  }}}|dkD  rf|dz  }t        j                  ||| j                        }	t        j                  t        j
                  |	|       |      }
| |
dddddf   dz  z   } |dkD  r:| ddd|ddf   dz   | ddd|ddf<   | ddddd|f   dz   | ddddd|f<   t        j                  | j                  |d      |ddd	
      }|j                  }|dk(  rE|j                  j                  d      j                  }| |ddddf   k\  j                         }||fS |dk(  rd}||fS t        | d      )zZ
    Compute the indices of the subset of components to be used in the approximation.
    r   r4   rO   )diagonalNg     @r:   TF)r9   largestsortedfullr8   sparsez# is not a valid approx_model value.)rP   rQ   r   rO   triltriutopkrT   rE   r?   minr   r=   )r   
num_blocksapprox_modeinitial_prior_first_n_blocksinitial_prior_diagonal_n_blocksrW   total_blocks_per_rowrc   offset	temp_maskdiagonal_mask
top_k_valsrE   	thresholdhigh_resolution_masks                  r'   get_block_idxesr   6  s    +?*D*D'J$a&*0A5JJ35IRfRmRmn	

5::i6'#JU[\3mD!QJ6ORU6UU#a' $A%A$A1!DEK 	Q =!= =q@A !A'D(D'D!DEK 	Q#@$@#@@A $$Z4jbRV_dJ   Gf%%))b)188	 4	!T4-8P PWWY ((( 
	 # ((( K=(KLMMr1   c	                    t         #t        j                  |       j                         S | j	                         \  }	}
}}|	|
z  }||z  dk7  rt        d      ||z  }| j                  |||      } |j                  |||      }|j                  |||      }|-| |dddddf   z  } ||dddddf   z  }||dddddf   z  }|dk(  rt        | ||||      \  }}}}nA|dk(  r1t        j                         5  t        | |||      \  }}}}ddd       nt        d      t        j                         5  z
  }t        |||||      \  }}ddd       t        j                  | ||      t        j                  |      z  }t        ||||      \  }}||z
  }|"|dd	t!        ||      dddddddf   z
  z  z
  }t        j"                  |      }t$        j                  ||||      }t&        j                  ||||      }|dk(  ryt        j"                  z
  dz  z
        dddddf   z  }t        j(                  |      dddddddf   j+                  d	d	|d	      j                  |||      }|j-                  d
      dddddf   j+                  d	d	|      j                  ||      }|j+                  d	d	|      j                  ||      |z
  } || |z  } t        j"                  | | dk  j/                         z        }!||!dddddf   z  }||!z  }t        j"                  |  | dkD  j/                         z        }"||"dddddf   z  }||"z  }||z   |dddddf   |dddddf   z   dz   z  }#n#|dk(  r||dddddf   dz   z  }#nt        d      ||#|dddddf   z  }#|#j                  |	|
||      }#|#S # 1 sw Y   xY w# 1 sw Y   xY w)z0
    Use Mra to approximate self-attention.
    Nr   z4sequence length must be divisible by the block_size.r   r   z&approx_mode must be "full" or "sparse")rV   r   r   r:   r8   r   z-config.approx_mode must be "full" or "sparse")r.   rQ   
zeros_likerequires_grad_r<   r=   rT   r   no_grad	Exceptionr   rs   r   r   r   rK   r[   expr   r   r   repeatr   r   )$r   r   r   rU   r   r   rV   r   r   rW   num_headrX   r   
meta_batchr   r   r   r   r   rc   low_resolution_logit_normalizedrE   r   high_resolution_logitrI   rJ   high_resolution_attnhigh_resolution_attn_outhigh_resolution_normalizerlow_resolution_attnlow_resolution_attn_outlow_resolution_normalizerlog_correctionlow_resolution_corrhigh_resolution_corrcontext_layers$                                       r'   mra2_attentionr   \  s    &5577.3jjl+J'8h&Jq OPP:-MM*gx8E
++j'8
4CMM*gx8EQ4Z((DAt$$Q4Z((fUm3
D%V
Rk+G 
	 ]]_ 	QisJRN +/KQ	 	
 @AA	 
*>A]*]'(7+(+)
%%
 2??sG
 @ 		( ",,A7L]_p!qH14DD 5q;tU\C]^_abdegk^kCl?l8m m 99%:;3AAgu.?  ".!;!;g'8:K" fII*-IICRfLffg!T1*%& 	 LL,i8AtQGVAq*a(WZ(3 	   ###+Aq$J7>>q!ZPXXYcelm 	" 6<<Q:NVVWacjknvv+d2N#ii.A:M9T9T9V(VW"9<OPQSTVZPZ<["[$=@S$S!$yy.NQ<N;U;U;W)WX#;>RSTVWY]S]>^#^ %?BV%V"14KK&q!Tz25NqRSUYz5ZZ]aa
 
	 04NqRSUYz4Z]a4abGHH%Q4Z(88!))*hRMS	 	
 
s   7O
3O
OO!c                   *     e Zd ZdZ fdZddZ xZS )MraEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 p   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  dz   |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        | j#                  dt%        j&                  |j                        j)                  d      dz          t+        |dd      | _        | j#                  dt%        j.                  | j0                  j3                         t$        j4                  | j0                  j6                  	      d
       y )N)padding_idxr4   epsposition_ids)r   r:   position_embedding_typeabsolutetoken_type_idsrM   F)
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferrQ   rR   expandgetattrr   r   r   r<   rS   rO   selfconfig	__class__s     r'   r   zMraEmbeddings.__init__  s?   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NQR0RTZTfTf#g %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<= 	^U\\&:X:X-Y-`-`ah-ilm-mn'.v7PR\']$KK))..0

4K\K\KcKcd 	 	
r1   c                 T   ||j                         }n|j                         d d }|d   }|| j                  d d d |f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }|}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }	||	z   }
| j                  dk(  r| j                  |      }|
|z  }
| j                  |
      }
| j                  |
      }
|
S )Nr:   r   r   r   rM   r   )r<   r   hasattrr   r   rQ   r   rS   rO   r   r   r   r   r   r   )r   	input_idsr   r   inputs_embedsinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr   
embeddingsr   s               r'   rw   zMraEmbeddings.forward  s=    #..*K',,.s3K ^
,,Q^<L
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r1   )NNNNr   r   r   __doc__r   rw   __classcell__r   s   @r'   r   r     s    Q
( r1   r   c                   .     e Zd Zd fd	Zd ZddZ xZS )MraSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      t        d u}t               rt               r|s	 t                |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j"                  |j                  | j                        | _        t!        j*                  |j,                        | _        ||n|j0                  | _        |j2                  dz  |j4                  z  | _        t9        | j6                  t        |j2                  dz  dz              | _        |j:                  | _        |j<                  | _        |j>                  | _        y # t        $ r#}t        j                  d|        Y d }~d }~ww xY w)	Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zGCould not load the custom kernel for multi-scale deformable attention: r6   r4   ) r   r   r   num_attention_headsr   r=   r.   r   r   r0   r   loggerwarningrB   attention_head_sizeall_head_sizer   Linearr   r   r   r   attention_probs_dropout_probr   r   r   block_per_rowrY   r   r   r   r   )r   r   r   kernel_loadeder   s        r'   r   zMraSelfAttention.__init__
  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 (t3"$);)=mn!# $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'>'J#PVPnPn 	$ !88B>&BVBVVT^^S&2P2PTV2V[\1\-]^!--,2,O,O)/5/U/U,+  n!hijhklmmns   =
H, ,	I5IIc                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr:   r   r4   r   r   )r<   r  r  viewpermute)r   layernew_layer_shapes      r'   transpose_for_scoresz%MraSelfAttention.transpose_for_scores-  sO    **,s+t/G/GIaIa.bb

O,}}Q1a((r1   c           
         | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }|j	                         \  }}}	}
d|dz  z   }|j                         j                  d|d      j                  ||z  |	      j                         }d}|
|k  r|||	||
z
  f}t        j                  |t        j                  ||j                        gd      }t        j                  |t        j                  ||j                        gd      }t        j                  |t        j                  ||j                        gd      }t        |j                         |j                         |j                         |j                         | j                  | j                   | j"                  | j$                        }|
|k  r|d d d d d d d |
f   }|j                  |||	|
      }|j'                  d	d
dd      j)                         }|j	                         d d | j*                  fz   } |j,                  | }|f}|S )N      ?r   r   r6   r   r:   r8   )r   r   r   r   r4   r   r7   )r   r  r   r   r<   squeezer   rT   rB   rQ   catr   rO   r   r   rY   r   r   r   r  rA   r  r  )r   hidden_statesattention_maskmixed_query_layer	key_layervalue_layerquery_layerrW   	num_headsrX   r   gpu_warp_sizepad_sizer   new_context_layer_shapeoutputss                   r'   rw   zMraSelfAttention.forward2  sD    JJ}5--dhh}.EF	//

=0IJ//0AB3>3C3C3E0
Iw ~77""$++Ay!<DDZR[E[]deiik 	 m#!9g}x7OOH))[%++h{OaOa2b$ciklK		9ekk(9K[K[.\"]cefI))[%++h{OaOa2b$ciklK&OO  "NN(()-)J)J,0,P,P	
 m#)!Q9H9*<=M%--j)WhW%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CD "r1   r"   )r   r   r   r   r  rw   r  r  s   @r'   r
  r
  	  s    !VF)
0r1   r
  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )MraSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )r   r   r   r  r   denser   r   r   r   r   r   s     r'   r   zMraSelfOutput.__init__g  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r1   r"  input_tensorreturnc                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r"   r1  r   r   r   r"  r2  s      r'   rw   zMraSelfOutput.forwardm  7    

=1]3}|'CDr1   r   r   r   r   rQ   Tensorrw   r  r  s   @r'   r.  r.  f  1    >U\\  RWR^R^ r1   r.  c                   .     e Zd Zd fd	Zd ZddZ xZS )MraAttentionc                     t         |           t        ||      | _        t	        |      | _        t               | _        y )N)r   )r   r   r
  r   r.  r   setpruned_heads)r   r   r   r   s      r'   r   zMraAttention.__init__u  s3    $VE\]	#F+Er1   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r8   )r;   r   r   r  r  r?  r   r   r   r   r   r1  r  union)r   headsindexs      r'   prune_headszMraAttention.prune_heads{  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r1   c                 f    | j                  ||      }| j                  |d   |      }|f|dd  z   }|S Nr   r   )r   r   )r   r"  r#  self_outputsattention_outputr,  s         r'   rw   zMraAttention.forward  s@    yy?;;|AF#%QR(88r1   r"   )r   r   r   r   rD  rw   r  r  s   @r'   r<  r<  t  s    ";$r1   r<  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r"   )r   r   r   r  r   intermediate_sizer1  
isinstance
hidden_actstrr   intermediate_act_fnr   s     r'   r   zMraIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r1   r"  r3  c                 J    | j                  |      }| j                  |      }|S r"   )r1  rP  r   r"  s     r'   rw   zMraIntermediate.forward  s&    

=100?r1   r8  r  s   @r'   rJ  rJ    s#    9U\\ ell r1   rJ  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	MraOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r0  )r   r   r   r  rL  r   r1  r   r   r   r   r   r   s     r'   r   zMraOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r1   r"  r2  r3  c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r"   r5  r6  s      r'   rw   zMraOutput.forward  r7  r1   r8  r  s   @r'   rT  rT    r:  r1   rT  c                   ,     e Zd Z fdZddZd Z xZS )MraLayerc                     t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        t        |      | _        t        |      | _
        y Nr   )r   r   chunk_size_feed_forwardseq_len_dimr<  	attentionadd_cross_attentionrJ  intermediaterT  r   r   s     r'   r   zMraLayer.__init__  sW    '-'E'E$%f-#)#=#= +F3'r1   c                     | j                  ||      }|d   }|dd  }t        | j                  | j                  | j                  |      }|f|z   }|S rF  )r]  r   feed_forward_chunkr[  r\  )r   r"  r#  self_attention_outputsrH  r,  layer_outputs          r'   rw   zMraLayer.forward  sc    !%~!N1!4(,0##T%A%A4CSCSUe
  /G+r1   c                 L    | j                  |      }| j                  ||      }|S r"   )r_  r   )r   rH  intermediate_outputrc  s       r'   ra  zMraLayer.feed_forward_chunk  s,    "//0@A{{#68HIr1   r"   )r   r   r   r   rw   ra  r  r  s   @r'   rX  rX    s    (r1   rX  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )
MraEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r   r   r   r   
ModuleListrangenum_hidden_layersrX  r  gradient_checkpointing)r   r   rc   r   s      r'   r   zMraEncoder.__init__  sN    ]]eFD\D\>]#^HV$4#^_
&+# $_s   A#c                 6   |rdnd }t        | j                        D ]Q  \  }}|r||fz   }| j                  r*| j                  r| j	                  |j
                  ||      }	n	 |||      }	|	d   }S |r||fz   }|st        d ||fD              S t        ||      S )Nr#   r   c              3   &   K   | ]	  }||  y wr"   r#   ).0vs     r'   	<genexpr>z%MraEncoder.forward.<locals>.<genexpr>  s     Xq!-Xs   )last_hidden_stater"  )	enumerater  rl  training_gradient_checkpointing_func__call__tupler   )
r   r"  r#  	head_maskoutput_hidden_statesreturn_dictall_hidden_statesilayer_modulelayer_outputss
             r'   rw   zMraEncoder.forward  s     #7BD(4 	-OA|#$58H$H!**t}} $ A A ))!"! !-]N K)!,M	-   1]4D DX]4E$FXXX1++
 	
r1   )NNFT)r   r   r   r   rw   r  r  s   @r'   rg  rg    s    , "!
r1   rg  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r0  )r   r   r   r  r   r1  rM  rN  rO  r   transform_act_fnr   r   r   s     r'   r   z#MraPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr1   r"  r3  c                 l    | j                  |      }| j                  |      }| j                  |      }|S r"   )r1  r  r   rR  s     r'   rw   z"MraPredictionHeadTransform.forward  s4    

=1--m<}5r1   r8  r  s   @r'   r  r    s$    UU\\ ell r1   r  c                   *     e Zd Z fdZd Zd Z xZS )MraLMPredictionHeadc                 H   t         |           t        |      | _        t	        j
                  |j                  |j                  d      | _        t	        j                  t        j                  |j                              | _        | j                  | j                  _        y )NF)bias)r   r   r  	transformr   r  r   r   decoder	ParameterrQ   r   r  r   s     r'   r   zMraLMPredictionHead.__init__  sm    3F; yy!3!3V5F5FUSLLV->->!?@	 !IIr1   c                 :    | j                   | j                  _         y r"   )r  r  r   s    r'   _tie_weightsz MraLMPredictionHead._tie_weights  s     IIr1   c                 J    | j                  |      }| j                  |      }|S r"   )r  r  rR  s     r'   rw   zMraLMPredictionHead.forward  s$    }5]3r1   )r   r   r   r   r  rw   r  r  s   @r'   r  r    s    &&r1   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MraOnlyMLMHeadc                 B    t         |           t        |      | _        y r"   )r   r   r  predictionsr   s     r'   r   zMraOnlyMLMHead.__init__'  s    .v6r1   sequence_outputr3  c                 (    | j                  |      }|S r"   )r  )r   r  prediction_scoress      r'   rw   zMraOnlyMLMHead.forward+  s     ,,_=  r1   r8  r  s   @r'   r  r  &  s#    7!u|| ! !r1   r  c                       e Zd ZeZdZdZd Zy)MraPreTrainedModelr    Tc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j                  d| j                  j                         |j                  2|j                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yy)zInitialize the weightsg        )r   stdNr  )rM  r   r  weightdatanormal_r   initializer_ranger  zero_r   r   r   fill_)r   modules     r'   _init_weightsz MraPreTrainedModel._init_weights7  s   fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r1   N)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r#   r1   r'   r  r  0  s     L&*#*r1   r  c                   *    e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee   dee   deeef   fd       Z xZS )MraModelc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r"   )r   r   r   r   r  rg  encoder	post_initr   s     r'   r   zMraModel.__init__J  s;     '/!&) 	r1   c                 .    | j                   j                  S r"   r  r   r  s    r'   get_input_embeddingszMraModel.get_input_embeddingsT  s    ...r1   c                 &    || j                   _        y r"   r  )r   r   s     r'   set_input_embeddingszMraModel.set_input_embeddingsW  s    */'r1   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r]  rD  )r   heads_to_pruner  rB  s       r'   _prune_headszMraModel._prune_headsZ  sE    
 +002 	CLE5LLu%//;;EB	Cr1   r   r#  r   r   rx  r   ry  rz  r3  c	                    ||n| j                   j                  }||n| j                   j                  }||t        d      |#| j	                  ||       |j                         }	n!||j                         d d }	nt        d      |	\  }
}||j                  n|j                  }|t        j                  |
|f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j                  |
|      }|}n&t        j                  |	t        j                  |      }| j                  ||	      }| j!                  || j                   j"                        }| j                  ||||      }| j%                  |||||      }|d	   }|s	|f|d
d  z   S t'        ||j(                  |j*                  |j,                        S )NzDYou cannot specify both input_ids and inputs_embeds at the same timer:   z5You have to specify either input_ids or inputs_embedsr   r   rM   )r   r   r   r   )r#  rx  ry  rz  r   r   )rr  r"  
attentionscross_attentions)r   ry  use_return_dictr=   %warn_if_padding_and_no_attention_maskr<   rO   rQ   r   r   r  r   r   r   rS   get_extended_attention_maskget_head_maskrk  r  r   r"  r  r  )r   r   r#  r   r   rx  r   ry  rz  r   rW   r  rO   r  r  extended_attention_maskembedding_outputencoder_outputsr  s                      r'   rw   zMraModel.forwardb  s    %9$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 ,,2!5# ' 
 *!,#%(;;;1-)77&11,==	
 	
r1   )NNNNNNNN)r   r   r   r   r  r  r  r   r   rQ   r9  boolr   r   r   rw   r  r  s   @r'   r  r  H  s    /0C  -11515/3,004/3&*J
ELL)J
 !.J
 !.	J

 u||,J
 ELL)J
  -J
 'tnJ
 d^J
 
u88	9J
 J
r1   r  c                   L    e Zd ZddgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee   dee   deeef   fd       Z xZS )MraForMaskedLMzcls.predictions.decoder.weightzcls.predictions.decoder.biasc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r"   )r   r   r  r    r  clsr  r   s     r'   r   zMraForMaskedLM.__init__  s4     F#!&) 	r1   c                 B    | j                   j                  j                  S r"   )r  r  r  r  s    r'   get_output_embeddingsz$MraForMaskedLM.get_output_embeddings  s    xx##+++r1   c                     || j                   j                  _        |j                  | j                   j                  _        y r"   )r  r  r  r  )r   new_embeddingss     r'   set_output_embeddingsz$MraForMaskedLM.set_output_embeddings  s,    '5$$2$7$7!r1   r   r#  r   r   rx  r   labelsry  rz  r3  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}|Ft	               } ||j                  d| j                   j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        Nr#  r   r   rx  r   ry  rz  r   r:   r   losslogitsr"  r  )
r   r  r    r  r	   r  r   r   r"  r  )r   r   r#  r   r   rx  r   r  ry  rz  r,  r  r  masked_lm_lossloss_fctr   s                   r'   rw   zMraForMaskedLM.forward  s    & &1%<k$++B]B](())%'!5#  	
 "!* HH_5')H%&7&<&<RAWAW&XZ`ZeZefhZijN')GABK7F3A3M^%.YSYY$!//))	
 	
r1   	NNNNNNNNN)r   r   r   _tied_weights_keysr   r  r  r   r   rQ   r9  r  r   r   r   rw   r  r  s   @r'   r  r    s   :<Z[,8  -11515/3,004)-/3&*0
ELL)0
 !.0
 !.	0

 u||,0
 ELL)0
  -0
 &0
 'tn0
 d^0
 
un$	%0
 0
r1   r  c                   (     e Zd ZdZ fdZd Z xZS )MraClassificationHeadz-Head for sentence-level classification tasks.c                 4   t         |           t        j                  |j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _
        || _        y r"   )r   r   r   r  r   r1  r   r   r   
num_labelsout_projr   r   s     r'   r   zMraClassificationHead.__init__  sg    YYv1163E3EF
zz&"<"<=		&"4"4f6G6GHr1   c                     |d d dd d f   }| j                  |      }| j                  |      }t        | j                  j                     |      }| j                  |      }| j                  |      }|S )Nr   )r   r1  r   r   rN  r  )r   featureskwargsxs       r'   rw   zMraClassificationHead.forward  se    Q1WLLOJJqM4;;))*1-LLOMM!r1   r  r  s   @r'   r  r    s    7r1   r  z
    MRA Model transformer with a sequence classification/regression head on top (a linear layer on top of
    the pooled output) e.g. for GLUE tasks.
    )custom_introc                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   de	e
ef   fd       Z xZS )MraForSequenceClassificationc                     t         |   |       |j                  | _        t        |      | _        t        |      | _        | j                          y r"   )r   r   r  r  r    r  
classifierr  r   s     r'   r   z%MraForSequenceClassification.__init__  sA      ++F#/7 	r1   r   r#  r   r   rx  r   r  ry  rz  r3  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }| j
                  dk(  r& ||j                         |j                               }n |||      }n| j                   j                  dk(  r=t               } ||j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               } |||      }|	s|f|
dd z   }||f|z   S |S t        |||
j                   |
j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr  r   r   
regressionsingle_label_classificationmulti_label_classificationr:   r  )r   r  r    r  problem_typer  rN   rQ   rS   rB   r
   r   r	   r  r   r   r"  r  )r   r   r#  r   r   rx  r   r  ry  rz  r,  r  r  r  r  r   s                   r'   rw   z$MraForSequenceClassification.forward  s   & &1%<k$++B]B](())%'!5#  	
 "!*1{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE'!//))	
 	
r1   r  )r   r   r   r   r   r   rQ   r9  r  r   r   r   rw   r  r  s   @r'   r  r    s      -11515/3,004)-/3&*A
ELL)A
 !.A
 !.	A

 u||,A
 ELL)A
  -A
 &A
 'tnA
 d^A
 
u..	/A
 A
r1   r  c                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   de	e
ef   fd       Z xZS )MraForMultipleChoicec                    t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  d      | _        | j                          y rZ  )
r   r   r  r    r   r  r   pre_classifierr  r  r   s     r'   r   zMraForMultipleChoice.__init__e  s_     F# ii(:(:F<N<NO))F$6$6: 	r1   r   r#  r   r   rx  r   r  ry  rz  r3  c
           
         |	|	n| j                   j                  }	||j                  d   n|j                  d   }
|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|!|j                  d|j	                  d            nd}|1|j                  d|j	                  d      |j	                  d            nd}| j                  ||||||||	      }|d   }|dddf   }| j                  |      } t        j                         |      }| j                  |      }|j                  d|
      }d}|t               } |||      }|	s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )a[  
        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
            1]`:

            - 0 corresponds to a *sentence A* token,
            - 1 corresponds to a *sentence B* token.

            [What are token type IDs?](../glossary#token-type-ids)
        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.max_position_embeddings - 1]`.

            [What are position IDs?](../glossary#position-ids)
        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
            model's internal embedding lookup matrix.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
            `input_ids` above)
        Nr   r:   r7   r  r   r  )r   r  rP   r  r<   r    r  r   ReLUr  r	   r   r"  r  )r   r   r#  r   r   rx  r   r  ry  rz  num_choicesr,  hidden_statepooled_outputr  reshaped_logitsr  r  r   s                      r'   rw   zMraForMultipleChoice.forwardo  s   V &1%<k$++B]B],5,Aiooa(}GZGZ[\G]>G>SINN2y~~b'9:Y]	M[Mg,,R1D1DR1HImqM[Mg,,R1D1DR1HImqGSG_|((\->->r-BCei ( r=#5#5b#9=;M;Mb;QR 	 (())%'!5#  	
 qz$QT*++M:!	-0/ ++b+6')HOV4D%''!"+5F)-)9TGf$EvE("!//))	
 	
r1   r  )r   r   r   r   r   r   rQ   r9  r  r   r   r   rw   r  r  s   @r'   r  r  c  s      -11515/3,004)-/3&*X
ELL)X
 !.X
 !.	X

 u||,X
 ELL)X
  -X
 &X
 'tnX
 d^X
 
u//	0X
 X
r1   r  c                   8    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	ee   d
ee   de	e
ef   fd       Z xZS )MraForTokenClassificationc                 ,   t         |   |       |j                  | _        t        |      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        | j                          y r"   )r   r   r  r  r    r   r   r   r   r  r   r  r  r   s     r'   r   z"MraForTokenClassification.__init__  si      ++F#zz&"<"<=))F$6$68I8IJ 	r1   r   r#  r   r   rx  r   r  ry  rz  r3  c
           
         |	|	n| j                   j                  }	| j                  ||||||||	      }
|
d   }| j                  |      }| j	                  |      }d}|t               }||j                  d      dk(  }|j                  d| j                        }t        j                  ||j                  d      t        j                  |j                        j                  |            } |||      }n2 ||j                  d| j                        |j                  d            }|	s|f|
dd z   }||f|z   S |S t        |||
j                  |
j                        S )z
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
        Nr  r   r:   r   r  )r   r  r    r   r  r	   r  r  rQ   wheretensorignore_indextype_asr   r"  r  )r   r   r#  r   r   rx  r   r  ry  rz  r,  r  r  r  r  active_lossactive_logitsactive_labelsr   s                      r'   rw   z!MraForTokenClassification.forward  sh   " &1%<k$++B]B](())%'!5#  	
 "!*,,71')H),11"5: &B @ %R%,,x?T?T2U2]2]^d2e!  }=B @&++b/RY,F)-)9TGf$EvE$!//))	
 	
r1   r  )r   r   r   r   r   r   rQ   r9  r  r   r   r   rw   r  r  s   @r'   r  r    s    	  -11515/3,004)-/3&*9
ELL)9
 !.9
 !.	9

 u||,9
 ELL)9
  -9
 &9
 'tn9
 d^9
 
u++	,9
 9
r1   r  c                   X    e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee   dee   de	e
ef   fd       Z xZS )MraForQuestionAnsweringc                     t         |   |       d|_        |j                  | _        t        |      | _        t        j                  |j                  |j                        | _        | j                          y )Nr4   )
r   r   r  r  r    r   r  r   
qa_outputsr  r   s     r'   r   z MraForQuestionAnswering.__init__  s[      ++F#))F$6$68I8IJ 	r1   r   r#  r   r   rx  r   start_positionsend_positionsry  rz  r3  c           
         |
|
n| j                   j                  }
| j                  |||||||	|
      }|d   }| j                  |      }|j	                  dd      \  }}|j                  d      }|j                  d      }d }||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|
s||f|dd  z   }||f|z   S |S t        ||||j                  |j                        S )	Nr  r   r   r:   r8   )r  r4   )r  start_logits
end_logitsr"  r  )r   r  r    r  splitr   r;   r<   clampr	   r   r"  r  )r   r   r#  r   r   rx  r   r  r  ry  rz  r,  r  r  r   r  
total_lossignored_indexr  
start_lossend_lossr   s                         r'   rw   zMraForQuestionAnswering.forward#  s    &1%<k$++B]B](())%'!5#  	
 "!*1#)<<r<#: j#++B/''+

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r1   )
NNNNNNNNNN)r   r   r   r   r   r   rQ   r9  r  r   r   r   rw   r  r  s   @r'   r  r    s   
  -11515/3,0042604/3&*<
ELL)<
 !.<
 !.	<

 u||,<
 ELL)<
  -<
 "%,,/<
  -<
 'tn<
 d^<
 
u22	3<
 <
r1   r  )r  r  r  r  r  rX  r  r  r   )NN)r6   r   r   )Nr  r   pathlibr   typingr   r   r   rQ   torch.utils.checkpointr   torch.nnr   r	   r
   torch.utils.cpp_extensionr   activationsr   modeling_outputsr   r   r   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_mrar   
get_loggerr   r  r.   r0   rK   r[   r_   rg   rq   autogradFunctionrs   r   r   r   r   r   Moduler   r
  r.  r<  rJ  rT  rX  rg  r  r  r  r  r  r  r  r  r  r  r  __all__r#   r1   r'   <module>r     s      ) )    A A * !  . l l Y Y ( 
		H	%	C&8.%OP%PsXENN33 X0]5>>22 ]. :%VP#)Z !"$%pf7BII 7tYryy YzBII 299 Bbii  		 ryy :(
 (
X $")) 0!RYY ! * * *, d
! d
 d
N D
' D
 D
PBII * L
#5 L
L
^ d
- d
 d
N F
 2 F
 F
R J
0 J
 J
Z	r1   