
    Uh$                       d Z ddlZddlZddlZddlmZmZmZmZm	Z	 ddl
Z
ddl
mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,  e'       rddl-m.Z. ddl/m0Z0  e*jb                  e2      Z3dSde
jh                  de5de5de5de
jh                  f
dZ6de
jh                  de5de5de
jh                  fdZ7dSde
jh                  de5de5de5de
jh                  f
dZ8de5de
jh                  fdZ9de
jh                  de5de
jh                  fdZ:d e
jh                  de5d!e
jv                  de
jh                  fd"Z<d e
jh                  d#e5dee
jh                  e
jh                  f   fd$Z=d e
jh                  d#e5de
jh                  fd%Z>d&e
jh                  d'e
jh                  d(e5de
jh                  fd)Z? G d* d+ej                        ZA	 dd,lBmCZC eCZAe3j                  d-        e j                  eA        G d/ d0ej                        ZI G d1 d2ej                        ZJ G d3 d4ej                        ZK G d5 d6ej                        ZL G d7 d8ej                        ZM G d9 d:ej                        ZN G d; d<ej                        ZO G d= d>ej                        ZP G d? d@ej                        ZQ G dA dBej                        ZR G dC dDej                        ZSe& G dE dFe             ZT G dG dHeT      ZUdIZVe& G dJ dKeT             ZW e&dLM       G dN dOeTe             ZXe& G dP dQeT             ZYg dRZZy# eE$ r Y [eF$ r e3j                  d.       Y sw xY w)TzPyTorch LongT5 model.    N)AnyListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS find_pruneable_heads_and_indicesprune_linear_layer)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )LongT5Config)	BlockMask)make_flex_block_causal_maskx	block_lendim	pad_valuereturnc                 t   | j                   |    |z  }t        | j                         sCt        | j                         }||xx   |z  cc<   t        j                  || j
                        S dg| j                  z  }d|f||<   t        |ddd   d      }t        j                  j                  | |d|      } | S )	zHPad a tensor so that a sequence length will be a multiple of `block_len`dtyper   r   r   N constantpadmodevalue)shapealllisttorchzerosr+   ndimsumr   
functionalr1   )r$   r%   r&   r'   pad_len	new_shaper1   s          |/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/longt5/modeling_longt5.py_pad_to_multipler?   >   s    wws|mi'Gqww<M	#'!{{9AGG44(QVV
C7|CH
c$B$i
C
!:YGAH    c                 >   | j                   |   |z  dk7  rt        | ||d      } | j                   |   |z  }| j                   d| ||fz   | j                   |dz   d z   }d|v r,t        j                  || j                  | j
                        S | j                  |      S )zSplit an input tensor into blocks of a given `block_len` along the given `dim`. If the dimension length
    is not a multiple of `block_len`, it will be padded first with selected `pad_value`.
    r   )r'   Nr    r+   device)r4   r?   r7   emptyr+   rC   reshape)r$   r%   r&   
num_blocksoutput_shapes        r>   _split_into_blocksrH   N   s    
 	wws|i1$Q	3!<*J774C=J	#::QWWcAg[=QQLL{{<qwwqxxHH99\""r@   	block_dimsequence_dimc                    | j                   |   }dg| j                  z  }d||<   t        |ddd   d      }t        j                  j                  | |d|      } g }t        d      D ]M  }t        d	d      g| j                  z  }t        |||z         ||<   t        |      }|j                  | |          O t        j                  ||
      S )zConcatenate three consecutive blocks for each input block for local attentiont.

    For more information, see: https://arxiv.org/pdf/2112.07916.pdf.
    r,   )r    r    Nr-   r.   r/   r0   r
   r   r&   )r4   r9   r:   r   r;   r1   rangeslicetupleappendr7   cat)	r$   rI   rJ   r'   rF   r1   blocks_listiindicess	            r>   _concatenate_3_blocksrU   ]   s    
 #J(QVV
CC	N
c$B$i
C
!:YGA&(K1X ' D>"QVV+"1a*n5	.1W:&' 99[l33r@   c                     t        j                  d| z  t         j                        }|| |   }|j                  d      |j                  d      z
  }|S )z:Makes 3-blocked relative position ids for local attention.r
   r*   r   r    )r7   arangeint32	unsqueeze)r%   position_idscenter_position_idsrelative_position_idss       r>   "_make_3block_relative_position_idsr]   v   sR    <<IU[[AL&y)<(22158K8U8UVW8XX  r@   local_attention_maskc                     t        |      }t        j                  |      |k  }|ddddddf   }|j                  | j                        }t        j
                  | |      S )znMask local attention mask to enforce that tokens are not allowed to attend tokens farther than ``local_radius.N)r]   r7   abstorC   logical_and)r^   r%   r\   locality_masks       r>   _mask_local_attention_maskrd      s_    >yIII34y@M!$a"23M!$$%9%@%@AM1=AAr@   attention_maskrC   c                    t        | |d      }t        |dd      }|j                  d      }|j                  d      }t        j                  ||      }t        ||      }|j                  d      j                  |      S )z;Prepare attention mask to be applied for a local attention.r    rL      rI   rJ   r-   )rH   rU   rY   r7   rb   rd   ra   )re   r%   rC   _blocked_attention_mask_3blocked_attention_maskr^   s         r>   _get_local_attention_maskrl      s     1PQR45LXYhij5??C7AA"E ,,-DF^_56JIV))!,//77r@   global_block_sizec                    | j                   dd \  }dt        j                  dt        j                  ffd}t        j                  | | j                        z  }t        j
                  |d      |z
  }t        j                  | d	k7  d
d      j                  | j                        }t        j                  ||z   d
z
        j                  | j                        }t        j                  d|j                  |j                        }t        j                  ||kD  ||      }|| z  | dz
  z   } ||      }z  }|dkD  rBt        j                  |d      j                  j                  |d      j                  dd      }	n-t        j                  |d|j                  |j                        }	t        j
                  t        j                   ||      d      dz
  }
|
j#                  | j                        }
t        j                  |
|	k  dd      }
|j                  t        j$                        |
j                  t        j$                        fS )a  Obtain the "fixed block" global id corresponding to each input token.

    This implementation is a simplified version of the original Flaxformr implementation adopted from:
    https://github.com/google/flaxformer/blob/main/flaxformer/architectures/longt5/long_attention.py.

    In our scenario, as we use this strategy only for a decoder, orphan tokens, i.e. those tokens which do not make for
    the whole fixed block, are assigned to the preceding block.

    Padding tokens from the original sequence are represented by -1.
    Nrg   	block_idsr(   c                 X   t        j                        z  dz
  k(  }|j                  | j                        }t        j                  || dk\        }|j                  d      j                  d      j                  | j                        dz
  }t        j                  | |k  | |      } | S )Nr    r   r-   )
r7   rW   ra   rC   rb   r:   rY   typer+   where)ro   
block_endstrue_block_endsfull_blocksrm   seq_lens       r>   handle_orphan_tokensz:_make_global_fixed_block_ids.<locals>.handle_orphan_tokens   s    ll7+.??DUXYDYY
]]9#3#34
++J	QG%))"-77;@@QTUUKK	K 7KP	r@   rC   r    )axis              ?g     @r-   rB   r   rL   )r4   r7   Tensor	ones_likerC   cumsumrr   rq   r+   floortensormaxvaluesrepeat	transposer8   onesra   int)re   rm   
batch_sizerw   fixed_block_maskmaskglobal_block_ids_global_block_ids_lower_boundnum_globals_sequence_block_ids_maxglobal_segment_idsrv   s    `         @r>   _make_global_fixed_block_idsr      s    )..r2J   ~n>S>STWhh||$41=@PP;;~,c7;@@AUAUVD{{4*:#:S#@AFF~G[G[\$)LL;K;Q;QZjZqZq$r!{{88:JLi )>9nq>PQ+,<=..KQ"')),<""E"L"L"S"ST_ab"c"m"mnoqr"s"'++!1!7!7@P@W@W#
 ejj[&IrRUVV+..~/D/DE%7;R%RTUWXY  +-?-D-DUYY-OOOr@   c                     t        | |      \  }}|j                  d   }t        j                  ||j                        }||d   z
  }|j                  t        j                        S )zBCreate the relative position tensor for local -> global attention.r-   rx   .N)r   r4   r7   rW   rC   rq   int64)re   rm   ro   r   global_seq_lenglobal_positionsside_relative_positions          r>    _make_side_relative_position_idsr      sa    $@Qb$c!I!'--b1N||N9;K;KL-	)0DD!&&u{{33r@   hidden_statesro   r   c                 x   |j                  |dk\  t        j                  ||j                  |j                              }t
        j                  j                  |j                  t        j                        |dz         ddddddf   }t        j                  d| |j                  | j                              S )zFCompute individual block aggregates by summing over individual blocks.r   rB   r    Nr-   z...nd,...ng->...gd)rr   r7   r   r+   rC   r   r;   one_hotrq   r   einsum)r   ro   r   one_hot_block_idss       r>   _create_global_aggregatesr      s    
 Q^9??S\ScScdI --innU[[.I>\]K]^_`bcehfheh_hi<<,m=N=S=STaTgTg=hiir@   c                   &     e Zd Zd fd	Zd Z xZS )LongT5LayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zg
        Construct a layernorm module in the LongT5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parameterr7   r   weightvariance_epsilon)selfhidden_sizeeps	__class__s      r>   r   zLongT5LayerNorm.__init__   s1     	ll5::k#:; #r@   c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )Nrg   r-   T)keepdim)ra   r7   float32powmeanrsqrtr   r   r+   float16bfloat16)r   r   variances      r>   forwardzLongT5LayerNorm.forward   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r@   )gư>)__name__
__module____qualname__r   r   __classcell__r   s   @r>   r   r      s    $+r@   r   )FusedRMSNormzSDiscovered apex.normalization.FusedRMSNorm - will use it instead of LongT5LayerNormzFdiscovered apex but it failed to load, falling back to LongT5LayerNormc                   *     e Zd Zdef fdZd Z xZS )LongT5DenseActDenseconfigc                 ^   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _
        t        |j                     | _        y NFbias)r   r   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr   dense_act_fnactr   r   r   s     r>   r   zLongT5DenseActDense.__init__
  sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r@   c                    | j                  |      }| j                  |      }| j                  |      }t        | j                  j
                  t        j                        r|j                  | j                  j
                  j                  k7  r`| j                  j
                  j                  t        j                  k7  r/|j                  | j                  j
                  j                        }| j	                  |      }|S N)r   r   r   
isinstancer   r   r7   r|   r+   int8ra   )r   r   s     r>   r   zLongT5DenseActDense.forward  s    ./]3tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r@   r   r   r   r!   r   r   r   r   s   @r>   r   r   	  s    /| /r@   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5DenseGatedActDenser   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y r   )r   r   r   r   r   r   wi_0wi_1r   r   r   r   r   r   r   r   s     r>   r   z!LongT5DenseGatedActDense.__init__   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r@   c                     | j                  | j                  |            }| j                  |      }||z  }| j                  |      }| j	                  |      }|S r   )r   r   r   r   r   )r   r   hidden_geluhidden_linears       r>   r   z LongT5DenseGatedActDense.forward(  sS    hhtyy78		-0#m3]3.r@   r   r   s   @r>   r   r     s    /| /r@   r   c                   *     e Zd Zdef fdZd Z xZS )LongT5LayerFFr   c                    t         |           |j                  rt        |      | _        nt        |      | _        t        |j                  |j                        | _	        t        j                  |j                        | _        y )Nr   )r   r   is_gated_actr   DenseReluDenser   r   r   layer_norm_epsilon
layer_normr   r   r   r   r   s     r>   r   zLongT5LayerFF.__init__3  s_    ":6"BD"5f"=D)&..f>W>WXzz&"5"56r@   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S r   )r   r   r   )r   r   forwarded_statess      r>   r   zLongT5LayerFF.forward=  s=    ??=9../?@%5E(FFr@   r   r   s   @r>   r   r   2  s    7| 7r@   r   c                   n     e Zd Z	 	 ddedee   f fdZd Zed	d       Z	d
dZ
	 	 	 	 	 	 	 	 	 ddZ xZS )LongT5Attentionr   	layer_idxc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  | j                  z  | _        || _        |9| j                  r-t        j!                  d| j"                  j$                   d       t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        | j                  r/t'        j2                  | j                  | j                        | _        t7               | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fr   )r   r   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancer   d_kvkey_value_proj_dim	num_headsn_headsr   r   	inner_dimr   loggerwarning_oncer   r   r   r   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr   r   r   r   r   s       r>   r   zLongT5Attention.__init__F  ss    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r@   c                    t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        | j                  |      | _        t        | j                  |      | _        t        | j                  |      | _        t        | j                  |d      | _	        | j                  t        |      z
  | _        | j                  | j                  z  | _
        | j                  j                  |      | _        y Nr   r    rL   lenr   r   r   r   r   r   r   r   r   r   unionr   headsindexs      r>   prune_headszLongT5Attention.prune_headsi      u:?74<<!8!8$:K:K
u $DFFE2#DFFE2#DFFE2#DFFEq9||c%j0004<<? --33E:r@   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   rg   r    ra   r7   longr`   min
zeros_likelogfloatmath	full_likerr   relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r>   _relative_position_bucketz)LongT5Attention._relative_position_buckety  s(   , AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r@   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  || j                   | j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )%Compute binned relative position biasNrB   r  r  r  rg   r   r    r   )r   r   rC   r7   rW   r  ra   r  r   r   r   permuterY   )
r   query_length
key_lengthrC   cache_positioncontext_positionmemory_positionr  relative_position_bucketr   s
             r>   compute_biaszLongT5Attention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r@   c                    |j                   dd \  }}|du}| j                  |      }|j                  |d| j                  | j                        j                  dd      }|@|j                  j                  | j                        }|r|j                  }n|j                  }|r|n|}|r7|5r3j                  | j                     }|j                  | j                     }n| j                  |      }| j                  |      }|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }|D|s|
nd}
j                  ||| j                  d|
i      \  }}|rd|j                  | j                  <   t!        j"                  ||j                  dd            }||j                   d   }||n|
d   dz   }| j$                  sZt!        j&                  d| j                  ||f|j(                  |j*                  	      }| j,                  rE| j.                  r9d|_        n1| j3                  |||j(                  |

      }|dddd| dddf   }|#|ddddddd|j                   d   f   }||z   }| j4                  rRt!        j6                  |j                   d         }d|t9        | j4                        <   |dd|j;                         f   }n|}||z  }t<        j>                  jA                  |jC                         d      jE                  |      }t<        j>                  jG                  || jF                  | j.                        }|||z  }t!        j"                  ||      }|j                  dd      jI                         }|j                  |d| jJ                        }| jM                  |      }|||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nrg   r-   r    r   Tr
   ri   rC   r+   )rC   r   r   rL   ptraining)'r4   r   viewr   r   r   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updater7   matmulr   r8   rC   r+   r   r)  requires_gradr$  r   r   r6   boolr   r;   softmaxr  type_asr   
contiguousr   r   )r   r   r   key_value_statesposition_biaspast_key_valuelayer_head_maskr  	use_cacheoutput_attentionsr   r   
seq_lengthis_cross_attentionquery_statesr+  curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr  real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r>   r   zLongT5Attention.forward  s   $ "/!4!4Ra!8
J .T9vvm,#((RtG^G^_iijkmno%'2266t~~FJ!&4&J&J#&4&I&I#-?)]."<,66t~~FJ.::4>>JL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fff[)>/Gr@   FNT       )NN)	NNNNNNFFN)r   r   r   r!   r   r   r   r  staticmethodr  r$  r   r   r   s   @r>   r   r   E  si     %*#'	!,!, C=	!,F;  -  - ^. ir@   r   c                   b     e Zd Zddededdf fdZd Zedd       Zde	fd	Z
	 	 	 	 dd
Z xZS )LongT5LocalAttentionr   r   r(   Nc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  dz   | _        |j                  | _        | j                  | j                  z  | _        t!        j"                  | j                  | j                  d      | _        t!        j"                  | j                  | j                  d      | _        t!        j"                  | j                  | j                  d      | _        t!        j"                  | j                  | j                  d      | _        | j                  r/t!        j,                  | j                  | j                        | _        t1               | _        d| _        y )Nr    Fr   )r   r   r   r   r   r   r   r   r   r   r   local_radiusr%   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   s      r>   r   zLongT5LocalAttention.__init__*  sS    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r@   c                    t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        | j                  |      | _        t        | j                  |      | _        t        | j                  |      | _        t        | j                  |d      | _	        | j                  t        |      z
  | _        | j                  | j                  z  | _
        | j                  j                  |      | _        y r   r   r   s      r>   r  z LongT5LocalAttention.prune_headsD  r  r@   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S r  r  r  s           r>   r  z.LongT5LocalAttention._relative_position_bucketT  (   . AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r@   block_lengthc                    | j                   j                  j                  j                  dk7  r | j                   j                  j                  nd}t	        j
                  d|z  t        j                  |      }|||  }|dddf   |dddf   z
  }| j                  || j                   | j                  | j                        }| j                  |      }|j                  g d      j                  d      j                  d      }|S r  metaNr
   rB   r  r  r   r   r   rC   rq   r7   rW   r  r  r   r   r   r  rY   r   rY  target_devicer"  r!  r  r#  r   s           r>   r$  z!LongT5LocalAttention.compute_bias      ++2299>>&H ((//66 	
  ,,q<'7uzzR_`*<F ,D!G47G47PP#'#A#A#.;;==	 $B $
  --.FG	*44Q7AA!Dr@   c                 6    |j                   d d \  } fd} fd} | j                  |            }	 | j                  |            }
 | j                  |            }t	        |	 j
                  d      }	t	        |
 j
                  d      }
t	        | j
                  d      }t        |
dd      }
t        |dd      }t        j                  d|	|
      }|ʉ j                  srt        j                  dd j                   j
                  d j
                  z  f|j                  |j                  	      } j                  r/ j                  r#d
|_        n j#                   j
                        }|/t        j$                  |dkD  dd      }||j'                  dd      z   }||z  }t(        j*                  j-                  |j/                         d      j1                  |      }t(        j*                  j3                  | j2                   j                        }|||z  }|j5                  |j                        } |t        j                  d||            }|d d d |d d f   } j7                  |      }d }|f|fz   |fz   }|r||fz   }|S )Nrg   c                 T    | j                  dj                  j                        S 
projectionr-   r*  r   r   statesr   r   s    r>   r4   z+LongT5LocalAttention.forward.<locals>.shape  "    ;;z2t||T=T=TUUr@   c                 Z    | j                         j                  dj                        S rE   r-   r7  r*  r   rf  s    r>   unshapez-LongT5LocalAttention.forward.<locals>.unshape  %    $$&++JDNNKKr@   r    rL   rh   ...qhd,...khd->...hqkr
   r&  Tr   rz       _r-   r'  ...hqk,...khd->...qhd)r4   r   r   r   rH   r%   rU   r7   r   r   r8   r   rC   r+   r   r)  r3  r$  rr   r   r   r;   r5  r  r6  r   rq   r   )r   r   r   r9  r;  r=  r>  r4   rl  r@  rC  rD  rE  rI  rJ  present_key_value_staterK  r   s   `                @r>   r   zLongT5LocalAttention.forward  sp    "/!4!4Ra!8
J	V	L
 TVVM23466-01
TVVM23 *,AN'
DNNJ
),AN +:QRS
,\QUVW #\:
  33 %4<<T^^9KLU[UbUbjpjvjv! ..4==26M/ $ 1 1$.. A{{4!8S%8 -q!0D D-}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9L#((););<ell+BLR^_`!![j[!"34ff[)"&.$;#==@PP/Gr@   FrM  NNNF)r   r   r   r!   r4  r   r  rP  r  r   r$  r   r   r   s   @r>   rR  rR  )  sX    ,| ,$ ,[_ ,4;  -  - ^ 6 Ir@   rR  c                        e Zd Zddededdf fdZd Zedd       Zde	fd	Z
d
ej                  dej                  dej                  fdZ	 	 	 	 ddZ xZS )LongT5TransientGlobalAttentionr   r   r(   Nc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  dz   | _        |j                  | _        |j                  | _        | j                  | j                  z  | _        t#        j$                  | j                  | j                   d      | _        t#        j$                  | j                  | j                   d      | _        t#        j$                  | j                  | j                   d      | _        t#        j$                  | j                   | j                  d      | _        | j                  r/t#        j.                  | j                  | j                        | _        t3               | _        | j                  r/t#        j.                  | j                  | j                        | _        t9        |j                  |j:                        | _        y )Nr    Fr   r   )r   r   r   r   r   r   r   r   r   r   r   rT  r%   rm   r   r   r   r   r   r   r   r   r   r   r   r   r   global_relative_attention_biasr   r   global_input_layer_normrU  s      r>   r   z'LongT5TransientGlobalAttention.__init__  s    +++F(.4.S.S+/5/U/U,~~"(++''"//**Q.!'!9!9**(?(?? 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E ++24,,t?b?bdhdpdp2qD/'6v~~6KdKd'e$r@   c                    t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        | j                  |      | _        t        | j                  |      | _        t        | j                  |      | _        t        | j                  |d      | _	        | j                  t        |      z
  | _        | j                  | j                  z  | _
        | j                  j                  |      | _        y r   r   r   s      r>   r  z*LongT5TransientGlobalAttention.prune_heads	  r  r@   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S r  r  r  s           r>   r  z8LongT5TransientGlobalAttention._relative_position_bucket  rX  r@   rY  c                    | j                   j                  j                  j                  dk7  r | j                   j                  j                  nd}t	        j
                  d|z  t        j                  |      }|||  }|dddf   |dddf   z
  }| j                  || j                   | j                  | j                        }| j                  |      }|j                  g d      j                  d      j                  d      }|S r[  r]  r^  s           r>   r$  z+LongT5TransientGlobalAttention.compute_biasJ  r`  r@   r   r   c                 v   t        j                  |d   |d d d d d f         d d d df   }t        j                  |dkD  dd      }t        || j                        }| j                  || j                   | j                  | j                        }| j                  |      }|j                  g d      }||z   }|S )Nr   .r   rz   ro  r  )r   r
   r    rg   )r7   eqrr   r   rm   r  r   r   r   rw  r  )r   r   r   side_attention_maskattention_side_biasr   side_relative_position_bucket	side_biass           r>   compute_side_biasz0LongT5TransientGlobalAttention.compute_side_biasb  s    #hhtI8J1dTU:8VWXY[_adXde#kk*=*A3N!A$H^H^!_(,(F(F"#.;;==	 )G )
% 778UV	 %%l3	1I=""r@   c                 V	    |j                   d d \  } fd} fd}t        ||n!t        j                  |j                   d d        j                        \  }	}
|
j                   d   }t        ||	|      } j                  |      } | j                  |            } | j                  |            } | j                  |            } | j                  |            } | j                  |            }t        | j                  d      }t        | j                  d      }t        | j                  d      }t        |dd      }t        |dd      }dg|j                  dz   z  }|j                   d   |d<   |j                  d      j                  |      }|j                  d      j                  |      }t        j                   ||gd      }t        j                   ||gd      }t        j"                  d||      }|<t%        | j                  |j&                        }t        j(                  |d	kD  d
d      }nd }|j j*                  srt        j,                  dd j.                   j                  d j                  z  f|j&                  |j0                        } j2                  r/ j4                  r#d|_        n j9                   j                        }|||j;                  dd      z   }|j=                  |j0                        }|t        j                  |      } j?                  ||
      }t        | j                  d      j;                  dd      }|j=                  |j0                        jA                  |j&                        }t        j                   ||gd      }||z  }tB        jD                  jG                  |jI                         d      jK                  |      }tB        jD                  jM                  | jL                   j4                        }|||z  }|j=                  |j0                        } |t        j"                  d||            }|d d d |d d f   } jO                  |      }d }|f|fz   |fz   }|r||fz   }|S )Nrg   c                 T    | j                  dj                  j                        S rc  re  rf  s    r>   r4   z5LongT5TransientGlobalAttention.forward.<locals>.shape  rh  r@   c                 Z    | j                         j                  dj                        S rj  rk  rf  s    r>   rl  z7LongT5TransientGlobalAttention.forward.<locals>.unshape  rm  r@   r-   r    rL   rh   rn  r   rz   ro  r
   r&  Tri   r'  rp  )(r4   r   r7   r   rm   r   rx  r   r   r   rH   r%   rU   r9   rY   r   rQ   r   rl   rC   rr   r   r8   r   r+   r   r)  r3  r$  r   rq   r  ra   r   r;   r5  r  r6  r   r   )r   r   r   r9  r;  r=  r>  r4   rl  ro   r   _global_seq_lenglobal_inputsr@  rC  rD  side_key_statesside_value_statesrepsrE  r^   side_position_biasrI  rJ  rq  rK  r   s   `                         @r>   r   z&LongT5TransientGlobalAttention.forwardw  sV    "/!4!4Ra!8
J	V	L )E$D%**]5H5H"5M*N"")
%	%
 -22261-O\44]C TVVM23466-01
TVVM23} 56!$&&"78 *,AN'
DNNJ
),AN +:QRS
,\QUVW so**Q./""1%Q)33A6==dC-77:AA$G YY
O<!D
yy,0A!BJ 5|ZP#<T4>>S`SgSg#h #(;;/Ca/Ge#T #'  33 %4<<T^^9KL!== ,,!
 ..4==26M/ $ 1 1$.. A#/ -0D0N0NqRS0T T)..v||<M |zz*j9!%!7!7>P!Q!34F\^!_!i!ijkmn!o!3!8!8!F!I!I&--!X!II}6H&IrRM-}},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9L#((););<ell+BLR^_`!![j[!"34ff[)"&.$;#==@PP/Gr@   rr  rM  rs  )r   r   r   r!   r4  r   r  rP  r  r   r$  r7   r|   r  r   r   r   s   @r>   ru  ru    s    f| f$ f[_ f>;  -  - ^ 0#ell # #Y^YeYe #0 vr@   ru  c                   B     e Zd Zddee   f fdZ	 	 	 	 	 	 	 ddZ xZS )LongT5LayerSelfAttentionr   c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nr   r   r   )r   r   r   SelfAttentionr   r   r   r   r   r   r   r   r   s       r>   r   z!LongT5LayerSelfAttention.__init__  sT    ,0KW`
 *&..f>W>WXzz&"5"56r@   c	           
          | j                  |      }	| j                  |	|||||||      }
|| j                  |
d         z   }|f|
dd  z   }|S )N)r   r9  r;  r:  r<  r=  r   r   r    )r   r  r   )r   r   re   r9  r;  r:  r<  r=  r   normed_hidden_statesattention_outputrK  s               r>   r   z LongT5LayerSelfAttention.forward  sv      $}=-- '+)/) . 	
 &5Ea5H(II "%5ab%99r@   rL  )NNNNFFNr   r   r   r   r   r   r   r   r   s   @r>   r  r    s0    7XVY] 7 r@   r  c                   F     e Zd ZdZddee   f fdZ	 	 	 	 ddefdZ xZ	S )LongT5LayerLocalSelfAttentionz$Local self attention used in encoderr   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y N)r   r   )r   r   rR  LocalSelfAttentionr   r   r   r   r   r   r   r   r   s       r>   r   z&LongT5LayerLocalSelfAttention.__init__  sL    "6v[v"w)&..f>W>WXzz&"5"56r@   kwargsc                     | j                  |      }| j                  |||||      }|| j                  |d         z   }|f|dd  z   }	|	S N)r   r9  r;  r=  r   r    )r   r  r   
r   r   re   r9  r;  r=  r  r  r  rK  s
             r>   r   z%LongT5LayerLocalSelfAttention.forward  sm      $}=22 '+/ 3 
 &5Ea5H(II "%5ab%99r@   rL  rs  
r   r   r   __doc__r   r   r   r   r   r   r   s   @r>   r  r    s4    .7XVY] 7  r@   r  c                   F     e Zd ZdZddee   f fdZ	 	 	 	 ddefdZ xZ	S )'LongT5LayerTransientGlobalSelfAttentionz/Transient-Global self attention used in encoderr   c                     t         |           t        ||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y r  )r   r   ru  TransientGlobalSelfAttentionr   r   r   r   r   r   r   r   r   s       r>   r   z0LongT5LayerTransientGlobalSelfAttention.__init__7  sQ    ,J0K-
) *&..f>W>WXzz&"5"56r@   r  c                     | j                  |      }| j                  |||||      }|| j                  |d         z   }|f|dd  z   }	|	S r  )r   r  r   r  s
             r>   r   z/LongT5LayerTransientGlobalSelfAttention.forward?  sm      $}=<< '+/ = 
 &5Ea5H(II "%5ab%99r@   rL  rs  r  r   s   @r>   r  r  4  s4    97XVY] 7  r@   r  c                   D     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 ddZ xZS )LongT5LayerCrossAttentionr   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFr  r   )r   r   r   EncDecAttentionr   r   r   r   r   r   r   r   )r   r   r   r   s      r>   r   z"LongT5LayerCrossAttention.__init__W  sO    .vSXdmn)&..f>W>WXzz&"5"56r@   c                     | j                  |      }| j                  |||||||||	|

      }|| j                  |d         z   }|f|dd  z   }|S )N)	r   r8  r9  r;  r:  r<  r  r=  r   r   r    )r   r  r   )r   r   r8  re   r9  r;  r:  r<  r  r=  r   r  r  layer_outputrK  s                  r>   r   z!LongT5LayerCrossAttention.forward]  s{      $}=// -'+)%/) 0 
 %t||4DQ4G'HH/$4QR$88r@   r   )NNNNFNFNr  r   s   @r>   r  r  V  s2    7(3- 7 r@   r  c                   L     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )LongT5Blockr   c                    t         |           |j                  | _        |j                  rt        }nE|j                  dk(  rt
        }n/|j                  dk(  rt        }nt        d|j                   d      t        j                         | _
        | j                  j                   ||||             | j                  r&| j                  j                  t        ||             | j                  j                  t        |             y )Nlocalztransient-globalzjFor encoder attention mechanism, either `local` or `transient-global` attention type is expected, but got .r  )r   )r   r   r   r  encoder_attention_typer  r  
ValueErrorr   
ModuleListlayerrP   r  r   )r   r   r   r   attention_layerr   s        r>   r   zLongT5Block.__init__}  s     ++6O**g5;O**.@@EO!889<  ]]_


F@[gpq	
 ??JJ7)TU

-/0r@   c                 ^    | j                   d   |||||	|
||      }|d d \  }}	|dd  }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }| j                  xr |d u}|r | j                   d   ||||||	|d   dz   |
||
      }|d d \  }}	|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }||dd  z   } | j                   d   |      }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|f}|
r||	fz   |z   }|S ||z   }|S )	Nr   )re   r9  r;  r:  r<  r=  r   rg   i  )r	  r   r    r-   )	r8  re   r9  r;  r:  r  r<  r=  r   )
r  r+   r7   r   isinfanyfinfor   clampr   )r   r   re   r9  encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr;  cross_attn_layer_head_maskr:  r<  r=  return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsrK  s                       r>   r   zLongT5Block.forward  s.     "/A)'+)/)	"
 )?r(B%~2126 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM!__R1Fd1R&3djjm!65; :-+B/!3#"3-'# -DBQ,G)M> ""emm3M8R8V8V8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O '

2}5 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM " 114EEG   11Gr@   rL  )NNNNNNNNFFTNr  r   s   @r>   r  r  |  s@    1XVY] 14 "#&*#'Ir@   r  c                   B    e Zd ZeZdZdZdgZdZdZ	e
d        Zd Zd Zy)	LongT5PreTrainedModeltransformerTr  Fc                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r7   r   r   r   )r   r  
input_maskdummy_inputss       r>   r  z"LongT5PreTrainedModel.dummy_inputs  s8     LL.	\\*-
!*"&0

 r@   c                 N   | j                   j                  }t        |t              r)|j                  j
                  j                  |dz         yt        |t        t        t        f      r|j                  j                  j
                  j                  d|dz         t        |d      rL| j                   j                  s5|j                  j                  j
                  j                  d|dz         yyyt        |t              rM|j                   j                  j
                  j                  d|| j                   j"                  dz  z         t        |j                   d      rD|j                   j$                  .|j                   j$                  j
                  j'                          |j(                  j                  j
                  j                  d|| j                   j*                  dz  z         t        |j(                  d      rF|j(                  j$                  /|j(                  j$                  j
                  j'                          yyyt        |t,              r|j.                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j.                  d      rD|j.                  j$                  .|j.                  j$                  j
                  j'                          |j0                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j0                  d      rD|j0                  j$                  .|j0                  j$                  j
                  j'                          |j(                  j                  j
                  j                  d|| j                   j*                  dz  z         t        |j(                  d      rF|j(                  j$                  /|j(                  j$                  j
                  j'                          yyyt        |t2        t4        t6        f      r| j                   j"                  }| j                   j8                  }| j                   j:                  }|j<                  j                  j
                  j                  d|||z  dz  z         |j>                  j                  j
                  j                  d||dz  z         |j@                  j                  j
                  j                  d||dz  z         |jB                  j                  j
                  j                  d|||z  dz  z         |jD                  r|jF                  j                  j
                  j                  d||dz  z         t        |t6              r8|jH                  j                  j
                  j                  d||dz  z         yyyy)zInitialize the weightsr{   rz   )r   stdlm_head      r   N)%r   initializer_factorr   r   r   datafill_LongT5ModelLongT5ForConditionalGenerationLongT5EncoderModelsharednormal_hasattrtie_word_embeddingsr  r   r   r   r   zero_r   r   r   r   r   r   rR  ru  r   r   r   r   r   r   r   r   rw  )r   modulefactorr   r   r   s         r>   _init_weightsz#LongT5PreTrainedModel._init_weights  s   //fo.MM$$Vc\2.LN` ab MM  %%--3FSL-Ivy)$++2Q2Q%%**22#2N 3R) 34 II!!))s4;;CVCV[_B_8`)avyy&)fiinn.H		##))+II!!))s4;;CSCSX\B\8])^vyy&)fiinn.H		##))+ /I) 89KK##++&T[[EXEX]aDa:b+cv{{F+0@0@0L  %%++-KK##++&T[[EXEX]aDa:b+cv{{F+0@0@0L  %%++-II!!))s4;;CSCSX\B\8])^vyy&)fiinn.H		##))+ /I)2FHf gh kk))G!%!1!1kk++GHHOO  ((cv'L^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cv'L^B^cgAg7h(i11..55::BBQW\chl[lQmBnf&DE99@@EEMM fT0A&B N  F 2 ir@   c                    | j                   j                  }| j                   j                  }|t        d      t	        |      rGt        j                  |j                  d d dz   |      }t        j                  ||dd df   gd      }n>|j                  |j                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |d	k(  |       |S )
Nzself.model.config.decoder_start_token_id has to be defined. In LongT5 it is usually set to the pad_token_id. See LongT5 docs for more information.r-   )r    .rL   r    ).r   z1self.model.config.pad_token_id has to be defined.)r   decoder_start_token_idpad_token_idr  r   r7   fullr4   rQ   	new_zerosclonemasked_fill_)r   r  r  r  shifted_input_idss        r>   _shift_rightz"LongT5PreTrainedModel._shift_right&  s    !%!C!C{{//!)8  Y' %

9??3B+?$+FH^ _ %		+<iSbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r@   N)r   r   r   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_cache_class_supports_static_cachepropertyr  r  r  r.   r@   r>   r  r    sD    L%&*#& " .b!r@   r  c                       e Zd Zd fd	Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 ddeej                  df   dej                  dej                  d	e
d
ef
dZedej                  dededej                  dej                  defd       Z xZS )LongT5Stackc                    t         |   |       t        j                  |j                  |j
                        | _        ||j                  | j                  _        |j                  | _        |j                  | _	        | j                  dz   | _
        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t#        |j
                  |j$                        | _        t        j(                  |j*                        | _        d| _        | j1                          y c c}w )Nr    r   r  r   F)r   r   r   r   
vocab_sizer   embed_tokensr   r   rT  r%   r  rM   
num_layersr  r4  blockr   r   final_layer_normr   r   r   r   	post_init)r   r   r  rS   r   s       r>   r   zLongT5Stack.__init__C  s    LL):):FNNK#'3':':D$ ++"//**Q.]] v001 FQ!VXYZ

 !0FD]D] ^zz&"5"56&+# 	s   9!Ec                     | j                   S r   r  r   s    r>   get_input_embeddingsz LongT5Stack.get_input_embeddings]  s       r@   c                     || _         y r   r  r   new_embeddingss     r>   set_input_embeddingsz LongT5Stack.set_input_embeddingsa  s
    *r@   c                 
   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|$|"| j
                  rdnd}t        d| d| d      |&|j                         }|j                  d|d         }n8||j                         d d }n"| j
                  rdnd}t        d| d| d	      | j                  r%| j                  r|	rt        j                  d
       d}	|$| j                  J d       | j                  |      }|\  }}d}d}| j
                  r|	s|t        |t              r't        |t               sd}t!        |t#                     }njt        |t               s-d}t        j                  d       t!        j$                  |      }n-|+t!        t#               t#                     }n| j
                  sd }||j'                         nd}|%t)        j*                  |||z   |j,                        }|1t/               s'||z   }t)        j0                  |||j,                        }| j
                  r$| j3                  |||||j4                  nd |
      }n=| j                   j6                  dk(  r"t9        || j:                  |j,                        }n|}| j
                  rO|M|j                         \  }}}||f}|!t)        j0                  ||j,                        }| j=                  |      }nd }| j?                  || j                   j@                        }| j?                  || j                   j@                        }|rdnd }|
rdnd }|
r| j
                  rdnd }d }d } | jC                  |      }!tE        | jF                        D ]  \  }"}#||"   }$||"   }%|r||!fz   }| j                  r5| j                  r)| jI                  |#jJ                  |!||||| |$|%d |	|
||      }&n |#|!||||| |$|%||	|
||      }&|	du r|&d d dz   |&dd  z   }&|&d d \  }!}'|&d   }| j
                  r|	|&|
rdnd   } |
s||&d   fz   }| j
                  s||&d   fz   } | jM                  |!      }!| jC                  |!      }!|r||!fz   }|	r'nd }(|r|j4                  }(|r|jO                         }(|stQ        d |!|(|||fD              S tS        |!|(|||      S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer-   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   rx   r  r.   )re   r9  r  r  r  r;  r  r:  r<  r=  r  r   r    r   rg      r
      c              3   $   K   | ]  }|| 
 y wr   r.   ).0r   s     r>   	<genexpr>z&LongT5Stack.forward.<locals>.<genexpr>%  s      
 = 
s   )last_hidden_statepast_key_valuesr   
attentionscross_attentions)*r   r<  r=  output_hidden_statesuse_return_dictr   r  sizer*  r   r)  r   r   r  r   r   r   r   from_legacy_cacheget_seq_lengthr7   rW   rC   r   r   _update_causal_maskr.  r  rl   r%   invert_attention_maskget_head_maskr  r   	enumerater  _gradient_checkpointing_funcr   r  to_legacy_cacherO   r   ))r   r  re   r  r  r  	head_maskcross_attn_head_maskr  r<  r=  r  r  r   err_msg_prefixinput_shaper   r>  return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthrG  encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr9  r  r   rS   layer_moduler;  r  layer_outputsnext_decoder_cache
next_caches)                                            r>   r   zLongT5Stack.forwardd  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	 $$0p2pp0 --i8M!,
J $&+#??	_-H/51*_Vi:j.2+"5o|~"V1DE&*###`
 #6"G"G"X ("5lnln"U #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D4zAO"ZZ
OML`L`aN??228G8S44Y]!K [[//7:3NDNNTaThThiK(K ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y$++2H2HI	#112FH^H^_"6BD0d&7DOOrRV(,%]3(4 :	VOA|'lO)=a)@&#$58H$H!**t}} $ A A ((!!)31#.%"!" !-!#."/*?+J2O$3/I#2'&7 +#1!$ E! -bq 1G ;mAB>O O0=bq0A-M-
 *!,M#8#D0=CTaZ[0\- !/=3C2E!E??+?=QRCSBU+U(u:	Vx --m<]3   1]4D D+4'$
&(==J(88:J 
 "%"(
 
 
 9+&+%1
 	
r@   re   r"   input_tensorr   r  r=  c           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2rz   flex_attentionr   Fsdpa)r  r  is_trainingr    r-   )sequence_lengthtarget_lengthr+   r   r   )cudaxpunpu)r   _attn_implementationr  r   r7   r|   r#   r  is_compileabler   _ignore_causal_mask_sdpar)  r+   r4   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionrC   rq   r  r	  _unmask_unattended)r   re   r-  r   r  r=  past_seen_tokensusing_compilable_cacher+   r3  r4  rG  	min_dtypes                r>   r  zLongT5Stack._update_causal_mask9  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr@   r3  r4  r+   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer+   rC   r    )diagonalrx   r-   r   )r&   r7   r  r	  r  rC   triurW   rE   expandr  r4   ra   masked_fill)re   r3  r4  r+   r   r   r  rG  r@  mask_lengthpadding_masks              r>   r<  zALongT5Stack._prepare_4d_causal_attention_mask_with_cache_position}  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r@   r   )NNNNNNNNNNNNNrr  )r   r   r   r   r  r   r   r   r7   r|   r   r4  r  rP  r   r+   r<  r   r   s   @r>   r  r  B  s    4!+
 "#!!R
v #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r@   r  a_  
The input argument `head_mask` was split into two arguments `head_mask` and `decoder_head_mask`. Currently,
`decoder_head_mask` is set to copy `head_mask`, but this feature is deprecated and will be removed in future versions.
If you do not want to use any `decoder_head_mask` now, please set `decoder_head_mask = torch.ones(num_layers,
num_heads)`.
c            &       Z    e Zd ZdgZddgZdef fdZd Zd Zd Z	d	 Z
d
 Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                      deej"                     deej                      deej$                     deej"                     deej"                     deej&                     deeeej"                           deeeej"                           deej&                     deej&                     dee   dee   dee   dee   deej                      deeej"                     ef   f"d       Z xZS )r  Fdecoder.block.0.layer.1.EncDecAttention.relative_attention_bias.weightencoder.embed_tokens.weightdecoder.embed_tokens.weightr   c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        d|_        t        || j                        | _        t        j                  |      }d|_	        d|_        |j                  |_        t        || j                        | _        | j#                          y )NFT)r   r   r   r   r  r   r  copydeepcopyr   r<  is_encoder_decoderr  encodernum_decoder_layersr  decoderr  r   r   encoder_configdecoder_configr   s       r>   r   zLongT5Model.__init__  s     ll6#4#4fnnEv.$)!#( ,1)">4;;?v.$(!,1)$*$=$=!">4;;? 	r@   c                     | j                   S r   r  r  s    r>   r  z LongT5Model.get_input_embeddings      {{r@   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r   r  rQ  r   rS  r  s     r>   r   z LongT5Model.set_input_embeddings  -    $)).9)).9r@   c                     | j                   j                  ra| j                  | j                  j                  | j
                         | j                  | j                  j                  | j
                         y y r   r   r  _tie_or_clone_weightsrQ  r  r  rS  r  s    r>   _tie_weightszLongT5Model._tie_weights  P    ;;**&&t||'@'@$++N&&t||'@'@$++N +r@   c                     | j                   S r   rQ  r  s    r>   get_encoderzLongT5Model.get_encoder      ||r@   c                     | j                   S r   rS  r  s    r>   get_decoderzLongT5Model.get_decoder  re  r@   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 yz
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        NitemsrQ  r  	attentionr  r   heads_to_pruner  r  s       r>   _prune_headszLongT5Model._prune_heads  E    
 +002 	CLE5LLu%//;;EB	Cr@   r  re   r  r  r  decoder_head_maskr  encoder_outputsr  r  decoder_inputs_embedsr<  r=  r  r  r   r(   c                    ||n| j                   j                  }||n| j                   j                  }|O|M| j                   j                  | j                   j                  k(  r t        j                  t        t               |}|| j                  |||
||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }| j                  ||||	|||||||||      }|s||z   S t        |j                  |j                   |j"                  |j$                  |j&                  |j                  |j"                  |j$                        S )	ax  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5Model

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5Model.from_pretrained("google/long-t5-local-base")

        >>> # Let's try a very long encoder input.
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you", return_tensors="pt"
        ... ).input_ids  # Batch size 1

        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1

        >>> # forward pass
        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```Nr  re   r  r  r=  r  r  r   r    rg   r
  r   r  r  re   r  r  r  r  r  r  r<  r=  r  r  r   )r
  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)r   r<  r  r  rR  warningswarn#_LongT5Model__HEAD_MASK_WARNING_MSGFutureWarningrQ  r   r   r   rS  r   r
  r  r   r  r  )r   r  re   r  r  r  rr  r  rs  r  r  rt  r<  r=  r  r  r   r   decoder_outputss                      r>   r   zLongT5Model.forward  s   b "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (* ,,'1/+"/#1'!5/!5#) ' 
  "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r@   )NNNNNNNNNNNNNNNN)r   r   r   "_keys_to_ignore_on_load_unexpected_tied_weights_keysr!   r   r  r   r`  rd  rh  rp  r   r   r7   
LongTensorFloatTensor
BoolTensorr|   r   r4  r   r   r   r   r   s   @r>   r  r    s    	R*& 89VW| &:
O
C  156:8<=A159=7;EIEI048<$(,0/3&*59#J
E,,-J
 !!2!23J
 $E$4$45	J

 !))9)9 :J
 E--.J
 $E$5$56J
 'u||4J
 "%e.?.?(@"ABJ
 "%e.?.?(@"ABJ
  -J
  (5J
 D>J
 $D>J
 'tnJ
  d^!J
" !!1!12#J
$ 
uU&&');;	<%J
 J
r@   r  z>
    LONGT5 Model with a `language modeling` head on top.
    )custom_introc            (           e Zd ZdgZg dZdef fdZd Zd Zd Z	d Z
d	 Zd
 Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d!deej"                     deej$                     deej"                     deej&                     deej$                     deej$                     deej(                     deeeej(                           deeeej(                           deej$                     deej$                     deej"                     dee   dee   dee   dee   deej"                     deeej$                     ef   f$d       Zdej(                  fdZd  Z xZS )"r  rJ  )rK  rL  zlm_head.weightr   c                 N   t         |   |       |j                  | _        t	        j
                  |j                  |j                        | _        t        j                  |      }d|_
        d|_        d|_        t        || j                        | _        t        j                  |      }d|_
        d|_        |j                  |_        t        || j                        | _        t	        j$                  |j                  |j                  d      | _        | j)                          y )NFTr   )r   r   r   	model_dimr   r   r  r  rN  rO  r   r<  rP  r  rQ  rR  r  rS  r   r  r  rT  s       r>   r   z'LongT5ForConditionalGeneration.__init__  s     ll6#4#4fnnEv.$)!#( ,1)">4;;?v.$(!,1)$*$=$=!">4;;?yy1B1BO 	r@   c                     | j                   S r   rX  r  s    r>   r  z3LongT5ForConditionalGeneration.get_input_embeddings  rY  r@   c                 ~    || _         | j                  j                  |       | j                  j                  |       y r   r[  r  s     r>   r   z3LongT5ForConditionalGeneration.set_input_embeddings  r\  r@   c                     | j                   j                  ra| j                  | j                  j                  | j
                         | j                  | j                  j                  | j
                         y y r   r^  r  s    r>   r`  z+LongT5ForConditionalGeneration._tie_weights  ra  r@   c                     || _         y r   r  r  s     r>   set_output_embeddingsz4LongT5ForConditionalGeneration.set_output_embeddings  s	    %r@   c                     | j                   S r   r  r  s    r>   get_output_embeddingsz4LongT5ForConditionalGeneration.get_output_embeddings  re  r@   c                     | j                   S r   rc  r  s    r>   rd  z*LongT5ForConditionalGeneration.get_encoder  re  r@   c                     | j                   S r   rg  r  s    r>   rh  z*LongT5ForConditionalGeneration.get_decoder  re  r@   r  re   r  r  r  rr  r  rs  r  r  rt  labelsr<  r=  r  r  r   r(   c                 l   ||n| j                   j                  }||n| j                   j                  }|O|M| j                   j                  | j                   j                  k(  r t        j                  t        t               |}|| j                  |||
||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|||| j                  |      }| j                  ||||	|||||||||      }|d   }| j                   j                  r|| j                   dz  z  }| j#                  |      }d}|^t%        d	
      }|j'                  |j(                        } ||j+                  d|j-                  d            |j+                  d            }|s|f|dd z   |z   }||f|z   S |S t/        |||j0                  |j2                  |j4                  |j6                  |j8                  |j2                  |j4                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            LONGT5 uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [LONGT5
            Training](./longt5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("Stancld/longt5-tglobal-large-16384-pubmed-3k_steps")
        >>> model = LongT5ForConditionalGeneration.from_pretrained(
        ...     "Stancld/longt5-tglobal-large-16384-pubmed-3k_steps"
        ... )

        >>> # Let's try a very long input.
        >>> inputs = tokenizer(100 * "studies have shown that owning a dog is good for you ", return_tensors="pt")
        >>> input_ids = inputs.input_ids

        >>> outputs = model.generate(input_ids)
        >>> print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        abstractthe aim of this article is to provide an overview of the literature on the role of dog
        ```Nrv  r   r    rg   rw  rx  r  r  )ignore_indexr-   )	losslogitsr  ry  rz  r  r{  r  r|  )r   r<  r  r  rR  r}  r~  6_LongT5ForConditionalGeneration__HEAD_MASK_WARNING_MSGr  rQ  r   r   r   r  rS  r  r  r  r	   ra   rC   r*  r  r   r  r   r  r  r
  )r   r  re   r  r  r  rr  r  rs  r  r  rt  r  r<  r=  r  r  r   r   r  sequence_output	lm_logitsr  loss_fctoutputs                            r>   r   z&LongT5ForConditionalGeneration.forward  sl   j "+!6IDKK<Q<Q	%0%<k$++B]B]  %6%>{{%%)G)GG5}E$-! ""ll#-+#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HYYy//0FINN2y~~b/ABFKKPROTD \OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r@   c                 $    | j                  |      S r   )r  )r   r  s     r>   %prepare_decoder_input_ids_from_labelszDLongT5ForConditionalGeneration.prepare_decoder_input_ids_from_labelsf  s      ((r@   c           	      :   |t         j                  d       |S d}|D ]z  }d}|D ]1  }||j                  d|j                  |j                              fz   }3 |d   j
                  |d   j
                  k(  sJ t        |      t        |      k(  sJ ||fz   }| |S )NzHYou might want to consider setting `use_cache=True` to speed up decodingr.   r   )r   warningindex_selectra   rC   r4   r   )r   r  beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r>   _reorder_cachez-LongT5ForConditionalGeneration._reorder_cachei  s     "NNef""!#!0 	] +-'$5  .I$11!X[[AQAXAX5YZM /+ /q177;LQ;O;U;UUUU23s;L7MMMM%;?Z>\%\"	] &%r@   )NNNNNNNNNNNNNNNNN)r   r   r   r  r  r!   r   r  r   r`  r  r  rd  rh  r   r   r7   r  r  r  r|   r   r4  r   r   r   r  r  r   r   s   @r>   r  r    s1    	R*& j| .:
O
&  156:8<=A159=7;@D@D59=A-1$(,0/3&*59%f
E,,-f
 !!2!23f
 $E$4$45	f

 !))9)9 :f
 E--.f
 $E$5$56f
 'u||4f
 "%ell(;"<=f
 "%ell(;"<=f
   1 12f
  ((9(9:f
 ))*f
 D>f
 $D>f
  'tn!f
" d^#f
$ !!1!12%f
& 
uU&&'8	9'f
 f
P)ELL )&r@   r  c                   .    e Zd ZdgZdgZdef fdZd Zd Zd Z	d Z
d	 Ze	 	 	 	 	 	 	 dd
eej                     deej                      deej                      deej                      dee   dee   dee   deeej                      ef   fd       Z xZS )r  rK  rS  r   c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |      }d|_	        d|_
        t        || j                        | _        | j                          y )NF)r   r   r   r   r  r   r  rN  rO  r<  rP  r  rQ  r  )r   r   rU  r   s      r>   r   zLongT5EncoderModel.__init__  sh     ll6#4#4fnnEv.#( ,1)">4;;? 	r@   c                     | j                   S r   rX  r  s    r>   r  z'LongT5EncoderModel.get_input_embeddings  rY  r@   c                 H    || _         | j                  j                  |       y r   )r  rQ  r   r  s     r>   r   z'LongT5EncoderModel.set_input_embeddings  s    $)).9r@   c                     | j                   j                  r1| j                  | j                  j                  | j
                         y y r   )r   r  r_  rQ  r  r  r  s    r>   r`  zLongT5EncoderModel._tie_weights  s2    ;;**&&t||'@'@$++N +r@   c                     | j                   S r   rc  r  s    r>   rd  zLongT5EncoderModel.get_encoder  re  r@   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 yrj  rk  rn  s       r>   rp  zLongT5EncoderModel._prune_heads  rq  r@   r  re   r  r  r=  r  r  r(   c           	      j    ||n| j                   j                  }| j                  |||||||      }|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. LongT5 is a model with relative position embeddings so
            you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            To know more on how to prepare `input_ids` for pretraining take a look a [LONGT5
            Training](./longt5#training).

        Example:

        ```python
        >>> from transformers import AutoTokenizer, LongT5ForConditionalGeneration

        >>> tokenizer = AutoTokenizer.from_pretrained("google/long-t5-local-base")
        >>> model = LongT5EncoderModel.from_pretrained("google/long-t5-local-base")
        >>> input_ids = tokenizer(
        ...     100 * "Studies have been shown that owning a dog is good for you ", return_tensors="pt"
        ... ).input_ids  # Batch size 1
        >>> outputs = model(input_ids=input_ids)
        >>> last_hidden_states = outputs.last_hidden_state
        ```rv  )r   r  rQ  )	r   r  re   r  r  r=  r  r  rs  s	            r>   r   zLongT5EncoderModel.forward  sK    F &1%<k$++B]B],,)'/!5# ' 
 r@   )NNNNNNN)r   r   r   r  r  r!   r   r  r   r`  rd  rp  r   r   r7   r  r  r4  r   r   r   r   r   r   s   @r>   r  r    s   78*4&
| 
:OC  156:1559,0/3&*.E,,-. !!2!23. E--.	.
   1 12. $D>. 'tn. d^. 
uU&&'8	9. .r@   r  )r  r  r  r  )r   )[r  rN  r  r}  typingr   r   r   r   r   r7   r   torch.nnr	   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   r   r   configuration_longt5r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerr   r   r|   r   r?   rH   rU   r]   rd   rC   rl   r   r   r   Moduler   apex.normalizationr   infoImportError	Exceptionr  rP   r   r   r   r   rR  ru  r  r  r  r  r  r  r  __HEAD_MASK_WARNING_MSGr  r  r  __all__r.   r@   r>   <module>r     s       4 4   % ! C C ) >  . g g   /  !;J 
		H	%  3 3 W\WcWc  #%,, #3 #S #U\\ #4U\\ 4c 4 4Y\ 4ejeqeq 42!# !%,, !BU\\ Bc BV[VbVb B8ell 8s 8TYT`T` 8ejeqeq 8 .PLL.P58.P
5<<%&.Pb4U\\ 4VY 4^c^j^j 4	j<<	j,1LL	jJM	j
\\	j+bii +2	/"O
KKef    O ,")) ,ryy &BII &abii aH}299 }@DRYY DP!ryy !HBII >bii D#		 #La")) aH ^!O ^! ^!Bq' qj  
' 
 
D 
x&%:O x&
x&v U. U Up k{>  	 	
NN[\	s   -M M:!M:9M: