
    Uh             
          d Z ddlZddlmZmZmZmZ ddlZddlZddlm	Z	 ddl
mZmZmZmZ ddl
mZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZ ddlmZm Z m!Z!m"Z"m#Z# ddl$m%Z%m&Z& ddl'm(Z( ddl)m*Z*m+Z+ ddl,m-Z- erddl.m/Z/  e       rddlm0Z0  e+jb                  e2      Z3 G d de	jh                        Z5d Z6d>dZ7 G d de	jp                        Z9dejt                  de;dejx                  dejt                  fdZ=d ejt                  d!ejt                  d"e>d#e?dejt                  f
d$Z@ G d% d&e	jp                        ZA G d' d(eA      ZB G d) d*e	jp                        ZCeAeAeBd+ZD G d, d-e	jp                        ZEe* G d. d/e(             ZFe* G d0 d1eF             ZG e*d23       G d4 d5eFe             ZH e*d63       G d7 d8eF             ZIe* G d9 d:eF             ZJe* G d; d<eF             ZKg d=ZLy)?zPyTorch Falcon model.    N)TYPE_CHECKINGOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLoss	LayerNormMSELoss)
functional   )get_activation)CacheDynamicCacheStaticCache)GenerationMixin)AttentionMaskConverter)!flash_attn_supports_top_left_maskis_flash_attn_available))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentionsQuestionAnsweringModelOutput SequenceClassifierOutputWithPastTokenClassifierOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)PreTrainedModel)auto_docstringlogging   )FalconConfig)PretrainedConfig)_flash_attention_forwardc                   D    e Zd Zdej                  dej                  fdZy)FalconLinearinputreturnc                 n    || j                   j                  z  }| j                  |S || j                  z   S N)weightTbias)selfr&   hidden_statess      |/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/falcon/modeling_falcon.pyforwardzFalconLinear.forward=   s3    -99  tyy((    N)__name__
__module____qualname__torchTensorr0    r1   r/   r%   r%   <   s    )U\\ )ell )r1   r%   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..N   dim)shaper5   cat)xx1x2s      r/   rotate_halfrB   E   sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r1   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezerB   )qkcossinposition_idsunsqueeze_dimq_embedk_embeds           r/   apply_rotary_pos_embrM   M   sY    ( --
&C
--
&C3w;q>C/0G3w;q>C/0GGr1   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )FalconRotaryEmbeddingconfigc                    t         |           t        |d      rG|j                  ;|j                  j	                  d|j                  j	                  d            | _        nd| _        |j                  | _        |j                  | _        || _	        t        | j
                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                  | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)super__init__hasattrrR   getrS   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenrP   r   rope_init_fnattention_scalingregister_bufferrV   original_inv_freq)r-   rP   devicerV   	__class__s       r/   rY   zFalconRotaryEmbedding.__init__j   s    6>*v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r1   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   r9   r    mpscpuF)device_typeenabledr:   r;   dtype)rV   floatexpandr=   torc   
isinstancerT   strr5   autocast	transposer>   rG   r`   rH   rk   )
r-   r?   rI   inv_freq_expandedposition_ids_expandedrh   freqsembrG   rH   s
             r/   r0   zFalconRotaryEmbedding.forward{   sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r)   )
r2   r3   r4   r!   rY   r5   no_gradr   r0   __classcell__rd   s   @r/   rO   rO   i   s3    /| /" U]]_<  <r1   rO   attention_mask	num_headsrk   r'   c                    | j                   \  }}dt        j                  t        j                  |            z  }t	        j
                  ddt        j                  |      dz
   z   z  | j                  t        j                        }t	        j                  dd|z   | j                  t        j                        }t	        j                  ||      }||k7  rt	        j
                  ddt        j                  d|z        dz
   z   z  | j                  t        j                        }	t        |||z
        }
t	        j                  ddd|
z  z   d| j                  t        j                        }t	        j                  |t	        j                  |	|      gd      }| j                  d      dz
  | z  d d d d d f   }|d   j                         |z  }|j                  ||z  d|      j!                  |      S )	Nr:   r   rc   rk   r    r   r;   r9   ).N)r=   mathfloorlog2r5   tensorrc   float32arangeint32powminr>   cumsumbfloat16reshapern   )rz   r{   rk   
batch_size
seq_lengthclosest_power_of_2basepowersslopes
extra_basenum_remaining_headsextra_powersarange_tensoralibis                 r/   build_alibi_tensorr      s   +11J
djj9)=>><<	tyy!34q899:;NDYDYafananD \\!Q!33N<Q<QY^YdYdeFYYtV$FY&\\A499Q);%;<q@AABCNLaLainiviv

 ""4iBT6TU||Aq1/B+B'BAnNcNckpkvkvwFEIIj,$GHaP %+++3a7>I1dTU:VM9&&(=8E==i/J?BB5IIr1   r?   residualprobtrainingc                 @    t        j                  | ||      }||z   }|S )a
  
    Dropout add function

    Args:
        x (`torch.tensor`):
            input tensor
        residual (`torch.tensor`):
            residual tensor
        prob (`float`):
            dropout probability
        training (`bool`):
            training mode
    )pr   )Fdropout)r?   r   r   r   outs        r/   dropout_addr      s$     ))A
1C
S.CJr1   c                       e Zd Zddef fdZdej                  deej                  ej                  ej                  f   fdZdej                  dej                  fdZ		 	 	 	 	 	 	 ddej                  d	e
ej                     d
ej                  de
ej                     de
e   de
ej                     dedede
ej                     de
eej                  ej                  f      fdZ xZS )FalconAttentionrP   c                    t         |           || _        |j                  | _        |j                  | _        | j                  | j
                  z  | _        | j                  | _        |j                  | _        |j                  | _	        |j                  | _
        d| _        |j                  dk(  | _        || _        |-t        j!                  d| j"                  j$                   d       | j                  | j
                  z  | j                  k7  r&t'        d| j                   d| j
                   d      dt)        j*                  | j                        z  | _        | j,                  | _        |j0                  r*|j2                  d	z  |j                  z   | j                  z  }n8|j4                  r| j                  d	| j                  z  z   }nd
| j                  z  }t7        | j                  ||j8                        | _        |j0                  | _        |j4                  | _        t7        | j                  | j                  |j8                        | _        t?        j@                  |jB                        | _!        | j0                  s| j4                  s|j2                  nd| _        |jD                  rtG        | j                        | _$        y y )NTsdpazInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.zA`hidden_size` must be divisible by num_heads (got `hidden_size`: z and `num_heads`: z).      ?r:   r   r,   r    rP   )%rX   rY   rP   hidden_sizenum_attention_headsr{   head_dim
split_sizehidden_dropoutr\   
rope_theta	is_causal_attn_implementation	_use_sdpa	layer_idxloggerwarning_oncerd   r2   
ValueErrorr~   sqrtinv_norm_factorbetanew_decoder_architecturenum_kv_headsmulti_queryr%   r,   query_key_valuedenser   Dropoutattention_dropoutrotaryrO   
rotary_emb)r-   rP   r   qkv_out_dimrd   s       r/   rY   zFalconAttention.__init__   sX   !--33((DNN:**$33'-'E'E$ ++44>" !8!8 9 :, , ==4>>)T-=-==STXTdTdSe fNN#2'   #TYYt}}%==((	**!..2V5O5OOSWS`S``K**Q->>Kd...K+D,<,<kPVP[P[\(.(G(G%!--!$"2"2D4D4D6;;W
!#F,D,D!E484Q4QY]YiYiF//pq ==34;;GDO r1   	fused_qkvr'   c                 l   | j                   r|j                  \  }}}|j                  ||d| j                  | j                  z  dz   | j
                        }|ddddddddf   }|dddddddgf   }|dddddddgf   }t        j                  ||j                        }t        j                  ||j                        }|||fD 	cg c]  }	|	j                  dd       c}	\  }}}|||fS | j                  sV|j                  \  }
}}|j                  |
|| j                  d| j
                        }|ddddf   |ddddf   |ddddf   fS |j                  \  }
}}|j                  |
|| j                  dz   | j
                        }|dddddf   |ddgddf   |ddgddf   fS c c}	w )	a  
        Split the last dimension into (num_heads, head_dim), results share same memory storage as `fused_qkv`

        Args:
            fused_qkv (`torch.tensor`): [batch_size, seq_length, num_heads * 3 * head_dim]

        Returns:
            query: [batch_size, seq_length, num_heads, head_dim] key: [batch_size, seq_length, num_heads, head_dim]
            value: [batch_size, seq_length, num_heads, head_dim]
        r9   r:   Nr   .r   r    )
r   r=   viewr{   r   r   r5   broadcast_toflattenr   )r-   r   batchseq_len_qkvquerykeyvaluer?   r   r   three_times_hidden_sizes                r/   _split_headszFalconAttention._split_heads   s    (( )E7A..T^^tGXGX5X[\5\^b^k^klC1a"%EaAtm$C1a"&E$$S%++6C&&uekk:E;@#u:M NQ1a NE3#u$$!!>Goo;J
$;!z:t~~qRVR_R_`IS!QY'319)=yaQR?SSS>Goo;J
$;!z:t~~PQ?QSWS`S`aIS#2#q[)9S2$\+BIcTVSWYZlD[[[ !Os   F1r?   c                    |j                   \  }}}|| j                  z  }|j                  || j                  || j                        }|j	                  dddd      }|j                  ||| j                  | j                  z        S )z
        Merge heads together over the last dimension

        Args:
            x (`torch.tensor`): [batch_size * num_heads, seq_length, head_dim]

        Returns:
            torch.tensor: [batch_size, seq_length, num_heads * head_dim]
        r   r:   r    r   )r=   r{   r   r   permuter   )r-   r?   batch_size_and_num_headsr   r   r   s         r/   _merge_headszFalconAttention._merge_heads  sy     34''/ *a-?
 FF:t~~z4==I IIaAq! yyZ$--1OPPr1   r.   r   rz   rI   
layer_past	head_mask	use_cacheoutput_attentionscache_positionposition_embeddingsc                 @
   | j                  |      }| j                  r| j                  n| j                  }| j	                  |      \  }}}|j
                  \  }}}}|j                  dd      j                  || j                  || j                        }|j                  dd      j                  |||| j                        }|j                  dd      j                  |||| j                        }||
\  }}t        ||||      \  }}|;d|	i}||j                  d       |j                  ||| j                  |      \  }}|j
                  d   }| j                  rK|j                  j                  dk(  r2|0|j                         }|j                         }|j                         }||d d d d d d d |j
                  d   f   }|:| j                  rK|sI| j                   r	||dkD  rdnd}t"        j$                  j&                  j)                  ||||d	|
      }d }na||j                  dd      z  }|t+        j,                  | j                        z  }t/        j0                  ||z   d|j2                        }||z  }|j5                  || j                  || j                        }|j7                  dddd      }|j                  ||| j                  | j                  z        }| j9                  |      }|r|||fS ||fS | j                  r|s|| j                   r	||dkD  rdnd}t"        j$                  j&                  j)                  ||||| j:                  r| j<                  j>                  nd	|
      }|j                  dd      }|j                  ||| j                  | j                  z        }| j9                  |      }nF||j                  dd      z  }|j5                  || j                  ||      }|j2                  }|t"        j@                  k(  s|t"        jB                  k(  r|jE                  t"        jF                        }||j5                  || j                  dd      z   }|| jH                  z  }t/        j0                  ||z   d|j2                        }| j=                  |      }|||z  }|j5                  || j                  ||      }||z  jK                  dd      }| jM                  |      }| j9                  |      }|r||fS ||fS )Nr    r:   r   rH   rG   r   cudaTF        )	attn_mask	dropout_pr   r9   )r<   rk   r   r   )'r   r   r{   r   r   r=   rr   r   r   rM   updater   r   rc   rT   
contiguousr   r5   r   r   scaled_dot_product_attentionr~   r   r   softmaxrk   r   r   r   r   r   r   float16r   rn   r   r   r   r   )r-   r.   r   rz   rI   r   r   r   r   r   r   r   r   query_layer	key_layervalue_layerr   query_lengthr   rG   rH   cache_kwargs	kv_lengthr   attn_outputattention_scoresmatmul_resultinput_dtypeattention_logitsattention_probsattention_probs_reshapeds                                  r/   r0   zFalconAttention.forward$  s!    ((7	)-)F)Ft~~DL]L]040A0A)0L-i)4):):&
L!Q!++Aq199*dnnVbdhdqdqr''1-55j,P\^b^k^kl	!++Aq199*lT`bfbobop=*HC%9+yRUWZ%["K!,n=L}##C$<=%/%6%6y+t~~_k%l"I{OOB'	>>k0055?ND^ &002K!,,.I%002K%+Aq!5Jyr7J5J,JKN=~~&7
 %)NN~7MR^abRbDhm	#hh11NN,!' O  $( #.1D1DR1L#L  DIIdmm$<< #$99-=-NTV^k^q^q#r .<%**:t~~|UYUbUbcK%--aAq9K%--j,Y]YfYfHfgK**[1K "J0@@@"J.. ~~&7I<M %)NN~7MR^abRbDhm	#hh11NN,:>--d4466S' O  *33Aq9)11*lDNN]a]j]jLjk"jj5 +i.A.A"b.I I $1#5#5j$..R^`i#j  /44%--/;%..3P'7':':5=='I$#3ejjT^^]^`b6c#c  D$8$88 "#)),<~,MSU]j]p]p"q"&"8"8"I(&5	&AO ,;+?+?
DNN\hjs+t(  8+ENNqRST #//<"jj5 "J??"J..r1   r)   NNNFFNN)r2   r3   r4   r!   rY   r5   r6   r   r   r   r   
LongTensorr   boolr0   rx   ry   s   @r/   r   r      s:   -H| -H^\ell \uU\\5<<Y^YeYe=e7f \@Qell Qu|| Q< 48&*,0"'59KOD/||D/ %D/ 	D/
 u//0D/ UOD/ ELL)D/ D/  D/ !!1!12D/ &eELL%,,,F&GHD/r1   r   c                   0    e Zd ZdZ fdZ	 	 	 	 	 	 	 ddej                  deej                     dej                  deej                     dee	   deej                     d	e
d
e
deej                     deeej                  ej                  f      fdZ xZS )FalconFlashAttention2aH  
    Falcon flash attention module. This module inherits from `FalconAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y r)   )rX   rY   r   _flash_attn_uses_top_left_mask)r-   argskwargsrd   s      r/   rY   zFalconFlashAttention2.__init__  s#    $)&)
 /P.Q+r1   r.   r   rz   rI   r   r   r   r   r   r   c                    | j                  |      }| j                  r| j                  n| j                  }| j	                  |      \  }}}|j
                  \  }}}}|j                  dd      j                  || j                  || j                        }|j                  dd      j                  |||| j                        }|j                  dd      j                  |||| j                        }||
\  }}t        ||||      \  }}|;d|	i}||j                  d       |j                  ||| j                  |      \  }}|j                  dd      }|j                  dd      }|j                  dd      }|t        d      | j                  r| j                  j                  nd}|j                   }|t"        j$                  k(  rt#        j&                         rt#        j(                         }nMt+        | j                  d      r| j                  j,                  }n | j                   j.                  j                   }t0        j3                  d| d	       |j5                  |      }|j5                  |      }|j5                  |      }t7        |||||||| j8                  | j:                  
	      }|j                  ||| j                  | j                  z        }| j=                  |      }|sd }|||fS )Nr    r:   r   r   z6`alibi` is not supported when `use_flash_attn` is Truer   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .)rI   r   r   use_top_left_mask)r   r   r{   r   r   r=   rr   r   r   rM   r   r   r   r   rP   r   rk   r5   r   is_autocast_enabledget_autocast_gpu_dtyperZ   r   r*   r   r   rn   r#   r   r   r   )r-   r.   r   rz   rI   r   r   r   r   r   r   r   r   r   r   r   r   r   r   rG   rH   r   attn_dropoutr   target_dtyper   attn_weightss                              r/   r0   zFalconFlashAttention2.forward  s    ((7	)-)F)Ft~~DL]L]040A0A)0L-i)4):):&
L!Q!++Aq199*dnnVbdhdqdqr''1-55j,P\^b^k^kl	!++Aq199*lT`bfbobop=*HC%9+yRUWZ%["K!,n=L}##C$<=%/%6%6y+t~~_k%l"I{ "++Aq1''1-	!++Aq1UVV8<t{{443
 "''%--'((*$;;=&?@#{{BB#33::@@ >$ &..6K!\2I%..6K.% nn"AA

 #**:|T^^VZVcVcEcdjj. LJ44r1   r   )r2   r3   r4   __doc__rY   r5   r6   r   r   r   r   r   r0   rx   ry   s   @r/   r   r     s    R 48&*,0"'59KOV5||V5 %V5 	V5
 u//0V5 UOV5 ELL)V5 V5  V5 !!1!12V5 &eELL%,,,F&GHV5r1   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )	FalconMLPrP   c                 ,   t         |           |j                  }t        ||j                  |j
                        | _        t        |j                        | _	        t        |j                  ||j
                        | _
        |j                  | _        y )Nr   )rX   rY   r   r%   ffn_hidden_sizer,   dense_h_to_4hr   
activationactdense_4h_to_hr   )r-   rP   r   rd   s      r/   rY   zFalconMLP.__init__  su    (()+v7M7MTZT_T_`!&"3"34)&*@*@+TZT_T_`$33r1   r?   r'   c                 h    | j                  | j                  |            }| j                  |      }|S r)   )r
  r  r  )r-   r?   s     r/   r0   zFalconMLP.forward  s0    HHT''*+q!r1   )	r2   r3   r4   r!   rY   r5   r6   r0   rx   ry   s   @r/   r  r    s*    4| 4 %,, r1   r  )eagerr   flash_attention_2c                   p    e Zd Zddef fdZ	 	 	 	 	 	 	 ddej                  deej                     dej                  deej                     dee	e
eej                  ej                  f   f      deej                     d	ed
edeej                     deeej                  ej                  f      fdZ xZS )FalconDecoderLayerrP   c                 x   t         |           |j                  }|j                  | _        t        |j                     ||      | _        t        |      | _	        |j                  | _
        || _        |j                  |j                  rd|_        |j                  s9t        ||j                         | _        t        ||j                         | _        y |j                  dk(  r9t        ||j                         | _        t        ||j                         | _        y t        ||j                         | _        y )Nr:   eps)rX   rY   r   r   r{   FALCON_ATTENTION_CLASSESr   self_attentionr  mlpr   rP   num_ln_in_parallel_attnr   parallel_attnr
   layer_norm_epsilonpost_attention_layernorminput_layernormln_attnln_mlp)r-   rP   r   r   rd   s       r/   rY   zFalconDecoderLayer.__init__+  s    ((336v7R7RSTZ\efV$$33))1f6U6U-.F*##,5kvG`G`,aD)#,[f>W>W#XD --2(&:S:ST'9R9RS'0&B[B['\$r1   r.   r   rz   rI   r   r   r   r   r   r   c                    |}| j                   j                  r<| j                   j                  dk(  r#| j                  |      }| j	                  |      }n| j                  |      }| j                  |||||||||	|

      }|d   }| j                   j                  sW| j                   j                  r|}n>t        ||| j                   j                  | j                        }| j                  |      }| j                   j                  r1| j                   j                  r| j                   j                  dk(  r|}|dd  }| j                        }| j                   j                  s| j                   j                  r||z  }t        ||| j                   j                  | j                        }|r|f|z   }|S |f|dd  z   }|S )Nr:   )	r   rz   rI   r   r   r   r   r   r   r   )r   r    )rP   r   r  r  r  r  r  r  r   r   r   r  r  r   )r-   r.   r   rz   rI   r   r   r   r   r   r   r   r   attention_layernorm_outmlp_layernorm_outattn_outputsattention_outputoutputs
mlp_outputoutputs                       r/   r0   zFalconDecoderLayer.forwardD  s    !;;//DKK4W4W[\4\&*ll=&A# $M :&*&:&:=&I# **#!)%/) 3 + 
 (?{{33{{(($;!&$h0M0MX\XeXe %)$A$A($K! KK00))33q8 7qr" XX/0
;;//4;;3L3L**JZ4;;3M3MX\XeXefi')G  i'!"+-Gr1   r)   r   )r2   r3   r4   r!   rY   r5   r6   r   r   r   r   r   r   r0   rx   ry   s   @r/   r  r  *  s   ]| ]< 48PT,0"'59KOE||E %E 	E
 u//0E U5%ell0J*K#KLME ELL)E E  E !!1!12E &eELL%,,,F&GHEr1   r  c                        e Zd ZeZdZdZdgZdZdZ	dZ
dZdZ fdZdej                  fdZeddedd	fd
       Z xZS )FalconPreTrainedModeltransformerTr  c                 $    t        |   |i | y r)   )rX   rY   )r-   inputsr   rd   s      r/   rY   zFalconPreTrainedModel.__init__  s    &+F+r1   modulec                    t        |t        j                        st        |t              rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rz|j                  j
                  j                  d| j                  j                         |j                  2|j                  j
                  |j                     j                          yyt        |t              rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weights.r   )meanstdNr   )ro   r   Linearr%   r*   datanormal_rP   initializer_ranger,   zero_	Embeddingpadding_idxr
   fill_)r-   r+  s     r/   _init_weightsz#FalconPreTrainedModel._init_weights  s   fbii(Jv|,L MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .	*KK""$MM$$S) +r1   hard_check_onlyr'   r"   c                 :    t        | dd      }|r|S |sd|_        |S )Nuse_bettertransformerFr   )getattrr   )clsrP   r8  _is_bettertransformers       r/   _check_and_enable_sdpaz,FalconPreTrainedModel._check_and_enable_sdpa  s*     '-De L M*0F'r1   )F)r2   r3   r4   r!   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_quantized_cache_supports_static_cacherY   r   Moduler7  classmethodr   r>  rx   ry   s   @r/   r'  r'    sr    L%&*#-.!N  $!,*BII *" T N`  r1   r'  c                       e Zd Zdef fdZd Zdej                  fdZe		 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
eeeeej                  ej                  f   df   f      d	e
ej                     d
e
ej                     de
ej                     de
ej                     de
e   de
e   de
e   de
e   de
ej                     deeej                  df   ef   fd       Zd	ej                  dej                  dej                  dededej                  dej                  fdZed	ej                  dededej*                  dej                  defd       Z xZS )FalconModelrP   c           	      j   t         |   |       |j                  | _        |j                  | _        |j                  | _        t        j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        |j"                  dk(  | _        |j"                  dk(  | _        t)        | j                  |j*                        | _        t/        |      | _        d| _        | j5                          y c c}w )N)r   r  r   r  r   F)rX   rY   r   	embed_dimr   r{   r   	use_alibir   r4  
vocab_sizeword_embeddings
ModuleListrangenum_hidden_layersr  hr   _use_flash_attention_2r   r
   r  ln_frO   r   gradient_checkpointing	post_init)r-   rP   ird   s      r/   rY   zFalconModel.__init__  s     ++33  "||F,=,=t~~N QVW]WoWoQpqA 26Q Gqr&,&A&AEX&X#44> dnn&2K2KL	/v>&+# 	  rs   D0c                     | j                   S r)   rP  r-   s    r/   get_input_embeddingsz FalconModel.get_input_embeddings  s    ###r1   new_embeddingsc                     || _         y r)   r[  r-   r^  s     r/   set_input_embeddingsz FalconModel.set_input_embeddings  s
    -r1   	input_idspast_key_values.rz   rI   r   inputs_embedsr   r   output_hidden_statesreturn_dictr   r'   c                 J   ||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }|
|
n| j                   j                  }
|du |duz  rt        d      | j                  r%| j                  r|rt        j                  d       d}|| j                  |      }d}|rIt        |t              s9d}|t               }n*t        j                  |      }t        j                  d       d}||j                         nd}|j                   \  }}}| j"                  r[|5t%        j&                  |||z   f|j(                  t$        j*                        n|}t-        || j.                  |j0                  	      }|%t%        j2                  |||z   |j(                  
      }||j5                  d      }| j7                  |||||||      }| j9                  || j                   j:                        }|}| j=                  ||      }d}|rdnd}|	rdnd}t?        | j@                        D ]  \  }}|	r||fz   }| j                  r5| j                  r)| jC                  |jD                  ||||||   |||||      }n |||||||   |||||
      }|d   }|du r|d   }|su|||rdnd   fz   } | jG                  |      }|	r||fz   }|r|nd}|r|jI                         }|
stK        d ||||fD              S tM        ||||      S )  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        Nz:You must specify exactly one of input_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r   r}   rj   rc   r7   )	r   rz   rI   r   r   r   r   r   r   r    r:   c              3   &   K   | ]	  }||  y wr)   r7   ).0vs     r/   	<genexpr>z&FalconModel.forward.<locals>.<genexpr>p  s      bcbos   )last_hidden_staterc  r.   
attentions)'rP   r   re  r   use_return_dictr   rW  r   r   r   rP  ro   r   r   from_legacy_cacheget_seq_lengthr=   rN  r5   onesrc   longr   r{   rk   r   rD   _update_causal_maskget_head_maskrS  r   	enumeraterT  _gradient_checkpointing_func__call__rV  to_legacy_cachetupler   )r-   rb  rc  rz   rI   r   rd  r   r   re  rf  r   return_legacy_cacher   past_key_values_lengthr   r   r   maskcausal_maskr.   r   next_decoder_cacheall_self_attentionsall_hidden_statesrY  blockr#  
next_caches                                r/   r0   zFalconModel.forward  s   8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]-t";<YZZ&&4==##p "	  00;M $Z?"&&".."."@"@"Q##^ ETE`!?!?!Afg$1$7$7!
J>>
 ") 

.D!DEmNbNbjojtjt $  'tT^^=CVCVWE!"\\&(>(KTaThThN )33A6L..M>?L]_hjo
 &&y$++2O2OP	% #oom\J!$5b4"6BD!$&&) %	^HAu#$58H$H!**t}};;NN! aL#%"'  !.#.!-'l'&7#1(; $AJMD %,QZ" &9W)QYZ=[<]&]#K%	^P 		-0 1]4D D+4'$
#335J ):7HJ]^   9+&+*	
 	
r1   input_tensorr   c           
         | j                   j                  dk(  r	|d|v r|S y ||j                         nd}t        |t              }	| j                   j                  dk(  r,|	s*|s(|&|$t        j                  |||| j                        ry |j                  |j                  }}
t        j                  |
      j                  }|j                  \  }}}|	r|j                         }n.t        |t        j                        r|j                  d   n||z   }| j!                  ||||
|||j                  d         }|t|r |j"                  |dg|j                  dd   }t        j$                  |t'        j(                  | j                   j*                  | j,                  z        z  |dk  |      }| j                   j                  dk(  r2|0|j                  j.                  d	v r|st        j0                  ||      }|S )
Nr  r   r   r   )rd  r}  is_trainingr9   )sequence_lengthtarget_lengthrk   rc   r   r   r    )r   xpunpu)rP   r   rr  ro   r   r   _ignore_causal_mask_sdpar   rk   rc   r5   finfor   r=   get_max_cache_shaper6   5_prepare_4d_causal_attention_mask_with_cache_positionr   masked_fillr~   r   r   r{   rT   _unmask_unattended)r-   rz   r  r   rc  r   r   r   past_seen_tokensusing_static_cacherk   rc   	min_dtyper   r  r   r  r  s                     r/   ru  zFalconModel._update_causal_mask{  s    ;;++/BB)c^.C%%
 @O?Z?99;`a'E KK,,6&%!%>>*'7 MM	 $**L,?,?vKK&**	)5););&
OQ+??AM nell; $$R(%7  PP+')#))!, Q 
 !2!EMM*bC5;;qr?CE++		$++"9"9T^^"KLLb K KK,,6*%%**.DD%
 1CCKQZ[Kr1   r  r  rk   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerk   rc   r    )diagonalri  r9   r   )r<   r5   r  r   fullrc   triur   r   rm   cloner=   rn   r  )rz   r  r  rk   r   r   r   r  r  mask_lengthpadding_masks              r/   r  zAFalconModel._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r1   )NNNNNNNNNNN)r2   r3   r4   r!   rY   r]  r5   r6   ra  r   r   r   r   r   r   r   r   r0   ru  staticmethodintrk   r  rx   ry   s   @r/   rK  rK    s3   | 2$.5<< .  15ae15370448$(,0/3&*59`
E,,-`
 "%uU5<<;U5VX[5[/\(\"]^`
 !.	`

 u//0`
 E,,-`
   0 01`
 D>`
 $D>`
 'tn`
 d^`
 !!1!12`
 
uU\\3&')RR	S`
 `
DUU llU 	U
 U  U <<U ||Un 444 4 {{	4
 4 4 4r1   rK  z
    The Falcon Model transformer with a language modeling head on top (linear layer with weights tied to the input embeddings).
    )custom_introc                        e Zd ZdgZdef fdZd Zdej                  fdZ	e
	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deeeeeej                  ej                  f   d	f   f      d
eej                     deej                     deej                     deej                     deej                     dee   dee   dee   dee   deej                     deeej                  f   deeej                     ef   fd       Zdeeej                  ej                  f   d	f   dej                  deeej                  ej                  f   d	f   fdZ xZS )FalconForCausalLMzlm_head.weightrP   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y NFr   )
rX   rY   rK  r(  r   r/  r   rO  lm_headrX  r-   rP   rd   s     r/   rY   zFalconForCausalLM.__init__  sI     &v.yy!3!3V5F5FUS 	r1   c                     | j                   S r)   r  r\  s    r/   get_output_embeddingsz'FalconForCausalLM.get_output_embeddings  s    ||r1   r^  c                     || _         y r)   r  r`  s     r/   set_output_embeddingsz'FalconForCausalLM.set_output_embeddings  s	    %r1   rb  rc  .rz   rI   r   rd  labelsr   r   re  rf  r   logits_to_keepr'   c                    ||n| j                   j                  }| j                  ||||||||	|
||      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|* | j                  ||fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                        S )aZ  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
        N)
rc  rz   rI   r   rd  r   r   re  rf  r   r   rO  r    losslogitsrc  r.   ro  )rP   rp  r(  ro   r  slicer  loss_functionrO  r   rc  r.   ro  )r-   rb  rc  rz   rI   r   rd  r  r   r   re  rf  r   r  r   transformer_outputsr.   slice_indices	lm_logitsr  r%  s                        r/   r0   zFalconForCausalLM.forward!  s2   H &1%<k$++B]B]"..+)%'/!5#) / 
 ,A.8B>SV8W~ot4]kLLq-/B!CD	%4%%  ;;11 	D \$7$;;F)-)9TGf$EvE0/??-;;*55
 	
r1   pastbeam_idxc           	          |D ci c]/  }|D ](  }|j                   |j                  |j                         * 1 c}}t        fd|D              }|S c c}}w )aL  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.

        Output shares the same memory storage as `past`.
        c              3      K   | ]N  }|d    j                  d |d    j                           |d   j                  d |d    j                           f P yw)r   r    N)index_selectrc   )rk  r   device_to_beam_idxs     r/   rm  z3FalconForCausalLM._reorder_cache.<locals>.<genexpr>}  se      

  1**1.@AAUAU.VW1**1.@AAUAU.VW
s   AA)rc   rn   r{  )r-   r  r  r   
past_statereordered_pastr  s         @r/   _reorder_cachez FalconForCausalLM._reorder_cachen  sr     QU
BLgq
YcJx{{:+<+<==

  

 #
 
 
s   4A)NNNNNNNNNNNNr   )r2   r3   r4   _tied_weights_keysr!   rY   r  r5   r6   r  r   r   r   r   r   r   r   r  r   r0   r  rx   ry   s   @r/   r  r    s    ++| &ELL &  15ae1537,004)-$(,0/3&*5934J
E,,-J
 "%uU5<<;U5VX[5[/\(\"]^J
 !.	J

 u//0J
 ELL)J
  -J
 &J
 D>J
 $D>J
 'tnJ
 d^J
 !!1!12J
 c5<</0J
  
uU\\"$EE	F!J
 J
X%ell :;S@AMRM]M]	uU\\5<</0#5	6r1   r  a  
    The Falcon Model transformer with a sequence classification head on top (linear layer).

    [`FalconForSequenceClassification`] uses the last token in order to do the classification, as other causal models
    (e.g. GPT-1) do.

    Since it does classification on the last token, it requires to know the position of the last token. If a
    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
    each row of the batch).
    c                   x    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     dee	e	ej                  ej                  f   df      deej                     deej                     deej                     d	eej                     d
ee   dee   dee   dee   dee	ej                     ef   fd       Z xZS )FalconForSequenceClassificationrP   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y r  )
rX   rY   
num_labelsrK  r(  r   r/  r   scorerX  r  s     r/   rY   z(FalconForSequenceClassification.__init__  sV      ++&v.YYv1163D3D5Q
 	r1   rb  rc  .rz   r   rd  r  r   r   re  rf  r'   c                 t   |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }||j                  d   }n|j                  d   }| j                   j
                  |dk7  rt        d      | j                   j
                  d}n||| j                   j
                  k7  j                  |j                  t        j                        }t        j                  |j                  d   |j                  t        j                        }||z  j                  d      }n.d}t        j                  | j                  j                    d       |t        j                  ||j                  	      |f   }d}|^| j                   j"                  | j$                  dk(  rd
| j                   _        nl| j$                  dkD  rL|j&                  t        j(                  k(  s|j&                  t        j*                  k(  rd| j                   _        nd| j                   _        | j                   j"                  d
k(  rIt-               }| j$                  dk(  r& ||j/                         |j/                               }nc |||      }nY| j                   j"                  dk(  rt1               } |||      }n,| j                   j"                  dk(  rt3               } |||      }|
s|f|dd z   }||f|z   S |S t5        |||j6                  |j8                  |j:                        S )4  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0][0].shape[2]`
            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nrc  rz   r   rd  r   r   re  rf  r   r    z=Cannot handle batch sizes > 1 if no padding token is defined.r9   r}   z will not detect padding tokens in `inputs_embeds`. Results may be unexpected if using padding tokens in conjunction with `inputs_embeds.`ri  
regressionsingle_label_classificationmulti_label_classificationr  )rP   rp  r(  r  r=   pad_token_idr   rn   rc   r5   r   r   argmaxr   r   rd   r2   problem_typer  rk   rt  r  r   squeezer	   r   r   rc  r.   ro  )r-   rb  rc  rz   r   rd  r  r   r   re  rf  r  r.   r  r   last_non_pad_tokennon_pad_masktoken_indicespooled_logitsr  loss_fctr%  s                         r/   r0   z'FalconForSequenceClassification.forward  s   @ &1%<k$++B]B]"..+)'/!5# / 

 ,A.M* "+J&,,Q/J;;##+
a\]];;##+!#"%)A)AAEEfmmUZU`U`aL!LL)<V]]Z_ZeZefM"/,">!F!Fr!J!#>>**+ ,Z Z
 u||Jv}}MOaab{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#M$9$9$;V^^=MND#M6:D))-JJ+-v6))-II,.v6#%(;AB(??F)-)9TGf$EvE/ /??-;;*55
 	
r1   
NNNNNNNNNN)r2   r3   r4   r!   rY   r   r   r5   r   r   r6   r   r   r   r0   rx   ry   s   @r/   r  r    s0   |   15SW15,004)-$(,0/3&*g
E,,-g
 "%ellELL.H(I3(N"OPg
 !.	g

 ELL)g
  -g
 &g
 D>g
 $D>g
 'tng
 d^g
 
uU\\"$DD	Eg
 g
r1   r  c                   x    e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 ddeej                     dee	e	ej                  ej                  f   df      deej                     deej                     deej                     d	eej                     d
ee   dee   dee   dee   dee	ej                     ef   fd       Z xZS )FalconForTokenClassificationrP   c                    t         |   |       |j                  | _        t        |      | _        t        |dd       |j                  }nt        |dd       |j                  }nd}t        j                  |      | _
        t        j                  |j                  |j                        | _        | j                          y )Nclassifier_dropoutr   g?)rX   rY   r  rK  r(  r;  r  r   r   r   r   r/  r   
classifierrX  )r-   rP   r  rd   s      r/   rY   z%FalconForTokenClassification.__init__  s      ++&v.6/6B!'!:!:V-t4@!'!6!6!$zz"45))F$6$68I8IJ 	r1   rb  rc  .rz   r   rd  r  r   r   re  rf  r'   c                    |
|
n| j                   j                  }
| j                  ||||||||	|
	      }|d   }| j                  |      }| j	                  |      }d}|Q|j
                  \  }}t               } ||j                  ||z  | j                        |j                  ||z              }|
s|f|dd z   }||f|z   S |S t        |||j                  |j                        S )r  Nr  r   r:   )r  r  r.   ro  )rP   rp  r(  r   r  r=   r	   r   r  r   r.   ro  )r-   rb  rc  rz   r   rd  r  r   r   re  rf  r  r.   r  r  r   r   r  r%  s                      r/   r0   z$FalconForTokenClassification.forward  s   @ &1%<k$++B]B]"..+)'/!5# / 

 ,A.]3/%+\\"J
')HJ3T__Ev{{S]`jSjGkD Y!4QR!88F)-)9TGf$EvE$-;;*55	
 	
r1   r  )r2   r3   r4   r!   rY   r   r   r5   r   r   r6   r   r   r   r0   rx   ry   s   @r/   r  r  
  s0   | "  15SW15,004)-$(,0/3&*B
E,,-B
 "%ellELL.H(I3(N"OPB
 !.	B

 ELL)B
  -B
 &B
 D>B
 $D>B
 'tnB
 d^B
 
uU\\"$99	:B
 B
r1   r  c                   $    e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     dee	   d	ee	   d
ee	   de
eef   fd       Z xZS )FalconForQuestionAnsweringc                     t         |   |       t        |      | _        t	        j
                  |j                  d      | _        | j                          y )Nr:   )	rX   rY   rK  r(  r   r/  r   
qa_outputsrX  r  s     r/   rY   z#FalconForQuestionAnswering.__init__e  sA     &v.))F$6$6: 	r1   rb  rz   r   rd  start_positionsend_positionsr   re  rf  r'   c
           	      $   |	|	n| j                   j                  }	| j                  |||||||	      }
|
d   }| j                  |      }|j	                  dd      \  }}|j                  d      j                         }|j                  d      j                         }d}||t        |j                               dkD  r|j                  d      }t        |j                               dkD  r|j                  d      }|j                  d      }|j                  d|      }|j                  d|      }t        |      } |||      } |||      }||z   dz  }|	s||f|
dd z   }||f|z   S |S t        ||||
j                  |
j                  	      S )
rh  N)rz   r   rd  r   re  rf  r   r    r9   r;   )ignore_indexr:   )r  start_logits
end_logitsr.   ro  )rP   rp  r(  r  splitr  r   lensizeclampr	   r   r.   ro  )r-   rb  rz   r   rd  r  r  r   re  rf  r#  sequence_outputr  r  r  
total_lossignored_indexr  
start_lossend_lossr%  s                        r/   r0   z"FalconForQuestionAnswering.forwardm  s   4 &1%<k$++B]B]"")'/!5# # 
 "!*1#)<<r<#: j#++B/::<''+668

&=+D?'')*Q."1"9"9""==%%'(1, - 5 5b 9(--a0M-33A}EO)//=AM']CH!,@J
M:H$x/14J"J/'!"+=F/9/EZMF*Q6Q+%!!//))
 	
r1   )	NNNNNNNNN)r2   r3   r4   rY   r   r   r5   r   FloatTensorr   r   r   r   r0   rx   ry   s   @r/   r  r  c  s      156:15596:48,0/3&*G
E,,-G
 !!2!23G
 E--.	G

   1 12G
 "%"2"23G
   0 01G
 $D>G
 'tnG
 d^G
 
u22	3G
 G
r1   r  )r  rK  r'  r  r  r  )Nr    )Mr  r~   typingr   r   r   r   r5   torch.utils.checkpointr   torch.nnr   r	   r
   r   r   r   activationsr   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   r   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   utilsr   r   configuration_falconr!   configuration_utilsr"   r#   
get_loggerr2   r   r/  r%   rB   rM   rH  rO   r6   r  rk   r   rl   r   r   r   r   r  r  r  r'  rK  r  r  r  r  __all__r7   r1   r/   <module>r      sE     8 8    L L $ ) ; ; ) i  L - / 7J			H	%
)299 )(8<BII <DJu|| J JEKK J\a\h\h J:5<< 5<< u PT Y^YeYe &m/bii m/`e5O e5P		 " . _ _D 'O ' 'T P' P Pf
 
t- t
tn r
&; r
r
j U
#8 U
 U
p Q
!6 Q
 Q
hr1   