
    Uh.A                        d dl mZmZmZmZmZ d dlZd dlZd dlmZ ddl	m
Z
mZmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e       rd dl)m*Z* ddl+m,Z,  ejZ                  e.      Z/dZ0 G d de"      Z1 G d de      Z2 G d de      Z3 G d de$      Z4 G d de#      Z5 G d de      Z6 G d d e!      Z7 G d! d"e       Z8 G d# d$e      Z9g d%Z:y)&    )CallableListOptionalTupleUnionN)nn   )CacheSlidingWindowCacheStaticCache)AttentionMaskConverter)FlashAttentionKwargs)BaseModelOutputWithPastQuestionAnsweringModelOutput)ALL_ATTENTION_FUNCTIONS)Unpack)is_torch_flex_attn_availablelogging   )LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaMLP
LlamaModelLlamaPreTrainedModelapply_rotary_pos_embeager_attention_forward   )MistralConfig)	BlockMask)make_flex_block_causal_maskzmistralai/Mistral-7B-v0.1c                        e Zd Z fdZ xZS )
MistralMLPc                 J   t         |   |       t        j                  | j                  | j
                  d      | _        t        j                  | j                  | j
                  d      | _        t        j                  | j
                  | j                  d      | _        y )NFbias)	super__init__r   Linearhidden_sizeintermediate_size	gate_projup_proj	down_projselfconfig	__class__s     }/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mistral/modular_mistral.pyr+   zMistralMLP.__init__*   ss     4#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWX    )__name__
__module____qualname__r+   __classcell__r5   s   @r6   r&   r&   )   s    Y Yr7   r&   c                   2    e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )MistralAttentionr4   	layer_idxc                 l   t         |           t        |dd       xs |j                  |j                  z  | _        t        j                  |j                  |j                  | j
                  z  d      | _        t        j                  |j                  |j                  | j
                  z  d      | _
        t        j                  |j                  |j                  | j
                  z  d      | _        t        j                  |j                  | j
                  z  |j                  d      | _        y )Nhead_dimFr(   )r*   r+   getattrr-   num_attention_headsrA   r   r,   q_projnum_key_value_headsk_projv_projo_projr3   r4   r?   r5   s      r6   r+   zMistralAttention.__init__2   s    
D9mV=O=OSYSmSm=mii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii 2 2F4N4NQUQ^Q^4^ejkii : :T]] JFL^L^ejkr7   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc           
         |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }	| j                  |      j                  |      j	                  dd      }
| j                  |      j                  |      j	                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j                  d	       nt         | j                  j                     } || |	|
||f| j"                  sd
n| j$                  | j&                  t)        | j                  dd       d|\  }} |j*                  g |d j-                         }| j/                  |      }||fS )Nr!   r   )sincosrN   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        sliding_window)dropoutscalingrY   )shaperA   rD   view	transposerF   rG   r   updater?   r    r4   _attn_implementationgetloggerwarning_oncer   trainingattention_dropoutr[   rB   reshape
contiguousrH   )r3   rJ   rK   rL   rM   rN   rO   input_shapehidden_shapequery_states
key_statesvalue_statesrT   rS   cache_kwargsattention_interfaceattn_outputattn_weightss                     r6   forwardzMistralAttention.forward:   s    $))#2.88b8$--8{{=166|DNNqRST[[/44\BLLQPQR
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL"4;;0@$G
%
 
%
!\ *k));;;;FFHkk+.L((r7   )NN)r8   r9   r:   r"   intr+   torchTensorr   r   r
   
LongTensorr   r   rq   r;   r<   s   @r6   r>   r>   1   s    l} l l +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0)r7   r>   c                   (     e Zd Zdedef fdZ xZS )MistralDecoderLayerr4   r?   c                 j    t         |   ||       t        ||      | _        t	        |      | _        y )N)r4   r?   )r*   r+   r>   	self_attnr&   mlprI   s      r6   r+   zMistralDecoderLayer.__init__n   s,    +)9Mf%r7   )r8   r9   r:   r"   rr   r+   r;   r<   s   @r6   rw   rw   m   s    &} & & &r7   rw   c                       e Zd Zy)MistralPreTrainedModelNr8   r9   r:    r7   r6   r|   r|   t       r7   r|   c                        e Zd Zdef fdZ	 ddeej                  df   dej                  dej                  dede	f
d	Z
edej                  d
ededej                  dej                  dededefd       Z xZS )MistralModelr4   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w N)r*   r+   r   
ModuleListrangenum_hidden_layersrw   layersrI   s      r6   r+   zMistralModel.__init__y   sD     mmEJ6KcKcEde	 3e
es   ArL   r#   input_tensorrN   past_key_valuesrW   c           
      l   | j                   j                  dk(  rS|H|F|d d df   j                         j                         |j	                         d   k7  }|rt        d      |d|v r|S y | j                   j                  dk(  r't        |t        j                        rt        |      }|S ||j                         nd}t        |t              }t        |t              }	| j                   j                  dk(  r?|s=|	s;|s9t        j                  |||| j                   j                  | j                         ry |j"                  }
t        j$                  |
      j&                  }|j(                  d	   }|	s|r|j+                         }n1t        |t        j                        r|j(                  d   n||z   d	z   }| j-                  ||||
||j(                  d   | j                   |
      }| j                   j                  dk(  r2|0|j.                  j0                  dv r|st        j2                  ||      }|S )Nflash_attention_2rR   r   zYou are attempting to perform batched generation with padding_side='right' this may lead to unexpected behaviour for Flash Attention version of Mistral. Make sure to  call `tokenizer.padding_side  = 'left'` before tokenizing the input. rX   flex_attentionrV   )inputs_embedspast_key_values_lengthrY   is_trainingr!   )sequence_lengthtarget_lengthdtyperN   
batch_sizer4   r   )cudaxpunpu)r4   r`   sumitemsize
ValueError
isinstancers   rt   r$   get_seq_lengthr   r   r   _ignore_causal_mask_sdparY   rd   r   finfominr\   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positiondevicetype_unmask_unattended)r3   rL   r   rN   r   rW   is_padding_rightpast_seen_tokensusing_static_cacheusing_sliding_window_cacher   	min_dtyper   r   causal_masks                  r6   _update_causal_maskz MistralModel._update_causal_mask   s:    ;;++/BB)o.I#1!R%#8#<#<#>#C#C#EIZIZI\]^I_#_ #$a 
 )c^.C%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`a'E%/AS%T" KK,,6'+E%%>>*'7#{{99 MM ""KK&**	&,,Q/%);+??AM
 nell; $$R(%7!;  PP+')#))!,;;+ Q 	
 KK,,6*%%**.DD%
 1CCKQZ[Kr7   r   r   r   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }	t        j                  ||f|	||j
                        }t        j                  ||j
                        |j                  dd      kD  }
|j                         }t        |dd      rs|j                  gt        |t              r||kD  rRt        j                  ||j
                        |j                  dd      |j                  z
  k  }|
j                  |       ||
z  }|ddddddf   j                  |ddd      }| |j                         }| j                   d   |kD  r| ddd|f   } | j                   d   }|ddddddd|f   | ddddddf   j#                  |j
                        z   }|d	k(  }|ddddddd|f   j%                  ||	      |ddddddd|f<   |S )
a  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache, to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
            config (`MistralConfig`):
                The model's configuration class
            past_key_values (`Cache`):
                The cache class that is being used currently to generate
        N   )
fill_valuer   r   )r   rR   r!   use_sliding_windowTr   )dimrs   r   r   fullr   arangerf   get_text_configrB   rY   r   r   bitwise_or_expandcloner\   tomasked_fill)rL   r   r   r   rN   r   r4   r   r   r   diagonal_attend_masktext_configsliding_attend_maskmask_lengthpadding_masks                  r6   r   zBMistralModel._prepare_4d_causal_attention_mask_with_cache_position   s%   B %.*<*<*>!*C(K@ = E*..I** -0Ye\j\q\qK $)<<nF[F[#\_m_u_uA` $  !002K{$8$?KD^D^Dj "/3EF/\iJi*/,,}^MbMb*c&..r158R8RR+' )445HI//K%dD!Q&67>>z1bRTUK))//1!''+m;%3A~~4E%FN,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r7   )F)r8   r9   r:   r"   r+   r   rs   rt   r
   boolr   staticmethodrr   r   r   r;   r<   s   @r6   r   r   x   s    
} 
 #(TellK78T llT 	T
 T  Tl BBB B {{	B
 B B B B Br7   r   c                       e Zd Zy)MistralForCausalLMNr}   r~   r7   r6   r   r     r   r7   r   c                       e Zd Zy)MistralForTokenClassificationNr}   r~   r7   r6   r   r     r   r7   r   c                       e Zd Zy) MistralForSequenceClassificationNr}   r~   r7   r6   r   r   #  r   r7   r   c                   D    e Zd ZdZ fdZd Zd Z	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deeeeej                     f      d	eej                     d
eej                     deej                     dee   dee   defdZ xZS )MistralForQuestionAnsweringmodelc                 H    t         |   |       t        |      | _        | `y r   )r*   r+   r   r   transformerr2   s     r6   r+   z$MistralForQuestionAnswering.__init__*  s"     !&)
r7   c                 .    | j                   j                  S r   r   embed_tokens)r3   s    r6   get_input_embeddingsz0MistralForQuestionAnswering.get_input_embeddings/  s    zz&&&r7   c                 &    || j                   _        y r   r   )r3   values     r6   set_input_embeddingsz0MistralForQuestionAnswering.set_input_embeddings2  s    "'

r7   	input_idsrL   position_idsr   r   start_positionsend_positionsrW   output_hidden_statesrP   c
           	         | j                  |||||||	      }|j                  }| j                  |      }|j                  dd      \  }}|j	                  d      j                         }|j	                  d      j                         }d}|| | j                  ||||fi |
}t        ||||j                  |j                        S )a  
        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the start of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for position (index) of the end of the labelled span for computing the token classification loss.
            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
            are not taken into account for computing the loss.
        )rL   r   r   r   rW   r   r!   rR   )r   N)lossstart_logits
end_logitsrJ   
attentions)
r   last_hidden_state
qa_outputssplitsqueezerg   loss_functionr   rJ   r   )r3   r   rL   r   r   r   r   r   rW   r   rO   outputssequence_outputlogitsr   r   r   s                    r6   rq   z#MistralForQuestionAnswering.forward5  s    0 ,0::)%+'/!5 ,6 ,
 "331#)<<r<#: j#++B/::<''+668
&=+D%4%%lJQ^ibhiD+%!!//))
 	
r7   )	NNNNNNNNN)r8   r9   r:   base_model_prefixr+   r   r   r   rs   ru   rt   r   r
   r   FloatTensorr   r   rq   r;   r<   s   @r6   r   r   '  s    
'(
 151537KO596:48,0/33
E,,-3
 !.3
 u//0	3

 "%tE4E4E/F(F"GH3
   1 123
 "%"2"233
   0 013
 $D>3
 'tn3
 
&3
r7   r   )r   r   r   r|   r   r   );typingr   r   r   r   r   rs   torch.utils.checkpointr   cache_utilsr
   r   r   modeling_attn_mask_utilsr   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   llama.modeling_llamar   r   r   r   r   r   r   r   r   r   r    configuration_mistralr"   !torch.nn.attention.flex_attentionr#   integrations.flex_attentionr$   
get_loggerr8   rb   _CHECKPOINT_FOR_DOCr&   r>   rw   r|   r   r   r   r   r   __all__r~   r7   r6   <module>r      s    9 9    A A > B U 5 & :    1  !;J 
		H	%1 Y Y9)~ 9)x&+ &	1 	`: `F	) 		$? 		'E 	A
"; A
Hr7   