
    Uh                        d Z ddlmZmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#  ejH                  e%      Z&dZ' G d de      Z( G d de      Z) G d de      Z* G d de      Z+ G d de!      Z, G d de
e      Z- G d de      Z. G d  d!e      Z/ G d" d#e      Z0 G d$ d%e      Z1g d&Z2y)'zPyTorch Qwen3 model.    )CallableOptionalTupleN   )Cache)FlashAttentionKwargs)CausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONS)Unpack)
LossKwargslogging   )GemmaMLP)	LlamaAttentionLlamaDecoderLayerLlamaForCausalLMLlamaForQuestionAnsweringLlamaForSequenceClassificationLlamaForTokenClassificationLlamaRMSNormapply_rotary_pos_embeager_attention_forward)MistralModel   )Qwen3ConfigzQwen/Qwen3-8Bc                       e Zd Zy)Qwen3RMSNormN__name__
__module____qualname__     y/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/qwen3/modular_qwen3.pyr   r   1       r#   r   c                       e Zd Zy)Qwen3MLPNr   r"   r#   r$   r'   r'   5   r%   r#   r'   c                   2    e Zd Zdedef fdZ	 	 ddej                  deej                  ej                  f   de	ej                     de	e
   de	ej                     d	ee   d
eej                  e	ej                     e	eej                        f   fdZ xZS )Qwen3Attentionconfig	layer_idxc                    t         |   ||       t        | j                  |j                        | _        t        | j                  |j                        | _        |j                  | _        | j                  j                  r:t        | j                  dd       #| j                  | j                  j                  k\  sd | _        y y )N)epssliding_window)super__init__r   head_dimrms_norm_epsq_normk_normr.   r*   use_sliding_windowgetattrr+   max_window_layersselfr*   r+   	__class__s      r$   r0   zQwen3Attention.__init__:   s    +"4==f6I6IJ"4==f6I6IJ$33KK**%5t<H$++"?"??"&D @r#   hidden_statesposition_embeddingsattention_maskpast_key_valuecache_positionkwargsreturnc                    |j                   d d }g |d| j                  }| j                  | j                  |      j	                  |            j                  dd      }	| j                  | j                  |      j	                  |            j                  dd      }
| j                  |      j	                  |      j                  dd      }|\  }}t        |	|
||      \  }	}
|'|||d}|j                  |
|| j                  |      \  }
}t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt         j#                  d	       nt$        | j                  j                     } || |	|
||f| j&                  sd
n| j(                  | j*                  | j,                  d|\  }} |j.                  g |d j1                         }| j3                  |      }||fS )Nr   r   )sincosr?   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.g        )dropoutscalingr.   )shaper1   r3   q_projview	transposer4   k_projv_projr   updater+   r   r*   _attn_implementationgetloggerwarning_oncer
   trainingattention_dropoutrJ   r.   reshape
contiguouso_proj)r9   r;   r<   r=   r>   r?   r@   input_shapehidden_shapequery_states
key_statesvalue_statesrE   rD   cache_kwargsattention_interfaceattn_outputattn_weightss                     r$   forwardzQwen3Attention.forwardF   s    $))#2.88b8$--8{{4;;}#=#B#B<#PQ[[\]_`a[[]!;!@!@!NOYYZ[]^_
{{=166|DNNqRST&S#7jRUWZ#[ j%#&snUL'5'<'<ZW[WeWegs't$J(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HLL..
%
 
%
!\ *k));;;;FFHkk+.L((r#   )NN)r   r    r!   r   intr0   torchTensorr   r   r   
LongTensorr   r   rd   __classcell__r:   s   @r$   r)   r)   9   s    
'{ 
's 
'" +/590)||0) #5<<#=>0) !.	0)
 !0) !!1!120) -.0) 
u||Xell3XeELL>Q5RR	S0)r#   r)   c                   (     e Zd Zdedef fdZ xZS )Qwen3DecoderLayerr*   r+   c                     t         |           t        ||      | _        t	        |      | _        |j                  r4|j                  dk7  r$t        j                  d|j                   d       y y y )N)r*   r+   flash_attention_2z=Sliding Window Attention is enabled but not implemented for `z)`; unexpected results may be encountered.)
r/   r0   r)   	self_attnr'   mlpr.   rR   rT   rU   r8   s      r$   r0   zQwen3DecoderLayer.__init__z   sp    'vKF#!!f&A&AEX&XOPVPkPkOl m9 9 'Y!r#   )r   r    r!   r   re   r0   ri   rj   s   @r$   rl   rl   y   s    
{ 
s 
 
r#   rl   c                       e Zd Zy)
Qwen3ModelNr   r"   r#   r$   rr   rr      r%   r#   rr   c                       e Zd Zy)KwargsForCausalLMNr   r"   r#   r$   rt   rt      s    r#   rt   c                   .     e Zd Zdee   def fdZ xZS )Qwen3ForCausalLMsuper_kwargsrA   c                 "    t        |   di |S )a^  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import AutoTokenizer, Qwen3ForCausalLM

        >>> model = Qwen3ForCausalLM.from_pretrained("Qwen/Qwen3-8B")
        >>> tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-8B")

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```r"   )r/   rd   )r9   rw   r:   s     r$   rd   zQwen3ForCausalLM.forward   s    4 w...r#   )r   r    r!   r   rt   r	   rd   ri   rj   s   @r$   rv   rv      s%    /01/ 
 / /r#   rv   c                       e Zd Zy)Qwen3ForSequenceClassificationNr   r"   r#   r$   rz   rz      r%   r#   rz   c                       e Zd Zy)Qwen3ForTokenClassificationNr   r"   r#   r$   r|   r|      r%   r#   r|   c                       e Zd Zy)Qwen3ForQuestionAnsweringNr   r"   r#   r$   r~   r~      r%   r#   r~   )rv   r~   rr   Qwen3PreTrainedModelrz   r|   )3__doc__typingr   r   r   rf   torch.utils.checkpointcache_utilsr   modeling_flash_attention_utilsr   modeling_outputsr	   modeling_utilsr
   processing_utilsr   utilsr   r   gemma.modeling_gemmar   llama.modeling_llamar   r   r   r   r   r   r   r   r   mistral.modeling_mistralr   configuration_qwen3r   
get_loggerr   rT   _CHECKPOINT_FOR_DOCr   r'   r)   rl   rr   rt   rv   rz   r|   r~   __all__r"   r#   r$   <module>r      s     , ,     B 6 5 & ( +
 
 
 4 , 
		H	%% 	< 		x 	=)^ =)@) 	 	 ?,j >/' /<	%C 		"= 		 9 	r#   