
    Uh^                        d dl mZmZmZmZ d dlZd dlmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZmZ ddlmZ dd	lmZmZ d
dlmZmZmZmZmZmZmZ d
dlm Z  d
dl!m"Z"m#Z#  ejH                  e%      Z& G d de	      Z' G d de      Z( G d de"      Z) G d de      Z* G d de      Z+ G d de      Z, G d de      Z- G d de      Z.e G d de             Z/ G d  d!e      Z0g d"Z1y)#    )CallableOptionalTupleUnionN   )PretrainedConfig)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionc                   F     e Zd ZdZdZdZ	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z xZS )MLCDVisionConfiga  
    This is the configuration class to store the configuration of a [`MLCDVisionModel`]. It is used to instantiate a MLCD
    vision encoder according to the specified arguments, defining the model architecture. Instantiating a configuration
    with the defaults will yield a similar configuration to that of the vision encoder of the MLCD
    [DeepGlint-AI/mlcd-vit-bigG-patch14-336](https://huggingface.co/DeepGlint-AI/mlcd-vit-bigG-patch14-336) architecture.

    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
    documentation from [`PretrainedConfig`] for more information.

    Args:
        hidden_size (`int`, *optional*, defaults to 1664):
            Dimensionality of the encoder layers and the pooler layer.
        intermediate_size (`int`, *optional*, defaults to 8192):
            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
        projection_dim (`int`, *optional*, defaults to 1024):
            Dimensionality of text and vision projection layers.
        num_hidden_layers (`int`, *optional*, defaults to 48):
            Number of hidden layers in the Transformer encoder.
        num_attention_heads (`int`, *optional*, defaults to 16):
            Number of attention heads for each attention layer in the Transformer encoder.
        num_channels (`int`, *optional*, defaults to 3):
            The number of input channels.
        image_size (`int`, *optional*, defaults to 336):
            The size (resolution) of each image.
        patch_size (`int`, *optional*, defaults to 14):
            The size (resolution) of each patch.
        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
            `"relu"`, `"selu"` and `"gelu_new"` `"quick_gelu"` are supported.
        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
            The epsilon used by the layer normalization layers.
        attention_dropout (`float`, *optional*, defaults to 0.0):
            The dropout ratio for the attention probabilities.
        initializer_range (`float`, *optional*, defaults to 0.02):
            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
        initializer_factor (`float`, *optional*, defaults to 1.0):
            A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
            testing).

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_configc                     t        |   di | || _        || _        || _        || _        || _        || _        || _        || _	        || _
        || _        || _        |
| _        |	| _        y )N )super__init__hidden_sizeintermediate_sizenum_hidden_layersnum_attention_headsnum_key_value_groupsnum_channels
patch_size
image_sizeinitializer_rangeinitializer_factorattention_dropoutlayer_norm_eps
hidden_act)selfr$   r%   r&   r'   r(   r)   r+   r*   r0   r/   r.   r,   r-   kwargs	__class__s                  w/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mlcd/modular_mlcd.pyr#   zMLCDVisionConfig.__init__d   sz    " 	"6"&!2!2#6 $8!($$!2"4!2,$    )i  i    0         r   iP     gelugh㈵>        g{Gz?      ?)__name__
__module____qualname____doc__
model_typebase_config_keyr#   __classcell__r3   s   @r4   r   r   *   sH    4l %J%O % %r5   r   c                       e Zd Zy)MLCDMLPN)r=   r>   r?   r!   r5   r4   rF   rF      s    r5   rF   c                   4    e Zd Zdededej
                  fdZy)MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc                    t        j                  || j                  j                        j	                  d      j                  d|      }t        j                  || j                  j                        j	                  d      j                  |d      }t        j                  |j                         |j                         gd      }t        ||      }t        j                  || j                  j                  | j                  j                        }t        j                  || j                        }||   j                  d      }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer8   r   dim)rM   dtype)torcharangeinv_freqrM   	unsqueezeexpandstackflattenmaxrQ   outer)
r1   rI   rJ   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             r4   forwardzMLCDRotaryEmbedding.forward   s	    LL+DMM4H4HISSTUV]]^`bst 	 LL*4==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .0ABll=1E1ET]]M`M`a#kk#t}}= -W5==a@r5   N)r=   r>   r?   intrR   Tensorrb   r!   r5   r4   rH   rH      s     # # %,, r5   rH   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )MLCDVisionEmbeddingsconfigc                 (    t         |   |       | `y N)r"   r#   position_embeddingr1   rg   r3   s     r4   r#   zMLCDVisionEmbeddings.__init__   s     #r5   pixel_valuesrK   c                 T   |j                   d   }| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }|S )Nr   )rQ   r   r8   rN   rO   )shapepatch_embeddingweightrQ   torX   	transposeclass_embeddingrV   rR   cat)r1   rl   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          r4   rb   zMLCDVisionEmbeddings.forward   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
r5   )
r=   r>   r?   r   r#   rR   FloatTensorrd   rb   rC   rD   s   @r4   rf   rf      s-    $/ $
E$5$5 
%,, 
r5   rf   c                        e Zd ZdZdef fdZ	 d
dej                  deej                  ej                  f   de	ej                     de
e   deej                  e	ej                     f   f
d	Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://arxiv.org/abs/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://arxiv.org/abs/2104.09864
    rg   c                 T    t         |   |       |j                  | _        d| _        y NF)r"   r#   r(   	is_causalrk   s     r4   r#   zMLCDAttention.__init__   s%     $*$?$?!r5   hidden_statesposition_embeddingsattention_maskr2   rK   c                 <   |j                   d d \  }}| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }	|d   j                  d      j                         }
|d   j                  d      j                         }t        |||
|      \  }}|j                  dddd      j                         }|j                  dddd      j                         }|	j                  dddd      j                         }	t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd	      rt         j#                  d
       nt$        | j                  j                     } || |||	|f| j&                  sdn| j(                  | j*                  | j,                  d|\  }}|j                  dddd      j                         }|j/                  ||d      }| j1                  |      }|j                  ddd      j                         }||fS )NrN   r   r8   r   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r;   )dropoutscalingr   )rn   q_projreshape	num_headshead_dimk_projv_projrU   floatr   permute
contiguousr   rg   _attn_implementationgetloggerwarning_oncer   trainingr   scaler   viewout_proj)r1   r   r   r   r2   ru   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  r4   rb   zMLCDAttention.forward   s|    "/!4!4Sb!9
J {{=199:zSWSaSacgcpcp:qr[[/77ZQUQ_Q_aeanan8op
{{=199:zSWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|ZY\^a#b j $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$,,JJnn
%
 
%
!\ "))!Q15@@B!&&z:rBmmK0!))!Q2==?L((r5   ri   )r=   r>   r?   r@   r   r#   rR   rd   r   r   r   r	   rb   rC   rD   s   @r4   r|   r|      s    /  26	2)||2) #5<<#=>2) !.	2)
 -.2) 
u||Xell33	42)r5   r|   c                        e Zd Zdef fdZ	 	 d	dej                  deej                  ej                  f   deej                     dee	   deej                     f
dZ xZS )
MLCDEncoderLayerrg   c                 D    t         |   |       t        |      | _        y ri   )r"   r#   r|   	self_attnrk   s     r4   r#   zMLCDEncoderLayer.__init__  s     &v.r5   r   r   r   r   rK   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r   r   r   r   )layer_norm1r   layer_norm2mlp)r1   r   r   r   r   residualr   outputss           r4   rb   zMLCDEncoderLayer.forward  s    * !((7&*nn' 3)/	 '5 '
#| !=0 ((7/ =0 "&Gr5   r~   )r=   r>   r?   r   r#   rR   rd   r   r   boolrz   rb   rC   rD   s   @r4   r   r      s{    // / 26,1*||* #5<<#=>* !.	*
 $D>* 
u  	!*r5   r   c                        e Zd ZdZdef fdZ	 	 	 	 ddej                  deej                  ej                  f   de
ej                     de
e   de
e   d	e
e   d
eeef   fdZ xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    rg   c                 $    t         |   |       y)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)r"   r#   rk   s     r4   r#   zMLCDEncoder.__init__;  s     r5   inputs_embedsr   r   r   output_hidden_statesreturn_dictrK   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]b  \  }
}|r||	fz   }| j                  r,| j                  r | j                  |j                  |	|||      }n ||	|||      }|d   }	|sZ||d   fz   }d |r||	fz   }|st        d |	||fD              S t        |	||      S )aj  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`Tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   r   r   r8   c              3   &   K   | ]	  }||  y wri   r!   ).0vs     r4   	<genexpr>z&MLCDEncoder.forward.<locals>.<genexpr>  s     eqWXWdes   )last_hidden_stater   
attentions)rg   r   use_return_dictr   	enumeratelayersgradient_checkpointingr   _gradient_checkpointing_func__call__tupler
   )r1   r   r   r   r   r   r   encoder_statesall_attentionsr   idxencoder_layerlayer_outputss                r4   rb   zMLCDEncoder.forward?  sH   D %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq30d%"+DKK"8 	FC#!/=2B!B**t}} $ A A!**!'"%! !."/(;#1&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+(%
 	
r5   NNNN)r=   r>   r?   r@   r   r#   rR   rz   r   rd   r   r   r   r
   rb   rC   rD   s   @r4   r   r   2  s    !/ ! 26,0/3&*L
((L
 #5<<#=>L
 !.	L

 $D>L
 'tnL
 d^L
 
uo%	&L
r5   r   c                        e Zd Zdef fdZe	 	 	 	 d	deej                     dee	   dee	   dee	   de
eef   f
d       Z xZS )
MLCDVisionTransformerrg   c                    t         |   |       t        |j                  |j                  z  dz        | _        t        j                  t        j                  d|j                  |j                  z  dz              | _
        y )Nr   r8   )r"   r#   rH   r$   r'   vision_rotary_embeddingnn	ParameterrR   randnclass_pos_embrk   s     r4   r#   zMLCDVisionTransformer.__init__  sh     ':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*opr5   rl   r   r   r   rK   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j
                  d   | j                   j                  z  }|j
                  d   | j                   j                  z  }| j                  ||      }|j                  | j                  j                        }t        j                  | j                  |gd      }t        j                  ||fd      }|j                         |j                         f}	| j                  |      }
| j!                  |
      }
| j#                  |
|	|||      }|d   }|d d dd d f   }| j%                  |      }|s
||f|dd  z   S t'        |||j(                  |j*                        S )	Nz You have to specify pixel_valuesrN   r   rO   )r   r   r   r   r   r8   )r   pooler_outputr   r   )rg   r   r   r   
ValueErrorrn   r*   r   rq   r   rM   rR   rt   r   r   ry   pre_layrnormencoderpost_layernormr   r   r   )r1   rl   r   r   r   rI   rJ   ra   embr   r   encoder_outputsr   pooled_outputs                 r4   rb   zMLCDVisionTransformer.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq?@@)//3t{{7M7MM(..r2dkk6L6LL556HJ[\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8,,' 3/!5# ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r5   r   )r=   r>   r?   r   r#   r   r   rR   rz   r   r   r   r   rb   rC   rD   s   @r4   r   r     s    q/ q
  59,0/3&*/
u001/
 $D>/
 'tn	/

 d^/
 
u00	1/
 /
r5   r   c                   &    e Zd ZeZdZdZdZdZd Z	y)MLCDPreTrainedModelmlcdTc                 L   | j                   j                  }t        |t              r| j                   j                  }t        j
                  j                  |j                  d|j                  dz  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         yt        |t              r,| j                   j                  }|j                  dz  d|j                   j                  z  dz  z  |z  }|j                  dz  |z  }t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                   j                  |       t        j
                  j                  |j"                  j                  |       yt        |t$              r| j                   j                  }|j                   j&                  dz  d|j                   j                  z  dz  z  |z  }d|j                   j&                  z  dz  |z  }t        j
                  j                  |j(                  j                  |       t        j
                  j                  |j*                  j                  |       yt        |t,              ry| j                   j                  }|j                   j&                  |j                   j.                  z  dz  dz  |z  }t        j
                  j                  |j0                  d|       yt        |t        j2                        rJ|j4                  j6                  j9                          |j                  j6                  j;                  d       yt        |t        j<                        r2|j4                  %|j4                  j6                  j9                          yyy)zInitialize the weightsr;   g      )meanstd)r   r   r<   N)rg   r-   
isinstancerf   r   initnormal_rs   	embed_dimro   rp   r,   r|   r&   r   r   r   r   rF   r$   fc1fc2r   r'   r   	LayerNormbiasdatazero_fill_Linear)r1   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stds          r4   _init_weightsz!MLCDPreTrainedModel._init_weights  s   //f23[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOh.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O? 56[[33F!==448Y8YY]^^cggjppKGGOOF00sOL-KK""$MM$$S)		*v{{/FKK""$ 0G*r5   N)
r=   r>   r?   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r!   r5   r4   r   r     s#    #L&*#!N%r5   r   c                   r    e Zd Ze	 	 	 	 ddeej                     dee   dee   dee   dee	e
f   f
d       Zy)	MLCDVisionModelNrl   r   r   r   rK   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||      S )a  
        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```)rl   r   r   r   )rg   r   r   r   vision_model)r1   rl   r   r   r   s        r4   rb   zMLCDVisionModel.forward  su    > %9$D $++JjJj 	 &1%<k$++B]B]1B1N-TXT_T_TqTq  %/!5#	 ! 
 	
r5   r   )r=   r>   r?   r   r   rR   rz   r   r   r   r   rb   r!   r5   r4   r   r     st     59,0/3&*(
u001(
 $D>(
 'tn	(

 d^(
 
u00	1(
 (
r5   r   )r   r   r   )2typingr   r   r   r   rR   torch.nnr   configuration_utilsr   modeling_flash_attention_utilsr	   modeling_outputsr
   r   modeling_utilsr   r   processing_utilsr   utilsr   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr=   r   r   rF   rH   rf   r|   r   r   r   r   r   __all__r!   r5   r4   <module>r
     s    4 3   3 B K F & ,   ; [ 
		H	%Y%' Y%x	g 	/ D/ $?)M ?)D/' /dY
+ Y
x6
1 6
r $%/ $% $%N*
o *
Zr5   