
    Uh                        d dl Z d dlZd dlmZ d dlmZmZmZmZm	Z	 d dl
Zd dlZd dlmZ d dlmc mZ d dlmZmZmZ d dlmZ ddlmZ ddlmZ dd	lmZ dd
lmZmZm Z  ddl!m"Z"m#Z# ddl$m%Z%m&Z&m'Z'm(Z( ddl)m*Z*m+Z+m,Z,  e(jZ                  e.      Z/e G d de%             Z0e G d de%             Z1e G d de%             Z2 G d dejf                        Z4	 dIdejf                  dejj                  dejj                  dejj                  deejj                     de6de6fdZ7 G d d ejf                        Z8 G d! d"ejf                        Z9 G d# d$e      Z: G d% d&ejf                        Z; G d' d(ejf                        Z< G d) d*ejf                        Z=d+ Z>	 dJd,ejj                  d-e6d.e6d/e6d0e6d1ejj                  fd2Z?dKd3Z@d4 ZAd5 ZB G d6 d7ejf                        ZCe& G d8 d9e#             ZD e&d:;       G d< d=eD             ZE G d> d?ejf                        ZF e&d@;       G dA dBeD             ZGe& G dC dDeD             ZH e&dE;       G dF dGeD             ZIg dHZJy)L    N)	dataclass)AnyCallableOptionalTupleUnion)BCEWithLogitsLossCrossEntropyLossMSELoss)_calculate_fan_in_and_fan_out   )ACT2FN)_prepare_4d_attention_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingImageClassifierOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)ModelOutputauto_docstringcan_return_tuplelogging   )Siglip2ConfigSiglip2TextConfigSiglip2VisionConfigc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)Siglip2VisionOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r    r   torchFloatTensor__annotations__r!   r"   r   r#        ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/siglip2/modeling_siglip2.pyr   r   -   sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r,   r   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)Siglip2TextOutputa  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_embedsr!   .r"   r#   )r$   r%   r&   r'   r0   r   r(   r)   r*   r!   r"   r   r#   r+   r,   r-   r/   r/   J   sr    * 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r,   r/   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)Siglip2Outputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`Siglip2TextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`Siglip2VisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`Siglip2TextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`Siglip2VisionModel`].
    Nlosslogits_per_imagelogits_per_textr0   r    text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r6   r7   N)getattrto_tuple).0kselfs     r-   	<genexpr>z)Siglip2Output.to_tuple.<locals>.<genexpr>   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr?   s   `r-   r<   zSiglip2Output.to_tuple   s#     
YY[
 
 	
r,   )r$   r%   r&   r'   r3   r   r(   r)   r*   r4   r5   r0   r    r6   r   r7   r   r   r<   r+   r,   r-   r2   r2   g   s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r,   r2   c            	            e Zd Zdef fdZedej                  dej                  de	dej                  fd       Z
dej                  dej                  dej                  fd	Z xZS )
Siglip2VisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        t        j                  |j                  | j
                  z  | j
                  z  | j                        | _	        |j                  | _
        t        | j                  dz        | _        t        j                  | j                  | j                        | _        y )N)in_featuresout_featuresg      ?)super__init__rF   hidden_size	embed_dim
patch_sizennLinearnum_channelspatch_embeddingnum_patchesintposition_embedding_size	Embeddingposition_embeddingr?   rF   	__class__s     r-   rK   z Siglip2VisionEmbeddings.__init__   s    ++ ++!yy++doo=O 

 "--'*4+;+;S+@'A$"$,,t/?/?"Pr,   positional_embeddingsspatial_shapes
max_lengthr8   c                 b   |j                   d   }| j                   d   }| j                  }t        j                  |||f| j                  |      }| j                  ddd      j                  d      } | j                  j                  dk(  r| j                  t        j                        } t        |      D ]w  }||   \  }}	t        j                  | ||	fddd	
      }
|
j                  |||	z        j                  dd      }
|
j                  |      }
|
||d||	z  f<   |
d   ||||	z  df<   y |S )ac  
        Resize positional embeddings to image-specific size and pad to a fixed size.

        Args:
            positional_embeddings (`torch.Tensor`):
                Position embeddings of shape (height, width, embed_dim)
            spatial_shapes (`torch.LongTensor`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
            max_length (`int`):
                Maximum length of the positional embeddings to pad resized positional embeddings to

        Returns:
            `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
        r   )devicedtype   r   cpubilinearFT)sizemodealign_corners	antialiasN)shaper`   r(   emptyr_   permute	unsqueezetypetofloat32rangeFinterpolatereshape	transpose)rZ   r[   r\   
batch_sizerM   source_dtyperesulted_positional_embeddingsiheightwidthresized_embeddingss              r-   resize_positional_embeddingsz4Siglip2VisionEmbeddings.resize_positional_embeddings   sc   ( $))!,
)//3	,22).Y/(//*
& !6 = =aA F P PQR S !'',,5$9$<$<U]]$K!z" 	XA*1-MFE!"%e_#" "4!;!;IvPU~!V!`!`abde!f "4!6!6|!DBT*1.>.>+>?BTUVBW*1fun.>+>?%	X( .-r,   pixel_valuesc                 J   | j                   j                  j                  }| j                  |j                  |            }| j                  j                  j                  | j                  | j                  d      }| j                  |||j                  d         }||z   }|S )aH  
        Args:
            pixel_values (`torch.FloatTensor`):
                Pixel values of shape (batch_size, max_num_patches, num_channels * patch_size * patch_size)
            spatial_shapes (`List[Tuple[int, int]]`):
                Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
        )r`   r^   r   )r\   )	rR   weightr`   rm   rW   rr   rU   r{   rh   )r?   r|   r[   target_dtypepatch_embedsrZ   resized_positional_embeddings
embeddingss           r-   forwardzSiglip2VisionEmbeddings.forward   s     ++2288++LOO,O,OP !% 7 7 > > F F(($*F*F!
 )-(I(I!>l>P>PQR>S )J )
%
 "$AA
r,   )r$   r%   r&   r   rK   staticmethodr(   Tensor
LongTensorrT   r{   r)   r   __classcell__rY   s   @r-   rE   rE      s    Q2 Q 8.$||8.((8. 8. 
	8. 8.tE$5$5 uGWGW \a\h\h r,   rE   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr^   )dimr`   )ptrainingr   ra   )r(   matmulrs   rO   
functionalsoftmaxrn   rm   r`   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r-   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r,   c                        e Zd ZdZdeeef   f fdZ	 	 d	dej                  de
ej                     de
e   deej                  e
ej                     f   fdZ xZS )
Siglip2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperrF   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      F)rJ   rK   rF   rL   rM   num_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutr   	is_causalrO   rP   k_projv_projq_projout_projrX   s     r-   rK   zSiglip2Attention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar,   r"   r   output_attentionsr8   c           
         |j                   \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	t        }
| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     }
 |
| |||	|| j                  | j                  | j                   sdn| j"                        \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )	z#Input shape: Batch x Time x Channelr   ra   eagersdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r   r   r   N)rh   r   r   r   viewr   r   rs   r   rF   _attn_implementationloggerwarning_oncer   r   r   r   r   rr   r   r   )r?   r"   r   r   rt   
seq_lengthrM   queriesrB   valuesattention_interfacer   r   s                r-   r   zSiglip2Attention.forward  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r,   NF)r$   r%   r&   r'   r   r   r   rK   r(   r   r   boolr   r   r   r   s   @r-   r   r     s{    GBu%8:K%KL B. 26,1	-)||-) !.-) $D>	-)
 
u||Xell33	4-)r,   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
Siglip2MLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y N)rJ   rK   rF   r   
hidden_actactivation_fnrO   rP   rL   intermediate_sizefc1fc2rX   s     r-   rK   zSiglip2MLP.__init__P  sd    #F$5$5699V//1I1IJ99V55v7I7IJr,   r"   r8   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r?   r"   s     r-   r   zSiglip2MLP.forwardW  s4    /**=9/r,   )r$   r%   r&   rK   r(   r   r   r   r   s   @r-   r   r   O  s$    KU\\ ell r,   r   c            
            e Zd Zdeeef   f fdZ	 ddej                  dej                  de	e
   deej                     fdZ xZS )	Siglip2EncoderLayerrF   c                 D   t         |           |j                  | _        t	        j
                  | j                  |j                        | _        t        |      | _	        t	        j
                  | j                  |j                        | _
        t        |      | _        y )Neps)rJ   rK   rL   rM   rO   	LayerNormlayer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlprX   s     r-   rK   zSiglip2EncoderLayer.__init___  sm    ++<<F<Q<QR)&1<<F<Q<QRf%r,   r"   r   r   r8   c                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r"   r   r   )r   r   r   r   )r?   r"   r   r   residualr   outputss          r-   r   zSiglip2EncoderLayer.forwardg  s      !((7&*nn')/ '5 '
#|
 !=0 ((7/ =0 "&Gr,   )F)r$   r%   r&   r   r   r   rK   r(   r   r   r   r   r)   r   r   r   s   @r-   r   r   ^  sb    &u%8:K%KL & -2	$||$ $ $D>	$
 
u  	!$r,   r   c            
       x     e Zd ZdZdef fdZe	 	 	 d	deej                     dee
   dee
   defd       Z xZS )
Siglip2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Siglip2EncoderLayer`].

    Args:
        config: Siglip2Config
    rF   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
rJ   rK   rF   rO   
ModuleListro   num_hidden_layersr   layersgradient_checkpointing)r?   rF   _rY   s      r-   rK   zSiglip2Encoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %ks   A#r   r   output_hidden_statesr8   c                    ||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}| j                  D ]&  }|r||fz   } ||||      }	|	d   }|s||	d   fz   }( |r||fz   }t	        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr+   )r   r   r   )r!   r"   r#   )rF   r   r   r   r   )
r?   inputs_embedsr   r   r   encoder_statesall_attentionsr"   encoder_layerlayer_outputss
             r-   r   zSiglip2Encoder.forward  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[ 	FM#!/=2B!B)"3M *!,M !/=3C2E!E	F  +}.>>N+(%
 	
r,   NNN)r$   r%   r&   r'   r   rK   r   r   r(   r   r   r   r   r   r   s   @r-   r   r     sl    ,} ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
r,   r   c                        e Zd Zdef fdZee	 	 d
dej                  dej                  dej                  dee   dee   defd	              Z xZS )Siglip2VisionTransformerrF   c                 r   t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        t        |d      sdn|j                  | _        | j                  rt        |      | _        |j"                  dk(  | _        y )Nr   vision_use_headTflash_attention_2)rJ   rK   rF   rL   rE   r   r   encoderrO   r   r   post_layernormhasattrr   use_head$Siglip2MultiheadAttentionPoolingHeadheadr   _use_flash_attention_2r?   rF   rM   rY   s      r-   rK   z!Siglip2VisionTransformer.__init__  s    &&	1&9%f- ll9&:O:OP$+F4E$FFLbLb==<VDDI&,&A&AEX&X#r,   r|   r   r[   r   r   r8   c                    ||n| j                   j                  }||n| j                   j                  }| j                  ||      }|#| j                  st        ||j                        }n|}| j                  ||||      }|j                  }	| j                  |	      }	| j                  r| j                  |	|      nd}
t        |	|
|j                  |j                        S )z
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        Nr   r   r   r   r!   pooler_outputr"   r#   )rF   r   r   r   r   r   r`   r   r!   r   r   r   r   r"   r#   )r?   r|   r   r[   r   r   r"   encoder_attention_maskencoder_outputsr!   r   s              r-   r   z Siglip2VisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 nE%d.I.I%?P]PcPc%d"%3"+/<<'1/!5	 ,8 ,
 ,== //0ABHL		"3^D[_)/')77&11	
 	
r,   NN)r$   r%   r&   r   rK   r   r   r(   r)   r   r   r   r   r   r   r   r   s   @r-   r   r     s    Y2 Y  -1/3*
''*
 *
 ((	*

 $D>*
 'tn*
 
$*
  *
r,   r   c            	            e Zd Zdef fdZ	 	 	 ddeej                     deej                     deej                     dej                  fdZ
 xZS )	Siglip2TextEmbeddingsrF   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nposition_ids)r   r^   F)
persistent)rJ   rK   rL   rO   rV   
vocab_sizetoken_embeddingmax_position_embeddingsrW   register_bufferr(   arangeexpandr   s      r-   rK   zSiglip2TextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r,   	input_idsr   r   r8   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr^   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rh   rW   r~   r   r   r   )r?   r  r   r   r   max_position_embeddingposition_embeddingsr   s           r-   r   zSiglip2TextEmbeddings.forward(  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r,   r   )r$   r%   r&   r   rK   r   r(   r   r)   r   r   r   r   s   @r-   r   r     sk    

0 

 153759	E,,- u//0   1 12	
 
r,   r   c                    d }||d|z  z
  k  s||d|z  z   kD  rt        j                  dd        |||z
  |z        } |||z
  |z        }| j                  d|z  dz
  d|z  dz
         | j                          | j	                  |t        j                  d      z         | j                  |       | j                  ||       y )Nc                 d    dt        j                  | t        j                  d      z        z   dz  S )N      ?       @)matherfsqrt)xs    r-   norm_cdfz _trunc_normal_.<locals>.norm_cdfF  s(    dhhq499S>122c99r,   ra   zjmean is more than 2 std from [a, b] in nn.init.trunc_normal_. The distribution of values may be incorrect.)
stacklevelr   r	  )minmax)	warningswarnuniform_erfinv_mul_r
  r  add_clamp_)tensormeanstdabr  lus           r-   _trunc_normal_r   C  s    : 	q1s7{q1s7{ 2;	
 	!d(c!"A!d(c!"A OOAEAIq1uqy) NN KKdiin$%
KK MMaQMr,   r  r  r  r  r  r8   c                     t        j                         5  t        | dd||       | j                  |      j	                  |       ddd       y# 1 sw Y   yxY w)an  Fills the input Tensor with values drawn from a truncated
    normal distribution. The values are effectively drawn from the
    normal distribution :math:`\mathcal{N}(	ext{mean}, 	ext{std}^2)`
    with values outside :math:`[a, b]` redrawn until they are within
    the bounds. The method used for generating the random values works
    best when :math:`a \leq 	ext{mean} \leq b`.

    NOTE: this 'tf' variant behaves closer to Tensorflow / JAX impl where the
    bounds [a, b] are applied when sampling the normal distribution with mean=0, std=1.0
    and the result is subsequently scaled and shifted by the mean and std args.

    Args:
        tensor: an n-dimensional `torch.Tensor`
        mean: the mean of the normal distribution
        std: the standard deviation of the normal distribution
        a: the minimum cutoff value
        b: the maximum cutoff value
    r   r  N)r(   no_gradr   r  r  )r  r  r  r  r  s        r-   trunc_normal_tf_r#  g  sI    * 
 $vq#q!,Cd#$ $ $s   0AAc                 ,   t        |       \  }}|dk(  r|}n|dk(  r|}n|dk(  r||z   dz  }|z  }|dk(  r$t        | t        j                  |      dz         y |dk(  rCt	        j
                         5  | j                  t        j                  |             d d d        y |d	k(  rIt        j                  d
|z        }t	        j
                         5  | j                  | |       d d d        y t        d|       # 1 sw Y   y xY w# 1 sw Y   y xY w)Nfan_infan_outfan_avgra   truncated_normalg۶%?r  normaluniformr   zinvalid distribution )	r   r#  r
  r  r(   r"  normal_r  r   )	r  r   re   distributionr%  r&  denomvariancebounds	            r-   variance_scaling_r1    s   3F;OFGx				'!Q&u}H))TYYx%8;N%NO		!]]_ 	4NNtyy2N3	4 	4		"		!h,']]_ 	+OOUFE*	+ 	+ 0?@@	4 	4	+ 	+s   3&C>D
>D
Dc                      t        | dd       y )Nr%  r(  re   r-  r1  r  s    r-   lecun_normal_r6    s    f8:LMr,   c                      t        | dd       y )Nr%  r*  r3  r4  r5  s    r-   default_flax_embed_initr8    s    f8(Cr,   c                        e Zd Zdef fdZee	 	 	 	 	 d
deej                     deej                     deej                     dee
   dee
   defd	              Z xZS )Siglip2TextTransformerrF   c                 F   t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        |j                  dk(  | _        y )Nr   r   )rJ   rK   rF   rL   r   r   r   r   rO   r   r   final_layer_normrP   projection_sizer   r   r   r   s      r-   rK   zSiglip2TextTransformer.__init__  s}    &&	/7%f- "YF<Q<Q RIIi)?)?@	&,&A&AEX&X#r,   r  r   r   r   r   r8   c                     ||n| j                   j                  }||n| j                   j                  }|t        d      |j	                         }|j                  d|d         }| j                  ||      }|"| j                  st        ||j                        }| j                  ||||      }|j                  }	| j                  |	      }	|	d d dd d f   }
| j                  |
      }
t        |	|
|j                  |j                         S )NzYou have to specify input_idsr^   )r  r   r   r   )rF   r   r   r   rd   r   r   r   r   r`   r   r!   r<  r   r   r"   r#   )r?   r  r   r   r   r   input_shaper"   r   r!   pooled_outputs              r-   r   zSiglip2TextTransformer.forward  s&    2C1N-TXT_T_TqTq$8$D $++JjJj 	 <==nn&NN2{27	),W %d.I.I7H[H[\N+/<<')/!5	 ,8 ,
 ,== 112CD *!R(3		-0)/')77&11	
 	
r,   NNNNN)r$   r%   r&   r   rK   r   r   r   r(   r   r   r   r   r   r   s   @r-   r:  r:    s    	Y0 	Y  -115/3,0/3.
ELL).
 !..
 u||,	.

 $D>.
 'tn.
 
$.
  .
r,   r:  c                   .    e Zd ZeZdZdZg dZdZdZ	d Z
y)Siglip2PreTrainedModelsiglip2T)r   r   rE   r   r   c                    t        |t              rt        | j                  t              r | j                  j                  j
                  n| j                  j
                  }t        j                  j                  |j                  j                  dt        j                  |      z         yt        |t        j                        rt        |j                         yt        |t              rt        j                  j!                  |j"                  j                         t        j                  j!                  |j$                  j                         t        j                  j!                  |j&                  j                         t        j                  j!                  |j(                  j                         t        j                  j+                  |j"                  j,                         t        j                  j+                  |j$                  j,                         t        j                  j+                  |j&                  j,                         t        j                  j+                  |j(                  j,                         yt        |t.              rt        j                  j!                  |j0                  j                         t        j                  j!                  |j2                  j                         t        j                  j                  |j0                  j,                  d       t        j                  j                  |j2                  j,                  d       yt        |t4              rt        j                  j!                  |j6                  j8                         t        j                  j!                  |j:                  j<                  j8                         t        j                  j+                  |j:                  j>                  j8                         yt        |t@              rrtC        jD                  tC        jF                  d            }|jH                  j8                  jK                  |       |jL                  j8                  jO                          yt        |tP              rnt        j                  j                  |jR                  j                  | j                  j                  j
                  dz  | j                  jT                  z         yt        |t        jV                  t        jX                  f      rLt[        |j                         |j,                  *t        j                  j+                  |j,                         yyt        |t        j\                        rJ|j,                  j8                  jO                          |j                  j8                  jK                  d       yy)zInitialize the weightsr   r)  gư>r  r   N)/
isinstancerE   rF   r   vision_configrL   rO   initr,  rW   r~   npr  rV   r8  r   xavier_uniform_r   r   r   r   zeros_biasr   r   r   r   probedata	attentionin_proj_weightin_proj_biasSiglip2Modelr(   logr  logit_scalefill_
logit_biaszero_Siglip2ForImageClassification
classifierinitializer_factorrP   Conv2dr6  r   )r?   r   ry   logit_scale_inits       r-   _init_weightsz$Siglip2PreTrainedModel._init_weights  s   f56 dkk=9 ))55[[,, 
 GGOOF55<<!bggenBTOU-#FMM2 01GG##FMM$8$89GG##FMM$8$89GG##FMM$8$89GG##FOO$:$:;GGNN6==--.GGNN6==--.GGNN6==--.GGNN6??//0
+GG##FJJ$5$56GG##FJJ$5$56GGOOFJJOOO6GGOOFJJOOO6 DEGG##FLL$5$56GG##F$4$4$C$C$H$HIGGNN6++88==>-$yyc):;##))*:;""((* =>GGOO!!((KK--994?$++B`B``   BII 67&--({{&v{{+ '-KK""$MM$$S) .r,   N)r$   r%   r&   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpar]  r+   r,   r-   rC  rC    s-     L!&*# "N,*r,   rC  zL
    The text model from Siglip2 without any head or projection on top.
    )custom_introc                        e Zd ZeZdef fdZdej                  fdZd Z	e
e	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   defd              Z xZS )Siglip2TextModelrF   c                 d    t         |   |       t        |      | _        | j	                          y r   )rJ   rK   r:  
text_model	post_initrX   s     r-   rK   zSiglip2TextModel.__init__(  s&     08r,   r8   c                 B    | j                   j                  j                  S r   rh  r   r   rC   s    r-   get_input_embeddingsz%Siglip2TextModel.get_input_embeddings.  s    ))999r,   c                 :    || j                   j                  _        y r   rk  )r?   r   s     r-   set_input_embeddingsz%Siglip2TextModel.set_input_embeddings1  s    5:""2r,   r  r   r   r   r   c                 .    | j                  |||||      S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, Siglip2TextModel

        >>> model = Siglip2TextModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r  r   r   r   r   )rh  )r?   r  r   r   r   r   s         r-   r   zSiglip2TextModel.forward4  s)    6 )%/!5  
 	
r,   rA  )r$   r%   r&   r   r^  rK   rO   Modulerl  rn  r   r   r   r(   r   r   r   r   r   r   s   @r-   rf  rf     s     %L0 :bii :;  -115/3,0/3
ELL)
 !.
 u||,	

 $D>
 'tn
 
$
  
r,   rf  c                        e Zd ZdZdef fdZddej                  deej                     dej                  fdZ	 xZ
S )	r   zMultihead Attention Pooling.rF   c                    t         |           t        j                  t	        j
                  dd|j                              | _        t        j                  j                  |j                  |j                  d      | _
        t        j                  |j                  |j                        | _        t        |      | _        |j                  | _        y )Nr   T)batch_firstr   )rJ   rK   rO   	Parameterr(   randnrL   rM  MultiheadAttentionr   rO  r   r   	layernormr   r   r   rX   s     r-   rK   z-Siglip2MultiheadAttentionPoolingHead.__init__[  s    \\%++aF4F4F"GH
44V5G5GIcIcqu4vf&8&8f>S>STf%33r,   hidden_stater   r8   c                    |j                   d   }| j                  j                  |dd      }|f|j                   d   |j                   d   }}t        ||j                  |      }|j                  d| j
                  |d      }|j                  d||      }| j                  ||||      d   }|}| j                  |      }|| j                  |      z   }|d d df   S )Nr   r   r^   )	attn_mask)
rh   rM  repeatr   r`   r   rr   rO  rx  r   )r?   ry  r   rt   rM  
target_len
source_lenr   s           r-   r   z,Siglip2MultiheadAttentionPoolingHead.forwardd  s    !''*


!!*a3%%*[[^\5G5G5J
J7HZHZ\fgN+221dnnjRSTN+33B
JON~~e\<Sa~bcde~~l3$((<"88AqD!!r,   r   )r$   r%   r&   r'   r   rK   r(   r   r   r   r   r   s   @r-   r   r   X  sA    &42 4"ELL "(5<<BX "didpdp "r,   r   zN
    The vision model from Siglip2 without any head or projection on top.
    c                        e Zd ZeZdZdef fdZdej                  fdZ	e
e	 	 ddej                  dej                  dej                  dee   d	ee   defd
              Z xZS )Siglip2VisionModelr|   rF   c                 d    t         |   |       t        |      | _        | j	                          y r   )rJ   rK   r   vision_modelri  rX   s     r-   rK   zSiglip2VisionModel.__init__  s)     4V< 	r,   r8   c                 B    | j                   j                  j                  S r   )r  r   rR   rC   s    r-   rl  z'Siglip2VisionModel.get_input_embeddings  s      ++;;;r,   pixel_attention_maskr[   r   r   c                 .    | j                  |||||      S )a9  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Siglip2VisionModel

        >>> model = Siglip2VisionModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled features
        ```r|   r   r[   r   r   )r  )r?   r|   r  r[   r   r   s         r-   r   zSiglip2VisionModel.forward  s,    F   %/)/!5 ! 
 	
r,   r   )r$   r%   r&   r   r^  main_input_namerK   rO   rq  rl  r   r   r(   r)   r   r   r   r   r   r   r   r   s   @r-   r  r  w  s     'L$O2 <bii <  -1/3'
'''
 $ll'
 ((	'

 $D>'
 'tn'
 
$'
  '
r,   r  c                   v    e Zd ZeZdef fdZe	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dej                  fd	       Ze	 	 	 	 	 dd
eej                     deej                     deej                     dee
   dee
   dej                  fd       Zee	 	 	 	 	 	 	 	 	 ddeej                     d
eej                     deej                     deej                     deej                     deej                     dee
   dee
   dee
   defd              Z xZS )rR  rF   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  |      }t        j                  |      }|j                  | _        |j                  | _        t        j                  t!        j"                  d            | _        t        j                  t!        j"                  d            | _        | j)                          y )NzNconfig.text_config is expected to be of type Siglip2TextConfig but is of type .zRconfig.vision_config is expected to be of type Siglip2VisionConfig but is of type r   )rJ   rK   rF  text_configr   	TypeErrorrl   rG  r   rf  _from_configr  rh  r  rO   ru  r(   rv  rT  rV  ri  )r?   rF   r  rG  rh  r  rY   s         r-   rK   zSiglip2Model.__init__  s    &,,.?@++,-Q0 
 &..0CD--./q2 
 ((,, &22;?
)66}E %//(55<<A7,,u{{1~6 	r,   r  r   r   r   r   r8   c                     ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }|S )aM  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`Siglip2TextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> tokenizer = AutoTokenizer.from_pretrained("google/siglip2-base-patch16-224")

        >>> # important: make sure to set padding="max_length" as that's how the model was trained
        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding="max_length", return_tensors="pt")
        >>> with torch.no_grad():
        ...     text_features = model.get_text_features(**inputs)
        ```rp  )rF   r   r   rh  r   )r?   r  r   r   r   r   text_outputsr@  s           r-   get_text_featureszSiglip2Model.get_text_features  sr    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 48??)%/!5 4C 4
 %22r,   r|   r  r[   c                     ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }|S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.

        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Siglip2VisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     image_features = model.get_image_features(**inputs)
        ```
        r  )rF   r   r   r  r   )r?   r|   r  r[   r   r   vision_outputsr@  s           r-   get_image_featureszSiglip2Model.get_image_features	  su    P 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 '44r,   return_lossc
           	         ||n| j                   j                  }|	|	n| j                   j                  }	| j                  |||||	      }
| j	                  |||||	      }|
j
                  }|j
                  }||j                  ddd      z  }||j                  ddd      z  }t        j                  ||j                         j                  |j                              }| j                  j                  |j                        | j                  j                  |j                        }}||j                         z  |z   }|j                         }d}|rt        j                  |j!                  d      |j                  	      }t        j"                  |       d|z  z   }t        j$                  j&                  j)                  ||z        }t        j*                  |d
       }|j-                         }t/        |||||||
      S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AutoModel
        >>> import torch

        >>> model = AutoModel.from_pretrained("google/siglip2-base-patch16-224")
        >>> processor = AutoProcessor.from_pretrained("google/siglip2-base-patch16-224")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> texts = ["a photo of 2 cats", "a photo of 2 dogs"]
        >>> # important: we pass `padding=max_length` since the model was trained with this
        >>> inputs = processor(text=texts, images=image, padding="max_length", return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> logits_per_image = outputs.logits_per_image
        >>> probs = torch.sigmoid(logits_per_image) # these are the probabilities
        >>> print(f"{probs[0][0]:.1%} that image 0 is '{texts[0]}'")
        31.9% that image 0 is 'a photo of 2 cats'
        ```
        Nr  rp  ra   r^   T)r   r   keepdimr   )r_   r   )r3   r4   r5   r0   r    r6   r7   )rF   r   r   r  rh  r   normr(   r   trm   r_   rT  rV  expeyerd   	ones_likerO   r   
logsigmoidsumr  r2   )r?   r  r|   r  r[   r   r   r  r   r   r  r  r    r0   r5   rT  rV  r4   r3   r  m1_diag1logliknlls                          r-   r   zSiglip2Model.forwardB  s    d 2C1N-TXT_T_TqTq$8$D $++JjJj 	 6:5F5F%/)/!5 6G 6
 48??)%/!5 4C 4
 &33"00 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4D4G4GHZHZ4[\"&"2"2"5"5k6H6H"I4??K]K]^i^p^pKqZ)KOO,==
J*,,.))O003O<R<RSC881s7BHXX((33H4NOF99V,,C88:D-+#%* .
 	
r,   rA  )	NNNNNNNNN)r$   r%   r&   r   r^  rK   r   r   r(   r   r   r)   r  r   r  r   r2   r   r   r   s   @r-   rR  rR    s    L} @  -115/3,0/3+ELL)+ !.+ u||,	+
 $D>+ 'tn+ 
		+ +Z  597;59,0/36u0016 'u||46 !!1!12	6
 $D>6 'tn6 
		6 6p  15487;591537&*,0/3e
E,,-e
 u001e
 'u||4	e

 !!1!12e
 !.e
 u//0e
 d^e
 $D>e
 'tne
 
e
  e
r,   rR  z
    Siglip2 vision encoder with an image classification head on top (a linear layer on top of the pooled final hidden states of
    the patch tokens) e.g. for ImageNet.
    c                        e Zd ZdZdeddf fdZee	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee   d
ee   defd              Z xZS )rX  r|   rF   r8   Nc                 ~   t         |   |       |j                  | _        t        j	                  |j
                        }|j                  | _        |j                  dkD  r4t        j                  |j
                  j                  |j                        nt        j                         | _        | j                          y )Nr   )rJ   rK   
num_labelsr  r  rG  r  rO   rP   rL   IdentityrY  ri  )r?   rF   r  rY   s      r-   rK   z&Siglip2ForImageClassification.__init__  s      ++ *66v7K7KL(55 OUN_N_bcNcBIIf**668I8IJikititiv 	
 	r,   r  r[   labelsr   r   c                 8   ||n| j                   j                  }||n| j                   j                  }| j                  |||||      }|j                  }|Q|d   j                  |j                        }	t        j                  ||	z  d      t        j                  |	d      z  }nt        j                  |d      }| j                  |      }
d}||j                  |
j                        }| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt!               }| j                  dk(  r& ||
j#                         |j#                               }n ||
|      }n| j                   j                  dk(  r=t%               } ||
j'                  d	| j                        |j'                  d	            }n,| j                   j                  dk(  rt)               } ||
|      }t+        ||
|j,                  |j.                  
      S )a  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        spatial_shapes (`torch.LongTensor` of shape `(batch_size, 2)`):
            Tensor containing the spatial dimensions (height, width) of the input images.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, Siglip2ForImageClassification
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> torch.manual_seed(3)  # doctest: +IGNORE_RESULT
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> # note: we are loading a `Siglip2Model` from the hub here,
        >>> # so the head will be randomly initialized, hence the predictions will be random if seed is not set above.
        >>> image_processor = AutoImageProcessor.from_pretrained("google/siglip2-base-patch16-224")
        >>> model = Siglip2ForImageClassification.from_pretrained("google/siglip2-base-patch16-224")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        >>> # model predicts one of the two classes
        >>> predicted_class_idx = logits.argmax(-1).item()
        >>> print("Predicted class:", model.config.id2label[predicted_class_idx])
        Predicted class: LABEL_1
        ```
        N)r   r[   r   r   ).Nr   r  
regressionsingle_label_classificationmulti_label_classificationr^   )r3   logitsr"   r#   )rF   r   r   r  r!   rm   r_   r(   r  r  rY  problem_typer  r`   longrT   r   squeezer
   r   r	   r   r"   r#   )r?   r|   r  r[   r  r   r   r   sequence_output	pool_maskr  r3   loss_fcts                r-   r   z%Siglip2ForImageClassification.forward  s8   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 /3.?.?/)/!5 /@ /
 "33  +,Y7::?;Q;QRI#ii)(CKeiiXaghNiiO#jja@O 1YYv}}-F{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./$!//))	
 	
r,   )NNNNNN)r$   r%   r&   r  r   rK   r   r   r   r(   r   r   r   r   r   r   r   s   @r-   rX  rX    s     %O}  $  047;59)-,0/3d
u||,d
 'u||4d
 !!1!12	d

 &d
 $D>d
 'tnd
 
d
  d
r,   rX  )rR  rC  rf  r  rX  )r   )r   r  g       r	  )r  r%  r*  )Kr
  r  dataclassesr   typingr   r   r   r   r   numpyrI  r(   torch.nnrO   torch.nn.functionalr   rp   r	   r
   r   torch.nn.initr   activationsr   modeling_attn_mask_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   utilsr   r   r   r   configuration_siglip2r   r   r   
get_loggerr$   r   r   r/   r2   rq  rE   r   floatr   r   r   r   r   r   r   r   r#  r1  r6  r8  r:  rC  rf  r   r  rR  rX  __all__r+   r,   r-   <module>r     s  *   ! 8 8      A A 7 ! B 9 b b F K K X X 
		H	% ?+ ? ?8 ? ? ?8 !
K !
 !
Hbbii bX %II%<<% 
% <<	%
 U\\*% % %.D)ryy D)N -4 -`M
RYY M
`:
ryy :
z%BII %P! J \_$LL$ %$27$BG$SX$
\\$4A2ND<
RYY <
~ ;*_ ;* ;*| 
0
- 0

0
f"299 "> 
8
/ 8

8
v q
) q
 q
h {
$: {
{
|r,   