
    Uh                     `   d dl Zd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlmZ ddlmZ ddlmZ ddlmZ ddlmZ dd	lmZmZmZ dd
lmZmZ ddlmZ ddlmZm Z m!Z!m"Z"m#Z#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*  e$jV                  e,      Z- ed       G d dej\                               Z/	 dAdej\                  de
j`                  de
j`                  de
j`                  dee
j`                     de1de1fdZ2 G d dej\                        Z3e! G d de             Z4e G d  d!e             Z5 G d" d#ej\                        Z6 G d$ d%ej\                        Z7 G d& d'ej\                        Z8ejr                  e/d(Z: G d) d*ej\                        Z; G d+ d,ej\                        Z<e! G d- d.e4             Z=e! G d/ d0e             Z> G d1 d2ej\                        Z?e G d3 d4e             Z@ e!d56       G d7 d8e>             ZAe G d9 d:e              ZB G d; d<ee      ZC e!d=6       G d> d?e>e             ZDg d@ZEy)B    N)	dataclass)CallableListOptionalTupleUnion   )ACT2FN)GenerationMixin)use_kernel_forward_from_hub)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torchdynamo_compilinglogging	torch_int   )	AutoModel   )InternVLConfigInternVLVisionConfigRMSNormc                   ,     e Zd Zd fd	Zd Zd Z xZS )InternVLVisionRMSNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zD
        InternVLVisionRMSNorm is equivalent to T5LayerNorm
        N)super__init__nn	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/internvl/modeling_internvl.pyr%   zInternVLVisionRMSNorm.__init__7   s1     	ll5::k#:; #    c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   T)keepdim)	dtypetor(   float32powmeanrsqrtr+   r*   )r,   hidden_statesinput_dtypevariances       r0   forwardzInternVLVisionRMSNorm.forward?   sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r1   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)tupler*   shaper+   r,   s    r0   
extra_reprz InternVLVisionRMSNorm.extra_reprF   s*    ))*+6$2G2G1HIIr1   )gư>)__name__
__module____qualname__r%   r>   rC   __classcell__r/   s   @r0   r"   r"   5   s    $;Jr1   r"   modulequerykeyvalueattention_maskscalingdropoutc                    |}|}	t        j                  ||j                  dd            |z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j
                  j                  |
d      }
t        j
                  j                  |
|| j                        }
t        j                  |
|	      }|j                  dd      j                         }||
fS )Nr   r	   r3   dim)ptrainingr   )
r(   matmul	transposerA   r&   
functionalsoftmaxrO   rU   
contiguous)rI   rJ   rK   rL   rM   rN   rO   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r0   eager_attention_forwardra   J   s     JL<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1 ==((2(>L==((6??([L,,|\:K''1-88:K$$r1   c            
            e Zd ZdZdef fdZ	 	 d	dej                  deej                     deej                     de	e
   fdZ xZS )
InternVLVisionAttentionz+Attention Class for InternVL Vision Encoderconfigc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                  | j                  z  |j                         | _        t        j                  | j                  | j                        | _        |dkD  rt        j*                  |      nt        j,                         | _        |rt/        | j                        nt        j,                         | _        |rt/        | j                        | _        y t        j,                         | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Fbiasr   )r$   r%   rd   r-   	embed_dimnum_attention_heads	num_headshead_dim
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr&   Linearattention_biasq_projk_projv_projprojection_layerDropoutIdentityr"   q_normk_norm)r,   rd   proj_dropoutqk_normr/   s       r0   r%   z InternVLVisionAttention.__init__h   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta?F+DNN;BKKM?F+DNN;BKKMr1   r;   rM   output_attentionsr[   c                 $   |j                         \  }}}| j                  |      }| j                  |      }	| j                  |      }
| j	                  |      }| j                  |	      }	|j                  ||| j                  | j                        j                  dd      }|	j                  ||| j                  | j                        j                  dd      }	|
j                  ||| j                  | j                        j                  dd      }
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j!                  d       nt"        | j                  j                     } || ||	|
|f| j$                  sdn| j&                  | j(                  dd	|\  }}|j                  ||| j*                        }| j-                  |      }| j/                  |      }|r||f}|S |d f}|S )
Nr   r   eagersdpar~   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )rO   rN   rq   )sizert   ru   rv   rz   r{   reshaperj   rk   rW   viewra   rd   _attn_implementationgetloggerwarning_oncer   rU   rn   rm   rh   rw   ro   )r,   r;   rM   r~   r[   
batch_sizeseq_len_query_statesr\   r]   attention_interfacer`   r^   outputoutputss                   r0   r>   zInternVLVisionAttention.forward   s    "/!3!3!5
GQ{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HJJ
%
 
%
!\ "))*gt~~N&&{3((0,=6<( EKD>r1   )NN)rD   rE   rF   __doc__r   r%   r(   Tensorr   r   r   r>   rG   rH   s   @r0   rc   rc   e   sd    5Z3 Z> 2648	/||/ !./ $ELL1	/
 -./r1   rc   c                   0    e Zd ZeZdZdZdZdgZdZ	dZ
d Zy)InternVLVisionPreTrainedModelinternvl_visionpixel_valuesTInternVLVisionLayerc                 r   t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j
                  j                  j                  d| j                  j                         |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yt        |t               r|j"                  j                  j                          |j$                  $|j$                  j                  j                          |j&                  %|j&                  j                  j                          yyt        |t(              rs|j*                  j                  j                  | j                  j,                         |j.                  j                  j                  | j                  j,                         yy)zInitialize the weightsr   r9   stdN      ?)
isinstancer&   rr   Conv2dConvTranspose2dr*   datanormal_rd   initializer_rangerg   zero_	Embeddingpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsr   lambda_1layer_scale_init_valuelambda_2)r,   rI   s     r0   _init_weightsz+InternVLVisionPreTrainedModel._init_weights   s   fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) 89!!'')  ,!!&&,,.))5**//557 6 34OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r1   N)rD   rE   rF   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2r    r1   r0   r   r      s2    'L)$O&*#./N!Kr1   r   c                       e Zd ZdZy)$InternVLVisionModelOutputWithPoolinga  
    Class for outputs of [`InternVLVisionModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)rD   rE   rF   r   r   r1   r0   r   r      s    r1   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 ^   t         |           |j                  |j                  }}|j                  |j
                  }}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _        || _        t        j                  ||||      | _
        y )Nr   r   )kernel_sizestride)r$   r%   
image_size
patch_sizenum_channelsr-   num_patchespatch_shaper&   r   
projection)	r,   rd   r   r   r   r-   r   r   r/   s	           r0   r%   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir1   r   returnc                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   d   |j                   d   }}|j	                  d      j                  dd      }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r	   r   )rA   r   rl   r   flattenrW   )	r,   r   r   r   heightwidth
embeddingspatch_heightpatch_widths	            r0   r>   z%InternVLVisionPatchEmbeddings.forward
  s    2>2D2D/
L&%4,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
L+666r1   )	rD   rE   rF   r   r%   r(   r   r>   rG   rH   s   @r0   r   r      s)    j7ELL 7U\\ 7r1   r   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  de
ej                     dej                  fdZ xZS )r   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rd   r   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )r$   r%   r&   r'   r(   zerosr-   r   use_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler    use_absolute_position_embeddingsr   rx   hidden_dropout_probrO   )r,   rd   r   r/   s      r0   r%   z!InternVLVisionEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r1   r   r   r   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  d   z  }	|| j
                  d   z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr3   r         ?r	   r   bicubicF)r   modealign_cornersrR   )rA   r   r(   jit
is_tracingr   r   r   permuter&   rX   interpolater   cat)r,   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedrS   
new_height	new_widthsqrt_num_positionss               r0   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding6  sj    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"tq11
T__Q//	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr1   r   bool_masked_posc                    |j                   \  }}}}| j                  |      \  }\  }}|j                         \  }	}
}|K| j                  j	                  |	|
d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |	dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|||ffS )Nr3   r   rR   )rA   r   r   r   expand	unsqueezetype_asr   r(   r   r   r   rO   )r,   r   r   r   r   r   r   r   r   r   r   mask_tokensw
cls_tokenss                 r0   r>   z InternVLVisionEmbeddings.forward^  s   
 +001fe262G2G2U/
/\;!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
L+666r1   N)rD   rE   rF   r   r   r%   r(   r   intr   r   
BoolTensorr>   rG   rH   s   @r0   r   r     s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7r1   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )InternVLVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r$   r%   rd   r
   
hidden_actactivation_fnr&   rr   r-   intermediate_sizefc1fc2r,   rd   r/   s     r0   r%   zInternVLVisionMLP.__init__y  sd    #F$5$5699V//1I1IJ99V55v7I7IJr1   r;   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )r,   r;   s     r0   r>   zInternVLVisionMLP.forward  s4    /**=9/r1   )rD   rE   rF   r%   r(   r   r>   rG   rH   s   @r0   r   r   x  s$    KU\\ ell r1   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZ	 d	dej                  dede	e
ej                     e
ej                  ej                  f   f   fdZ xZS )
r   z?This corresponds to the Block class in the timm implementation.rd   r   Nc                    t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                     |j                  |j                        | _        t        |j                     |j                  |j                        | _        |j                  }t        j                   |t#        j$                  |j                        z  d      | _        t        j                   |t#        j$                  |j                        z  d      | _        t        j*                  |j,                        | _        y )Nr   r.   T)requires_grad)r$   r%   chunk_size_feed_forwardseq_len_dimrc   	attentionr   mlpNORM2FN	norm_typer-   layer_norm_epslayernorm_beforelayernorm_afterr   r&   r'   r(   r)   r   r   rx   r   rO   )r,   rd   init_valuesr/   s      r0   r%   zInternVLVisionLayer.__init__  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::v?Q?Q3S%Scgh[5::v?Q?Q3S%Scghzz&"<"<=r1   r;   r~   c                 "   | j                  | j                  |      |      \  }}| j                  |z  }||z   }| j                  |      }| j	                  |      }| j                  |      }| j                  | j                  |z  }||z   }||fS )N)r~   )r  r	  r   r
  r  rO   r   )r,   r;   r~   attention_outputattention_weightslayer_outputs         r0   r>   zInternVLVisionLayer.forward  s    
 /3nn!!-0/ /= /
++
  ==+;; )=8 ++M:xx-||L1==$==<7L $m3...r1   )F)rD   rE   rF   r   r   r%   r(   r   boolr   r   r>   rG   rH   s   @r0   r   r     si    I>3 > >$ #(/||/  / 
uU\\"E%,,*D$EE	F	/r1   r   c                   n     e Zd Zdeddf fdZe	 	 d	dej                  dedede	e
ef   fd       Z xZS )
InternVLVisionEncoderrd   r   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r$   r%   rd   r&   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r,   rd   ir/   s      r0   r%   zInternVLVisionEncoder.__init__  sO    ]]vOgOgIh#iA$7$?#ij
&+# $js   A#r;   r~   output_hidden_statesc                 0   |rdnd }|rdnd }t        | j                        D ]]  \  }}|r||fz   }| j                  r*| j                  r| j	                  |j
                  ||      }n	 |||      }|d   }|sU||d   fz   }_ |r||fz   }t        |||      S )Nr   r   r   last_hidden_stater;   
attentions)	enumerater  r  rU   _gradient_checkpointing_func__call__r   )	r,   r;   r~   r  all_hidden_statesall_self_attentionsr  layer_modulelayer_outputss	            r0   r>   zInternVLVisionEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!**t}} $ A A ))=:K! !-]<M N)!,M &9]1=M<O&O#	P    1]4D D++*
 	
r1   )FF)rD   rE   rF   r   r%   r   r(   r   r  r   r@   r   r>   rG   rH   s   @r0   r  r    sg    ,3 , ,  #(%*	 
|| 
   
 #	 

 
uo%	& 
  
r1   r  c                        e Zd Zdeddf fdZd Zee	 	 	 ddej                  de
ej                     de
e   d	e
e   deeef   f
d
              Z xZS )InternVLVisionModelrd   r   Nc                 2   t         |   |       || _        t        |      | _        t        |      | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        | j                          y )Nr   )r$   r%   rd   r   r   r  encoderuse_mean_poolingr&   ry   r   r-   r  	layernorm	post_initr   s     r0   r%   zInternVLVisionModel.__init__  so     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r1   c                 .    | j                   j                  S r   )r   r   rB   s    r0   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    ///r1   r   r   r~   r  c                 .   ||n| j                   j                  }||n| j                   j                  }| j                  ||      \  }}| j	                  |||      }|d   }| j                  |      }t        ||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   )r~   r  r   r  )	rd   r~   r  r   r)  r+  r   r;   r  )	r,   r   r   r~   r  embedding_outputr   encoder_outputssequence_outputs	            r0   r>   zInternVLVisionModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 #oolOo\!,,/!5 ' 

 *!,..93-)77&11
 	
r1   )NNN)rD   rE   rF   r   r%   r.  r   r   r(   r   r   r   r  r   r@   r   r>   rG   rH   s   @r0   r'  r'    s    3  0  7;,0/3
ll
 "%"2"23
 $D>	

 'tn
 
u::	;
  
r1   r'  c                   :    e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZd Zy)InternVLPreTrainedModel Tpast_key_valuesc                 $   t        | j                  d| j                  j                         j                        }t	        |t
        j                        rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rJ|j                  j                  j                          |j                  j                  j                  d       y y )Nr   r   r   r   )getattrrd   get_text_configr   r   r&   rr   r*   r   r   rg   r   r   r   )r,   rI   r   s      r0   r   z%InternVLPreTrainedModel._init_weights(  s    dkk#68S8S8U8g8ghfbii(MM&&CS&9{{&  &&( '-KK""$MM$$S) .r1   N)rD   rE   rF   r   r   r   r   _skip_keys_device_placement_supports_cache_classr   r   _supports_quantized_cache_supports_static_cache_supports_attention_backendr   r   r1   r0   r4  r4    sA    !L&*#"3 !N $!"&	*r1   r4  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrd   c                 *   t         |           t        j                  |j                  j
                  t        d|j                  z        dz  z        | _        t        j                  |j                  j
                  t        d|j                  z        dz  z  |j                  j
                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                        | _        y )Nr   r   )r$   r%   r&   r   vision_configr-   r   downsample_ratior   rr   text_configlinear_1r
   projector_hidden_actactlinear_2r   s     r0   r%   z$InternVLMultiModalProjector.__init__5  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar1   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   rE  rG  rH  )r,   image_featuresr;   s      r0   r>   z#InternVLMultiModalProjector.forward>  s@    7m4/m4r1   )rD   rE   rF   r   r%   r>   rG   rH   s   @r0   r@  r@  4  s    b~ br1   r@  c                   :    e Zd ZU dZdZeej                     ed<   y)InternVLModelOutputWithPasta  
    Base class for InternVL outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	rD   rE   rF   r   rM  r   r(   FloatTensor__annotations__r   r1   r0   rL  rL  F  s    8 8<%"3"34;r1   rL  zx
    The InternVL model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc            %           e Zd ZddiZdef fdZd Zd Zdej                  de
eee   f   d	efd
Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej"                  dej                  deej&                     deej"                     deeej                        deej                     dee
eee   f      d	ee   dee   dee   dee   dee   deej"                     dej&                  dee   de
eef   f d              Zddej&                  defdZ xZS )InternVLModelzlanguage_model.modellanguage_modelrd   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y r   )r$   r%   r   from_configrB  vision_towerr@  multi_modal_projectorrD  rS  r,  r   s     r0   r%   zInternVLModel.__init__o  sY     %11&2F2FG%@%H"'33F4F4FGr1   c                 6    | j                   j                         S r   )rS  r.  rB   s    r0   r.  z"InternVLModel.get_input_embeddingsw  s    ""7799r1   c                 :    | j                   j                  |       y r   )rS  set_input_embeddingsr,   rL   s     r0   rZ  z"InternVLModel.set_input_embeddingsz  s    007r1   r   vision_feature_layervision_feature_select_strategyc                    | j                   j                  }|dk(  r| j                  |      j                  }n| j	                  |      j
                  |   }|dk(  r|ddddddf   }|j                  d   }t        |dz        }|j                  d   }	|j                  |	||d      }| j                  ||      }|j                  |	d|j                  d         }| j                  |      }|S )	a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `List[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        r3   )r   defaultNr   r   r   )scale_factor)rd   rC  rV  r  vision_modelr;   rA   r   r   pixel_shufflerW  )
r,   r   r\  r]  r[   rC  vision_featureschannelsfeature_sizer   s
             r0   get_image_featuresz InternVLModel.get_image_features}  s	   $  ;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*lLZ\] ,,_K[,\ *11*b/BWBWXZB[\ 44_Er1   	input_idsrM   position_idsr6  inputs_embeds	use_cacher~   r  return_dictcache_positionimage_sizesr[   r   c                    |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j
                  }|d u |d uz  rt        d      | | j                         |      }|| j                  ||||      }|| | j                         t        j                  | j                   j                  t        j                  |j                              k(  }|j                  d      j                  d      d   }ny|| j                   j                  k(  j                  d      }|j!                  |      j#                  |j                        }|| j                   j                  k(  j                         }t%               s{||   j'                         |j'                         k7  rW|| j                   j                  k(  j                         }|j(                  d   |j(                  d   z  }t        d| d	|       |j#                  |j                  |j*                        }|j-                  ||      } | j.                  d|||||	|
|d
|d	|}t1        |j2                  |j4                  |j6                  |j8                  |      S d       S )Nz:You must specify exactly one of input_ids or inputs_embeds)r   r\  r]  rm  )r5   devicer   rR   r   r3   z6Image features and image tokens do not match: tokens: z, features T)	rM   rh  r6  ri  rj  r~   r  rk  rl  )r  r6  r;   r  rM  r   )rd   r~   r  use_return_dictr\  r]  rl   r.  rf  r(   tensorimage_token_idlongro  sumr   	expand_asr6   r   numelrA   r5   masked_scatterrS  rL  r  r6  r;   r  )r,   rg  r   rM   rh  r6  ri  r\  r]  rj  r~   r  rk  rl  rm  r[   rJ  special_image_maskn_image_tokensn_image_featuresr   s                        r0   r>   zInternVLModel.forward  s   ( 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' -t";<YZZ 7D557	BM#!44)%9/M'	 5 N  %26Qd6O6O6QLL!;!;5::VcVjVjk7 &" #5!9!9a!9!@!D!D!D!KA!N&/4;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i""+t{{/I/I"I!N!N!P+--@R2S2Y2Y2[_m_s_s_u2u"+t{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K^\M%$%% 
)%+'/!5)
 
 +%77#33!//))2>2J
 	

 QU
 	
r1   rc  r`  c           
         |j                         \  }}}}||z  dk7  s||z  dk7  rt        d      |j                  ||t        ||z        t        ||z              }|j	                  dddd      j                         }|j                  |t        ||z        t        ||z        t        ||dz  z              }|j	                  dddd      j                         }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r	   )r   rl   r   r   r   rZ   )r,   rc  r`  r   r   r   rd  s          r0   rb  zInternVLModel.pixel_shuffle  s     />.B.B.D+
E68L A%)=)Bjkk *..s6L#893x,?V;W
 *11!Q1=HHJ *..F\12C8L4MsS[_kmn_nSoOp

 *11!Q1=HHJr1   )NNNNNNNNNNNNNN)r   )rD   rE   rF   _checkpoint_conversion_mappingr   r%   r.  rZ  r(   rN  r   r   r   strrf  r   r   
LongTensorr   r   r  r   r   r   rL  r>   floatrb  rG   rH   s   @r0   rR  rR  g  s    '=>N%O"~ :8+''+ $CcN3+ ),	+Z  '+*.1537=A59@D8<$(,0/3&*59$(T
##T
 ''T
 !.	T

 u//0T
 "$u'8'8"9:T
   1 12T
 'uS$s)^'<=T
 )1T
 D>T
 $D>T
 'tnT
 d^T
 !!1!12T
 \\T
  -.!T
" 
u11	2#T
  T
l!U\\ ! !r1   rR  c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	InternVLCausalLMOutputWithPasta  
    Base class for InternVL causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitsr6  r;   r  rM  )rD   rE   rF   r   r  r   r(   rN  rO  r  r6  r   r;   r   r  rM  r   r1   r0   r  r  &  s    < )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r1   r  c                       e Zd Zy)KwargsForCausalLMN)rD   rE   rF   r   r1   r0   r  r  N  s    r1   r  zV
    The INTERNVL model which consists of a vision backbone and a language model.
    c            )           e Zd ZdddddZdgZdef fdZd	 Zd
 Zde	j                  fdZd Zed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d)dej(                  dej*                  deej.                     deej(                     deeej*                        deej*                     deeeee   f      dee   deej(                     dee   dee   dee   dee   deej(                     deeej.                  f   d eej.                     d!ee   deee f   f$d"              Z!	 	 	 	 	 	 d* fd#	Z"e#dej.                  d$ed%ed&ejH                  dej.                  d'efd(       Z% xZ&S )+ InternVLForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightrd   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFrf   )r$   r%   rR  modelr&   rr   rD  r-   
vocab_sizer  r,  r   s     r0   r%   z)InternVLForConditionalGeneration.__init___  sS     "6*
yy!3!3!?!?ASASA^A^ejkr1   c                 6    | j                   j                         S r   )r  r.  rB   s    r0   r.  z5InternVLForConditionalGeneration.get_input_embeddingse  s    zz..00r1   c                 :    | j                   j                  |       y r   )r  rZ  r[  s     r0   rZ  z5InternVLForConditionalGeneration.set_input_embeddingsh  s    

''.r1   r   c                     | j                   S r   r  rB   s    r0   get_output_embeddingsz6InternVLForConditionalGeneration.get_output_embeddingsk  s    ||r1   c                     || _         y r   r  )r,   new_embeddingss     r0   set_output_embeddingsz6InternVLForConditionalGeneration.set_output_embeddingsn  s	    %r1   c                 .    | j                   j                  S r   )r  rS  rB   s    r0   rS  z/InternVLForConditionalGeneration.language_modelr  s    zz(((r1   c                 .    | j                   j                  S r   )r  rV  rB   s    r0   rV  z-InternVLForConditionalGeneration.vision_towerv  s    zz&&&r1   c                 .    | j                   j                  S r   )r  rW  rB   s    r0   rW  z6InternVLForConditionalGeneration.multi_modal_projectorz  s    zz///r1   rg  r   rM   rh  r6  ri  r\  r]  labelsrj  r~   r  rk  rl  logits_to_keeprm  r[   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }||n| j                   j
                  } | j                  d|||||||||
||d||d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                   |j"                  |j$                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NT)rg  r   rM   rh  r6  ri  r\  r]  rj  r~   r  rk  rl  rm  r   )r  r  r  )r  r  r6  r;   r  rM  r   )rd   r~   r  rp  r\  r]  r  r   r   slicer  loss_functionrD  r  r  r6  r;   r  rM  )r,   rg  r   rM   rh  r6  ri  r\  r]  r  rj  r~   r  rk  rl  r  rm  r[   r   r;   slice_indicesr  r  s                          r0   r>   z(InternVLForConditionalGeneration.forward~  s   | 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$8$D $++JjJj 	
 .9 +;; 	' $** 
%)%+'!5+I/!5)#
 
$  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD .#33!//)) ' ; ;
 	
r1   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r6  ri  rM   rl  r  r   r   )r$   prepare_inputs_for_generation)r,   rg  r6  ri  r   rM   rl  r  r[   model_inputsr/   s             r0   r  z>InternVLForConditionalGeneration.prepare_inputs_for_generation  sV     w<
+')))
 
 !! ,8L(r1   sequence_lengthtarget_lengthr5   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer5   ro  r   )diagonal)ro  r3   r   )rS   r(   finfominfullro  triuaranger   r   clonerA   r6   masked_fill)rM   r  r  r5   rl  r   r[   r_   	min_dtypemask_lengthpadding_masks              r0   5_prepare_4d_causal_attention_mask_with_cache_positionzVInternVLForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r1   )NNNNNNNNNNNNNNr   N)NNNNNN)'rD   rE   rF   r|  _tied_weights_keysr   r%   r.  rZ  r&   Moduler  r  propertyrS  rV  rW  r   r   r(   r~  rN  r   r   r   r   r   r}  r  r   r  r   r  r>   r  staticmethodr5   r  rG   rH   s   @r0   r  r  Q  s    "8-"?#,	&" ++~ 1/ryy & ) ) ' ' 0 0  '+*.1537=A59@D8<-1$(,0/3&*5934.2#n
##n
 ''n
 !.	n

 u//0n
 "$u'8'8"9:n
   1 12n
 'uS$s)^'<=n
 )1n
 ))*n
 D>n
 $D>n
 'tnn
 d^n
 !!1!12n
  c5<</0!n
" ell+#n
$ *+%n
& 
u44	5'n
  n
f < 444 4 {{	4
 4 4 4r1   r  )r   r'  r4  rR  r  )r   )Fcollections.abcr   dataclassesr   typingr   r   r   r   r   r(   torch.nnr&   activationsr
   
generationr   integrationsr   modeling_flash_attention_utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   r   autor   configuration_internvlr   r   
get_loggerrD   r   r  r"   r   r  ra   rc   r   r   r   r   r   r   r  r   r  r'  r4  r@  rL  rR  r  r  r  __all__r   r1   r0   <module>r     s  .  ! 9 9   ! ) 7 B d d F &    H 
		H	% Y'JBII J (J6 %II%<<% 
% <<	%
 U\\*% % %6Nbii Nb  KO  K  KF +E  2!7BII !7L[7ryy [7|		  3H
I-/")) -/`(
BII (
V 2
7 2
 2
j *o * *0")) $ <"9 < <@ 
w+ w
wt $<[ $< $<N ?,j > 
m'> m
m`r1   