
    UhXi                        d dl Zd dlmZ d dlmZmZmZmZm	Z	 d dl
Z
d dlmZ d dlZ
ddlmZ ddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+m,Z,  ejZ                  e.      Z/	 d:dej`                  de
jb                  de
jb                  de
jb                  dee
jb                     de2de2fdZ3 G d de$      Z4 G d de"      Z5e G d de             Z6e G d  d!e             Z7 G d" d#ej`                        Z8 G d$ d%ej`                        Z9 G d& d'e       Z:ejv                  e4d(Z< G d) d*ej`                        Z= G d+ d,ej`                        Z>e G d- d.e6             Z? G d/ d0e)      Z@dZA G d1 d2ej`                        ZB G d3 d4e(      ZC G d5 d6e&      ZD G d7 d8e'      ZEg d9ZFy);    N)	dataclass)CallableListOptionalTupleUnion   )ACT2FN)FlashAttentionKwargs)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tuplelogging	torch_int   )CLIPMLP)JanusVisionAttention)LlamaRMSNorm)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaPreTrainedModel   )InternVLConfigInternVLVisionConfigmodulequerykeyvalueattention_maskscalingdropoutc                    |}|}	t        j                  ||j                  dd            |z  }
|#|d d d d d d d |j                  d   f   }|
|z   }
t        j
                  j                  |
d      }
t        j
                  j                  |
|| j                        }
t        j                  |
|	      }|j                  dd      j                         }||
fS )Nr   r	   dim)ptrainingr   )
torchmatmul	transposeshapenn
functionalsoftmaxr&   r-   
contiguous)r    r!   r"   r#   r$   r%   r&   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/internvl/modular_internvl.pyeager_attention_forwardr=   .   s     JL<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1 ==((2(>L==((6??([L,,|\:K''1-88:K$$    c                       e Zd Zy)InternVLVisionRMSNormN__name__
__module____qualname__ r>   r<   r@   r@   I       r>   r@   c            
            e Zd Zdef fdZ	 	 ddej                  deej                     deej                     dee	   fdZ
 xZS )	InternVLVisionAttentionconfigc                    t         |           | `d| _        |j                  }|rt        | j                        nt        j                         | _	        |rt        | j                        | _
        y t        j                         | _
        y NF)super__init__num_key_value_groups	is_causaluse_qk_normr@   	embed_dimr2   Identityq_normk_norm)selfrI   qk_norm	__class__s      r<   rM   z InternVLVisionAttention.__init__N   sb    % $$?F+DNN;BKKM?F+DNN;BKKMr>   hidden_statesr$   output_attentionsr6   c                 $   |j                         \  }}}| j                  |      }| j                  |      }	| j                  |      }
| j	                  |      }| j                  |	      }	|j                  ||| j                  | j                        j                  dd      }|	j                  ||| j                  | j                        j                  dd      }	|
j                  ||| j                  | j                        j                  dd      }
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j!                  d       nt"        | j                  j                     } || ||	|
|f| j$                  sdn| j&                  | j(                  dd	|\  }}|j                  ||| j*                        }| j-                  |      }| j/                  |      }|r||f}|S |d f}|S )
Nr   r   eagersdparY   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )r&   r%   rO   )sizeq_projk_projv_projrS   rT   reshape	num_headshead_dimr0   viewr=   rI   _attn_implementationgetloggerwarning_oncer   r-   attention_dropoutscalerQ   projection_layerprojection_dropout)rU   rX   r$   rY   r6   
batch_sizeseq_len_query_statesr7   r8   attention_interfacer;   r9   outputoutputss                   r<   forwardzInternVLVisionAttention.forwardY   s    "/!3!3!5
GQ{{=1[[/
{{=1{{<0[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HJJ
%
 
%
!\ "))*gt~~N&&{3((0,=6<( EKD>r>   )NN)rB   rC   rD   r   rM   r.   Tensorr   r   r   ru   __classcell__rW   s   @r<   rH   rH   M   sa    	Z3 	Z 2648	/||/ !./ $ELL1	/
 -./r>   rH   c                   0    e Zd ZeZdZdZdZdgZdZ	dZ
d Zy)InternVLVisionPreTrainedModelinternvl_visionpixel_valuesTInternVLVisionLayerc                 r   t        |t        j                  t        j                  t        j                  f      rm|j
                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j
                  j                  j                  d| j                  j                         |j                  2|j
                  j                  |j                     j                          yyt        |t        j                        rJ|j                  j                  j                          |j
                  j                  j                  d       yt        |t               r|j"                  j                  j                          |j$                  $|j$                  j                  j                          |j&                  %|j&                  j                  j                          yyt        |t(              rs|j*                  j                  j                  | j                  j,                         |j.                  j                  j                  | j                  j,                         yy)zInitialize the weightsr]   meanstdN      ?)
isinstancer2   LinearConv2dConvTranspose2dweightdatanormal_rI   initializer_rangebiaszero_	Embeddingpadding_idx	LayerNormfill_InternVLVisionEmbeddings	cls_token
mask_tokenposition_embeddingsr}   lambda_1layer_scale_init_valuelambda_2)rU   r    s     r<   _init_weightsz+InternVLVisionPreTrainedModel._init_weights   s   fryy"))R5G5GHI MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) 89!!'')  ,!!&&,,.))5**//557 6 34OO  &&t{{'I'IJOO  &&t{{'I'IJ 5r>   N)rB   rC   rD   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_sdpa_supports_flash_attn_2r   rE   r>   r<   rz   rz      s2    'L)$O&*#./N!Kr>   rz   c                       e Zd ZdZy)$InternVLVisionModelOutputWithPoolinga  
    Class for outputs of [`InternVLVisionModel`].

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            Average of the last layer hidden states of the patch tokens (excluding the *[CLS]* token) if
            *config.use_mean_pooling* is set to True. If set to False, then the final hidden state of the *[CLS]* token
            will be returned.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    N)rB   rC   rD   __doc__rE   r>   r<   r   r      s    r>   r   c                   Z     e Zd ZdZ fdZdej                  dej                  fdZ xZS )InternVLVisionPatchEmbeddingsz
    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
    Transformer.
    c                 ^   t         |           |j                  |j                  }}|j                  |j
                  }}|d   |d   z  |d   |d   z  z  }|d   |d   z  |d   |d   z  f}|| _        || _        || _        || _        || _        t        j                  ||||      | _
        y )Nr   r   )kernel_sizestride)rL   rM   
image_size
patch_sizenum_channelshidden_sizenum_patchespatch_shaper2   r   
projection)	rU   rI   r   r   r   r   r   r   rW   s	           r<   rM   z&InternVLVisionPatchEmbeddings.__init__   s    !'!2!2F4E4EJ
$*$7$79K9Kk!!}
15*Q-:VW=:XY!!}
15z!}
ST7UV$$(&&))L+:^hir>   r|   returnc                    |j                   \  }}}}|| j                  k7  rt        d      | j                  |      }|j                   d   |j                   d   }}|j	                  d      j                  dd      }|||ffS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.r   r	   r   )r1   r   
ValueErrorr   flattenr0   )	rU   r|   rn   r   heightwidth
embeddingspatch_heightpatch_widths	            r<   ru   z%InternVLVisionPatchEmbeddings.forward   s    2>2D2D/
L&%4,,,w  __\2
$.$4$4Q$79I9I!9Lk''*44Q:
L+666r>   )	rB   rC   rD   r   rM   r.   rv   ru   rw   rx   s   @r<   r   r      s)    j7ELL 7U\\ 7r>   r   c                        e Zd ZdZdeddf fdZdej                  dededej                  fd	Z		 dd
ej                  de
ej                     dej                  fdZ xZS )r   zc
    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.

    rI   r   Nc                 2   t         |           t        j                  t	        j
                  dd|j                              | _        |j                  r:t        j                  t	        j
                  dd|j                              | _	        nd | _	        t        |      | _        |j                  | _        t        |j                  t        j                   j"                        r|j                  n|j                  |j                  f| _        | j                  j$                  }|j&                  r=t        j                  t	        j
                  d|dz   |j                              | _        nd | _        t        j*                  |j,                        | _        y )Nr   )rL   rM   r2   	Parameterr.   zerosr   r   use_mask_tokenr   r   patch_embeddingsr   r   r   collectionsabcIterabler    use_absolute_position_embeddingsr   Dropouthidden_dropout_probr&   )rU   rI   r   rW   s      r<   rM   z!InternVLVisionEmbeddings.__init__   s$   ekk!Q8J8J&KL   ll5;;q!V=O=O+PQDO"DO =f E ++ &++[__-E-EF ##V%6%67 	
 ++7722')||EKK;QR?TZTfTf4g'hD$'+D$zz&"<"<=r>   r   r   r   c                    |j                   d   dz
  }| j                  j                   d   dz
  }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  ddddf   }| j                  ddddf   }|j                   d   }|| j
                  d   z  }	|| j
                  d   z  }
t        |dz        }|j                  d|||      }|j                  dddd      }t        j                  j                  ||	|
fdd	
      }|j                  dddd      j                  dd|      }t        j                  ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   Nr)   r         ?r	   r   bicubicF)r^   modealign_cornersr*   )r1   r   r.   jit
is_tracingr   r   rb   permuter2   r3   interpolatere   cat)rU   r   r   r   r   num_positionsclass_pos_embedpatch_pos_embedr+   
new_height	new_widthsqrt_num_positionss               r<   interpolate_pos_encodingz1InternVLVisionEmbeddings.interpolate_pos_encoding  sj    !&&q)A-0066q9A= yy##%+*F6UZ?+++221bqb59221ab59r"tq11
T__Q//	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr>   r|   bool_masked_posc                    |j                   \  }}}}| j                  |      \  }\  }}|j                         \  }	}
}|K| j                  j	                  |	|
d      }|j                  d      j                  |      }|d|z
  z  ||z  z   }| j                  j	                  |	dd      }t        j                  ||fd      }| j                  || j                  |||      z   }| j                  |      }|||ffS )Nr)   r   r*   )r1   r   r^   r   expand	unsqueezetype_asr   r.   r   r   r   r&   )rU   r|   r   rp   r   r   r   r   r   rn   ro   mask_tokensw
cls_tokenss                 r<   ru   z InternVLVisionEmbeddings.forward3  s   
 +001fe262G2G2U/
/\;!+!2
GQ&//00WbIK))"-55kBA#q1u-a?J^^**:r2>
YY
J7Q?
##/#d&C&CJPVX]&^^J\\*-
L+666r>   N)rB   rC   rD   r   r   rM   r.   rv   intr   r   
BoolTensorru   rw   rx   s   @r<   r   r      s    
>3 > >,&D5<< &D &DUX &D]b]i]i &DV 7;7ll7 "%"2"237 
	7r>   r   c                       e Zd Zy)InternVLVisionMLPNrA   rE   r>   r<   r   r   M  rF   r>   r   )
layer_normrms_normc                        e Zd ZdZdeddf fdZ	 d	dej                  dede	e
ej                     e
ej                  ej                  f   f   fdZ xZS )
r}   z?This corresponds to the Block class in the timm implementation.rI   r   Nc                    t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                     |j                  |j                        | _        t        |j                     |j                  |j                        | _        |j                  }t        j                   |t#        j$                  |j                        z  d      | _        t        j                   |t#        j$                  |j                        z  d      | _        t        j*                  |j,                        | _        y )Nr   epsT)requires_grad)rL   rM   chunk_size_feed_forwardseq_len_dimrH   	attentionr   mlpNORM2FN	norm_typer   layer_norm_epslayernorm_beforelayernorm_afterr   r2   r   r.   onesr   r   r   r   r&   )rU   rI   init_valuesrW   s      r<   rM   zInternVLVisionLayer.__init__W  s    '-'E'E$08$V, '(8(8 9&:L:LRXRgRg h&v'7'789K9KQWQfQfg33[5::v?Q?Q3S%Scgh[5::v?Q?Q3S%Scghzz&"<"<=r>   rX   rY   c                 "   | j                  | j                  |      |      \  }}| j                  |z  }||z   }| j                  |      }| j	                  |      }| j                  |      }| j                  | j                  |z  }||z   }||fS )N)rY   )r   r   r   r   r   r&   r   )rU   rX   rY   attention_outputattention_weightslayer_outputs         r<   ru   zInternVLVisionLayer.forwardf  s    
 /3nn!!-0/ /= /
++
  ==+;; )=8 ++M:xx-||L1==$==<7L $m3...r>   )F)rB   rC   rD   r   r   rM   r.   rv   boolr   r   ru   rw   rx   s   @r<   r}   r}   T  si    I>3 > >$ #(/||/  / 
uU\\"E%,,*D$EE	F	/r>   r}   c                   n     e Zd Zdeddf fdZe	 	 d	dej                  dedede	e
ef   fd       Z xZS )
InternVLVisionEncoderrI   r   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w rK   )
rL   rM   rI   r2   
ModuleListrangenum_hidden_layersr}   layergradient_checkpointing)rU   rI   irW   s      r<   rM   zInternVLVisionEncoder.__init__  sO    ]]vOgOgIh#iA$7$?#ij
&+# $js   A#rX   rY   output_hidden_statesc                 0   |rdnd }|rdnd }t        | j                        D ]]  \  }}|r||fz   }| j                  r*| j                  r| j	                  |j
                  ||      }n	 |||      }|d   }|sU||d   fz   }_ |r||fz   }t        |||      S )NrE   r   r   last_hidden_staterX   
attentions)	enumerater   r   r-   _gradient_checkpointing_func__call__r   )	rU   rX   rY   r  all_hidden_statesall_self_attentionsr  layer_modulelayer_outputss	            r<   ru   zInternVLVisionEncoder.forward  s     #7BD$5b4(4 	POA|#$58H$H!**t}} $ A A ))=:K! !-]<M N)!,M &9]1=M<O&O#	P    1]4D D++*
 	
r>   )FF)rB   rC   rD   r   rM   r   r.   rv   r   r   tupler   ru   rw   rx   s   @r<   r   r     sg    ,3 , ,  #(%*	 
|| 
   
 #	 

 
uo%	& 
  
r>   r   c                        e Zd Zdeddf fdZd Zee	 	 	 ddej                  de
ej                     de
e   d	e
e   deeef   f
d
              Z xZS )InternVLVisionModelrI   r   Nc                 2   t         |   |       || _        t        |      | _        t        |      | _        |j                  rt        j                         n*t        j                  |j                  |j                        | _        | j                          y )Nr   )rL   rM   rI   r   r   r   encoderuse_mean_poolingr2   rR   r   r   r   	layernorm	post_initrU   rI   rW   s     r<   rM   zInternVLVisionModel.__init__  so     26:,V4 $44BKKM",,vGYGY_e_t_t:u 	
 	r>   c                 .    | j                   j                  S r   )r   r   )rU   s    r<   get_input_embeddingsz(InternVLVisionModel.get_input_embeddings  s    ///r>   r|   r   rY   r  c                 .   ||n| j                   j                  }||n| j                   j                  }| j                  ||      \  }}| j	                  |||      }|d   }| j                  |      }t        ||j                  |j                        S )z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        )r   )rY   r  r   r  )	rI   rY   r  r   r  r  r   rX   r  )	rU   r|   r   rY   r  embedding_outputrp   encoder_outputssequence_outputs	            r<   ru   zInternVLVisionModel.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 #oolOo\!,,/!5 ' 

 *!,..93-)77&11
 	
r>   )NNN)rB   rC   rD   r   rM   r  r   r   r.   rv   r   r   r   r   r  r   ru   rw   rx   s   @r<   r  r    s    3  0  7;,0/3
ll
 "%"2"23
 $D>	

 'tn
 
u::	;
  
r>   r  c                       e Zd Zd Zy)InternVLPreTrainedModelc                 $   t        | j                  d| j                  j                         j                        }t	        |t
        j                        rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rJ|j                  j                  j                          |j                  j                  j                  d       y y )Nr   r]   r   r   )getattrrI   get_text_configr   r   r2   r   r   r   r   r   r   r   r   )rU   r    r   s      r<   r   z%InternVLPreTrainedModel._init_weights  s    dkk#68S8S8U8g8ghfbii(MM&&CS&9{{&  &&( '-KK""$MM$$S) .r>   N)rB   rC   rD   r   rE   r>   r<   r  r    s    	*r>   r  c                   *     e Zd Zdef fdZd Z xZS )InternVLMultiModalProjectorrI   c                 *   t         |           t        j                  |j                  j
                  t        d|j                  z        dz  z        | _        t        j                  |j                  j
                  t        d|j                  z        dz  z  |j                  j
                        | _        t        |j                     | _        t        j                  |j                  j
                  |j                  j
                        | _        y )Nr   r   )rL   rM   r2   r   vision_configr   r   downsample_ratior   r   text_configlinear_1r
   projector_hidden_actactlinear_2r  s     r<   rM   z$InternVLMultiModalProjector.__init__  s    ,,v';';'G'G#aRXRiRiNiJjnoJo'op		  ,,s1v7N7N3N/OST/TTV\VhVhVtVt
 &556		&"4"4"@"@&BTBTB`B`ar>   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r   )r   r(  r*  r+  )rU   image_featuresrX   s      r<   ru   z#InternVLMultiModalProjector.forward  s@    7m4/m4r>   )rB   rC   rD   r   rM   ru   rw   rx   s   @r<   r#  r#    s    b~ br>   r#  c                   j    e Zd Zd	dej                  defdZdej                  dee	e
e	   f   defdZy)
InternVLModelvision_featuresscale_factorc           
         |j                         \  }}}}||z  dk7  s||z  dk7  rt        d      |j                  ||t        ||z        t        ||z              }|j	                  dddd      j                         }|j                  |t        ||z        t        ||z        t        ||dz  z              }|j	                  dddd      j                         }|S )a&  Perform pixel shuffle downsampling on vision features.

        Args:
            vision_features (`torch.Tensor`):
                Input tensor of shape (batch_size, width, height, channels).
            scale_factor (`float`, *optional*, defaults to `0.5`):
                Factor by which to downsample. Default is 0.5, which halves the dimensions.

        Returns:
            vision_features (`torch.Tensor`):
                Downsampled tensor of shape (batch_size, height*scale_factor, width*scale_factor, channels/(scale_factor^2)).
        r   zKHeight and width must be divisible by scale_factor for proper downsampling.r   r   r	   )r^   r   re   r   r   r5   )rU   r0  r1  rn   r   r   channelss          r<   pixel_shufflezInternVLModel.pixel_shuffle  s     />.B.B.D+
E68L A%)=)Bjkk *..s6L#893x,?V;W
 *11!Q1=HHJ *..F\12C8L4MsS[_kmn_nSoOp

 *11!Q1=HHJr>   r|   vision_feature_layervision_feature_select_strategyc                    | j                   j                  }|dk(  r| j                  |      j                  }n| j	                  |      j
                  |   }|dk(  r|ddddddf   }|j                  d   }t        |dz        }|j                  d   }	|j                  |	||d      }| j                  ||      }|j                  |	d|j                  d         }| j                  |      }|S )	a%  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layer (`int` or `List[int]`):
                Layer index or list of layer indices to extract features from.
        Returns:
            vision_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`.
        r)   )r|   defaultNr   r   r   )r1  )rI   r&  vision_towerr  vision_modelrX   r1   r   rb   r4  multi_modal_projector)
rU   r|   r5  r6  r6   r&  r0  r3  feature_sizern   s
             r<   get_image_featuresz InternVLModel.get_image_features+  s	   $  ;;772%"//\/J\\O"//\/JXXYmnO)Y6-aQh7O #((+8S=)$**1-
 *11*lLZ\] ,,_K[,\ *11*b/BWBWXZB[\ 44_Er>   N)r   )rB   rC   rD   r.   rv   floatr4  FloatTensorr   r   r   strr=  rE   r>   r<   r/  r/    sN    !U\\ ! !F+''+ $CcN3+ ),	+r>   r/  c                       e Zd Zy)InternVLCausalLMOutputWithPastNrA   rE   r>   r<   rB  rB  Y  rF   r>   rB  c                        e Zd Z fdZ xZS ) InternVLForConditionalGenerationc                  :     t               j                  di |  y)ai  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, AutoModelForImageTextToText

        >>> torch_device = "cuda"
        >>> processor = AutoProcessor.from_pretrained("OpenGVLab/InternVL3-1B-hf")
        >>> model = AutoModelForImageTextToText.from_pretrained(
        ...     "OpenGVLab/InternVL3-1B-hf", torch_dtype=torch.bfloat16, device_map=torch_device
        ... )

        >>> messages = [
        ...     {
        ...         "role": "user",
        ...         "content": [
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
        ...             },
        ...             {
        ...                 "type": "image",
        ...                 "url": "https://thumbs.dreamstime.com/b/golden-gate-bridge-san-francisco-purple-flowers-california-echium-candicans-36805947.jpg",
        ...             },
        ...             {"type": "text", "text": "These images depict two different landmarks. Can you identify them?"},
        ...         ],
        ...     },
        ... ]

        >>> inputs = processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt").to(torch_device)
        >>> generate_ids = model.generate(**inputs, max_new_tokens=200)
        >>> print(processor.decode(generate_ids[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True))
        The images depict the Statue of Liberty and the Golden Gate Bridge.
        ```NrE   )rL   ru   )super_kwargsrW   s    r<   ru   z(InternVLForConditionalGeneration.forward^  s    H 	','r>   )rB   rC   rD   ru   rw   rx   s   @r<   rD  rD  ]  s    $( $(r>   rD  )rz   r  r  r/  rD  )r]   )Gcollections.abcr   dataclassesr   typingr   r   r   r   r   r.   torch.nnr2   torch.utils.checkpointactivationsr
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   clip.modeling_clipr   janus.modeling_janusr   llama.modeling_llamar   llava.modeling_llavar   r   r   r   configuration_internvlr   r   
get_loggerrB   rh   Modulerv   r>  r=   r@   rH   rz   r   r   r   r   r   r   r}   r   r  r  INTERNVL_INPUTS_DOCSTRINGr#  r/  rB  rD  __all__rE   r>   r<   <module>r[     s  "  ! 9 9    ! B K F & I I ( 7 /  I 
		H	% %II%<<% 
% <<	%
 U\\*% % %6	L 	;2 ;|  KO  K  KF +E  2!7BII !7L[7ryy [7|	 	 3H
I-/")) -/`(
BII (
V 2
7 2
 2
j
*2 
* ! ")) $OJ Od	%@ 	%('D %(Pr>   