
    Uh                       d Z ddlZddlmZ ddlmZmZmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZmZ ddlmZm Z m!Z! ddl"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)  e%jT                  e+      Z,e G d de#             Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1de0iZ2 G d dej\                        Z3 G d dej\                        Z4 G d dej\                        Z5 G d  d!ej\                        Z6 G d" d#ej\                        Z7e$ G d$ d%e             Z8 G d& d'ej\                        Z9 G d( d)ej\                        Z:	 dGd*ej\                  d+e
jv                  d,e
jv                  d-e
jv                  d.ee
jv                     d/e<d0e<fd1Z= G d2 d3ej\                        Z> G d4 d5ej\                        Z? G d6 d7ej\                        Z@ G d8 d9ej\                        ZA e$d:;       G d< d=e8             ZB G d> d?ej\                        ZC e$d@;       G dA dBe8             ZD e$dC;       G dD dEe8e             ZEg dFZFy)HzPyTorch GIT model.    N)	dataclass)CallableListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCache)GenerationMixin)_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )	GitConfigGitVisionConfigc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)GitVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r"   r   torchFloatTensor__annotations__r#   r$   r   r%        v/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/git/modeling_git.pyr!   r!   2   sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r.   r!   c                        e Zd ZdZ fdZ	 	 	 	 d	deej                     deej                     deej                     de	dej                  f
dZ xZS )
GitEmbeddingsz;Construct the embeddings from word and position embeddings.c                 B   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        t        |dd      | _        | j#                  dt%        j&                  |j                        j)                  d      d       y )	N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistent)super__init__r	   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutgetattrr6   register_bufferr*   arangeexpandselfconfig	__class__s     r/   r>   zGitEmbeddings.__init__S   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c  f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
r.   	input_idsr8   inputs_embedspast_key_values_lengthreturnc                 J   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|| j                  |      }n|}| j                  dk(  r| j	                  |      }||z  }| j                  |      }| j                  |      }|S )Nr:   r   r7   )sizer8   rC   r6   rE   rF   rJ   )	rP   rS   r8   rT   rU   input_shape
seq_length
embeddingsrE   s	            r/   forwardzGitEmbeddings.forwardb   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J'':5"&":":<"H--J^^J/
\\*-
r.   )NNNr   )r&   r'   r(   r)   r>   r   r*   
LongTensorr+   intTensorr\   __classcell__rR   s   @r/   r1   r1   P   ss    E
" 153759&'E,,- u//0   1 12	
 !$ 
r.   r1   c                        e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 ddej
                  deej                     deej                     dee	   d	ee
   d
ee
   deej
                     fdZ xZS )GitSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |-t        j                  d| j                  j                   d       |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        |j                  j                  |j                  j                   z  dz  d	z         | _        |j$                  | xj"                  |j$                  z  c_        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j0                  |j2                        | _        |xs t7        |d
d      | _        | j8                  dk(  s| j8                  dk(  rG|j:                  | _        t'        j<                  d|j:                  z  d	z
  | j                        | _        y y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   r6   r7   relative_keyrelative_key_query) r=   r>   rA   num_attention_headshasattr
ValueError	layer_idxloggerwarning_oncerR   r&   r^   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr	   LinearquerykeyvaluerH   attention_probs_dropout_probrJ   rK   r6   rD   r?   distance_embeddingrP   rQ   r6   rm   rR   s       r/   r>   zGitSelfAttention.__init__   s(    : ::a?PVXhHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# >rr.   xrV   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr:   r   rg   r   r
   )rX   rj   rp   viewpermute)rP   r~   new_x_shapes      r/   transpose_for_scoresz%GitSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r.   r$   attention_mask	head_maskpast_key_valueoutput_attentionspixel_values_presentc           	      x   | j                  |      }|r| j                  nd}| j                  | j                  |            }	| j                  | j	                  |            }
||j                  |	d d d d |d d d f   |
d d d d |d d d f   | j                        \  }}t        j                  |	d d d d d |d d f   |gd      }	t        j                  |
d d d d d |d d f   |gd      }
| j                  |      }t        j                  ||	j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |	j                  d   }}|Dt        j                  |dz
  t        j                  |j                  	      j!                  dd      }n@t        j"                  |t        j                  |j                  	      j!                  dd      }t        j"                  |t        j                  |j                  	      j!                  dd      }||z
  }| j%                  || j&                  z   dz
        }|j)                  |j*                  
      }| j                  dk(  rt        j,                  d||      }||z   }nE| j                  dk(  r6t        j,                  d||      }t        j,                  d|	|      }||z   |z   }|t/        j0                  | j2                        z  }|||z   }t4        j6                  j9                  |d      }| j;                  |      }|||z  }t        j                  ||
      }|j=                  dddd      j?                         }|jA                         d d | jB                  fz   }|j!                  |      }|r||fn|f}||fz   }|S )Nr   rg   dimr:   rh   ri   r   dtypedevicer   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   )"rx   ru   r   ry   rz   updaterm   r*   catmatmul	transposer6   shapetensorlongr   r   rM   r|   rD   tor   einsummathsqrtrp   r	   
functionalsoftmaxrJ   r   
contiguousrX   rq   )rP   r$   r   r   r   r   r   mixed_query_layercutoff	key_layervalue_layerkey_layer_pastvalue_layer_pastquery_layerattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r/   r\   zGitSelfAttention.forward   s    !JJ}5,@((a--dhh}.EF	//

=0IJ%/=/D/D!Q*+[Avw9I-JDNN0,N, 		9Q7F7A-=#>"OUVWI))[Aww1A%BDT$U[\]K//0AB !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L)!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]^--r.   NNNNNFF)r&   r'   r(   r>   r*   r_   r   r   r+   r   boolr   r\   r`   ra   s   @r/   rc   rc      s     uD%ell %u|| % 7;15*.,1/4J||J !!2!23J E--.	J
 !J $D>J 'tnJ 
u||	Jr.   rc   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )GitSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr4   )r=   r>   r	   rw   rA   denserF   rG   rH   rI   rJ   rO   s     r/   r>   zGitSelfOutput.__init__   s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r.   r$   input_tensorrV   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S Nr   rJ   rF   rP   r$   r   s      r/   r\   zGitSelfOutput.forward   7    

=1]3}|'CDr.   r&   r'   r(   r>   r*   r_   r\   r`   ra   s   @r/   r   r      1    >U\\  RWR^R^ r.   r   eagerc                        e Zd Zd fd	Zd Z	 	 	 	 	 ddej                  deej                     deej                     dee	   dee
   dee
   d	eej                     fd
Z xZS )GitAttentionc                     t         |           t        |j                     |||      | _        t        |      | _        t               | _        y )N)r6   rm   )	r=   r>   GIT_SELF_ATTENTION_CLASSES_attn_implementationrP   r   outputsetpruned_headsr}   s       r/   r>   zGitAttention.__init__
  sE    .v/J/JK,Cy
	 $F+Er.   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rP   rj   rp   r   r   rx   ry   rz   r   r   rq   union)rP   headsindexs      r/   prune_headszGitAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r.   r$   r   r   r   r   r   rV   c                 n    | j                  ||||||      }| j                  |d   |      }|f|dd  z   }	|	S )Nr   r   )rP   r   )
rP   r$   r   r   r   r   r   self_outputsattention_outputr   s
             r/   r\   zGitAttention.forward%  sT     yy 
  ;;|AF#%QR(88r.   r   r   )r&   r'   r(   r>   r   r*   r_   r   r+   r   r   r   r\   r`   ra   s   @r/   r   r   	  s    ";* 7;15*.,1/4|| !!2!23 E--.	
 ! $D> 'tn 
u||	r.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r=   r>   r	   rw   rA   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrO   s     r/   r>   zGitIntermediate.__init__=  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r.   r$   rV   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rP   r$   s     r/   r\   zGitIntermediate.forwardE  s&    

=100?r.   r   ra   s   @r/   r   r   <  s#    9U\\ ell r.   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	GitOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r=   r>   r	   rw   r   rA   r   rF   rG   rH   rI   rJ   rO   s     r/   r>   zGitOutput.__init__M  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r.   r$   r   rV   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r/   r\   zGitOutput.forwardS  r   r.   r   ra   s   @r/   r   r   L  r   r.   r   c                        e Zd Zd fd	Z	 	 	 	 	 ddej
                  deej                     deej                     dee   dee	   dee	   de
ej
                     fd	Zd
 Z xZS )GitLayerc                     t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        y )Nr   )rm   )
r=   r>   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   )rP   rQ   rm   rR   s      r/   r>   zGitLayer.__init__[  sK    '-'E'E$%f	B+F3'r.   r$   r   r   r   r   r   rV   c                     | j                  ||||||      }|d   }|dd }	|d   }
t        | j                  | j                  | j                  |      }|f|	z   }	|	|
fz   }	|	S )N)r   r   r   r   r   r:   )r   r   feed_forward_chunkr   r   )rP   r$   r   r   r   r   r   self_attention_outputsr   r   present_key_valuelayer_outputs               r/   r\   zGitLayer.forwardc  s     "&/)!5 "0 "
 2!4 )2.2260##T%A%A4CSCSUe
  /G+ .00r.   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )rP   r   intermediate_outputr   s       r/   r   zGitLayer.feed_forward_chunk  s,    "//0@A{{#68HIr.   r   r   )r&   r'   r(   r>   r*   r_   r   r+   r   r   r   r\   r   r`   ra   s   @r/   r   r   Z  s    ( 7;15*.,1/4 ||  !!2!23  E--.	 
 !  $D>  'tn  
u||	 Dr.   r   c                       e Zd Z fdZ	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deee	e
e
ej                        f      dee   dee   dee   d	ee   d
ee   dee
ej
                     ef   fdZ xZS )
GitEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w NF)
r=   r>   rQ   r	   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)rP   rQ   irR   s      r/   r>   zGitEncoder.__init__  sP    ]]vG_G_A`#aAHVQ$7#ab
&+# $bs   A$r$   r   r   past_key_values	use_cacher   output_hidden_statesr   return_dictrV   c
           	         | j                   r%| j                  r|rt        j                  d       d}d}
|rIt	        |t
              s9d}
|t               }n*t        j                  |      }t        j                  d       |rdnd }|rdnd }d }t        | j                        D ]t  \  }}|r||fz   }|||   nd }| j                   r-| j                  r!| j                  |j                  |||||      }n |||||||      }|d   }|r|d   }|sl||d   fz   }v |r||fz   }|r|nd }|
r|j                         }|	st        d	 ||||fD              S t        ||||
      S )NzZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FTzWe detected that you are passing `past_key_values` as a tuple of tuples. This is deprecated and will be removed in v4.47. Please convert your cache or use an appropriate `Cache` class (https://huggingface.co/docs/transformers/kv_cache#legacy-cache-format)r-   r   r:   r   c              3   $   K   | ]  }|| 
 y wr   r-   .0vs     r/   	<genexpr>z%GitEncoder.forward.<locals>.<genexpr>  s      	 = 	s   r#   r   r$   r%   )r   trainingrn   ro   r   r   r   from_legacy_cache	enumerater   _gradient_checkpointing_func__call__to_legacy_cachetupler   )rP   r$   r   r   r   r   r   r   r   r   return_legacy_cacheall_hidden_statesall_self_attentionsnext_decoder_cacher   layer_modulelayer_head_masklayer_outputs
next_caches                      r/   r\   zGitEncoder.forward  s    &&4==##p "	 $Z?"&&".."."@"@"Q##^ #7BD$5b4!(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!"##%! !-!"##%(! *!,M%22%6" &9]1=M<O&O#;	P>   1]4D D+4'$
#335J 	 "%'		 	 	 '+&+*	
 	
r.   )NNNNFFFT)r&   r'   r(   r>   r*   r_   r   r+   r   r   r   r   r   r\   r`   ra   s   @r/   r   r     s    , 7;15SW$(,1/4/4&*Z
||Z
 !!2!23Z
 E--.	Z

 "%uU5;L;L5M/N(N"OPZ
 D>Z
 $D>Z
 'tnZ
 'tnZ
 d^Z
 
uU\\"$;;	<Z
r.   r   c                   &    e Zd ZeZdZdZdZdZd Z	y)GitPreTrainedModelgitTc                    t        |t              rt        j                  j	                  |j
                  d| j                  j                         t        j                  j	                  |j                  j                  | j                  j                         t        j                  j	                  |j                  j                  | j                  j                         t        |t        j                        rm|j                  j                  j	                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rz|j                  j                  j	                  d| j                  j                         |j                   2|j                  j                  |j                      j                          yyt        |t        j"                        rJ|j                  j                  j                          |j                  j                  j%                  d       yy)zInitialize the weights        )meanstd)r  Ng      ?)r   GitVisionEmbeddingsr	   initnormal_class_embeddingrQ   initializer_rangepatch_embeddingweightposition_embeddingrw   databiaszero_r?   r3   rF   fill_)rP   modules     r/   _init_weightsz GitPreTrainedModel._init_weights  s   f12GGOOF22$++B_B_O`GGOOF2299t{{?\?\O]GGOOF55<<$++B_B_O`fbii( MM&&CT[[5R5R&S{{&  &&( '-MM&&CT[[5R5R&S!!-""6#5#56<<> .-KK""$MM$$S) .r.   N)
r&   r'   r(   r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_cache_class_supports_quantized_cacher*  r-   r.   r/   r  r    s$    L&*#  $*r.   r  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )r  rQ   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestrider&  rg   r   r8   r9   r;   )r=   r>   rQ   rA   	embed_dimrs   rt   r	   	Parameterr*   randnr   Conv2dnum_channelsr"  num_patchesnum_positionsr?   r$  rL   rM   rN   rO   s     r/   r>   zGitVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr.   r[   heightwidthrV   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr:   g      ?r
   rg   bicubicF)rX   modealign_cornersr   )r   r$  r#  	unsqueezer*   jit
is_tracingr8   rt   r   reshaper   r	   r   interpolater   r   )rP   r[   r=  r>  r;  r$  r<  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r/   interpolate_pos_encodingz,GitVisionEmbeddings.interpolate_pos_encoding$  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr.   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r   rg   r   r:   r   )r   rs   rl   r"  r#  r   r   flattenr   r   rN   r*   r   rM  r$  r8   )rP   rN  rM  
batch_size_r=  r>  target_dtypepatch_embedsclass_embedsr[   s              r/   r\   zGitVisionEmbeddings.forwardM  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr.   F)r&   r'   r(   r   r>   r*   r_   r^   rM  r+   r\   r`   ra   s   @r/   r  r    sd    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r.   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r=   r>   rQ   r   r   activation_fnr	   rw   rA   r   fc1fc2rO   s     r/   r>   zGitVisionMLP.__init__a  sd    #F$5$5699V//1I1IJ99V55v7I7IJr.   r$   rV   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r]  r\  r^  r   s     r/   r\   zGitVisionMLP.forwardh  s4    /**=9/r.   r   ra   s   @r/   rZ  rZ  `  s$    KU\\ ell r.   rZ  r)  rx   ry   rz   r   scalingrJ   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr:   r   )r   r   )pr  r   rg   )r*   r   r   r	   r   r   float32r   r   rJ   r  r   )
r)  rx   ry   rz   r   r`  rJ   kwargsattn_weightsattn_outputs
             r/   eager_attention_forwardrg  p  s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r.   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
GitVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: rQ  g      F)r=   r>   rQ   rA   r6  rj   	num_headshead_dimrl   scaleattention_dropoutrJ   	is_causalr	   rw   k_projv_projq_projout_projrO   s     r/   r>   zGitVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar.   r$   r   causal_attention_maskr   rV   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
z#Input shape: Batch x Time x Channelr   rg   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r  )ro  r`  rJ   )r   rr  rp  rq  r   rk  rl  r   rQ   r   ro  rg  rn   ro   r   rm  r  rJ   rF  r   rs  )rP   r$   r   rt  r   rS  rZ   r6  querieskeysvaluesattention_interfacerf  re  s                 r/   r\   zGitVisionAttention.forward  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r.   )NNF)r&   r'   r(   r)   r>   r*   r_   r   r   r   r\   r`   ra   s   @r/   ri  ri    s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r.   ri  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
GitVisionEncoderLayerrQ   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )r=   r>   rA   r6  ri  	self_attnr	   rF   rG   layer_norm1rZ  mlplayer_norm2rO   s     r/   r>   zGitVisionEncoderLayer.__init__  sm    +++F3<<F<Q<QR'<<F<Q<QRr.   r$   r   rt  r   rV   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r$   r   rt  r   )r  r  r  r  )rP   r$   r   rt  r   residualre  r   s           r/   r\   zGitVisionEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr.   rX  )r&   r'   r(   r   r>   r*   r_   r   r   r   r+   r\   r`   ra   s   @r/   r}  r}    sf    S S -2&||& &  %||	&
 $D>& 
u  	!&r.   r}  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )GitVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`GitVisionEncoderLayer`].

    Args:
        config: GitVisionConfig
    rQ   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
r=   r>   rQ   r	   r   r   r   r}  layersr   )rP   rQ   rT  rR   s      r/   r>   zGitVisionEncoder.__init__  sP    mmERXRjRjLk$lq%:6%B$lm&+# %ms   A#r   rt  r   r   r   rV   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]b  \  }
}|r||	fz   }| j                  r,| j                  r | j                  |j                  |	|||      }n ||	|||      }|d   }	|sZ||d   fz   }d |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr-   )r   r   r   c              3   &   K   | ]	  }||  y wr   r-   r  s     r/   r  z+GitVisionEncoder.forward.<locals>.<genexpr>d  s     eqWXWdes   r#   r$   r%   )rQ   r   r   use_return_dictr	  r  r   r  r
  r  r  r   )rP   rT   r   rt  r   r   r   encoder_statesall_attentionsr$   idxencoder_layerr  s                r/   r\   zGitVisionEncoder.forward  sH   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+>Vd
 	
r.   )NNNNN)r&   r'   r(   r)   r   r>   r   r*   r_   r   r   r   r   r\   r`   ra   s   @r/   r  r  	  s    , , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
r.   r  c                        e Zd Zdef fdZe	 	 	 	 	 d
deej                     dee	   dee	   dee	   dee	   de
eef   fd	       Z xZS )GitVisionTransformerrQ   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r=   r>   rQ   rA   r  r[   r	   rF   rG   pre_layrnormr  encoderpost_layernorm)rP   rQ   r6  rR   s      r/   r>   zGitVisionTransformer.__init__l  sj    &&	-f5LL8M8MN'/ ll9&:O:OPr.   rN  r   r   rM  r   rV   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }| j                  |      }|s	|f|dd  z   S t        ||j                  |j                        S )Nz You have to specify pixel_valuesrM  )rT   r   r   r   r   r   r  )rQ   r   r   r  rl   r[   r  r  r  r   r$   r%   )	rP   rN  r   r   rM  r   r$   encoder_outputsr#   s	            r/   r\   zGitVisionTransformer.forwardv  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A. //0AB%'/!"*===/)77&11
 	
r.   NNNFN)r&   r'   r(   r   r>   r   r   r*   r+   r   r   r   r   r\   r`   ra   s   @r/   r  r  j  s    Q Q  59,0/338&*&
u001&
 $D>&
 'tn	&

 #+4.&
 d^&
 
uo%	&&
 &
r.   r  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    )custom_introc                        e Zd ZeZdZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )GitVisionModelrN  rQ   c                 d    t         |   |       t        |      | _        | j	                          y r   )r=   r>   r  vision_model	post_initrO   s     r/   r>   zGitVisionModel.__init__  s'     08r.   rV   c                 B    | j                   j                  j                  S r   )r  r[   r"  rP   s    r/   get_input_embeddingsz#GitVisionModel.get_input_embeddings  s      ++;;;r.   r   r   rM  r   c                 b    ||n| j                   j                  }| j                  |||||      S )a{  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, GitVisionModel

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```)rN  r   r   rM  r   )rQ   r  r  )rP   rN  r   r   rM  r   s         r/   r\   zGitVisionModel.forward  sA    8 &1%<k$++B]B]  %/!5%=# ! 
 	
r.   r  )r&   r'   r(   r   r+  main_input_namer>   r	   Moduler  r   r   r*   r+   r   r   r   r   r\   r`   ra   s   @r/   r  r    s     #L$O <bii <  59,0/3).&*#
u001#
 $D>#
 'tn	#

 #'#
 d^#
 
uo%	&#
 #
r.   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )GitProjectionrQ   c                 0   t         |           || _        t        j                  t        j
                  |j                  j                  |j                        t        j                  |j                  |j                  j                              | _
        y r   )r=   r>   rQ   r	   
Sequentialrw   rr   rA   rF   rG   visual_projectionrO   s     r/   r>   zGitProjection.__init__  sf    !#IIf**668J8JKLL++1E1E1T1TU"
r.   r[   rV   c                 $    | j                  |      S r   )r  )rP   r[   s     r/   r\   zGitProjection.forward  s    %%j11r.   )	r&   r'   r(   r   r>   r*   r_   r\   r`   ra   s   @r/   r  r    s*    
y 
2%,, 25<< 2r.   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                       e Zd Z fdZd Zd Zd Zdedej                  dej                  dej                  fd	Zdd
Ze	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deeeeej&                     f      dee   dee   dee   dedee   deeej                     ef   fd       Z xZS )GitModelc                 l   t         |          | _        t              | _        t        j                        | _        t              | _	        t              | _        j                  6t        j                  fdt        j                        D              | _        | j#                          y )Nc              3      K   | ]B  }t        j                  t        j                  d d j                  j
                               D yw)r   N)r	   r7  r*   zerosrr   rA   )r  rT  rQ   s     r/   r  z$GitModel.__init__.<locals>.<genexpr>  s;      ; U[[Av/C/C/O/OPQ;s   AA)r=   r>   rQ   r1   r[   r  rr   image_encoderr   r  r  r  rv   r	   ParameterListr   img_temperal_embeddingr  rO   s    `r/   r>   zGitModel.__init__  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r.   c                 .    | j                   j                  S r   r[   rC   r  s    r/   r  zGitModel.get_input_embeddings   s    ...r.   c                 &    || j                   _        y r   r  )rP   rz   s     r/   set_input_embeddingszGitModel.set_input_embeddings  s    */'r.   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   r   )rP   heads_to_pruner   r   s       r/   _prune_headszGitModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr.   rX   r   r   rV   c                     t        j                  t        j                  ||||      d      }|j                  |dk(  t	        d            }|S )Nr   r   r   )diagonal-inf)r*   triuonesmasked_fillfloat)rP   rX   r   r   masks        r/   _generate_future_maskzGitModel._generate_future_mask  sA    zz%**T4eLWXY	5=9r.   c                    |j                   d   }|j                   d   }|j                  }|j                  }	t        j                  ||f||	      }
t        j
                  |||z   ft        d      |j                  |	      }t        j                  ||f|	|j                        }|dkD  rAt        j                  |j                   d   |j                   d   |z   f|	|j                        }t        j                  |
|fd      }t        j                  ||j                  |	      fd      }t        j                  ||fd      d d d f   }|4t        j
                  |j                   d   |j                   d   fd|      }|j                  t        j                  k7  rt        d	      t        j                  ||j                  
      }t        d      ||<   |j                  |j                   d   ||z   ||z   |z   f      }|j                         }|d d d d d |f   }|d d d d d f   }||z   |d d d d d |f<   |d d d d d d d f   }|S )Nr   r  r  r   r   r   F)
fill_valuer   z1Memory key padding mask must be a boolean tensor.r   )r   r   r   r*   r  fullr  r   r   r   rl   
zeros_likerN   clone)rP   tgtmemorytgt_maskrU   memory_key_padding_masknum_tgt
num_memoryr   r   top_left	top_rightbottom_leftleftrightfull_attention_maskzero_negative_infinityorigin_leftr   s                      r/   create_attention_maskzGitModel.create_attention_mask  sF   ))A,\\!_
		;;
J7eTJJ#99:&M::	
	 kkj!??
 "A%{{"HNN1$58N$NOH yy(K0a8		9hkk%&89qA#iiu1=dAgF"*&+jj&,,q/6<<PQ?1S`ent&u#"((EJJ6PQQ!&!1!12IQTQZQZ![:?-67188$**1-zG/CZRhEhkrErs
 2779)!Q*;<'4
31<v1EAq+:+-. 2!T1a-@""r.   rS   r   r8   rN  r   rT   r   r   r   r   rM  r   c                 f   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      |#| j                  ||       |j                         }n!||j                         dd }nt        d      |d   }d}|5t        |t              s|d   d   j                  d   n|j                         }| j                  || j                   j                        }d}||j                  dk(  r| j                  ||	      j                   }n|j                  d
k(  rg }t#        |j                  d         D ]O  }| j                  |dd|ddddf   |	      j                   }|| j$                  |   z  }|j'                  |       Q t)        j*                  |d      }nt        d      | j-                  |      }| j/                  ||||      }|It)        j0                  |j                  d   d|j                  d   f|j2                  |j4                        }|j7                  |j                  d      |j                  d      z  dd      }t)        j*                  ||fd      }| j9                  ||j2                  |j4                        }| j;                  ||||      }|mt=        ||j2                  |d         j?                  |j4                        }|dkD  r|dddd| dddf   }n!|dddd|d    d|d    dfxx   |z  cc<   | jA                  ||||||	|
||du	      }|d   }|s	|f|dd z   S tC        ||jD                  |jF                  |jH                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = AutoModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> text = "this is an image of two cats"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer:   z5You have to specify either input_ids or inputs_embedsr   r   rg      r     r   z#pixel_values must be of rank 4 or 5)rS   r8   rT   rU   r   )r  r  r  rU   )tgt_len)r   r   r   r   r   r   r   r   r  )%rQ   r   r   r   r  rl   %warn_if_padding_and_no_attention_maskrX   r   r   r   get_seq_lengthget_head_maskr   ndimr  r#   r   r  appendr*   r   r  r[   r  r   r   repeatr  r  r   r   r  r   r   r$   r%   )rP   rS   r   r8   rN  r   rT   r   r   r   r   rM  r   rY   rZ   rU   projected_visual_featuresvisual_features	frame_idxvisual_features_frameembedding_outputr$   r  combined_attention_maskexpanded_attn_maskr  sequence_outputs                              r/   r\   zGitModel.forwardF  sC   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU ^
 "#& "/59  "1%++A.$335 # &&y$++2O2OP	$(!#  A%"&"4"4 ;S #5 ###   ""a'"$!&|'9'9!'<!= BI,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@AB #())O"C !!FGG(,(>(>(O%??%'#9	 + 
 %,(-!''*A/?/E/Ea/HI&,,'..)% %>$D$D!!!$(A(F(Fq(II1a%
!
 		#<>N"OUVW --j:J:P:PRbRiRij #'"<"< ,#9	 #= #
 % "< 0 6 6B"b!(()  &)%71?U>U>VXY8Y%Z"'1{1~o.?+a.AR(RSWiiS,,2+/!5#!-T!9 ' 

 *!,#%(;;;&-+;;)77&11	
 	
r.   r   )NNNNNNNNNNFN)r&   r'   r(   r>   r  r  r  r^   r*   r   r   r_   r  r  r   r   r   r   r   r+   r   r   r   r\   r`   ra   s   @r/   r  r    s   &/0C# ekk 5<< \a\h\h 0#d  -115/3/3,004KO$(,0/3).&*c
ELL)c
 !.c
 u||,	c

 u||,c
 ELL)c
  -c
 "%tE4E4E/F(F"GHc
 D>c
 $D>c
 'tnc
 #'c
 d^c
 
uU\\"$>>	?c
 c
r.   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                        e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     deeeee	j                     f      dee   dee   dee   dedee   deee	j                     ef   fd       Z	 ddZd Z xZS )GitForCausalLMzoutput.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
r=   r>   r  r  r	   rw   rA   r@   r   r  rO   s     r/   r>   zGitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r.   c                     | j                   S r   r   r  s    r/   get_output_embeddingsz$GitForCausalLM.get_output_embeddings  s    {{r.   c                     || _         y r   r  )rP   new_embeddingss     r/   set_output_embeddingsz$GitForCausalLM.set_output_embeddings  s	    $r.   rS   r   r8   rN  r   rT   labelsr   r   r   r   rM  r   rV   c                    ||n| j                   j                  }|d}	| j                  ||||||||	|
|||      }|d   }| j                  |      }d}|| j                  j                  j
                  d   j                  j                  j                  }|dd|dddf   j                         }|ddddf   j                         } | j                  |j                  d| j                   j                        |j                  d      fd| j                   j                  i|}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                         S )	a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Examples:

        Image captioning example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> import requests
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_caption)
        two cats sleeping on a pink blanket next to remotes.
        ```

        Visual question answering (VQA) example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> from huggingface_hub import hf_hub_download
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
        >>> image = Image.open(file_path).convert("RGB")

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> question = "what does the front of the bus say at the top?"

        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)

        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
        ['what does the front of the bus say at the top? special']
        ```

        Video captioning example:

        ```python
        >>> import av
        >>> import numpy as np
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download
        >>> from transformers import AutoProcessor, AutoModelForCausalLM

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

        >>> # set seed for reproducibility
        >>> np.random.seed(45)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`List[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`List[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # load video
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample frames
        >>> num_frames = model.config.num_image_with_embedding
        >>> indices = sample_frame_indices(
        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
        ... )
        >>> frames = read_video_pyav(container, indices)

        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
        ```
        NF)r   r8   rN  r   rT   r   r   r   r   rM  r   r   r:   r   r@   )losslogitsr   r$   r%   )rQ   r  r  r   r  r   r   rP   ru   r   loss_functionr   r@   r   r   r$   r%   )rP   rS   r   r8   rN  r   rT   r  r   r   r   r   rM  r   rd  r   r  r  r  num_image_tokensshifted_logitsr   s                         r/   r\   zGitForCausalLM.forward  s   j &1%<k$++B]B]I(()%%'+/!5%=#  
 "!*_-#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%4%%##B(>(>?B  ;;11 	D Y,F)-)9TGf$EvE%#33!//))
 	
r.   c                     |B|j                         }|j                  d   |kD  r|}n|j                  d   dz
  }|d d |d f   }|j                  }||j                  |      }|||j                  dd       ||dS )Nr   rN  )rS   r   rN  r   r   )r  r   new_onesget)	rP   rS   r   r   r   rd  past_lengthremove_prefix_lengthrY   s	            r/   prepare_inputs_for_generationz,GitForCausalLM.prepare_inputs_for_generation  s     &)88:K q!K/'2$ (1q'9A'=$!!%9%:":;I  oo!&//<N #,"JJ~t<."
 	
r.   c                 J    d}|D ]  }|t        fd|D              fz  } |S )Nr-   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectr   r   )r  
past_statebeam_idxs     r/   r  z0GitForCausalLM._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)r  )rP   r   r  reordered_past
layer_pasts     `  r/   _reorder_cachezGitForCausalLM._reorder_cache  s=    ) 	Jncmnn N	 r.   )NNNNNNNNNNNFN)NNN)r&   r'   r(   _tied_weights_keysr>   r  r  r   r   r*   r_   r   r   r   r   r   r   r\   r  r
  r`   ra   s   @r/   r  r    s    **%  -115/3/3,004)-FJ$(,0/3).&*A
ELL)A
 !.A
 u||,	A

 u||,A
 ELL)A
  -A
 &A
 "%tELL/A(A"BCA
 D>A
 $D>A
 'tnA
 #'A
 d^A
  
uU\\"$::	;!A
 A
H OS
>r.   r  )r  r  r  r  )r  )Gr)   r   dataclassesr   typingr   r   r   r   r   r*   torch.utils.checkpointr	   activationsr   cache_utilsr   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_gitr   r   
get_loggerr&   rn   r!   r  r1   rc   r   r   r   r   r   r   r   r  r  rZ  r_   r  rg  ri  r}  r  r  r  r  r  r  __all__r-   r.   r/   <module>r     s      ! 9 9    ! . ) B  G l l  : 
		H	% ?; ? ?8-BII -`rryy rlBII   
/299 /fbii  		 .ryy .ba
 a
H * * *:P")) Pf299 . %II%<<% 
% <<	%
 U\\*% % %,L) L)`/BII /f^
ryy ^
B3
299 3
l 
2
' 2

2
j
2BII 
2 
~
! ~

~
B 
{' {
{| Qr.   