
    Uh{*                    t   d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddlmZ ddlmZ ddlm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(m)Z)m*Z*  e#       rddl+m,Z, ddl-m.Z.  e&j^                  e0      Z1 G d de
jd                        Z3	 ddl4m5Z5 e5Z3e1jm                  d        ejt                  e3        G d de
jd                        Z; G d de
jd                        Z< G d de
jd                        Z= G d de
jd                        Z> G d d e
jd                        Z?e" G d! d"e             Z@e" G d# d$e@             ZA G d% d&e
jd                        ZB G d' d(e
jd                        ZC G d) d*e
jd                        ZD G d+ d,e
jd                        ZE G d- d.e
jd                        ZF G d/ d0e
jd                        ZG e"d12       G d3 d4e@             ZH e"d52       G d6 d7e@e             ZIg d8ZJy# e7$ r Y Ue8$ r e1js                  d       Y mw xY w)9zPix2Struct modeling file    N)DictListOptionalTupleUnion)nn   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutputBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS)DUMMY_INPUTS
DUMMY_MASKauto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )Pix2StructConfigPix2StructTextConfigPix2StructVisionConfig)	BlockMask)make_flex_block_causal_maskc                   &     e Zd Zd fd	Zd Z xZS )Pix2StructLayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zc
        Construct a layernorm module in the T5 style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/pix2struct/modeling_pix2struct.pyr(   zPix2StructLayerNorm.__init__>   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor*   float32powmeanrsqrtr-   r,   dtypefloat16bfloat16)r.   hidden_statesvariances      r2   forwardzPix2StructLayerNorm.forwardF   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r3   )gư>__name__
__module____qualname__r(   rB   __classcell__r1   s   @r2   r%   r%   =   s    $+r3   r%   )FusedRMSNormzWDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pix2StructLayerNormzJDiscovered apex but it failed to load, falling back to Pix2StructLayerNormc                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )Pix2StructVisionEmbeddingsa-  
    Construct the embeddings from patch. In `Pix2Struct` the input is different from classic Vision-transformer models.
    Here the input is a sequence of `seq_len` flattened patches that also combines padding patches (tokens). Each patch
    is represented by a vector of `hidden_size` values.
    configreturnNc                    t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _
        t        j                  |j                        | _        y N)r'   r(   r   Linearpatch_embed_hidden_sizer/   patch_projection	Embeddingseq_lenrow_embeddercolumn_embedderDropoutdropout_ratedropoutr.   rL   r1   s     r2   r(   z#Pix2StructVisionEmbeddings.__init__m   s}     "		&*H*H&J\J\ ]LL9K9KL!||FNNF<N<NOzz&"5"56r3   flattened_patchesc                 "   |d d d d df   j                         }|d d d d df   j                         }|d d d d dd f   }| j                  |      }| j                  |      }| j                  |      }||z   |z   }| j	                  |      }|S )Nr   r   r5   )longrR   rU   rV   rY   )r.   r[   row_indicescol_indices
embeddingsrow_embeddingscol_embeddingss          r2   rB   z"Pix2StructVisionEmbeddings.forwardv   s     (1a0557'1a0557-aABh7**+<=
**;7--k:  .0>A
\\*-
r3   )
rD   rE   rF   __doc__r   r(   r*   TensorrB   rG   rH   s   @r2   rK   rK   f   s7    7/ 7D 7 %,, r3   rK   c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )Pix2StructVisionAttentionc                 |   t         |           |j                  | _        |j                  | _        |j
                  | _        |j                  | _        | j                  | j                  z  | _	        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        d| _        y NFbias)r'   r(   r/   d_kvkey_value_proj_dimnum_attention_headsn_headsattention_dropoutrY   	inner_dimr   rP   querykeyvalueoutputgradient_checkpointingrZ   s     r2   r(   z"Pix2StructVisionAttention.__init__   s    !--"(++11//(?(?? YYt//eL
99T--t~~EJYYt//eL
ii0@0@uM&+#r3   c                 6    |j                   dd \  } fd} | j                  |            } | j                  |            }	 | j                  |            }
t	        j
                  ||	j                  dd            }|t	        j                  d j                  ||f|j                  |j                        } j                  r j                  rd|_        |j                         dk(  r*||ddddddf   j                  |j                        z   }nw|||j                  |j                        z   }nVt!               sLt	        j"                  |f|j                  |j                        }||j                  |j                        z   }d|z
  }|j%                  |dk(  t	        j&                  |j                        j(                        }||z  }t	        j*                  |t	        j,                  t	        j&                  |j                        j(                              }t.        j0                  j3                  |dt        j4                  	      j7                  |      }t.        j0                  j9                  | j8                   j                  
      }|||z  }t	        j
                  ||
      }|j                  dd      j;                         j=                  d j>                        } jA                  |      }|f|fz   }|r||fz   }|S )z&
        Self-attention block
        Nr5   c                     | j                         j                  dj                  j                        j	                  dd      S )
projectionr6   r   r5   )
contiguousviewrn   rl   	transpose)states
batch_sizer.   s    r2   to_projection_shapez>Pix2StructVisionAttention.forward.<locals>.to_projection_shape   s<    $$&++JDLL$JaJabllmnpqrrr3   r	   r   devicer=   Tr6   )dimr=   ptraining)!shaperq   rr   rs   r*   matmulr{   zerosrn   r   r=   ru   r   requires_gradr   r8   r   r+   masked_fillfinfominmaxtensorr   
functionalsoftmaxr9   type_asrY   ry   rz   rp   rt   )r.   r@   attention_maskposition_biaslayer_head_maskoutput_attentions
seq_lengthr~   query_states
key_statesvalue_statesscoresposition_bias_maskedattn_weightsattn_outputoutputsr}   s   `               @r2   rB   z!Pix2StructVisionAttention.forward   s    "/!4!4Ra!8
J	s +4::m+DE )-)@A
*4::m+DE lJ,@,@A,FG !KKDLL*j9&--W]WcWcM **t}}.2+!!#q( -q$a?O0P0S0STaThTh0i i+ -0A0A-BVBV0W W-/!&,]5I5IQ^QdQd" !.0A0A-BVBV0W W-M,88!9KU[[Y_YeYeMfMjMjk&&65<<FLL0I0M0M#NO }},,V5==,QYYZ`a }},,\T\\TXTaTa,b &'/9Lll<> "++Aq1<<>CCJPRTXTbTbckk+..M#33/Gr3   )NNNFrC   rH   s   @r2   rf   rf      s    ,& Mr3   rf   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructVisionMlprL   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y rh   r'   r(   r   rP   r/   d_ffwi_0wi_1worW   rX   rY   r
   dense_act_fnactrZ   s     r2   r(   zPix2StructVisionMlp.__init__       IIf00&++EJ	IIf00&++EJ	))FKK););%Hzz&"5"56&--.r3   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rO   r   r   r   rY   
isinstancer   r,   r*   rd   r=   int8r8   r.   r@   hidden_geluhidden_linears       r2   rB   zPix2StructVisionMlp.forward       hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r3   )rD   rE   rF   r!   r(   rB   rG   rH   s   @r2   r   r      s    /5 /r3   r   c                        e Zd Zdeddf fdZ	 	 	 d
dej                  deej                     deej                     dede	e
ej                  ej                  f   e
ej                     f   f
d	Z xZS )Pix2StructVisionLayerrL   rM   Nc                 *   t         |           |j                  | _        d| _        t	        |      | _        t        |      | _        t        |j                  |j                        | _        t        |j                  |j                        | _        y )Nr   r0   )r'   r(   chunk_size_feed_forwardseq_len_dimrf   	attentionr   mlpr%   r/   layer_norm_epspre_mlp_layer_normpre_attention_layer_normrZ   s     r2   r(   zPix2StructVisionLayer.__init__
  ss    '-'E'E$26:&v."5f6H6HfNcNc"d(;F<N<NTZTiTi(j%r3   r@   r   	head_maskr   c                     |}| j                  |      }| j                  ||||      }|d   }|dd  }||z   }| j                  |      }	| j                  |	      |z   }	|	f|z   }|S )N)r   r   r   r   r   )r   r   r   r   )
r.   r@   r   r   r   residualself_attention_outputsattention_outputr   layer_outputs
             r2   rB   zPix2StructVisionLayer.forward  s     ! 55mD!%)%/	 "0 "
 2!4(, )83 ..}=xx-=/G+r3   )NNF)rD   rE   rF   r   r(   r*   rd   r   boolr   r   rB   rG   rH   s   @r2   r   r   	  s    k/ kD k 26,0"'|| !. ELL)	
   
uU\\5<</0%2EE	Fr3   r   c                        e Zd Zdeddf fdZ	 	 	 	 	 ddej                  deej                     deej                     ded	ed
ede	e
ef   fdZ xZS )Pix2StructVisionEncoderrL   rM   Nc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r'   r(   rL   r   
ModuleListrangenum_hidden_layersr   layerru   )r.   rL   _r1   s      r2   r(   z Pix2StructVisionEncoder.__init__5  sP    ]]5QWQiQiKj#ka$9&$A#kl
&+# $ls   A#r@   r   r   r   output_hidden_statesreturn_dictc                 x   |rdnd }|rdnd }t        | j                        D ]j  \  }	}
|r||fz   }|||	   nd }| j                  r,| j                  r | j	                  |
j
                  ||||      }n |
||||      }|d   }|sb||d   fz   }l |r||fz   }|st        d |||fD              S t        |||      S )N r   r   c              3   &   K   | ]	  }||  y wrO   r   .0vs     r2   	<genexpr>z2Pix2StructVisionEncoder.forward.<locals>.<genexpr>a  s     mq_`_lms   last_hidden_stater@   
attentions)	enumerater   ru   r   _gradient_checkpointing_func__call__tupler   )r.   r@   r   r   r   r   r   all_hidden_statesall_self_attentionsilayer_moduler   layer_outputss                r2   rB   zPix2StructVisionEncoder.forward;  s    #7BD$5b4(4 	POA|#$58H$H!.7.CilO**t}} $ A A ))!"#%! !-]NO]n o)!,M &9]1=M<O&O#)	P,   1]4D Dm]4EGZ$[mmm++*
 	
r3   )NNFFT)rD   rE   rF   r   r(   r*   rd   r   r   r   r   r   rB   rG   rH   s   @r2   r   r   4  s    ,/ ,D , 26,0"'%* +
||+
 !.+
 ELL)	+

  +
 #+
 +
 
uo%	&+
r3   r   c                   4    e Zd ZeZdZdZed        Zd Z	d Z
y)Pix2StructPreTrainedModelTFc                 v    t        j                  t              }t        j                  t              }|||d}|S )N)decoder_input_ids	input_idsdecoder_attention_mask)r*   r   r   r   )r.   r   
input_maskdummy_inputss       r2   r   z&Pix2StructPreTrainedModel.dummy_inputso  s6    LL.	\\*-
!*"&0

 r3   c                 4   | j                   j                  }t        |t              r)|j                  j
                  j                  |dz         yt        |t              rVt        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j                  n| j                   j                  }|j                  j                  j
                  j                  d||dz  z         t        |j                  d      rD|j                  j                  .|j                  j                  j
                  j!                          |j"                  j                  j
                  j                  d||dz  z         t        |j"                  d      rD|j"                  j                  .|j"                  j                  j
                  j!                          |j$                  j                  j
                  j                  d||dz  z         t        |j$                  d      rF|j$                  j                  /|j$                  j                  j
                  j!                          yyyt        |t&              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }t        | j                   t              r | j                   j                  j(                  n| j                   j                  }t        | j                   t              r | j                   j                  j*                  n| j                   j*                  }|j,                  j                  j
                  j                  d|||z  dz  z         |j.                  j                  j
                  j                  d||dz  z         |j0                  j                  j
                  j                  d||dz  z         |j2                  j                  j
                  j                  d|||z  dz  z         |j4                  r8|j6                  j                  j
                  j                  d||dz  z         yyt        |t8        j:                        rt        | j                   t              r | j                   j                  j                  n| j                   j                  }|j                  j
                  j                  d||dz  z         |j<                  2|j                  j
                  |j<                     j!                          yyt        |t>              rt        | j                   t              r | j                   j                  j                  n| j                   j                  }|j@                  j                  j
                  j                  d||dz  z         yt        |t8        jB                  t8        jD                  f      rt8        jF                  jI                  |j                  j
                  jK                  tL        jN                        d| j                   jP                        jK                  |j                  jR                        |j                  _        |j                  %|j                  j
                  j!                          yyt        |t              r3|j                  &|j                  j
                  j                  d       yyt        |t8        j:                        rz|j                  j
                  j                  d| j                   jP                         |j<                  2|j                  j
                  |j<                     j!                          yyy)zInitialize the weights      ?        g      )r;   stdrj   N)*rL   initializer_factorr   r%   r,   datafill_ Pix2StructTextDenseGatedActDenser   text_configr/   r   r   normal_hasattrrj   zero_r   r   Pix2StructTextAttentionrk   	num_headsrq   rr   rs   rt   has_relative_attention_biasrelative_attention_biasr   rS   padding_idxPix2StructTextModellm_headrP   Conv2dinittrunc_normal_r8   r*   r9   initializer_ranger=   )r.   modulefactorr/   r   rl   rn   s          r2   _init_weightsz'Pix2StructPreTrainedModel._init_weightsz  s   //f12MM$$Vc\2 @A dkk+;< ''33[[,, 
 4>dkkK[3\4;;**//bfbmbmbrbrDKK##++&[UYDY:Z+[v{{F+0@0@0L  %%++-KK##++&[UYDY:Z+[v{{F+0@0@0L  %%++-II!!))s4D.8Q)Rvyy&)fiinn.H		##))+ /I) 78
 dkk+;< ''33[[,,  1;4;;HX0Y'',,_c_j_j_v_v 
 dkk+;< ''11[[**  LL$$,,#6kTfFfkoEo;p,qJJ""**;PTCT9U*VLL$$,,#6[RVEV;W,XMM  %%--3FwQcGchlFl<m-n11..55::BBQW\glp[pQqBr 2- dkk+;< ''33[[,,  MM&&CVPT?T5U&V!!-""6#5#56<<> . 34 dkk+;< ''33[[,,  NN!!&&..CVX\G\=].^BII 67 "$!6!6""%%emm43DKKDaDa "7 "b$$% MM {{&  &&( ' 34}}(""((- )-MM&&CT[[5R5R&S!!-""6#5#56<<> . .r3   c                    | j                   j                  }| j                   j                  }|t        d      t	        |      rGt        j                  |j                  d d dz   |      }t        j                  ||dd df   gd      }n>|j                  |j                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |d	k(  |       |S )
Nzself.model.config.decoder_start_token_id has to be defined. In Pix2Struct it is usually set to the pad_token_id. See Pix2Struct docs for more information.r6   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rL   decoder_start_token_idpad_token_id
ValueErrorr   r*   fullr   cat	new_zerosclonemasked_fill_)r.   r   r  r  shifted_input_idss        r2   _shift_rightz&Pix2StructPreTrainedModel._shift_right  s    !%!C!C{{//!)<  Y' %

9??3B+?$+FH^ _ %		+<iSbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r3   N)rD   rE   rF   r   config_class_supports_cache_class_supports_static_cachepropertyr   r  r  r   r3   r2   r   r   i  s1    #L " M?`!r3   r   c                       e Zd ZeZdZdZdgZdef fdZ	d Z
deeee   f   dd	fd
Ze	 	 	 	 	 	 ddeej$                     deej$                     deej$                     dee   dee   dee   deeef   fd       Z xZS )Pix2StructVisionModelr[   Tr   rL   c                     t         |   |       || _        t        |      | _        t        |      | _        t        |j                  |j                        | _
        | j                          y Nr   )r'   r(   rL   rK   r`   r   encoderr%   r/   r   	layernorm	post_initrZ   s     r2   r(   zPix2StructVisionModel.__init__  sU     4V<.v6,V-?-?VEZEZ[ 	r3   c                 .    | j                   j                  S rO   )r`   rR   r.   s    r2   get_input_embeddingsz*Pix2StructVisionModel.get_input_embeddings  s    ///r3   heads_to_prunerM   Nc                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r   r   prune_heads)r.   r  r   headss       r2   _prune_headsz"Pix2StructVisionModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr3   r   r   r   r   r   c                 (   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |#|j                  d      dk7  j                         }| j                  || j                   j                        }| j                  |      }| j                  ||||||      }|d   }	| j                  |	      }	|s|	f}
|
|dd z   S t        |	|j                  |j                        S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_channels x patch_height x patch_width)`):
            Flattened and padded pixel values. These values can be obtained using [`AutoImageProcessor`]. See
            [`Pix2StructVisionImageProcessor.__call__`] for details. Check the [original
            paper](https://arxiv.org/abs/2210.03347) (figure 5) for more details.

        Example:

        ```python
        >>> import requests
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, Pix2StructVisionModel

        >>> image_processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructVisionModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> last_hidden_states = outputs.last_hidden_state
        >>> list(last_hidden_states.shape)
        [1, 2048, 768]
        ```
        Nz%You have to specify flattened_patchesr6   r  r   )r   r   r   r   r   r   r   )rL   r   r   use_return_dictr  sumfloatget_head_maskr   r`   r  r  r   r@   r   )r.   r[   r   r   r   r   r   embedding_outputencoder_outputssequence_outputhead_outputss              r2   rB   zPix2StructVisionModel.forward  s8   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$DEE!/333;q@GGIN &&y$++2O2OP	??+<=,,)/!5# ' 
 *!,..9+-L/!""555-)77&11
 	
r3   )NNNNNN)rD   rE   rF   r!   r  main_input_namesupports_gradient_checkpointing_no_split_modulesr   r(   r  r   intr   r"  r   r   r*   rd   r   r   r   r   rB   rG   rH   s   @r2   r  r    s    )L)O&*#01
/ 
0C4T#Y+? CD C  5915,0,0/3&*N
#ELL1N
 !.N
 ELL)	N

 $D>N
 'tnN
 d^N
 
u00	1N
 N
r3   r  c                   *     e Zd Zdef fdZd Z xZS )r   rL   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y rh   r   rZ   s     r2   r(   z)Pix2StructTextDenseGatedActDense.__init__X  r   r3   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rO   r   r   s       r2   rB   z(Pix2StructTextDenseGatedActDense.forward`  r   r3   rD   rE   rF   r    r(   rB   rG   rH   s   @r2   r   r   W  s    /3 /r3   r   c                   *     e Zd Zdef fdZd Z xZS )Pix2StructTextLayerFFrL   c                     t         |           t        |      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y r  )r'   r(   r   DenseReluDenser%   r/   layer_norm_epsilon
layer_normr   rW   rX   rY   rZ   s     r2   r(   zPix2StructTextLayerFF.__init__u  sK    >vF-f.@.@fF_F_`zz&"5"56r3   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S rO   )r9  r7  rY   )r.   r@   forwarded_statess      r2   rB   zPix2StructTextLayerFF.forward}  s=    ??=9../?@%5E(FFr3   r3  rH   s   @r2   r5  r5  t  s    73 7r3   r5  c                   f     e Zd Z	 ddedee   f fdZedd       Zd	dZ		 	 	 	 	 	 	 	 	 d
dZ
 xZS )r   rL   	layer_idxc                    t         |           || _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _	        |j                  | _        | j                  | j                  z  | _        || _        |-t        j                  d| j                   j"                   d       t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        t%        j&                  | j
                  | j
                  d      | _        | j                  r/t%        j0                  | j                  | j                        | _        t5               | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.Fri   )r'   r(   r   relative_attention_num_bucketsrelative_attention_max_distancer/   rk   rl   r   rn   rX   rY   rp   r=  loggerwarning_oncer1   rD   r   rP   rq   rr   rs   rt   rS   r   setpruned_headsru   r.   rL   r   r=  r1   s       r2   r(   z Pix2StructTextAttention.__init__  ss    	+F(.4.S.S+/5/U/U,!--"(++''**(?(??"*4>>+B+B*C D, , YYt//1A1AN
99T--t/?/?eLYYt//1A1AN
ii 0 0$2B2BO+++-<<8[8[]a]i]i+jD(E&+#r3   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r5   r   )r8   r*   r]   absr   
zeros_likelogr&  math	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r2   _relative_position_bucketz1Pix2StructTextAttention._relative_position_bucket  s(   . AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r3   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  |d| j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r=   r   F)rN  rO  rP  )r5   r   r   r   )r   r,   r   r*   aranger]   r8   rU  r?  r@  permute	unsqueeze)
r.   query_length
key_lengthr   cache_positioncontext_positionmemory_positionrM  relative_position_bucketvaluess
             r2   compute_biasz$Pix2StructTextAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A;;==	 $B $
  --.FG	*44Q7r3   c                    |j                   dd \  }}|du}| j                  |      }|j                  |d| j                  | j                        j                  dd      }|@|j                  j                  | j                        }|r|j                  }n|j                  }|r|n|}|r7|r5r3j                  | j                     }|j                  | j                     }n| j                  |      }| j                  |      }|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }|D|s|
nd}
j                  ||| j                  d|
i      \  }}|rd|j                  | j                  <   t!        j"                  ||j                  dd            }||j                   d   }||n|
d   dz   }| j$                  sZt!        j&                  d| j                  ||f|j(                  |j*                  	      }| j,                  rE| j.                  r9d|_        n1| j3                  |||j(                  |

      }|dddd| dddf   }|#|ddddddd|j                   d   f   }||z   }| j4                  rRt!        j6                  |j                   d         }d|t9        | j4                        <   |dd|j;                         f   }n|}||z  }t<        j>                  jA                  |jC                         d      jE                  |      }t<        j>                  jG                  || jF                  | j.                        }|||z  }t!        j"                  ||      }|j                  dd      jI                         }|j                  |d| jJ                        }| jM                  |      }|||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr5   r6   r   r\  Tr	   r   )r   r\  r   r  r   )'r   rq   rz   rn   rl   r{   
is_updatedgetr=  cross_attention_cacheself_attention_cache	key_cachevalue_cacherr   rs   updater*   r   r   r   r   r=   ru   r   r   ra  rD  r+   listr   r   r   r   r&  r   rY   ry   rp   rt   )r.   r@   maskkey_value_statesr   past_key_valuer   rZ  	use_cacher   r\  r}   r   is_cross_attentionr   rd  curr_past_key_valuecurrent_statesr   r   r   r[  real_seq_lengthcausal_maskr   r   r   r   s                               r2   rB   zPix2StructTextAttention.forward  s   $ "/!4!4Ra!8
J .T9zz-0#((RtG^G^_iijkmno%'2266t~~FJ!&4&J&J#&4&I&I#-?)].Z,66t~~FJ.::4>>JL.1J::n5L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fkk+.>/Gr3   FN)T       )NN)	NNNNNNFFN)rD   rE   rF   r    r   r/  r(   staticmethodrU  ra  rB   rG   rH   s   @r2   r   r     s^    jn,*,ZbcfZg,> -  - `0 ir3   r   c                   B     e Zd Zddee   f fdZ	 	 	 	 	 	 	 ddZ xZS ) Pix2StructTextLayerSelfAttentionr=  c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nr   r=  r   r'   r(   r   r   r%   r/   r8  r9  r   rW   rX   rY   rE  s       r2   r(   z)Pix2StructTextLayerSelfAttention.__init__Y  sU    00KW`
 .f.@.@fF_F_`zz&"5"56r3   c	           
          | j                  |      }	| j                  |	|||||||      }
|| j                  |
d         z   }|f|
dd  z   }|S )N)rl  r   r   rn  ro  r   r\  r   r   r9  r   rY   )r.   r@   r   r   r   rn  ro  r   r\  normed_hidden_statesr   r   s               r2   rB   z(Pix2StructTextLayerSelfAttention.forwarda  st      $}=>> '+)/) * 	
 &5Ea5H(II "%5ab%99r3   ru  )NNNNFFNrD   rE   rF   r   r/  r(   rB   rG   rH   s   @r2   rz  rz  X  s0    7XVY] 7 r3   rz  c                   D     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 ddZ xZS )!Pix2StructTextLayerCrossAttentionr=  c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFr|  r   r}  )r.   rL   r=  r1   s      r2   r(   z*Pix2StructTextLayerCrossAttention.__init__~  sP    0UZfop-f.@.@fF_F_`zz&"5"56r3   c                     | j                  |      }| j                  |||||||||	|

      }|| j                  |d         z   }|f|dd  z   }|S )N)	rl  rm  r   r   rn  ro  rZ  r   r\  r   r   r  )r.   r@   rm  r   r   r   rn  ro  rZ  r   r\  r  r   r   r   s                  r2   rB   z)Pix2StructTextLayerCrossAttention.forward  sy      $}=>> -'+)%/) * 
 %t||4DQ4G'HH/$4QR$88r3   rO   )NNNNFNFNr  rH   s   @r2   r  r  }  s2    7(3- 7 r3   r  c                   L     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )Pix2StructTextBlockr=  c                     t         |           t        |||      | _        t	        ||      | _        t        |      | _        y )Nr|  )r=  )r'   r(   rz  self_attentionr  encoder_decoder_attentionr5  r   rE  s       r2   r(   zPix2StructTextBlock.__init__  sH    >(C
 *K*
&
 )0r3   c                 (   | j                  |||||	|
||      }|d d \  }}	|dd  }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|d u}|r| j                  ||||||	|d   dz   |
|	      }|d d \  }}	|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }||dd  z   }| j                  |      }|j                  t        j                  k(  rht        j                  |      j                         rEt        j                  |j                        j                  dz
  }t        j                  || |      }|f}|
r||	fz   |z   }|S ||z   }|S )N)r   r   r   rn  ro  r   r\  r5   i  )r   r   r6   r   )rm  r   r   r   rn  rZ  ro  r   )r  r=   r*   r>   isinfanyr   r   clampr  r   )r.   r@   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskrn  ro  r   r   r\  r   attention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r2   rB   zPix2StructTextBlock.forward  s     "&!4!4)'+)/) "5 	"
 )?r(B%~2126 %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM2$>&*&D&D!65; :-+B/!3#"3 'E 
'# -DBQ,G)M> ""emm3M8R8V8V8X#kk-*=*=>BBTI %M|Q\ ] !24KAB4O O / %--/EKK4N4R4R4T++m&9&9:>>EK!KKK<[YM " 114EEG   11Gr3   ru  )NNNNNNNNFFTNr  rH   s   @r2   r  r    s@    1XVY] 1& "#&*#'Hr3   r  z3
    The standalone text decoder of Pix2Struct
    )custom_introc            #           e Zd ZeZdgZdgZdZ fdZd Z	d Z
d Zd Zd	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 d#d
eej"                     deej$                     deej$                     deej$                     deej"                     deej$                     deej&                     deeeej$                           dee   dee   dee   deej"                     dee   deej"                     deeej$                  df   ef   fd       Z	 d$deej&                  df   dej&                  dej&                  dedef
dZedej&                  deded ej:                  dej&                  d!efd"       Z xZS )%r   r  zlm_head.weightTc                 V   t         |   |       t        j                  |j                  |j
                        | _        t        j                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _        t        |j
                  |j                        | _        t        j                   |j"                        | _        t        j&                  |j
                  |j                  d      | _        | j+                          d| _        y c c}w )Nr   r|  r   Fri   )r'   r(   r   rS   
vocab_sizer/   embed_tokensr   r   
num_layersr  r   r   r%   r8  final_layer_normrW   rX   rY   rP   r   r  ru   )r.   rL   r   r1   s      r2   r(   zPix2StructTextModel.__init__
  s     LL):):F<N<NO]] v001 $FQRSV`ab

 !4F4F4FFLeLe fzz&"5"56yy!3!3V5F5FUS 	&+#s   &!D&c           	         |t         j                  d       |S d}|D ]  }d}|D ]1  }||j                  d|j                  |j                              fz   }3 |d   j
                  |d   j
                  k7  r,t        d|d   j
                   d|d   j
                   d      t        |      t        |      k7  r$t        dt        |       dt        |       d      ||fz   } |S )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr   r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape z mismatchedz&length of reordered_layer_past_states z! and length of layer_past_states )rA  warningindex_selectr8   r   r   r  len)r.   past_key_valuesbeam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r2   _reorder_cachez"Pix2StructTextModel._reorder_cache  su    "NNef""!#!0 	] +-'$5  .I$11!X[[AQAXAX5YZM /+ +1-337H7K7Q7QQ ;<WXY<Z<`<`;a  bB  CT  UV  CW  C]  C]  B^  ^i  j  ./37H3II <SA\=]<^^  AD  EV  AW  @X  Xc  d  &<?Z>\%\"'	]( &%r3   c                     | j                   S rO   r  r  s    r2   r  z(Pix2StructTextModel.get_input_embeddings<  s       r3   c                     || _         y rO   r  r.   new_embeddingss     r2   set_input_embeddingsz(Pix2StructTextModel.set_input_embeddings?  s
    *r3   c                     | j                   S rO   r   r  s    r2   get_output_embeddingsz)Pix2StructTextModel.get_output_embeddingsB      ||r3   c                     || _         y rO   r  r  s     r2   set_output_embeddingsz)Pix2StructTextModel.set_output_embeddingsE  s	    %r3   r   r   r  r  inputs_embedsr   cross_attn_head_maskr  ro  r   r   labelsr   r\  rM   .c                 
   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      |&|j                         }|j                  d|d         }n!||j                         dd }nt        d      |$| j                  J d       | j                  |      }|\  }}d}d}|	s|t        |t              r't        |t              sd}t        |t                     }n[t        |t              s-d}t        j                  d       t        j                  |      }n|t        t               t                     }d	}||d	   }n||j!                         }|%t#        j$                  |||z   |j&                  
      }|9||j!                         |z   n|}t#        j(                  |||j&                  
      }| j                   j*                  r$| j-                  |||||j.                  nd|
      }nX|ddddddf   }|j1                  |j2                        }d|z
  t#        j4                  |j2                        j6                  z  }|M|j                         \  }}}||f}|!t#        j(                  ||j&                  
      }| j9                  |      }nd}| j;                  || j                   j<                        }| j;                  || j                   j<                        }|rdnd}|
rdnd}|
rdnd}d} d}!| j?                  |      }"tA        | jB                        D ]  \  }#}$||#   }%||#   }&|r||"fz   }| jD                  rM| jF                  rA|	rt        jI                  d       d}	| jK                  |$jL                  |"|| |||!|%|&d|	|
|      }'n |$|"|| |||!|%|&||	|
|      }'|	du r|'dd dz   |'dd z   }'|'dd \  }"}(|'d   } |	|'|
rdnd   }!|
s||'d   fz   }|||'d   fz   } | jO                  |"      }"| j?                  |"      }"| jQ                  |"      })|r||"fz   }d}*||j1                  |)j&                        }tS        jT                  dd      }+ |+|)jW                         j                  d|)j                  d            |jW                         j                  d            }*|	r(nd},|r|j.                  },|r|jY                         },|st[        d |*|)|,|||fD              S t]        |*|)|,|||      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pix2StructText is a model with relative position
            embeddings so you should be able to pad the inputs on both the right and the left.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for detail.

            [What are input IDs?](../glossary#input-ids)

            To know more on how to prepare `input_ids` for pretraining take a look a [Pix2StructText
            Training](./t5#training).
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.

        Example:

        ```python
        >>> from transformers import AutoProcessor, Pix2StructTextModel

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructTextModel.from_pretrained("google/pix2struct-textcaps-base")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> loss = outputs.loss
        ```
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer6   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsz<You have to initialize the model with valid token embeddingsFTzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r=   r   r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...)r   r   r  r  r  r   r  rn  ro  r   r\  r   rO   r5      r	      r  r;   )ignore_index	reductionc              3   $   K   | ]  }|| 
 y wrO   r   r   s     r2   r   z.Pix2StructTextModel.forward.<locals>.<genexpr>/  s       = s   )losslogitsr  r@   r   cross_attentions)/rL   ro  r   r   r$  r  sizerz   r  r   r   r   r   rA  rB  from_legacy_cacheget_seq_lengthr*   rW  r   r+   
is_decoder_update_causal_maskrg  r8   r=   r   r   invert_attention_maskr'  r  rY   r   r   ru   r   r  r   rB   r  r   r   CrossEntropyLossry   to_legacy_cacher   r   )-r.   r   r   r  r  r  r   r  r  ro  r   r   r  r   r\  kwargsinput_shaper}   r   return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthrt  encoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskr   all_attentionsall_cross_attentionsr   r  r@   r   r   r   r  r   next_decoder_cacher  r  loss_fct
next_caches-                                                r2   rB   zPix2StructTextModel.forwardH  s   f "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>stt"#..*K!r;r?;I&',,.s3Kdee $$0p2pp0 --i8M!,
J $&+#3/51*_Vi:j.2+"5o|~"V1DE&*###`
 #6"G"G"X ("5lnln"U!"%%3A%6"(%4%C%C%E"!"\\&(>(KTaThThN ! BQA\..0:=bl  #ZZ
OML`L`aN;;!!228G8S44Y]!K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK !,=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y$++2H2HI	#112FH^H^_"6BD0d&7rd(,%]3(4 <	VOA|'lO)=a)@&#$58H$H!**t}}NNt !&I $ A A ((!!)31#.%"!  !-!#."/*?+J2O$3/I#2'&7#1!" E! -bq 1G ;mAB>O O0=bq0A-M-
 *!,M$00=CTaZ[0\- !/=3C2E!E(4+?=QRCSBU+U(y<	V| --m<]3m,   1]4D DYYv}}-F**OHF--/44RRI6K\K\K^KcKcdfKghD+4'$
&(==J(88:J  %"(   1&+%1
 	
r3   r"   input_tensorc           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2r   flex_attentionr   Fsdpa)r  r  is_trainingr   r6   )sequence_lengthtarget_lengthr=   r\  r}   )cudaxpunpu)rL   _attn_implementationr  r   r*   rd   r#   r  is_compileabler   _ignore_causal_mask_sdpar   r=   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r.   r   r  r\  r  r   past_seen_tokensusing_compilable_cacher=   r  r  rt  	min_dtypes                r2   r  z'Pix2StructTextModel._update_causal_maskE  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr3   r  r  r=   r}   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nr  )
fill_valuer=   r   r   )diagonalr  r6   r   )r   r*   r   r   r  r   triurW  reshapeexpandr  r   r8   r   )r   r  r  r=   r\  r}   r  rt  r  mask_lengthpadding_masks              r2   r  zIPix2StructTextModel._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r3   )NNNNNNNNNNNNNN)F) rD   rE   rF   r    r  r.  _tied_weights_keysr-  r(   r  r  r  r  r  r   r   r*   
LongTensorFloatTensorrd   r   r   r   r   rB   r   r  rx  r/  r=   r  rG   rH   s   @r2   r   r     sv    (L./*+&*#,(&<!+&  156:=A>B48157;EI$(,0/3-1&*59y
E,,-y
 !!2!23y
  ((9(9:	y

 !)):): ;y
   0 01y
 E--.y
 'u||4y
 "%e.?.?(@"ABy
 D>y
 $D>y
 'tny
 ))*y
 d^y
 !!1!12y
" 
uU&&+,.OO	P#y
 y
D #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r3   r   zr
    A conditional generation model with a language modeling head. Can be used for sequence generation tasks.
    c            &       t    e Zd ZeZdZdgZdef fdZd Zd Z	de
j                  fdZd	 Zd
 Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej&                     deej&                     deej(                     deej*                     deej&                     deej&                     deej,                     deeeej&                           deeeej&                           deej(                     deej,                     dee   dee   dee   dee   deej(                     deeej&                     ef   f"d       Z xZS )"Pix2StructForConditionalGenerationr[   zdecoder.lm_head.weightrL   c                     t         |   |       t        |j                        | _        t        |j                        | _        |j                  | _        | j                          y rO   )
r'   r(   r  vision_configr  r   r   decoderis_vqar  rZ   s     r2   r(   z+Pix2StructForConditionalGeneration.__init__  sK     ,V-A-AB*6+=+=>mm 	r3   c                 6    | j                   j                         S rO   )r  r  r  s    r2   r  z7Pix2StructForConditionalGeneration.get_input_embeddings  s    ||0022r3   c                 :    | j                   j                  |       y rO   )r  r  r  s     r2   r  z7Pix2StructForConditionalGeneration.set_input_embeddings  s    )).9r3   rM   c                 6    | j                   j                         S rO   )r  r  r  s    r2   r  z8Pix2StructForConditionalGeneration.get_output_embeddings  s    ||1133r3   c                 :    | j                   j                  |       y rO   )r  r  r  s     r2   r  z8Pix2StructForConditionalGeneration.set_output_embeddings  s    **>:r3   c                     | j                   S rO   )r  r  s    r2   get_decoderz.Pix2StructForConditionalGeneration.get_decoder  r  r3   c                     | j                   S rO   )r  r  s    r2   get_encoderz.Pix2StructForConditionalGeneration.get_encoder  r  r3   r   r   r   r   decoder_head_maskr  r)  r  r  decoder_inputs_embedsro  r   r   r   r\  c                 $   ||n| j                   j                  j                  }||n| j                   j                  }|| j	                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|
U|S|Q| j                  |
      }||n2|j                  | j                   j                        j                         }d|dddf<   | j                  ||||	||||||||
||      }|s||z   S t        |j                  |j                  |j                   |j"                  |j$                  |j&                  |j(                  |j"                  |j$                  	      S )	a  
        flattened_patches (`torch.FloatTensor` of shape `(batch_size, seq_length, hidden_size)`):
            Flattened pixel patches. the `hidden_size` is obtained by the following formula: `hidden_size` =
            `num_channels` * `patch_size` * `patch_size`

            The process of flattening the pixel patches is done by `Pix2StructProcessor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            Pix2StructText uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            To know more on how to prepare `decoder_input_ids` for pretraining take a look at [Pix2StructText
            Training](./t5#training).
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss for the decoder.

        Example:

        Inference:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-textcaps-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-textcaps-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> # autoregressive generation
        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A stop sign is on a street corner.

        >>> # conditional generation
        >>> text = "A picture of"
        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_special_tokens=False)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=50)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_text)
        A picture of a stop sign with a red stop sign
        ```

        Training:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Pix2StructForConditionalGeneration

        >>> processor = AutoProcessor.from_pretrained("google/pix2struct-base")
        >>> model = Pix2StructForConditionalGeneration.from_pretrained("google/pix2struct-base")

        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "A stop sign is on the street corner."

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> labels = processor(text=text, return_tensors="pt").input_ids

        >>> # forward pass
        >>> outputs = model(**inputs, labels=labels)
        >>> loss = outputs.loss
        >>> print(f"{loss.item():.5f}")
        5.94282
        ```N)r[   r   r   r   r   r   r   r   r5   r   )r   r   r  r  r  r  r   r  ro  r   r   r  r   r\  )	r  r  r  decoder_hidden_statesdecoder_attentionsr  encoder_last_hidden_stater  encoder_attentions)rL   r   ro  r$  r  r   r   r  r  ner  r&  r  r   r  r  r  r@   r   r  r   )r.   r[   r   r   r   r   r  r  r)  r  r  r  ro  r   r   r   r\  r@   decoder_outputss                      r2   rB   z*Pix2StructForConditionalGeneration.forward  s   d "+!6IDKK<S<S<]<]	%0%<k$++B]B] ""ll"3-#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 *5 '&))$++*B*BCIIK # ,-"1a4( ,,'1/+"/#1'!5/!5#) ' 
" "_44 %%"))+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r3   )NNNNNNNNNNNNNNNN)rD   rE   rF   r   r  r,  r  r(   r  r  r   Moduler  r  r  r  r   r   r*   r  r  
BoolTensorrd   r   r   r   r   rB   rG   rH   s   @r2   r  r    s	    $L)O23	/ 	3:4ryy 4;  :>6:8<=A159=7;EIEI-18<$(,0/3&*59#q
#E$5$56q
 !!2!23q
 $E$4$45	q

 !))9)9 :q
 E--.q
 $E$5$56q
 'u||4q
 "%e.?.?(@"ABq
 "%e.?.?(@"ABq
 ))*q
  (5q
 D>q
 $D>q
 'tnq
  d^!q
" !!1!12#q
$ 
uU&&');;	<%q
 q
r3   r  )r   r  r  r   )Krc   rJ  typingr   r   r   r   r   r*   torch.utils.checkpointr   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   r   r   modeling_utilsr   pytorch_utilsr   utilsr   r   r   r   r   r   r   configuration_pix2structr   r    r!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerrD   rA  r  r%   apex.normalizationrI   infoImportError	Exceptionr  appendrK   rf   r   r   r   r   r  r   r5  r   rz  r  r  r   r  __all__r   r3   r2   <module>r&     s=     5 5    ! C C ) >  . 1   e d  !;J 
		H	%+")) +2	/&
KKij    / 0! !H^		 ^D")) :(BII (V2
bii 2
j y! y! y!x l
5 l
 l
`ryy :BII  Pbii Ph!ryy !J#		 #LY")) Yx 
{3 {
{| 
T
)BO T

T
nE2  	 	
NN_`	s   0H H7H76H7