
    Uh                     >   d Z ddlZddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZ ddlmZ  ej8                  e      Z G d dej>                        Z  G d dej>                        Z! G d dejD                        Z# G d dejD                        Z$ G d dejD                        Z%e G d de             Z& G d de&      Z' ed       G d d e&             Z( ed!       G d" d#e&e             Z)d#dgZ*y)$z/PyTorch TrOCR decoder model (based on RoBERTa).    N)OptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FN)GenerationMixin)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions)PreTrainedModel)auto_docstringlogging   )TrOCRConfigc                   n     e Zd ZdZdedef fdZd	dej                  dedej                  f fdZ xZ	S )
TrOCRLearnedPositionalEmbeddingzN
    This module learns positional embeddings up to a fixed maximum size.
    num_embeddingsembedding_dimc                 N    d| _         t        | 	  || j                   z   |       y )N   )offsetsuper__init__)selfr   r   	__class__s      z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/trocr/modeling_trocr.pyr   z(TrOCRLearnedPositionalEmbedding.__init__.   s$     $++5}E    	input_idspast_key_values_lengthposition_idsc                 $   |a|j                   dd \  }}t        j                  |||z   t        j                  | j                  j
                        j                  |d      }n|j                  d      }t        | %  || j                  z         S )z3`input_ids' shape is expected to be [bsz x seqlen].Nr   )dtypedevicer   )shapetorcharangelongweightr&   expand	unsqueezer   forwardr   )r   r!   r"   r#   bszseq_lenr   s         r   r/   z'TrOCRLearnedPositionalEmbedding.forward4   s     $??2A.LC <<&(>(HPUPZPZcgcncncucufS"o  (11!4Lw|dkk9::r    r   N)
__name__
__module____qualname____doc__intr   r)   Tensorr/   __classcell__r   s   @r   r   r   )   sH    Fs F3 F; ;s ;^c^j^j ; ;r    r   c            
       `     e Zd ZdZd	dedededee   f fdZdej                  f fdZ
 xZS )
TrOCRScaledWordEmbeddingz\
    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
    r   r   padding_idxembed_scalec                 6    t         |   |||       || _        y N)r   r   r>   )r   r   r   r=   r>   r   s        r   r   z!TrOCRScaledWordEmbedding.__init__H   s    D&r    r!   c                 <    t         |   |      | j                  z  S r@   )r   r/   r>   )r   r!   r   s     r   r/   z TrOCRScaledWordEmbedding.forwardL   s    wy)D,<,<<<r    )      ?)r3   r4   r5   r6   r7   r   floatr   r)   r8   r/   r9   r:   s   @r   r<   r<   C   sE    's '3 'S '_ghm_n '= = =r    r<   c            	            e Zd ZdZddededee   f fdZeddededee   fd       Z e	j                         dde	j                  d	efd
       Z	 dde	j                  ded	ee   fdZ xZS )"TrOCRSinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.num_positionsr   r=   c                     t         |           d| _        || _        || _        | j                  |||      | _        | j                  dt        j                  d             y )Nr   _float_tensorr   )
r   r   r   r   r=   get_embeddingweightsregister_bufferr)   FloatTensor)r   rF   r   r=   r   s       r   r   z+TrOCRSinusoidalPositionalEmbedding.__init__S   sV    *&))-T_e.?.?.BCr    r   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   )r%   r   dimr'   N)mathlogr)   expr*   int64rC   r.   catsincosviewzerostoget_default_dtype)r   r   r=   half_dimembs        r   rI   z0TrOCRSinusoidalPositionalEmbedding.get_embedding[   s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r    r!   r"   c                 P   |j                         \  }}| j                  || j                  |      j                  |j                        }| j                  dz   |z   }| j
                  || j
                  j                  d      kD  r,| j                  || j                  | j                        | _        | j
                  j                  | j                        | _        | j
                  j                  d|j                  d            j                  ||d      j                         }|S )Nr   r   r'   )size"create_position_ids_from_input_idsr=   rY   r&   rJ   rI   r   rH   index_selectrW   detach)r   r!   r"   r0   r1   r#   max_posxs           r   r/   z*TrOCRSinusoidalPositionalEmbedding.forwardn   s     ~~'W>>y$JZJZ\rsvv

 ""Q&0<<7T\\->->q-A#A--gt7I7I4K[K[\DL||t'9'9:LL%%a):):2)>?DDS'SUV]]_r    c                     |j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )z
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
        r   rN   )ner7   r)   cumsumtype_asr+   )r   r!   r=   r"   maskincremental_indicess         r   r_   zETrOCRSinusoidalPositionalEmbedding.create_position_ids_from_input_ids   sW     ||K(,,.$||Da8@@FI__cgg"'')K77r    r@   )r   )r3   r4   r5   r6   r7   r   r   staticmethodrI   r)   no_gradr8   r/   r_   r9   r:   s   @r   rE   rE   P   s    NDc D# DHUXM D 1c 1# 1HUXM 1 1$ U]]_ s  & bc
8
847
8QYZ]Q^
8r    rE   c                       e Zd ZdZ	 	 	 	 	 	 ddededee   dee   dededed	ef fd
Zde	j                  dedefdZ	 	 	 	 	 dde	j                  dee	j                     deee	j                        dee	j                     dee	j                     dedee	j                  ee	j                     eee	j                        f   fdZ xZS )TrOCRAttentionz>Multi-headed attention from 'Attention Is All You Need' paper.	embed_dim	num_headskdimvdimdropout
is_decoderbiasis_cross_attentionc
                 B   t         
|           || _        ||n|| _        ||n|| _        || _        || _        ||z  | _        | j                  |z  | j                  k(  st        d| j                   d| d      | j                  dz  | _	        || _
        t        j                  | j                  ||      | _        t        j                  | j                  ||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      ࿩rt   )r   r   rn   rp   rq   ro   rr   head_dim
ValueErrorscalingrs   r   Lineark_projv_projq_projout_proj)r   configrn   ro   rp   rq   rr   rs   rt   ru   r   s             r   r   zTrOCRAttention.__init__   s     	" ,D)	 ,D)	"!Y.	)T^^;MdnnM] ^;b"  }}d*$ii		94@ii		94@ii	94@		)YTBr    tensorr1   r0   c                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r   )rW   ro   rx   	transpose
contiguous)r   r   r1   r0   s       r   _shapezTrOCRAttention._shape   s7    {{3GQQRSUVWbbddr    hidden_stateskey_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 	   |du}|j                         \  }}	}
| j                  |      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |      d|      }| j                  | j                  |      d|      }n|}| j                  | j	                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j	                  |      d|      }| j                  | j                  |      d|      }| j                  r||f}|| j                  z  d| j                  f} | j                  ||	|      j                  | } |j                  | } |j                  | }|j                  d      }t        j                  ||j                  dd            }|j                         || j                  z  |	|fk7  r/t        d|| j                  z  |	|f d|j                                |{|j                         |d|	|fk7  r#t        d	|d|	|f d|j                                |j                  || j                  |	|      |z   }|j                  || j                  z  |	|      }t        j                   j#                  |d      }||j                         | j                  fk7  r*t        d
| j                  f d|j                                |j                  dddd      |j                  || j                  |	|      z  }|j                  || j                  z  |	|      }|r?|j                  || j                  |	|      }|j                  || j                  z  |	|      }nd}t        j                   j%                  || j$                  | j&                        }t        j                  ||      }|j                         || j                  z  |	| j                  fk7  r7t        d|| j                  |	| j                  f d|j                                |j                  || j                  |	| j                        }|j                  dd      }|j)                  ||	|
      }| j+                  |      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r'   r   rN   z$Attention weights should be of size z	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size ptrainingz `attn_output` should be of size )r^   r~   rz   r   r|   r}   r)   rT   rs   ro   rx   rW   bmmr   ry   r   
functionalsoftmaxrr   r   reshaper   )r   r   r   r   r   r   r   ru   r0   tgt_lenrn   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r   r/   zTrOCRAttention.forward   s    .T9"/"4"4"6Wi {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1!))#w	BmmK01>AAr    )NN        FTF)NNNNF)r3   r4   r5   r6   r7   r   rC   boolr   r)   r8   r   r   r/   r9   r:   s   @r   rm   rm      sa   H #" #(C C 	C
 smC smC C C C !CBeU\\ eC ec e 488<1526"'kB||kB #5<<0kB !u||!45	kB
 !.kB "%,,/kB  kB 
u||Xell3XeELL>Q5RR	SkBr    rm   c                   $    e Zd Zdef fdZ	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	eeej                        d
ee	   dee	   fdZ
 xZS )TrOCRDecoderLayerr   c           
      ^   t         |           |j                  | _        t	        || j                  |j
                  |j                  d      | _        |j                  | _        t        |j                     | _        |j                  | _        t        j                  | j                        | _        |j                   rnt	        || j                  |j
                  |j"                  |j"                  |j                  dd      | _        t        j                  | j                        | _        t        j(                  | j                  |j*                        | _        t        j(                  |j*                  | j                        | _        t        j                  | j                        | _        y )NT)rn   ro   rr   rs   )rn   ro   rp   rq   rr   rs   ru   )r   r   hidden_sizern   rm   decoder_attention_headsattention_dropout	self_attnrr   r	   activation_functionactivation_fnactivation_dropoutr   	LayerNormself_attn_layer_normrs   cross_attention_hidden_sizeencoder_attnencoder_attn_layer_normr{   decoder_ffn_dimfc1fc2final_layer_normr   r   r   s     r   r   zTrOCRDecoderLayer.__init__$  s7   ++'nn44,,
 ~~#F$>$>?"(";";$&LL$@! ... 88777700#'	!D ,.<<+GD(99T^^V-C-CD99V33T^^D "T^^ <r    r   r   encoder_hidden_statesencoder_attention_maskr   cross_attn_layer_head_maskr   r   	use_cachec
                 t   |}
||dd nd}| j                  |||||      \  }}}t        j                  j                  || j                  | j                        }|
|z   }| j                  |      }d}d}|w|}
||dd nd}| j                  ||||||      \  }}}t        j                  j                  || j                  | j                        }|
|z   }| j                  |      }||z   }|}
| j                  | j                  |            }t        j                  j                  || j                  | j                        }| j                  |      }t        j                  j                  || j                  | j                        }|
|z   }| j                  |      }|f}|r|||fz  }|	r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size *(decoder_attention_heads,)*.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr   )r   r   r   r   r   r   )r   r   r   r   r   r   )r   r   r   rr   r   r   r   r   r   r   r   r   r   )r   r   r   r   r   r   r   r   r   r   residualself_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valueoutputss                     r   r/   zTrOCRDecoderLayer.forwardF  s   < ! :H9S>"1#5Y] >Bnn'3)+/ ?M ?
;(*; --mt||VZVcVc-d =011-@ (,$! ,$H @N?Yrs(;_c%NRN_N_+!65 :8"3 O` OKM-/K MM11-4<<Z^ZgZg1hM$}4M 88GM !24P P !**488M+BC--mt?V?Vaeanan-o/--mt||VZVcVc-d =0--m< ")+=>>G)++Gr    )NNNNNNFT)r3   r4   r5   r   r   r)   r8   r   r   r   r/   r9   r:   s   @r   r   r   #  s     ={  =J 268<9=26=A8<,1$([||[ !.[  (5	[
 !) 6[ "%,,/[ %-U\\$:[ !u||!45[ $D>[ D>[r    r   c                   $    e Zd ZeZdZdZdgZd Zy)TrOCRPreTrainedModelmodelTr   c                 6   | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y y )Nr   )meanstd)r   init_std
isinstancer   r{   Conv1dr,   datanormal_rt   zero_	Embeddingr=   )r   moduler   s      r   _init_weightsz"TrOCRPreTrainedModel._init_weights  s    kk""fryy"))45MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r    N)	r3   r4   r5   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr    r    r   r   r     s!    L&*#,-	?r    r   c                   T     e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZ	S )TrOCRDecoderz
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TrOCRDecoderLayer`]

    Args:
        config: TrOCRConfig
    r   c                 \   t         |   |       |j                  | _        |j                  | _        |j
                  | _        |j                  rt        j                  |j                        nd}t        |j                  |j                  | j                  |      | _        |j                  r&t        |j                   |j                        | _        n@t%        |j                   | j                  z   dz   |j                  | j                        | _        |j&                  r%t)        j*                  |j                        | _        nd | _        t)        j,                  t/        |j0                        D cg c]  }t3        |       c}      | _        d| _        | j9                          y c c}w )NrB   )r>   r   F)r   r   rr   decoder_layerdrop	layerdroppad_token_idr=   scale_embeddingrP   sqrtr   r<   
vocab_sizeembed_tokensuse_learned_position_embeddingsr   max_position_embeddingsembed_positionsrE   layernorm_embeddingr   r   
ModuleListrangedecoder_layersr   layersgradient_checkpointing	post_init)r   r   r>   _r   s       r   r   zTrOCRDecoder.__init__  sK    ~~11!..7=7M7Mdii 2 23SV4v1143C3CQ\
 11#B6CaCacicucu#vD #E..1A1AAAE""  $D  %%')||F4F4F'GD$'+D$mmfNcNcHd$e1%6v%>$ef&+#	 %fs   3F)c                     | j                   S r@   r   r   s    r   get_input_embeddingsz!TrOCRDecoder.get_input_embeddings  s       r    c                     || _         y r@   r   r   values     r   set_input_embeddingsz!TrOCRDecoder.set_input_embeddings  s
    !r    c                 n   |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	||n| j                   j                  }||t        d      |"|}|j                  d|j                  d         }n-| |j                         dd }|dddddf   }nt        d      ||d   d   j                  d   nd}|| j                  |      }| j                   j                  r| j                  ||      }n| j                  ||      }||z   }| j                  | j                  |      }t        j                  j                  || j                  | j                         }|j                  }t#        ||||      }||t%        ||j&                  |d   	      }| j(                  r%| j                   r|	rt*        j-                  d
       d}	|rdnd}|
rdnd}|
r|dnd}|	rdnd}t/        ||gddg      D ]j  \  }}|	|j                         d   t1        | j2                        k7  s3t        d| dt1        | j2                         d|j                         d    d       t5        | j2                        D ]  \  }}|r||fz  }| j                   r%t7        j8                  g       }|| j:                  k  r?|||   nd}| j(                  r?| j                   r3| j=                  |j>                  |||||||   nd|||   ndd|
|	
      }n ||||||||   nd|||   nd||
|		      }|d   }|	r|||
rdnd   fz  }|
s||d   fz  }|||d   fz  } |r||fz  }|	r|nd}|stA        d |||||fD              S tC        |||||      S )a  
        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
                provide it.

                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
                [`PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
                on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzTYou cannot specify both decoder_input_ids and decoder_inputs_embeds at the same timer'   zEYou have to specify either decoder_input_ids or decoder_inputs_embedsr   r   )r"   r   )r   z^`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`...Fr   	head_maskcross_attn_head_maskzThe `z` should be specified for z layers, but it is for .)r   r   r   r   r   r   r   r   r   r   c              3   $   K   | ]  }|| 
 y wr@   r   ).0vs     r   	<genexpr>z'TrOCRDecoder.forward.<locals>.<genexpr>  s      = s   )last_hidden_statepast_key_valuesr   
attentionscross_attentions)"r   r   output_hidden_statesr   use_return_dictry   rW   r(   r^   r   r   r   r   r   r   rr   r   r   r   r%   r   loggerwarning_onceziplenr   	enumerater)   randr   _gradient_checkpointing_func__call__tupler   )r   r!   r   r   r   r   r   r   inputs_embedsr   r   r   return_dictinputinput_shaper"   	embed_posr   all_hidden_statesall_self_attnsall_cross_attentionsnext_decoder_cache	attn_mask	mask_nameidxdecoder_layerdropout_probabilityr   layer_outputs
next_caches                                 r   r/   zTrOCRDecoder.forward  s   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]  ]%>stt"E!r5;;r?;I&',,.s3K!!Q(+Edee DSC^!3A!6!<!<Q!?de  --i8M;;66,,UKa,bI,,YOe,fI%	1##/ 44]CM--mt||VZVcVc-dkk:K8N

 !,1G1S%?&(;(;[QS_&" &&4==##t "	 #7BD0d&7<Q<]rdh#,R$ %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 /	@C#!m%55!}}&+jjn#&75D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! *!,M"}:KQQR'S&UU" =#3"55(4(]1-=,??(_/	@d  -!11+4'$
 '5FXlm  
 9+&+%1
 	
r    )NNNNNNNNNNNN)
r3   r4   r5   r6   r   r   r   r   r/   r9   r:   s   @r   r   r     sJ    { >!"
 "#!!Y
r    r   a  
    The TrOCR Model with a language modeling head. Can be used for summarization.
    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
    used in combination with the [`EncoderDecoderModel`] framework.
    )custom_introc                   $     e Zd Z fdZd Z xZS )TrOCRDecoderWrapperc                 D    t         |   |       t        |      | _        y r@   )r   r   r   decoderr   s     r   r   zTrOCRDecoderWrapper.__init__  s     #F+r    c                 &     | j                   |i |S r@   )r  )r   argskwargss      r   r/   zTrOCRDecoderWrapper.forward  s    t||T,V,,r    )r3   r4   r5   r   r/   r9   r:   s   @r   r  r    s    ,-r    r  zy
    The TrOCR Decoder with a language modeling head. Can be used as the decoder part of [`EncoderDecoderModel`] and
    c                        e Zd ZdgZ fdZd Zd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                      deej                     deej                     deej                     deeeej                            deej                      deej                     dee   dee   dee   dee   deeef   fd       Zed        Z xZS )TrOCRForCausalLMzoutput_projection.weightc                    t        j                  |      }d|_        d|_        t        |   |       t        |      | _        t        j                  |j                  |j                  d      | _        | j                          y )NTFrw   )copydeepcopyrs   is_encoder_decoderr   r   r  r   r   r{   r   r   output_projectionr   r   s     r   r   zTrOCRForCausalLM.__init__  sh    v& $)! (0
!#6+=+=v?P?PW\!] 	r    c                 B    | j                   j                  j                  S r@   r   r  r   r   s    r   r   z%TrOCRForCausalLM.get_input_embeddings  s    zz!!...r    c                 :    || j                   j                  _        y r@   r+  r   s     r   r   z%TrOCRForCausalLM.set_input_embeddings  s    */

'r    c                     | j                   S r@   r)  r   s    r   get_output_embeddingsz&TrOCRForCausalLM.get_output_embeddings  s    %%%r    c                     || _         y r@   r.  )r   new_embeddingss     r   set_output_embeddingsz&TrOCRForCausalLM.set_output_embeddings  s
    !/r    c                 &    || j                   _        y r@   r   r  )r   r  s     r   set_decoderzTrOCRForCausalLM.set_decoder  s    $

r    c                 .    | j                   j                  S r@   r4  r   s    r   get_decoderzTrOCRForCausalLM.get_decoder  s    zz!!!r    r!   r   r   r   r   r   r   r  labelsr   r   r   r  r   c                 D   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  j                  |||||||||
|||      }| j                  |d         }d}|	Ft               } ||j                  d| j                   j                        |	j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                        S )a
  
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import (
        ...     TrOCRConfig,
        ...     TrOCRProcessor,
        ...     TrOCRForCausalLM,
        ...     ViTConfig,
        ...     ViTModel,
        ...     VisionEncoderDecoderModel,
        ... )
        >>> import requests
        >>> from PIL import Image

        >>> # TrOCR is a decoder model and should be used within a VisionEncoderDecoderModel
        >>> # init vision2text model with random weights
        >>> encoder = ViTModel(ViTConfig())
        >>> decoder = TrOCRForCausalLM(TrOCRConfig())
        >>> model = VisionEncoderDecoderModel(encoder=encoder, decoder=decoder)

        >>> # If you want to start from the pretrained model, load the checkpoint with `VisionEncoderDecoderModel`
        >>> processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
        >>> model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")

        >>> # load image from the IAM dataset
        >>> url = "https://fki.tic.heia-fr.ch/static/img/a01-122-02.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        >>> pixel_values = processor(image, return_tensors="pt").pixel_values
        >>> text = "industry, ' Mr. Brown commented icily. ' Let us have a"

        >>> # training
        >>> model.config.decoder_start_token_id = processor.tokenizer.eos_token_id
        >>> model.config.pad_token_id = processor.tokenizer.pad_token_id
        >>> model.config.vocab_size = model.config.decoder.vocab_size

        >>> labels = processor.tokenizer(text, return_tensors="pt").input_ids
        >>> outputs = model(pixel_values, labels=labels)
        >>> loss = outputs.loss
        >>> round(loss.item(), 2)
        5.30

        >>> # inference
        >>> generated_ids = model.generate(pixel_values)
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> generated_text
        'industry, " Mr. Brown commented icily. " Let us have a'
        ```N)r!   r   r   r   r   r   r   r  r   r   r   r  r   r'   r   )losslogitsr   r   r   r   )r   r   r   r  r   r  r)  r   rW   r   r   r   r   r   r   )r   r!   r   r   r   r   r   r   r  r8  r   r   r   r  r   r;  r:  loss_fctoutputs                      r   r/   zTrOCRForCausalLM.forward  sH   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] **$$)"7#9!5+'/!5# % 
 ''
3')HFKKDKK,B,BCV[[QS_UDY,F'+'7D7V#CVC0#33!//))$55
 	
r    c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr   c              3   t   K   | ]/  }|j                  d j                  |j                               1 ywr2   )r`   rY   r&   )r   
past_statebeam_idxs     r   r   z2TrOCRForCausalLM._reorder_cache.<locals>.<genexpr>q  s.     nU_j--aZ=N=N1OPns   58)r
  )r   rA  reordered_past
layer_pasts    `  r   _reorder_cachezTrOCRForCausalLM._reorder_cachel  s=    ) 	Jncmnn N	 r    )NNNNNNNNNNNNN)r3   r4   r5   _tied_weights_keysr   r   r   r/  r2  r5  r7  r   r   r)   
LongTensorr8   rL   r   r   r   r   r/   rj   rD  r9   r:   s   @r   r$  r$    s    55
/0&0%"  1515=A=A,07;EI59-1$(,0/3&*s
E,,-s
 !.s
  ((9(9:	s

 !))9)9 :s
 ELL)s
 'u||4s
 "%e.?.?(@"ABs
   1 12s
 ))*s
 D>s
 $D>s
 'tns
 d^s
 
u77	8s
 s
j  r    r$  )+r6   r&  rP   typingr   r   r   r)   r   torch.nnr   activationsr	   
generationr
   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   configuration_trocrr   
get_loggerr3   r  r   r   r<   ModulerE   rm   r   r   r   r  r$  __all__r   r    r   <module>rS     s0   6   ) )   % ! ) m - , , 
		H	%;bll ;4
=r|| 
=;8 ;8|RBRYY RBj~		 ~B ?? ? ?$F
' F
R -. -- 
^+_ ^
^B 5
6r    