
    Uh*                     X   d Z ddlZddlZddlZddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZmZmZ ddl m!Z!m"Z"m#Z# ddl$m%Z%  e"jL                  e'      Z(d Z) G d dejT                        Z+ G d dejT                        Z, G d dejT                        Z- G d dejT                        Z.e! G d de             Z/e! G d de/             Z0 e!d       G d d e/e             Z1 e!d!       G d" d#e/             Z2g d$Z3y)%zPyTorch OpenAI ImageGPT model.    N)AnyOptionalTupleUnion)nn)autocast)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)GenerationMixin))BaseModelOutputWithPastAndCrossAttentions!CausalLMOutputWithCrossAttentions SequenceClassifierOutputWithPast)PreTrainedModel)Conv1D find_pruneable_heads_and_indicesprune_conv1d_layer)auto_docstringloggingtorch_float   )ImageGPTConfigc                    	 ddl }ddl}t
        j                  j                  |      }t        j                  dj                  |             |j                  j                  |      }g }g }|D ]v  \  }	}
t        j                  dj                  |	|
             |j                  j                  ||	      }|j                  |	       |j                  |j                                x t        ||      D ]  \  }	}|	dd }	|	j!                  d      }	t#        d |	D              s|	d	   d
v r4t        j                  dj                  dj%                  |	                   j| }|	d	   dvrt'        |d      }|	D ]W  }|j)                  d|      r|j!                  d|      }n|g}|d   dk(  s|d   dk(  rt'        |d      }n|d   dk(  rt'        |d      }n|d   dk(  s|d   dk(  rt'        ||d         }t'        |d      }n|d   dv rt'        |d      }t'        |d      }nt+        |	      dk(  r,|	d   dk(  r$|d   dk(  rt'        ||d         }t'        |d      }nQ|d   dk(  rt'        |d      }t'        |d      }n0|d   dk(  rt'        |d      }t'        |d      }nt'        ||d         }t+        |      d k\  sEt-        |d         }||   }Z t+        |	      dkD  r|	d   dk(  s|	d	   dk(  s|	d	   dk(  s|	d	   dk(  rn	 |j.                  |j.                  k(  sJ 	 t        j                  d!j                  |	             |	d	   d"k(  rbt5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  ddd|j:                  f<   |	d	   d#k(  rot5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  dd|j:                  d |j:                  z  f<   -|	d	   d$k(  ret5        j6                  |j9                  |j:                  |j:                              j<                  |j>                  ddd |j:                  z  df<   t+        |	      dk(  rP|	d   dk(  rH|	d    dk(  r@t5        j6                  |j9                  |j:                  |j:                              |_        |	d	   dk(  rt5        j6                  |      |_        |	d	   dk(  r7t5        j6                  |      |j>                  d|j@                  dz
  ddf<   [|	d	   dk(  r$t5        j6                  |      |j>                  d	<   t5        j6                  |      |_         | S # t        $ r t        j	                  d        w xY w# t0        $ r1}|xj2                  |j.                  |j.                  fz  c_         d}~ww xY w)%z0
    Load tf checkpoints in a pytorch model
    r   NzLoading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see https://www.tensorflow.org/install/ for installation instructions.z(Converting TensorFlow checkpoint from {}z"Loading TF weight {} with shape {}   /c              3   $   K   | ]  }|d v  
 yw))adam_vadam_mAdamWeightDecayOptimizerAdamWeightDecayOptimizer_1global_stepN ).0ns     /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py	<genexpr>z.load_tf_weights_in_imagegpt.<locals>.<genexpr>Q   s      
 nn
   )_stepzSkipping {})wtettransformerz[A-Za-z]+\d+z(\d+)wgweightbbiaswpewte)q_projk_projv_projc_attnr   r   attnc_projr,   lm_headsos   zInitialize PyTorch weight {}r5   r6   r7   )!re
tensorflowImportErrorloggererrorospathabspathinfoformattrainlist_variablesload_variableappendsqueezezipsplitanyjoingetattr	fullmatchlenintshapeAssertionErrorargstorch
from_numpyreshapen_embdTdata
vocab_size)modelconfigimagegpt_checkpoint_pathr>   tftf_path	init_varsnamesarraysnamerU   arraypointerm_namescope_namesnumes                    r'   load_tf_weights_in_imagegptrn   0   s   	 ggoo67G
KK:AA'JK''0IEF  'e8??eLM&&w5Temmo&	' 5&) L3eABxzz#  

 
 "X"KK,,SXXd^<=88#g}5G 	'F||OV4 hhx8%h1~$A#(=!'84Q3&!'62Q5(KNe,C!';q>:!'84Q#AA!'84!'84TaDGv$5+a.H:T!';q>:!'84Q6)!'95!'84Q5(!'51!'84!';q>:;1$+a.)!#,;	'> t9q=T!W.$r(f2DRTYHY]abd]ein]n}}333
 	299$?@8x/4/?/?fmm]c]j]j@k/l/n/nGLLOfmmO+,"X!AFAQAQfmmV]];Ba LLFMMA,===> "X!383C3CEMMRXR_R_agananDo3p3r3rGLLA-//0Y!^Q6 1d1g6I ++EMM&--,WXGL"X ++E2GL"X7<7G7G7NGLL06,,q00!34"X$//6GLL ++E2GLYL3\ LC  Q	
 	P " 7==%++66s#   V ?V< V9<	W6,W11W6c                   h     e Zd Zddee   def fdZdej                  dej                  fdZ	 xZ
S )ImageGPTLayerNormhidden_sizeepsc                     t         |           || _        t        j                  t        j                  |            | _        y N)super__init__rr   r   	ParameterrX   Tensorr0   )selfrq   rr   	__class__s      r'   rv   zImageGPTLayerNorm.__init__   s.    ll5<<#<=    tensorreturnc                     |t        j                  t        j                  t        j                  |      dd      | j                  z         z  }|| j
                  z  }|S )Nr*   T)axiskeepdim)rX   sqrtmeansquarerr   r0   )ry   r|   s     r'   forwardzImageGPTLayerNorm.forward   sK    %**UZZV0D2W[%\_c_g_g%ghh$++%r{   )gh㈵>)__name__
__module____qualname__r   rT   floatrv   rX   rx   r   __classcell__rz   s   @r'   rp   rp      s5    >E#J >U >
ell u|| r{   rp   c                   "    e Zd Zddee   dee   f fdZd ZddZddZ	d Z
d Z	 	 	 	 	 	 	 dd	ej                  d
ee   deej                     deej                     deej                     deej                     dee   dee   defdZ xZS )ImageGPTAttentionis_cross_attention	layer_idxc           	         t         |           |j                  }| j                  dt	        j
                  t	        j                  ||ft        j                              j                  dd||      d       | j                  dt	        j                  d      d       |j                  | _        |j                  | _        | j                  | j                  z  | _        | j                  | _        | j                  | j                  z  | j                  k7  r&t!        d| j                   d	| j                   d
      |j"                  | _        || _        |j&                  | _        || _        |j*                  | _        | j$                  rNt-        d| j                  z  | j                        | _        t-        | j                  | j                        | _        n(t-        d| j                  z  | j                        | _        t-        | j                  | j                        | _        t5        j6                  |j8                        | _        t5        j6                  |j<                        | _        tA               | _!        y )Nr2   dtyper   F)
persistentmasked_biasg     z=`embed_dim` must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r=   r   )"ru   rv   max_position_embeddingsregister_bufferrX   trilonesboolviewr|   rq   	embed_dimnum_attention_heads	num_headshead_dim
split_size
ValueErrorscale_attn_weightsr   scale_attn_by_inverse_layer_idxr   reorder_and_upcast_attnr   r8   q_attnr:   r   Dropout
attn_pdropattn_dropoutresid_pdropresid_dropoutsetpruned_heads)ry   r`   r   r   max_positionsrz   s        r'   rv   zImageGPTAttention.__init__   s   66JJuzz=-"@

STYY1m]  	 	
 	]ELL,>5Q++33$..8..==4>>)T^^;OPTP^P^O_ `NN#2' 
 #)";";"4 06/U/U,"'-'E'E$"" T^^!3T^^DDK @DK T^^!3T^^DDKT^^T^^<JJv'8'89ZZ(:(:;Er{   c                 F   t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        j                  ||| j                  z   |d| j                  z  z   g      }t        | j                  |d      | _	        t        | j                  |d      | _
        | j                  | j                  z  | j                  t        |      z
  z  | _        | j                  t        |      z
  | _        | j                  j                  |      | _        y )Nr   r=   r   dim)rS   r   r   r   r   rX   catr   r   r8   r:   union)ry   headsindex
index_attns       r'   prune_headszImageGPTAttention.prune_heads   s    u:?7t~~t}}^b^o^opuYYut'>T__I\@]^_
 )jaH(eC  ??dnn<RUV[R\A\]#e*4 --33E:r{   c                 D   t        j                  ||j                  dd            }| j                  r |t	        |j                  d      dz        z  }| j                  r|t        | j                  dz         z  }| j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }	t        j                  |j                        j                  }
t        j                  |
|j                  |j                        }
t        j                   |	||
      }|||z   } t#        j$                  d      |      }|j'                  |j                        }| j)                  |      }|||z  }t        j                  ||      }||fS )Nr*         ?r   r   devicer   )rX   matmul	transposer   r   sizer   r   r   r   r2   finfor   minr|   r   wherer   Softmaxtyper   )ry   querykeyvalueattention_mask	head_maskattn_weightsquery_length
key_lengthcausal_mask
mask_valueattn_outputs               r'   _attnzImageGPTAttention._attn   st   ||E3==R+@A""'+ejjn6K*LLL //'%0B*CCL&&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 $((5((6  ')3Lll<7L((r{   c                 N   |j                         \  }}}}	|j                         \  }
}
}}
t        j                  ||z  ||t        j                  |j                        }d}| j
                  r |t        |j                  d            dz  z  }| j                  r|t        | j                  dz         z  }t        d      5  |j                  d||	      |j                  dd      j                  d|	|      }}t        j                  ||j                         |j                         d	|
      }|j                  ||||      }d d d        | j                  s|j                  d      |j                  d      }}| j                  d d d d ||z
  |d |f   }t        j                  |j                         j"                  }t        j$                  ||j                   |j                        }t        j&                  |||      }|||z   } t)        j*                  d      |      }|j                   t        j                  k7  rt-        d      |j/                  |j                         }| j1                  |      }|||z  }t        j2                  ||      }||fS # 1 sw Y   ZxY w)Nr         ?r*   r   r   F)enabledr   r   )betaalphar   zDError with upcasting, attn_weights does not have dtype torch.float32)r   rX   emptyfloat32r   r   r   r   r   r   rZ   r   baddbmmr   r2   r   r   r   r|   r   r   r   RuntimeErrorr   r   r   )ry   r   r   r   r   r   bszr   	q_seq_lendk_	k_seq_lenr   scale_factorqkr   r   r   r   r   s                        r'   _upcast_and_reordered_attnz,ImageGPTAttention._upcast_and_reordered_attn
  sa   (-

%Y	2 XXZ1i {{3?IyPUP]P]fkfrfrs ""E%**R.1S88L//E$..1"455L e$ 	V==Y3S]]2r5J5R5RSUWY[d5eqA ==qwwy!'')RS[ghL'//Y	9UL	V
 &&',zz"~sxx|*L))Aq*|*Cj*PR]S]R]$]^K\%7%78<<J j8J8JS_SfSfgJ ;;{L*ML%'.8L)rzzb),7 .eff#((5((6  ')3Lll<7L((C	V 	Vs   BJJ$c                 x    |j                         dd ||fz   } |j                  | }|j                  dddd      S )zJ
        Splits hidden_size dim into attn_head_size and num_heads
        Nr*   r   r=   r   r   )r   r   permutery   r|   r   attn_head_size	new_shapes        r'   _split_headszImageGPTAttention._split_heads>  sE     KKM#2&)^)DD	i(~~aAq))r{   c                     |j                  dddd      j                         }|j                         dd ||z  fz   }|j                  |      S )zS
        Merges attn_head_size dim and num_attn_heads dim into hidden_size
        r   r=   r   r   Nr   )r   
contiguousr   r   r   s        r'   _merge_headszImageGPTAttention._merge_headsF  sO     1a+668KKM#2&)n*D)FF	{{9%%r{   hidden_states
layer_pastr   r   encoder_hidden_statesencoder_attention_mask	use_cacheoutput_attentionsr}   c	                    |Zt        | d      st        d      | j                  |      }	| j                  |      j	                  | j
                  d      \  }
}|}n0| j                  |      j	                  | j
                  d      \  }	}
}| j                  |	| j                  | j                        }	| j                  |
| j                  | j                        }
| j                  || j                  | j                        }|7|\  }}t        j                  ||
fd      }
t        j                  ||fd      }|du r|
|f}nd }| j                  r| j                  |	|
|||      \  }}n| j                  |	|
|||      \  }}| j                  || j                  | j                        }| j                  |      }| j!                  |      }||f}|r||fz  }|S )Nr   zIf class is used as cross attention, the weights `q_attn` have to be defined. Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`.r=   r   r   T)hasattrr   r   r8   rN   r   r   r   r   rX   r   r   r   r   r   r:   r   )ry   r   r   r   r   r   r   r   r   r   r   r   past_key
past_valuepresentr   r   outputss                     r'   r   zImageGPTAttention.forwardN  s    !,4* t 
 KK.E%:;AA$//WXAYJC3N $M : @ @VW @ XE3!!%GT^^T]]C!!%G!#- Hj))XsO4CIIz51r:EElGG''(,(G(GsTY[ikt(u%K(,

5#unV_(`%K''T^^T]]Skk+.((5(&Gr{   )FN)NNNNNNNFF)r   r   r   r   r   rT   rv   r   r   r   r   r   rX   rx   tupler   r   r   s   @r'   r   r      s    )"8D> )"V^_bVc )"V;$)L2)h*& &*15,08<9=$),13||3 TN3 !.	3
 ELL)3  (53 !) 63 D>3 $D>3 
3r{   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ImageGPTMLPc                     t         |           |j                  }t        ||      | _        t        ||      | _        t        |j                     | _        t        j                  |j                        | _        y rt   )ru   rv   rq   r   c_fcr:   r   activation_functionactr   r   r   dropout)ry   intermediate_sizer`   r   rz   s       r'   rv   zImageGPTMLP.__init__  s_    &&	,i8	Y(9:&445zz&"4"45r{   r   r}   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rt   )r   r   r:   r   )ry   r   s     r'   r   zImageGPTMLP.forward  s@    		-0/M2]3r{   )r   r   r   rv   rX   rx   r   r   r   s   @r'   r   r     s#    6U\\ ell r{   r   c                        e Zd Zd fd	Z	 	 	 	 	 	 	 ddej
                  dee   deej
                     deej
                     deej
                     deej
                     dee   d	ee   d
efdZ	 xZ
S )ImageGPTBlockc                    t         |           |j                  }|j                  |j                  nd|z  }t	        ||j
                        | _        t        ||      | _        t	        ||j
                        | _	        |j                  r/t        |d|      | _        t	        ||j
                        | _        t        ||      | _        y )N   rr   r   T)r   r   )ru   rv   rq   n_innerrp   layer_norm_epsilonln_1r   r9   ln_2add_cross_attentioncrossattentionln_cross_attnr   mlp)ry   r`   r   rq   	inner_dimrz   s        r'   rv   zImageGPTBlock.__init__  s    ((&,nn&@FNNa+o	%kv7P7PQ	%f	B	%kv7P7PQ	%%"3Ft_h"iD!2;FD]D]!^Dy&1r{   r   r   r   r   r   r   r   r   r}   c	                    |}	| j                  |      }| j                  ||||||      }
|
d   }|
dd  }||	z   }|Wt        | d      st        d|  d      |}	| j	                  |      }| j                  ||||||      }|d   }|	|z   }||dd  z   }|}	| j                  |      }| j                  |      }|	|z   }|f|r|z   }|S |dd  z   }|S )	N)r   r   r   r   r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   r   r   r   r   r=   )r  r9   r   r   r  r  r	  r  )ry   r   r   r   r   r   r   r   r   residualattn_outputsr   r   cross_attn_outputsfeed_forward_hidden_statess                  r'   r   zImageGPTBlock.forward  sN    !		-0yy!)/ ! 
 #1oqr"#h. ,4!12 =dV DZ Z  %H ..}=M!%!4!4-#&;'="3 "5 " -Q/K${2M 212 66G 		-0%)XXm%<" #== "gL AHLr{   rt   r   )r   r   r   rv   rX   rx   r   r   r   r   r   r   s   @r'   r  r    s    2$ &*15,08<9=$),18||8 TN8 !.	8
 ELL)8  (58 !) 68 D>8 $D>8 
8r{   r  c                   >     e Zd ZeZeZdZdZdZ	dgZ
 fdZd Z xZS )ImageGPTPreTrainedModelr-   	input_idsTr  c                 $    t        |   |i | y rt   )ru   rv   )ry   inputskwargsrz   s      r'   rv   z ImageGPTPreTrainedModel.__init__  s    &+F+r{   c           	         t        |t        j                  t        f      rl|j                  j
                  j                  d| j                  j                         |j                  |j                  j
                  j                          nt        |t        j                        ry|j                  j
                  j                  d| j                  j                         |j                  g|j                  j
                  |j                     j                          n5t        |t              r%|j                  j
                  j                  d       |j                         D ]m  \  }}d|v sd|v s|j
                  j                  d| j                  j                  t!        j"                  d| j                  j$                  z        z         o y)zInitialize the weights.g        )r   stdNr   r:   r0   r=   )
isinstancer   Linearr   r0   r]   normal_r`   initializer_ranger2   zero_	Embeddingpadding_idxrp   fill_named_parametersmathr   n_layer)ry   modulerg   ps       r'   _init_weightsz%ImageGPTPreTrainedModel._init_weights  sQ   fryy&12 MM&&CT[[5R5R&S{{&  &&(-MM&&CT[[5R5R&S!!-""6#5#56<<> 12MM$$S) ..0 	sGD!4H$4Cdkk.K.KdiiXY\`\g\g\o\oXoNp.pr	sr{   )r   r   r   r   config_classrn   load_tf_weightsbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesrv   r)  r   r   s   @r'   r  r    s2    !L1O%!O&*#(),sr{   r  c            "           e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	eee
j                           de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   de	e   dedeeef   fd       Z xZS )ImageGPTModelr`   c           	      v   t         |   |       |j                  | _        t	        j
                  |j                  | j                        | _        t	        j
                  |j                  | j                        | _	        t	        j                  |j                        | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t%        | j                  |j&                        | _        d| _        d | _        d| _        | j1                          y c c}w )Nr  r  F)ru   rv   rq   r   r   r!  r^   r4   r   r3   r   
embd_pdropdrop
ModuleListrangenum_hidden_layersr  hrp   r  ln_fmodel_parallel
device_mapgradient_checkpointing	post_init)ry   r`   irz   s      r'   rv   zImageGPTModel.__init__	  s     ++<< 1 14>>B<< > >OJJv001	ERXRjRjLklqf Blm%dnn&:S:ST	 $&+#  ms   
D6c                     | j                   S rt   r4   ry   s    r'   get_input_embeddingsz"ImageGPTModel.get_input_embeddings  s    xxr{   c                     || _         y rt   r@  ry   new_embeddingss     r'   set_input_embeddingsz"ImageGPTModel.set_input_embeddings  s	    !r{   c                     |j                         D ]-  \  }}| j                  |   j                  j                  |       / y)zv
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
        N)itemsr8  r9   r   )ry   heads_to_prunelayerr   s       r'   _prune_headszImageGPTModel._prune_heads"  s<     +002 	2LE5FF5M**51	2r{   r  past_key_valuesr   token_type_idsposition_idsr   inputs_embedsr   r   r   r   output_hidden_statesreturn_dictr  r}   c                   $ d|v r8t        j                  dt               |t        d      |j	                  d      }||n| j
                  j                  }||n| j
                  j                  }|
|
n| j
                  j                  }
||n| j
                  j                  }||t        d      |G| j                  ||       |j                         }|j                  d|d         }|j                  d   }n0|#|j                         dd }|j                  d   }nt        d      ||j                  n|j                  }||j                  d|d         }|%d}t        dgt!        | j"                        z        }n|d   d   j                  d	      }|>t%        j&                  ||d   |z   t$        j(                  |
      }|j+                  d      }|z|dk  rt        d      |j                  |d      }|ddddddf   }|j-                  | j.                        }d|z
  t%        j0                  | j.                        j2                  z  }| j
                  j4                  rE|C|j                         \  }}}||f}|	t%        j6                  ||      }	| j9                  |	      }	nd}	| j;                  || j
                  j<                        }|| j?                  |      }| jA                  |      }||j-                  |j                        z   $|| j?                  |      }$|z   $| jC                  $      $|$j                  d      fz   }| jD                  r%| jF                  r|
rtH        jK                  d       d}
|
rdnd}|rdnd}|r| j
                  j4                  rdnd}|rdnd}tM        tO        | j"                  |            D ]  \  }\  }} | jP                  rt$        jR                  jU                  $j                         | t        $fd| D              } ||j-                  $j                        }tW        |t$        jX                        r|j-                  $j                        }|r|$fz   }| jD                  r3| jF                  r'| j[                  |j\                  $d|||   ||	|
|	      }!n |$| |||   ||	|
|      }!|!d   $|
du r	||!d   fz   }|r0||!|
rdnd   fz   }| j
                  j4                  r||!|
rdnd   fz   }| jP                  sS| j^                  ja                         D ]J  \  }"}#||#d   k(  sdtc        |"      z   | jd                  k7  s+$j-                  dtc        |"dz         z         $L  | jg                  $      $ $j                  | $|r|$fz   }|st        d $||||fD              S ti        $||||      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTModel
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> last_hidden_states = outputs.last_hidden_state
        ```pixel_values`The `pixel_values` argument is deprecated and will be removed in v4.47, use `input_ids` instead.N_You cannot pass both `pixel_values` and `input_ids`. Please make sure to only pass `input_ids`.zDYou cannot specify both input_ids and inputs_embeds at the same timer*   r   z5You have to specify either input_ids or inputs_embedsr   r   z$batch_size has to be defined and > 0r   r   )r   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr$   c              3   T   K   | ]  }|j                  j                         ! y wrt   )tor   )r%   
past_stater   s     r'   r(   z(ImageGPTModel.forward.<locals>.<genexpr>  s      &hzz}}]5I5I'J&hs   %()r   r   r   r   r   r   r   Tr   r=   r   zcuda:c              3   $   K   | ]  }|| 
 y wrt   r$   )r%   vs     r'   r(   z(ImageGPTModel.forward.<locals>.<genexpr>  s      = r)   )last_hidden_staterL  r   
attentionscross_attentions)5warningswarnFutureWarningr   popr`   r   rP  r   use_return_dict%warn_if_padding_and_no_attention_maskr   r   rU   r   r   rS   r8  rX   arangelong	unsqueezerW  r   r   r   r
  r   invert_attention_maskget_head_maskr&  r4   r3   r4  r<  trainingrA   warning_once	enumeraterM   r:  cuda
set_devicer  rx   _gradient_checkpointing_func__call__r;  rH  strlast_devicer9  r   )%ry   r  rL  r   rM  rN  r   rO  r   r   r   r   rP  rQ  r  input_shape
batch_sizer   past_lengthencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeposition_embedstoken_type_embedsoutput_shapepresentsall_self_attentionsall_cross_attentionsall_hidden_statesr>  blockr   r   r   rZ  r   s%                                       @r'   r   zImageGPTModel.forward)  s8   ^ V#MMr
 $ u  

>2I1B1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"66y.Q#..*K!r;r?;I"+J&',,.s3K&,,Q/JTUU%.%:!!@T@T%+00[_EN"K#TFS[$89O)!,Q/44R8K <<[_{5RZ_ZdZdmstL'11!4L %Q !GHH+00R@N ,AtT1,<=N ,..TZZ.@N!N2ekk$**6M6Q6QQN ;;**/D/P=R=W=W=Y: 7$68O#P %-).4HQW)X&%)%?%?@V%W"%)" &&y$++2E2EF	  HHY/M((<0%(:(:=;O;O(PP% $ 8),==M		-0"m&8&8&<%>>&&4==##p "	"2$5b4%64;;;Z;Zr`d"6BD&/DFFO0L&M 4	O"A"z""

%%m&:&:;)!&&h]g&h!hJ!-%3%6%6}7K7K%LNi6 )]-A-A BI#$58H$H!**t}};;NN!"aL)*%
  !)#1'l*?+A'&7	 $AJMD #wqzm3 &9W)QYZ=[<]&]#;;22+?7PY1_`CaBc+c( "" OO113 ODAqAbEzgA&6$:J:J&J(5(8(83q1u:9M(NOe4	Ol 		-0***L9 1]4D D '3DFY[op   9+$+*1
 	
r{   )NNNNNNNNNNNNN)r   r   r   r   rv   rB  rF  rK  r   r   rX   rx   r   r   r   r   r   r   r   r   s   @r'   r1  r1    sv   ~ &"2  -1@D1515/3,0048<9=$(,0/3&*d
ELL)d
 "%ell(;"<=d
 !.	d

 !.d
 u||,d
 ELL)d
  -d
  (5d
 !) 6d
 D>d
 $D>d
 'tnd
 d^d
 d
  
u??	@!d
 d
r{   r1  z
    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    )custom_introc            $       J    e Zd ZdgZdef fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	eee
j                           de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e   de	e   de	e   de	e   dedeeef   f d       Zedeee
j                        de
j                  deee
j                        fd       Z xZS )ImageGPTForCausalImageModelingzlm_head.weightr`   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  dz
  d      | _        d| _	        d | _
        | j                          y )Nr   Fr2   )ru   rv   r1  r-   r   r  r[   r^   r;   r:  r;  r=  ry   r`   rz   s     r'   rv   z'ImageGPTForCausalImageModeling.__init__  s[     (0yy0A0AA0EER $r{   c                     | j                   S rt   r;   rA  s    r'   get_output_embeddingsz4ImageGPTForCausalImageModeling.get_output_embeddings%  s    ||r{   c                     || _         y rt   r  rD  s     r'   set_output_embeddingsz4ImageGPTForCausalImageModeling.set_output_embeddings(  s	    %r{   r  rL  r   rM  rN  r   rO  r   r   labelsr   r   rP  rQ  r  r}   c                    d|v r8t        j                  dt               |t        d      |j	                  d      }||n| j
                  j                  }| j                  |||||||||	||||      }|d   }| j                  |      }d}|
r|dddddf   j                         }|
dd	df   j                         }t               } ||j                  d|j                  d            |j                  d            }|s|f|d	d z   }||f|z   S |S t        |||j                  |j                  |j                   |j"                  
      S )a%
  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
        >>> import torch
        >>> import matplotlib.pyplot as plt
        >>> import numpy as np

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        >>> model.to(device)  # doctest: +IGNORE_RESULT

        >>> # unconditional generation of 8 images
        >>> batch_size = 4
        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
        >>> context = context.to(device)
        >>> output = model.generate(
        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
        ... )

        >>> clusters = image_processor.clusters
        >>> height = image_processor.size["height"]
        >>> width = image_processor.size["width"]

        >>> samples = output[:, 1:].detach().cpu().numpy()
        >>> samples_img = [
        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
        ... ]  # convert color cluster tokens back to pixels
        >>> f, axes = plt.subplots(1, batch_size, dpi=300)

        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
        ...     ax.axis("off")
        ...     ax.imshow(img)
        ```rS  rT  NrU  )rL  r   rM  rN  r   rO  r   r   r   r   rP  rQ  r   .r*   r   )losslogitsrL  r   r\  r]  )r^  r_  r`  r   ra  r`   rb  r-   r;   r   r
   r   r   r   rL  r   r\  r]  )ry   r  rL  r   rM  rN  r   rO  r   r   r  r   r   rP  rQ  r  transformer_outputsr   	lm_logitsr  shift_logitsshift_labelsloss_fctoutputs                           r'   r   z&ImageGPTForCausalImageModeling.forward+  s   L V#MMr
 $ u  

>2I%0%<k$++B]B]"..+))%'"7#9/!5# / 
 ,A.LL/	$S#2#q[1<<>L!#qr'?557L')HL--b,2C2CB2GH,J[J[\^J_`D\$7$;;F)-)9TGf$EvE0/??-;;*550AA
 	
r{   beam_idxc                 ,    t        fd| D              S )a  
        This function is used to re-order the `past_key_values` cache if [`~PreTrainedModel.beam_search`] or
        [`~PreTrainedModel.beam_sample`] is called. This is required to match `past_key_values` with the correct
        beam_idx at every generation step.
        c              3   F   K   | ]  }t        fd |D                yw)c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectrW  r   )r%   rX  r  s     r'   r(   zJImageGPTForCausalImageModeling._reorder_cache.<locals>.<genexpr>.<genexpr>  s.     jQ[*))!X[[9J9J-KLjs   58Nr   )r%   r   r  s     r'   r(   z@ImageGPTForCausalImageModeling._reorder_cache.<locals>.<genexpr>  s%      
 j_ijj
s   !r  )rL  r  s    `r'   _reorder_cachez-ImageGPTForCausalImageModeling._reorder_cache  s      
-
 
 	
r{   )NNNNNNNNNNNNNN)r   r   r   _tied_weights_keysr   rv   r  r  r   r   rX   rx   r   r   r   r   r   r   staticmethodr  r   r   s   @r'   r  r    s    ++	~ 	&  -1@D1515/3,0048<9=)-$(,0/3&*{
ELL){
 "%ell(;"<={
 !.	{

 !.{
 u||,{
 ELL){
  -{
  (5{
 !) 6{
 &{
 D>{
 $D>{
 'tn{
 d^{
  !{
" 
u77	8#{
 {
z 
uU\\23
?D||
	uU\\"	#
 
r{   r  z
    The ImageGPT Model transformer with an image classification head on top (linear layer).
    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
    c                        e Zd Zdef fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     dee	e	ej                           deej                     deej                     deej                     deej                     d	eej                     d
eej                     dee
   dee
   dee
   dee
   dedee	ef   fd       Z xZS )ImageGPTForImageClassificationr`   c                     t         |   |       |j                  | _        t        |      | _        t        j                  |j                  | j                  d      | _        | j                          y )NFr  )
ru   rv   
num_labelsr1  r-   r   r  r[   scorer=  r  s     r'   rv   z'ImageGPTForImageClassification.__init__  sR      ++(0YYv}}dooEJ
 	r{   r  rL  r   rM  rN  r   rO  r  r   r   rP  rQ  r  r}   c                    d|v r8t        j                  dt               |t        d      |j	                  d      }||n| j
                  j                  }| j                  ||||||||	|
||      }|d   }|j                  d      }| j                  |      }d}|| j
                  j                  | j                  dk(  rd	| j
                  _
        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd
| j
                  _
        nd| j
                  _
        | j
                  j                  d	k(  rIt!               }| j                  dk(  r& ||j#                         |j#                               }n |||      }n| j
                  j                  d
k(  r=t%               } ||j'                  d| j                        |j'                  d            }n,| j
                  j                  dk(  rt)               } |||      }|s|f|dd z   }||f|z   S |S t+        |||j,                  |j.                  |j0                        S )ax  
        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
            `past_key_values[0][0].shape[-2]` (`sequence_length` of input past key value states). Indices of input
            sequence tokens in the vocabulary.

            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
            `input_ids`.

            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits = outputs.logits
        ```rS  rT  NrU  )
rL  r   rM  rN  r   rO  r   r   rP  rQ  r   r   r   
regressionsingle_label_classificationmulti_label_classificationr*   )r  r  rL  r   r\  )r^  r_  r`  r   ra  r`   rb  r-   r   r  problem_typer  r   rX   re  rT   r   rL   r
   r   r	   r   rL  r   r\  )ry   r  rL  r   rM  rN  r   rO  r  r   r   rP  rQ  r  r  r   pooled_hidden_statesr  r  r  r  s                        r'   r   z&ImageGPTForImageClassification.forward  sG   d V#MMr
 $ u  

>2I%0%<k$++B]B]"..+))%'/!5# / 
 ,A.,11a1801{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y!4QR!88F)-)9TGf$EvE//??-;;*55
 	
r{   )NNNNNNNNNNNN)r   r   r   r   rv   r   r   rX   rx   r   r   r   r   r   r   r   r   s   @r'   r  r    sP   ~   -1@D1515/3,004)-$(,0/3&*s
ELL)s
 "%ell(;"<=s
 !.	s

 !.s
 u||,s
 ELL)s
  -s
 &s
 D>s
 $D>s
 'tns
 d^s
 s
 
u66	7s
 s
r{   r  )r  r  r1  r  rn   )4__doc__r%  rC   r^  typingr   r   r   r   rX   torch.utils.checkpointr   torch.cuda.ampr   torch.nnr	   r
   r   activationsr   
generationr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_imagegptr   
get_loggerr   rA   rn   Modulerp   r   r   r  r  r1  r  r  __all__r$   r{   r'   <module>r     sR   %  	  . .    # A A ! ) 
 . Y Y 
 3 
		H	%iX
		 
X		 Xv")) "HBII HV #so #s #sL F
+ F
 F
R ^
%<o ^
^
B ~
%< ~
~
Br{   