
    Uh                    4   d Z ddlZddlmZ ddlmZmZmZmZm	Z	m
Z
 ddlZddlmZ ddlZddlmZ ddlmZmZmZmZmZ ddlmZmZ dd	lmZmZmZ dd
lmZm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&  e!jN                  e(      Z)dejT                  dejT                  fdZ+dejT                  dejT                  fdZ,e G d de             Z- G d dej\                        Z/ G d dej\                        Z0 G d dej\                        Z1de0iZ2 G d dej\                        Z3 G d dej\                        Z4 G d d ej\                        Z5 G d! d"ej\                        Z6 G d# d$ej\                        Z7 G d% d&ej\                        Z8	 dId'ej\                  d(ejT                  d)ejT                  d*ejT                  d+eejT                     d,e9d-e9fd.Z: G d/ d0ej\                        Z; G d1 d2ej\                        Z< G d3 d4ej\                        Z= G d5 d6ej\                        Z> G d7 d8ej\                        Z?e  G d9 d:e             Z@ G d; d<ej\                        ZA G d= d>e@      ZB e d?@       G dA dBe@             ZC G dC dDe@      ZD G dE dFe@      ZEdJdGZFg dHZGy)KzPyTorch AltCLIP model.    N)	dataclass)AnyCallableListOptionalTupleUnion   )ACT2FN)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions'BaseModelOutputWithPoolingAndProjection)ALL_ATTENTION_FUNCTIONSPreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )AltCLIPConfigAltCLIPTextConfigAltCLIPVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)nn
functionalcross_entropytorcharangelenr"   )r   s    ~/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/altclip/modeling_altclip.pycontrastive_lossr*   ,   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r*   t)r,   caption_loss
image_losss      r)   	clip_lossr1   0   s,    #J/L!*,,.1J:%,,r+   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)AltCLIPOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPTextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of [`AltCLIPVisionModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`AltCLIPTextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`AltCLIPVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r9   r:   N)getattrto_tuple).0kselfs     r)   	<genexpr>z)AltCLIPOutput.to_tuple.<locals>.<genexpr>V   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysrA   s   `r)   r>   zAltCLIPOutput.to_tupleU   s#     
YY[
 
 	
r+   )__name__
__module____qualname____doc__r4   r   r&   FloatTensor__annotations__r5   r6   r7   r8   r9   r   r:   r   r   r>    r+   r)   r3   r3   6   s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r+   r3   c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )AltRobertaEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxepsposition_embedding_typeabsoluteposition_idsr   F
persistenttoken_type_idsdtype)super__init__r#   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutr=   rS   register_bufferr&   r'   expandzerosrU   sizelongrP   rA   config	__class__s     r)   r^   zAltRobertaEmbeddings.__init__c   si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r+   c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )NrW   r   rZ   r   r\   r"   rT   )"create_position_ids_from_input_idsrP   &create_position_ids_from_inputs_embedsrp   hasattrrZ   rn   r&   ro   rq   rU   r"   rc   rg   rS   re   rh   rl   )rA   	input_idsrZ   rU   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedrg   
embeddingsre   s                r)   forwardzAltRobertaEmbeddings.forward|   sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r+   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        NrW   r   rv   r   )rp   r&   r'   rP   rq   r"   	unsqueezern   )rA   r{   r}   sequence_lengthrU   s        r)   rx   z;AltRobertaEmbeddings.create_position_ids_from_inputs_embeds   s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r+   )NNNNr   )rF   rG   rH   rI   r^   r   rx   __classcell__rt   s   @r)   rN   rN   ]   s    

4 rs&P=r+   rN   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )AltRobertaSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()rS   rT   relative_keyrelative_key_query   r   )r]   r^   ra   num_attention_headsry   
ValueErrorintattention_head_sizeall_head_sizer#   Linearquerykeyvaluerj   attention_probs_dropout_probrl   r=   rS   rd   r_   distance_embedding
is_decoderrA   rs   rS   rt   s      r)   r^   z AltRobertaSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r+   xr   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )NrW   r   r   r   r
   )rp   r   r   viewpermute)rA   r   new_x_shapes      r)   transpose_for_scoresz,AltRobertaSelfAttention.transpose_for_scores   sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r+   hidden_statesattention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r   r   dimrW   r   r   rv   r[   zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) r   r   r   r   r&   catr   matmul	transposerS   shapetensorrq   r"   r   r'   r   rd   tor\   einsummathsqrtr   r#   r$   softmaxrl   r   
contiguousrp   r   )rA   r   r   r   r   r   r   r   mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r)   r   zAltRobertaSelfAttention.forward   s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr+   NNNNNNF)rF   rG   rH   r^   r&   Tensorr   r   rJ   r   boolr   r   r   s   @r)   r   r      s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	cr+   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrQ   )r]   r^   r#   r   ra   denserh   ri   rj   rk   rl   rr   s     r)   r^   zAltRobertaSelfOutput.__init__?  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r+   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   rl   rh   rA   r   r   s      r)   r   zAltRobertaSelfOutput.forwardE  7    

=1]3}|'CDr+   rF   rG   rH   r^   r&   r   r   r   r   s   @r)   r   r   >  1    >U\\  RWR^R^ r+   r   eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )AltRobertaAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )NrS   )	r]   r^   "ALT_ROBERTA_SELF_ATTENTION_CLASSES_attn_implementationrA   r   outputsetpruned_headsr   s      r)   r^   zAltRobertaAttention.__init__S  sC    6v7R7RS,C
	 +62Er+   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )r(   r   rA   r   r   r   r   r   r   r   r   r   r   union)rA   headsindexs      r)   prune_headszAltRobertaAttention.prune_heads[  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r+   r   r   r   r   r   r   r   r   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   )rA   r   )rA   r   r   r   r   r   r   r   self_outputsattention_outputr   s              r)   r   zAltRobertaAttention.forwardm  sW     yy!"
  ;;|AF#%QR(88r+   r   r   )rF   rG   rH   r^   r   r&   r   r   rJ   r   r   r   r   r   s   @r)   r   r   R  s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r+   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )r]   r^   r#   r   ra   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrr   s     r)   r^   zAltRobertaIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r+   r   r   c                 J    | j                  |      }| j                  |      }|S r   )r   r   rA   r   s     r)   r   zAltRobertaIntermediate.forward  s&    

=100?r+   r   r   s   @r)   r   r     s#    9U\\ ell r+   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AltRobertaOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )r]   r^   r#   r   r   ra   r   rh   ri   rj   rk   rl   rr   s     r)   r^   zAltRobertaOutput.__init__  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r+   r   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r)   r   zAltRobertaOutput.forward  r   r+   r   r   s   @r)   r   r     r   r+   r   c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )AltRobertaLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is addedrT   r   )r]   r^   chunk_size_feed_forwardseq_len_dimr   	attentionr   add_cross_attentionr   crossattentionr   intermediater   r   rr   s     r)   r^   zAltRobertaLayer.__init__  s    '-'E'E$,V4 ++#)#=#= ##?? D6)g!hii"5fV`"aD26:&v.r+   r   r   r   r   r   r   r   r   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr   )r   r   r   r   rW   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	r  r   ry   r   r  r   feed_forward_chunkr   r  )rA   r   r   r   r   r   r   r   self_attn_past_key_valueself_attention_outputsr   r   present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r)   r   zAltRobertaLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr+   c                 L    | j                  |      }| j                  ||      }|S r   )r  r   )rA   r   intermediate_outputr  s       r)   r  z"AltRobertaLayer.feed_forward_chunk  s,    "//0@A{{#68HIr+   r   )rF   rG   rH   r^   r&   r   r   rJ   r   r   r   r  r   r   s   @r)   r   r     s    /" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Br+   r   c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )AltRobertaEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r]   r^   rs   r#   
ModuleListrangenum_hidden_layersr   layergradient_checkpointingrA   rs   _rt   s      r)   r^   zAltRobertaEncoder.__init__  sN    ]]U6KcKcEd#eOF$;#ef
&+# $f   A#r   r   r   r   r   past_key_valuesr   r   output_hidden_statesreturn_dictr   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
NrL   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   rW   r   r   c              3   $   K   | ]  }|| 
 y wr   rL   r?   vs     r)   rB   z,AltRobertaEncoder.forward.<locals>.<genexpr>E  s      
 = 
s   )last_hidden_stater  r   
attentionscross_attentions)rs   r  r  trainingloggerwarning_once	enumerater  _gradient_checkpointing_func__call__rC   r   )rA   r   r   r   r   r   r  r   r   r  r  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr   layer_outputss                       r)   r   zAltRobertaEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r+   )	NNNNNNFFT)rF   rG   rH   r^   r&   r   r   rJ   r   r   r	   r   r   r   r   s   @r)   r  r    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r+   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AltRobertaPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r   )r]   r^   r#   r   ra   r   Tanh
activationrr   s     r)   r^   zAltRobertaPooler.__init__[  s9    YYv1163E3EF
'')r+   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r9  )rA   r   first_token_tensorpooled_outputs       r)   r   zAltRobertaPooler.forward`  s6     +1a40

#566r+   r   r   s   @r)   r6  r6  Z  s#    $
U\\ ell r+   r6  moduler   r   r   r   scalingrl   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )NrW   r   )r   r\   )pr'  r   r   )r&   r   r   r#   r$   r   float32r   r\   rl   r'  r   )
r=  r   r   r   r   r>  rl   kwargsattn_weightsattn_outputs
             r)   eager_attention_forwardrE  j  s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r+   c                        e Zd ZdZ fdZ	 	 	 d	dej                  deej                     deej                     dee   de	ej                  eej                     f   f
dZ
 xZS )
AltCLIPAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).      F)r]   r^   rs   ra   	embed_dimr   	num_headshead_dimr   scaleattention_dropoutrl   	is_causalr#   r   k_projv_projq_projout_projrr   s     r)   r^   zAltCLIPAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar+   r   r   causal_attention_maskr   r   c           
         |j                   \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
| j                  j                  dk7  r||||z   }n||}n	|du| _
        t        }| j                  j                  dk7  rN| j                  j                  dk(  r|rt        j                  d       nt        | j                  j                     } || ||	|
|| j                  | j                  | j                   sdn| j"                  	      \  }}|j%                  |||      j'                         }| j)                  |      }|sd}||fS )
z#Input shape: Batch x Time x Channelr   r   flash_attention_2Nr   sdpaz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.        )rP  r>  rl   )r   rS  rQ  rR  r   rL  rM  r   rs   r   rP  rE  r(  r)  r   rN  r'  rl   reshaper   rT  )rA   r   r   rU  r   
batch_sizer~   rK  queriesrD   valuesattention_interfacerD  rC  s                 r)   r   zAltCLIPAttention.forward  s    -:,?,?)
J	++m,{{=)]+,,z:t~~t}}U__`acdeyyZOYYZ[]^_ZT^^T]]S]]^_abc ;;++/BB).C.O!/2G!G&2!62$>DN(?;;++w6{{//69>O##L
 '>dkk>^>^&_#$7nnJJ#}}C$,,	%
!\ "))*j)LWWYmmK0 LL((r+   )NNF)rF   rG   rH   rI   r^   r&   r   r   r   r   r   r   r   s   @r)   rG  rG    s}    GB. 268<,15)||5) !.5)  (5	5)
 $D>5) 
u||Xell33	45)r+   rG  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
AltCLIPMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r]   r^   rs   r   r   activation_fnr#   r   ra   r   fc1fc2rr   s     r)   r^   zAltCLIPMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr+   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rc  rb  rd  r   s     r)   r   zAltCLIPMLP.forward  s4    /**=9/r+   r   r   s   @r)   r`  r`    s$    KU\\ ell r+   r`  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
AltCLIPEncoderLayerrs   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )r]   r^   ra   rK  rG  	self_attnr#   rh   ri   layer_norm1r`  mlplayer_norm2rr   s     r)   r^   zAltCLIPEncoderLayer.__init__  sm    ++)&1<<F<Q<QRf%<<F<Q<QRr+   r   r   rU  r   r   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   rU  r   )rj  ri  rl  rk  )rA   r   r   rU  r   residualrC  r   s           r)   r   zAltCLIPEncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr+   F)rF   rG   rH   r   r^   r&   r   r   r   r   rJ   r   r   r   s   @r)   rg  rg    sf    S} S -2&||& &  %||	&
 $D>& 
u  	!&r+   rg  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )AltCLIPEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`AltCLIPEncoderLayer`].

    Args:
        config: AltCLIPConfig
    rs   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r  )
r]   r^   rs   r#   r  r  r  rg  layersr  r  s      r)   r^   zAltCLIPEncoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %kr  r   rU  r   r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	t	        | j
                        D ]b  \  }
}|r||	fz   }| j                  r,| j                  r | j                  |j                  |	|||      }n ||	|||      }|d   }	|sZ||d   fz   }d |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrL   )r   r   r   c              3   &   K   | ]	  }||  y wr   rL   r"  s     r)   rB   z)AltCLIPEncoder.forward.<locals>.<genexpr>l  s     eqWXWdes   )r$  r   r%  )rs   r   r  use_return_dictr*  rs  r  r'  r+  r,  rC   r   )rA   r{   r   rU  r   r  r  encoder_statesall_attentionsr   idxencoder_layerr4  s                r)   r   zAltCLIPEncoder.forward   sH   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%"+DKK"8 	FC#!/=2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+>Vd
 	
r+   )NNNNN)rF   rG   rH   rI   r   r^   r   r&   r   r   r	   r   r   r   r   r   s   @r)   rq  rq    s    ,} , 268<,0/3&*O
 !.O
  (5	O

 $D>O
 'tnO
 d^O
 
uo%	&O
r+   rq  c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )AltCLIPVisionEmbeddingsrs   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr   r   rU   rV   rX   )r]   r^   rs   ra   rK  
image_size
patch_sizer#   	Parameterr&   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positionsr_   position_embeddingrm   r'   rn   rr   s     r)   r^   z AltCLIPVisionEmbeddings.__init__t  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr+   r   heightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   NrW   g      ?r
   r   bicubicF)rp   modealign_cornersr   )r   r  weightr   r&   jit
is_tracingrU   r  r   rZ  r   r#   r$   interpolater   r   )rA   r   r  r  r  r  r  class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r)   interpolate_pos_encodingz0AltCLIPVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr+   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (rI  r[   r   r   rW   r   )r   r  r   r  r  r\   r   flattenr   r  rn   r&   r   r  r  rU   )rA   r  r  r[  r  r  r  target_dtypepatch_embedsclass_embedsr   s              r)   r   zAltCLIPVisionEmbeddings.forward  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr+   ro  )rF   rG   rH   r   r^   r&   r   r   r  rJ   r   r   r   s   @r)   r|  r|  s  se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r+   r|  c                   "    e Zd ZeZdZdZg Zd Zy)AltCLIPPreTrainedModelaltclipTc                 :   | j                   j                  }t        |t              r| j                   j                  }t        j
                  j                  |j                  d|j                  dz  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         t        j
                  j                  |j                  j                  |j                   j                  |z         yt        |t              r,| j                   j                  }|j                  dz  d|j                   j                  z  dz  z  |z  }|j                  dz  |z  }t        j
                  j                  |j                  j                  |       t        j
                  j                  |j                   j                  |       t        j
                  j                  |j"                  j                  |       t        j
                  j                  |j$                  j                  |       yt        |t&              r| j                   j                  }|j                   j(                  dz  d|j                   j                  z  dz  z  |z  }d|j                   j(                  z  dz  |z  }t        j
                  j                  |j*                  j                  |       t        j
                  j                  |j,                  j                  |       yt        |t.              rt        j
                  j                  |j0                  j                  |j2                  dz  | j                   j                  z         d|j0                  _        t        j
                  j                  |j6                  j                  |j8                  dz  | j                   j                  z         d|j6                  _        yt        |t        j:                        rJ|j<                  j>                  jA                          |j                  j>                  jC                  d       yt        |t        jD                        rm|j                  j>                  j                  d| j                   j                         |j<                  %|j<                  j>                  jA                          yyt        |t        jF                        rz|j                  j>                  j                  d| j                   j                         |jH                  2|j                  j>                  |jH                     jA                          yyy)	zInitialize the weightsrY  rJ  )meanstd)r  r   Tg      ?N)%rs   initializer_factorr   r|  r#   initnormal_r  rK  r  r  initializer_ranger  rG  r  rS  rQ  rR  rT  r`  ra   rc  rd  AltCLIPModeltext_projectiontext_embed_dim_is_hf_initializedvisual_projectionvision_embed_dimrh   r  datazero_fill_r   r_   rP   )rA   r=  factorin_proj_stdout_proj_stdfc_stds         r)   _init_weightsz$AltCLIPPreTrainedModel._init_weights  s   //f56[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?-GGOO&&--))4/$++2P2PP   9=F""5GGOO((//++T1DKK4R4RR   ;?F$$7-KK""$MM$$S)		*MM&&CT[[5S5S&T{{&  &&( '-MM&&CT[[5S5S&T!!-""6#5#56<<> . .r+   N)	rF   rG   rH   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_moduler  rL   r+   r)   r  r    s     L!&*#+?r+   r  c                        e Zd Zdef fdZe	 	 	 	 	 d
deej                     dee	   dee	   dee	   dee	   de
eef   fd	       Z xZS )AltCLIPVisionTransformerrs   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r]   r^   rs   ra   r|  r   r#   rh   ri   pre_layrnormrq  encoderpost_layernorm)rA   rs   rK  rt   s      r)   r^   z!AltCLIPVisionTransformer.__init__  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr+   r  r   r  r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r  )r{   r   r  r  r   r   r$  pooler_outputr   r%  )rs   r   r  rv  r   r   r  r  r  r   r   r%  )
rA   r  r   r  r  r  r   encoder_outputsr$  r<  s
             r)   r   z AltCLIPVisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r+   )NNNNF)rF   rG   rH   r   r^   r   r   r&   rJ   r   r	   r   r   r   r   r   s   @r)   r  r    s    Q2 Q  59,0/3&*38'
u001'
 $D>'
 'tn	'

 d^'
 #+4.'
 
u00	1'
 '
r+   r  c                        e Zd ZeZdZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )AltCLIPVisionModelr  rs   c                 d    t         |   |       t        |      | _        | j	                          y r   )r]   r^   r  vision_model	post_initrr   s     r)   r^   zAltCLIPVisionModel.__init__5  s'     4V<r+   r   c                 B    | j                   j                  j                  S r   )r  r   r  rE   s    r)   get_input_embeddingsz'AltCLIPVisionModel.get_input_embeddings;  s      ++;;;r+   r   r  r  r  c                 b    ||n| j                   j                  }| j                  |||||      S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPVisionModel

        >>> model = AltCLIPVisionModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r  r   r  r  r  )rs   rv  r  )rA   r  r   r  r  r  s         r)   r   zAltCLIPVisionModel.forward>  sA    : &1%<k$++B]B]  %/!5%=# ! 
 	
r+   NNNFN)rF   rG   rH   r   r  main_input_namer^   r#   Moduler  r   r   r&   rJ   r   r	   r   r   r   r   r   s   @r)   r  r  1  s    &L$O2 <bii <  59,0/3).&*$
u001$
 $D>$
 'tn	$

 #'$
 d^$
 
u00	1$
 $
r+   r  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                        e Zd ZeZd fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
ej                     de
eej                        de
e   de
e   de
e   de
e   deeej                     ef   fd       Z xZS )AltRobertaModelc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
r]   r^   rs   rN   r   r  r  r6  poolerr  )rA   rs   add_pooling_layerrt   s      r)   r^   zAltRobertaModel.__init__x  sN    
 	 .v6(02C&v. 	r+   c                 .    | j                   j                  S r   r   rc   rE   s    r)   r  z$AltRobertaModel.get_input_embeddings  s    ...r+   c                 &    || j                   _        y r   r  rA   r   s     r)   set_input_embeddingsz$AltRobertaModel.set_input_embeddings  s    */'r+   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)itemsr  r  r  r   )rA   heads_to_pruner  r   s       r)   _prune_headszAltRobertaModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr+   rz   r   rZ   rU   r   r{   r   r   r  r   r   r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|t        j                  |||z   f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j!                  ||      }|}n&t        j"                  |t        j$                  |	      }| j'                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j)                  |      }nd }| j+                  || j                   j,                        }| j                  |||||
      }| j/                  ||||||	|
|||
      }|d   }| j0                  | j1                  |      nd }|s
||f|dd  z   S t3        |||j4                  |j6                  |j8                  |j:                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timerW   z5You have to specify either input_ids or inputs_embedsr   r   r!   rZ   rv   )rz   rU   rZ   r{   r|   )	r   r   r   r   r  r   r   r  r  r   )r$  r  r  r   r%  r&  )rs   r   r  rv  r   r   r   %warn_if_padding_and_no_attention_maskrp   r"   r   r&   onesry   r   rZ   rn   ro   rq   get_extended_attention_maskinvert_attention_maskget_head_maskr  r  r  r   r  r   r%  r&  )rA   rz   r   rZ   rU   r   r{   r   r   r  r   r   r  r  r}   r[  r~   r"   r|   r   r   extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr  encoder_hidden_shapeencoder_extended_attention_maskembedding_outputr  sequence_outputr<  s                                  r)   r   zAltRobertaModel.forward  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r+   )T)NNNNNNNNNNNNN)rF   rG   rH   r   r  r^   r  r  r  r   r   r&   r   r   rJ   r   r	   r   r   r   r   r   s   @r)   r  r  f  sr    %L /0C  -11515/3,0048<9==A$(,0/3&*l
ELL)l
 !.l
 !.	l

 u||,l
 ELL)l
  -l
  (5l
 !) 6l
 "$u'8'8"9:l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$PP	Ql
 l
r+   r  c                       e Zd ZeZ fdZdej                  fdZdej                  ddfdZ
ddee   dej                  f fdZe	 	 	 	 	 	 	 	 	 	 	 dd	eej                      d
eej                      deej                      deej                      deej                      deej                      deej                      deej                      dee   dee   dee   deeef   fd       Z xZS )AltCLIPTextModelc                 &   t         |   |       t        |d      | _        t	        j
                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        | j                          y )NF)r  rQ   )r]   r^   r  robertar#   r   ra   project_dimtransformationrh   ri   pre_LNr  rr   s     r)   r^   zAltCLIPTextModel.__init__
  se     &vG ii(:(:F<N<NOll6#5#56;P;PQr+   r   c                 B    | j                   j                  j                  S r   r  r   rc   rE   s    r)   r  z%AltCLIPTextModel.get_input_embeddings  s    ||&&666r+   r   Nc                 :    || j                   j                  _        y r   r  r  s     r)   r  z%AltCLIPTextModel.set_input_embeddings  s    27/r+   new_num_tokensc                 "    t         |   |      S r   )r]   resize_token_embeddings)rA   r  rt   s     r)   r  z(AltCLIPTextModel.resize_token_embeddings  s    w.~>>r+   rz   r   rZ   rU   r   r{   r   r   r   r  r  c                 ,   |
|
n| j                   j                  }
| j                  |||||||||	||
      }|d   }| j                  |      }| j	                  |      }|dddf   }|
s
||f|dd z   S t        |||j                  |j                        S )a+  
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPTextModel

        >>> model = AltCLIPTextModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")

        >>> texts = ["it's a cat", "it's a dog"]

        >>> inputs = processor(text=texts, padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```N)rz   r   rZ   rU   r   r{   r   r   r   r  r  r   r      r  )rs   rv  r  r  r  r   r   r%  )rA   rz   r   rZ   rU   r   r{   r   r   r   r  r  r   r  projection_stater  s                   r)   r   zAltCLIPTextModel.forward  s    B &1%<k$++B]B],,))%'"7#9/!5#  
 "!* ++o6  ..?(A.$m4wq|CC6.'!//))	
 	
r+   r   )NNNNNNNNNNN)rF   rG   rH   r   r  r^   r#   r  r  r_   r  r   r   r  r   r&   r   r   r	   r   r   r   r   r   s   @r)   r  r    si   $L7bii 78",, 84 8?hsm ?r|| ?  -11515/3,0048<9=,0&*/3B
ELL)B
 !.B
 !.	B

 u||,B
 ELL)B
  -B
  (5B
 !) 6B
 $D>B
 d^B
 'tnB
 
u==	>B
 B
r+   r  c                   H    e Zd ZeZdef fdZe	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dee
   d	ej                  fd
       Ze	 	 	 	 	 ddeej                     dee
   dee
   de
dee
   d	ej                  fd       Ze	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     dee
   dee
   dee
   de
dee
   d	eeef   fd       Z xZS )r  rs   c                 P   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        |      | _        t!        |      | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzRconfig.vision_config is expected to be of type AltCLIPVisionConfig but is of type .zNconfig.text_config is expected to be of type AltCLIPTextConfig but is of type F)r  )r]   r^   r   vision_configr   	TypeErrortypetext_configr   projection_dimr  r  ra   r  r  
text_modelr  r  r#   r   r  r  r  r&   r   rs   logit_scale_init_valuelogit_scaler  )rA   rs   r	  r  rt   s       r)   r^   zAltCLIPModel.__init__c  sW    &..0CD--./q2  &,,.?@++,-Q0 
 ((,,$33)55 - 9 9*;74]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r+   rz   r   rU   r   r  r  r   c           	          ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||||      }|d   }	| j                  |	      }
|
S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```)rz   r   rU   rZ   r   r  r  r   )rs   r   r  rv  r  r  )rA   rz   r   rU   rZ   r   r  r  text_outputsr<  text_featuress              r)   get_text_featureszAltCLIPModel.get_text_features  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%)/!5# ' 
 %Q,,];r+   r  r  c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      }|d   }| j                  |      }|S )a*  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AltCLIPVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```r  r   )rs   r   r  rv  r  r  )	rA   r  r   r  r  r  vision_outputsr<  image_featuress	            r)   get_image_featureszAltCLIPModel.get_image_features  s    : 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r+   rZ   return_lossc           	         ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
| j	                  |||||||
      }| j                  ||||	|
      }|d   }| j                  |      }|d   }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                  }d}|rt        |      }|
s||||||f}||f|z   S |S t!        |||||||	      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AltCLIPModel

        >>> model = AltCLIPModel.from_pretrained("BAAI/AltCLIP")
        >>> processor = AutoProcessor.from_pretrained("BAAI/AltCLIP")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```N)rz   r   rZ   rU   r   r  r  r  r   r   rW   T)r@  r   keepdim)r4   r5   r6   r7   r8   r9   r:   )rs   r   r  rv  r  r  r  r  normr  expr&   r   r.   Tr1   r3   )rA   rz   r  r   rU   rZ   r  r   r  r  r  r  r  r8   r7   r  r6   r5   r4   r   s                       r)   r   zAltCLIPModel.forward  s   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%/!5# ' 
 **%/!5%=# + 
 &a(--l;"1o**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,_-D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r+   )NNNNNNNr  )
NNNNNNNNFN)rF   rG   rH   r   r  r^   r   r   r&   r   r   rJ   r  r  
LongTensorr	   r   r3   r   r   r   s   @r)   r  r  `  s(    L} >  -115/3,0/3&*,ELL), !., u||,	, $D>, 'tn, d^, 
		, ,\  59,0/3).&*-u001- $D>- 'tn	-
 #'- d^- 
		- -^  1548153715&*,0/3).&*[
E,,-[
 u001[
 !.	[

 u//0[
 !.[
 d^[
 $D>[
 'tn[
 #'[
 d^[
 
um#	$[
 [
r+   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r&   cumsumtype_asrq   )rz   rP   r|   maskincremental_indicess        r)   rw   rw   A  sW     <<$((*D <<!4<<TBE[[_cc##%33r+   )r  r  r  r  )rY  )r   )HrI   r   dataclassesr   typingr   r   r   r   r   r	   r&   torch.nnr#   torch.utils.checkpointactivationsr   modeling_outputsr   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   r   utilsr   r   r   r   configuration_altclipr   r   r   
get_loggerrF   r(  r   r*   r1   r3   r  rN   r   r   r   r   r   r   r   r  r6  floatrE  rG  r`  rg  rq  r|  r  r  r  r  r  r  rw   __all__rL   r+   r)   <module>r0     s     ! > >    !  G l l D D X X 
		H	%
`U\\ `ell `-%,, -5<< - !
K !
 !
JV=299 V=tCbii CN299  $& "0")) 0hRYY  ryy Sbii SnZ
		 Z
|ryy . %II%<<% 
% <<	%
 U\\*% % %,L)ryy L)` /")) /d^
RYY ^
DPbii Pf 1?_ 1? 1?h3
ryy 3
l2
/ 2
j P
, P
P
fV
- V
r]
) ]
B4  _r+   