
    Uh                     R   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z	ddl
Z	ddl	mZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZ ddlmZmZm Z   ejB                  e"      Z#e G d de             Z$e G d de             Z%e G d de             Z&de	jN                  de	jN                  fdZ(de	jN                  de	jN                  fdZ)de de*fdZ+dNdee*ef   de,fdZ- G d d ej\                        Z/ G d! d"ej`                        Z1 G d# d$ej\                        Z2 G d% d&ej\                        Z3 G d' d(ej\                        Z4 G d) d*ej\                        Z5 G d+ d,ej\                        Z6 G d- d.ej\                        Z7 G d/ d0ej\                        Z8 G d1 d2ej\                        Z9 G d3 d4ej\                        Z:d5e9iZ; G d6 d7ej\                        Z< G d8 d9ej\                        Z= G d: d;ej\                        Z> G d< d=ej\                        Z? G d> d?ej\                        Z@ G d@ dAej\                        ZAe G dB dCe             ZB edDE       G dF dGeB             ZC edHE       G dI dJeB             ZDe G dK dLeB             ZEg dMZFy)OzPyTorch ALIGN model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN)BaseModelOutputWithNoAttention)BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentions(BaseModelOutputWithPoolingAndNoAttention)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesprune_linear_layer)ModelOutputauto_docstringlogging   )AlignConfigAlignTextConfigAlignVisionConfigc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   y)AlignVisionModelOutputa+  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Nimage_embedslast_hidden_statehidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r        z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/align/modeling_align.pyr   r   )   sN     15L(5,,-459x 1 1298<M8E%"3"345<r'   r   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)AlignTextModelOutputa  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_embedsr   r   
attentions)r   r    r!   r"   r+   r   r#   r$   r%   r   r   r   r,   r&   r'   r(   r*   r*   ?   sh    * 04K%++,359x 1 1298<M8E%"3"345<59Ju00129r'   r*   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)AlignOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image:(`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text:(`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`AlignTextModel`].
        image_embeds(`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The output of [`AlignVisionModel`].
        text_model_output(`BaseModelOutputWithPoolingAndCrossAttentions`):
            The output of the [`AlignTextModel`].
        vision_model_output(`BaseModelOutputWithPoolingAndNoAttention`):
            The output of the [`AlignVisionModel`].
    Nlosslogits_per_imagelogits_per_textr+   r   text_model_outputvision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r2   r3   N)getattrto_tuple).0kselfs     r(   	<genexpr>z'AlignOutput.to_tuple.<locals>.<genexpr>{   s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr;   s   `r(   r8   zAlignOutput.to_tuplez   s#     
YY[
 
 	
r'   )r   r    r!   r"   r/   r   r#   r$   r%   r0   r1   r+   r   r2   r   r3   r   r   r   r8   r&   r'   r(   r.   r.   \   s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-4FJCJDHAH
%* 
r'   r.   logitsr4   c                     t         j                  j                  | t        j                  t        |       | j                        d      S )Ndeviceg?)label_smoothing)r   
functionalcross_entropyr#   arangelenrC   )r@   s    r(   contrastive_lossrI      s5    ==&&vu||CKPVP]P]/^ps&ttr'   
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)rI   t)rJ   caption_loss
image_losss      r(   
align_lossrO      s,    #J/L!*,,.1J:%,,r'   confignum_channelsc                     | j                   }|| j                  z  }t        |t        ||dz  z         |z  |z        }|d|z  k  r||z  }t        |      S )z<
    Round number of filters based on depth multiplier.
       g?)depth_divisorwidth_coefficientmaxint)rP   rQ   divisornew_dims       r(   round_filtersrZ      sf     ""GF,,,L'3|gk9:gEOPG |##7w<r'   kernel_sizeadjustc                     t        | t              r| | f} | d   dz  | d   dz  f}|r|d   dz
  |d   |d   dz
  |d   fS |d   |d   |d   |d   fS )aJ  
    Utility function to get the tuple padding value for the depthwise convolution.

    Args:
        kernel_size (`int` or `tuple`):
            Kernel size of the convolution layers.
        adjust (`bool`, *optional*, defaults to `True`):
            Adjusts padding value to apply to right and bottom sides of the input.
    r   rS   r   )
isinstancerW   )r[   r\   corrects      r(   correct_padr`      s}     +s#"K01~"KNa$78G
Q
GAJNGAJGG
GAJ
GAJ??r'   c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ xZ	S )AlignVisionEmbeddingszL
    A module that corresponds to the stem module of the original work.
    rP   c                    t         |           t        |d      | _        t	        j
                  d      | _        t	        j                  |j                  | j                  dddd      | _	        t	        j                  | j                  |j                  |j                  	      | _        t        |j                     | _        y )
N    )r   r   r   r   paddingr	   rS   validFr[   striderf   bias)epsmomentum)super__init__rZ   out_dimr   	ZeroPad2drf   Conv2drQ   convolutionBatchNorm2dbatch_norm_epsbatch_norm_momentum	batchnormr
   
hidden_act
activationr;   rP   	__class__s     r(   rn   zAlignVisionEmbeddings.__init__   s    $VR0||L9991QPW^c
 &:O:OZ`ZtZtu !2!23r'   pixel_valuesr4   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)rf   rr   rv   rx   )r;   r{   featuress      r(   forwardzAlignVisionEmbeddings.forward   sA    <<-##H->>(+??8,r'   )
r   r    r!   r"   r   rn   r#   Tensorr   __classcell__rz   s   @r(   rb   rb      s0    	40 	4ELL U\\ r'   rb   c                   .     e Zd Z	 	 	 	 	 	 	 d fd	Z xZS )AlignVisionDepthwiseConv2dc	                 @    ||z  }	t         
|   ||	|||||||	       y )N)	in_channelsout_channelsr[   ri   rf   dilationgroupsrj   padding_mode)rm   rn   )r;   r   depth_multiplierr[   ri   rf   r   rj   r   r   rz   s             r(   rn   z#AlignVisionDepthwiseConv2d.__init__   s=     #%55#%#% 	 
	
r'   )r   r	   r   r   r   Tzeros)r   r    r!   rn   r   r   s   @r(   r   r      s$     
 
r'   r   c                   l     e Zd ZdZdedededef fdZdej                  dej                  fd	Z
 xZS )
AlignVisionExpansionLayerz_
    This corresponds to the expansion phase of each block in the original implementation.
    rP   in_dimro   ri   c                     t         |           t        j                  ||ddd      | _        t        j
                  ||j                        | _        t        |j                     | _
        y )Nr   sameFr   r   r[   rf   rj   )num_featuresrk   )rm   rn   r   rq   expand_convrs   rt   	expand_bnr
   rw   
expand_act)r;   rP   r   ro   ri   rz   s        r(   rn   z"AlignVisionExpansionLayer.__init__   sZ    99 
 W&BWBWX !2!23r'   r   r4   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r}   )r   r   r   r;   r   s     r(   r   z!AlignVisionExpansionLayer.forward   s4    ((7}56r'   )r   r    r!   r"   r   rW   rn   r#   r$   r   r   r   r   s   @r(   r   r      sH    
40 
4# 
4 
4UX 
4U%6%6 5<< r'   r   c            
       p     e Zd ZdZdededededef
 fdZdej                  d	ej                  fd
Z xZS )AlignVisionDepthwiseLayerzk
    This corresponds to the depthwise convolution phase of each block in the original implementation.
    rP   r   ri   r[   adjust_paddingc                 b   t         |           || _        | j                  dk(  rdnd}t        ||      }t	        j
                  |      | _        t        ||||d      | _        t	        j                  ||j                  |j                        | _        t        |j                     | _        y )	NrS   rg   r   )r\   re   Frh   r   rk   rl   )rm   rn   ri   r`   r   rp   depthwise_conv_padr   depthwise_convrs   rt   ru   depthwise_normr
   rw   depthwise_act)	r;   rP   r   ri   r[   r   conv_padrf   rz   s	           r(   rn   z"AlignVisionDepthwiseLayer.__init__  s     	"kkQ.7Fk.A"$,,w"?8FHSX
 !nnV%:%:VE_E_
 $F$5$56r'   r   r4   c                     | j                   dk(  r| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S )NrS   )ri   r   r   r   r   r   s     r(   r   z!AlignVisionDepthwiseLayer.forward  sT    ;;! 33MBM++M:++M:**=9r'   r   r    r!   r"   r   rW   boolrn   r#   r$   r   r   r   r   s   @r(   r   r     sZ    7!7 7 	7
 7 7,	U%6%6 	5<< 	r'   r   c            	       n     e Zd ZdZd
dedededef fdZdej                  dej                  fd	Z xZS )AlignVisionSqueezeExciteLayerzl
    This corresponds to the Squeeze and Excitement phase of each block in the original implementation.
    rP   r   
expand_dimexpandc                    t         |           |r|n|| _        t        dt	        ||j
                  z              | _        t        j                  d      | _	        t        j                  | j                  | j                  dd      | _        t        j                  | j                  | j                  dd      | _        t        |j                     | _        t        j                          | _        y )Nr   )output_sizer   )r   r   r[   rf   )rm   rn   dimrV   rW   squeeze_expansion_ratiodim_ser   AdaptiveAvgPool2dsqueezerq   reducer   r
   rw   
act_reduceSigmoid
act_expand)r;   rP   r   r   r   rz   s        r(   rn   z&AlignVisionSqueezeExciteLayer.__init__0  s    !':V!S&*H*H!HIJ++:ii	
 ii	
 !!2!23**,r'   r   r4   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }t        j                  ||      }|S r}   )r   r   r   r   r   r#   mul)r;   r   inputss      r(   r   z%AlignVisionSqueezeExciteLayer.forwardE  sc    ]3M26M26		&-8r'   )Fr   r   s   @r(   r   r   +  sH    '0 '# '3 'X\ '*
U%6%6 
5<< 
r'   r   c                        e Zd ZdZdedededededef fdZd	e	j                  d
e	j                  de	j                  fdZ xZS )AlignVisionFinalBlockLayerz[
    This corresponds to the final phase of each block in the original implementation.
    rP   r   ro   ri   	drop_rateid_skipc                    t         |           |dk(  xr | | _        t        j                  ||ddd      | _        t        j                  ||j                  |j                        | _	        t        j                  |      | _        y )Nr   r   Fr   r   )p)rm   rn   apply_dropoutr   rq   project_convrs   rt   ru   
project_bnDropoutdropout)r;   rP   r   ro   ri   r   r   rz   s          r(   rn   z#AlignVisionFinalBlockLayer.__init__W  sz     	#q[8[II 
 .. f&;&;fF`F`
 zzI.r'   
embeddingsr   r4   c                     | j                  |      }| j                  |      }| j                  r| j                  |      }||z   }|S r}   )r   r   r   r   )r;   r   r   s      r(   r   z"AlignVisionFinalBlockLayer.forwardh  sG    ))-86 LL7M)J6Mr'   r   r    r!   r"   r   rW   floatr   rn   r#   r$   r   r   r   r   s   @r(   r   r   R  sj    /'/14/?B/LO/\a/lp/"%"3"3 EDUDU Z_ZfZf r'   r   c                        e Zd ZdZdededededededed	ed
ef fdZde	j                  de	j                  fdZ xZS )AlignVisionBlocka  
    This corresponds to the block module of original the EfficientNet vision encoder implementation.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
        in_dim (`int`):
            Number of input channels.
        out_dim (`int`):
            Number of output channels.
        stride (`int`):
            Stride size to be used in convolution layers.
        expand_ratio (`int`):
            Expand ratio to set the output dimensions for the expansion and squeeze-excite layers.
        kernel_size (`int`):
            Kernel size for the depthwise convolution layer.
        drop_rate (`float`):
            Dropout rate to be used in the final phase of each block.
        id_skip (`bool`):
            Whether to apply dropout and sum the final hidden states with the input embeddings during the final phase
            of each block. Set to `True` for the first block of each stage.
        adjust_padding (`bool`):
            Whether to apply padding to only right and bottom side of the input kernel before the depthwise convolution
            operation, set to `True` for inputs with odd input sizes.
    rP   r   ro   ri   expand_ratior[   r   r   r   c
                 t   t         |           || _        | j                  dk7  rdnd| _        ||z  }
| j                  rt	        |||
|      | _        t        || j                  r|
n||||	      | _        t        |||
| j                        | _	        t        || j                  r|
n|||||      | _        y )Nr   TF)rP   r   ro   ri   )rP   r   ri   r[   r   )rP   r   r   r   )rP   r   ro   ri   r   r   )rm   rn   r   r   r   	expansionr   r   r   squeeze_exciter   
projection)r;   rP   r   ro   ri   r   r[   r   r   r   expand_in_dimrz   s              r(   rn   zAlignVisionBlock.__init__  s     	("//14d%-;;6fmFDN 8$(KK=V#)
 <&]4;;
 5$(KK=V
r'   r   r4   c                     |}| j                   dk7  r| j                  |      }| j                  |      }| j                  |      }| j	                  ||      }|S )Nr   )r   r   r   r   r   )r;   r   r   s      r(   r   zAlignVisionBlock.forward  s[    "
! NN=9M++M: ++M:
MBr'   r   r   s   @r(   r   r   s  s    4'
!'
 '
 	'

 '
 '
 '
 '
 '
 '
R
U%6%6 
5<< 
r'   r   c            	       f     e Zd ZdZdef fdZ	 	 d	dej                  dee	   dee	   de
fdZ xZS )
AlignVisionEncoderz
    Forward propagates the embeddings through each vision encoder (EfficientNet) block.

    Args:
        config ([`AlignVisionConfig`]):
            Model configuration class.
    rP   c                     t                    |j                   _         fdt        |j                        }t        fd|j                  D              }d}g }t        |      D ]  }t        ||j                  |         }t        ||j                  |         }|j                  |   }	|j                  |   }
|j                  |   }t         |j                  |               D ]k  }|dk(  rdnd}|dkD  rdn|	}	|dkD  r|n|}||j                  v rdnd}|j                  |z  |z  }t        ||||	|
||||	      }|j!                  |       |dz  }m  t#        j$                  |       _        y )Nc                 Z    t        t        j                  j                  | z              S r}   )rW   mathceildepth_coefficient)repeatsr;   s    r(   round_repeatsz2AlignVisionEncoder.__init__.<locals>.round_repeats  s"    tyy!7!7'!ABCCr'   c              3   .   K   | ]  } |        y wr}   r&   )r9   nr   s     r(   r<   z.AlignVisionEncoder.__init__.<locals>.<genexpr>  s     Laq)Ls   r   TFr   )	rP   r   ro   ri   r[   r   r   r   r   )rm   rn   r   rH   r   sumnum_block_repeatsrangerZ   r   strideskernel_sizesexpand_ratiosdepthwise_paddingdrop_connect_rater   appendr   
ModuleListblocks)r;   rP   num_base_blocks
num_blockscurr_block_numr   ir   ro   ri   r[   r   jr   r   r   blockr   rz   s   `                @r(   rn   zAlignVisionEncoder.__init__  s   !'!9!9	D f001L63K3KLL
' 	$A"66+=+=a+@AF#FF,?,?,BCG^^A&F --a0K!//2L=)A)A!)DEF $"#q&$e!e$%Ev*8F<T<T*TZ^"44~E
R	(!!#! +!-'##1
 e$!#'$	$8 mmF+r'   r   output_hidden_statesreturn_dictr4   c                     |r|fnd }| j                   D ]  } ||      }|s||fz  } |st        d ||fD              S t        ||      S )Nc              3   &   K   | ]	  }||  y wr}   r&   r9   vs     r(   r<   z-AlignVisionEncoder.forward.<locals>.<genexpr>  s     Xq!-Xs   )r   r   )r   r=   r   )r;   r   r   r   all_hidden_statesr   s         r(   r   zAlignVisionEncoder.forward  so     1E],$[[ 	6E!-0M#!m%55!	6
 X]4E$FXXX-++
 	
r'   )FT)r   r    r!   r"   r   rn   r#   r$   r   r   r   r   r   r   s   @r(   r   r     sW    ),0 ),\ 05&*	
((
 'tn
 d^	

 
2
r'   r   c                        e Zd ZdZ fdZ	 	 	 	 	 d
deej                     deej                     deej                     deej                     de	dej                  fd	Z xZS )AlignTextEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                 >   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       y )N)padding_idxrk   position_embedding_typeabsoluteposition_ids)r   F)
persistenttoken_type_idsdtype)rm   rn   r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddings	LayerNormlayer_norm_epsr   hidden_dropout_probr   r7   r   register_bufferr#   rG   r   r   r   sizelongry   s     r(   rn   zAlignTextEmbeddings.__init__  s/   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	
r'   	input_idsr   r   inputs_embedspast_key_values_lengthr4   c                 Z   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|st        | d      r-| j                  d d d |f   }|j	                  |d   |      }	|	}n:t        j                  |t
        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j                  |      }| j                  |      }|S )Nr   r   r   r   r   rC   r   )r  r   hasattrr   r   r#   r   r  rC   r  r  r   r  r	  r   )r;   r  r   r   r  r  input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  r   r  s                r(   r   zAlignTextEmbeddings.forward%  sH     #..*K',,.s3K ^
,,Q0FVlIl0l-lmL
 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r'   )NNNNr   )r   r    r!   r"   rn   r   r#   
LongTensorr$   rW   r   r   r   r   s   @r(   r   r     s    Q
* 15593759&''E,,-' !!1!12' u//0	'
   1 12' !$' 
'r'   r   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )AlignTextSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()r   r   relative_keyrelative_key_queryrS   r   )rm   rn   r  num_attention_headsr  
ValueErrorrW   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probr   r7   r   r  r   distance_embedding
is_decoderr;   rP   r   rz   s      r(   rn   zAlignTextSelfAttention.__init__Q  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r'   xr4   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr   r   rS   r   r	   )r  r!  r#  viewpermute)r;   r-  new_x_shapes      r(   transpose_for_scoresz+AlignTextSelfAttention.transpose_for_scoresk  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r'   r   attention_mask	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r   rS   r   r   r  r   r  r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr	   ) r&  r2  r'  r(  r#   catr+  matmul	transposer   shapetensorr  rC   r/  rG   r*  r  tor   einsumr   sqrtr#  r   rE   softmaxr   r0  
contiguousr  r$  )r;   r   r3  r4  r5  r6  r7  r8  mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r(   r   zAlignTextSelfAttention.forwardp  s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr'   r}   NNNNNF)r   r    r!   rn   r#   r   r2  r   r$   r   r   r   r   r   s   @r(   r  r  P  s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	cr'   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nr   )rm   rn   r   r%  r  denser	  r
  r   r  r   ry   s     r(   rn   zAlignTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r'   r   input_tensorr4   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r}   r_  r   r	  r;   r   r`  s      r(   r   zAlignTextSelfOutput.forward  7    

=1]3}|'CDr'   r   r    r!   rn   r#   r   r   r   r   s   @r(   r\  r\    1    >U\\  RWR^R^ r'   r\  eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )AlignTextAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nr   )	rm   rn   !ALIGN_TEXT_SELF_ATTENTION_CLASSES_attn_implementationr;   r\  outputsetpruned_headsr,  s      r(   rn   zAlignTextAttention.__init__  sC    5f6Q6QR,C
	 *&1Er'   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r:  )rH   r   r;   r!  r#  rp  r   r&  r'  r(  rn  r_  r$  union)r;   headsindexs      r(   prune_headszAlignTextAttention.prune_heads  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r'   r   r3  r4  r5  r6  r7  r8  r4   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   )r;   rn  )r;   r   r3  r4  r5  r6  r7  r8  self_outputsattention_outputrY  s              r(   r   zAlignTextAttention.forward  sW     yy!"
  ;;|AF#%QR(88r'   r}   rZ  )r   r    r!   rn   ru  r#   r   r   r$   r   r   r   r   r   s   @r(   ri  ri    s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r'   ri  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r}   )rm   rn   r   r%  r  intermediate_sizer_  r^   rw   strr
   intermediate_act_fnry   s     r(   rn   zAlignTextIntermediate.__init__   s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r'   r   r4   c                 J    | j                  |      }| j                  |      }|S r}   )r_  r~  r   s     r(   r   zAlignTextIntermediate.forward(  s&    

=100?r'   re  r   s   @r(   rz  rz    s#    9U\\ ell r'   rz  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )AlignTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r^  )rm   rn   r   r%  r|  r  r_  r	  r
  r   r  r   ry   s     r(   rn   zAlignTextOutput.__init__0  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r'   r   r`  r4   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r}   rb  rc  s      r(   r   zAlignTextOutput.forward6  rd  r'   re  r   s   @r(   r  r  /  rf  r'   r  c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )AlignTextLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is addedr   rk  )rm   rn   chunk_size_feed_forwardseq_len_dimri  	attentionr+  add_cross_attentionr"  crossattentionrz  intermediater  rn  ry   s     r(   rn   zAlignTextLayer.__init__?  s    '-'E'E$+F3 ++#)#=#= ##?? D6)g!hii"4VU_"`D1&9%f-r'   r   r3  r4  r5  r6  r7  r8  r4   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
NrS   )r8  r7  r   r   r   r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r;  )	r  r+  r  r"  r  r   feed_forward_chunkr  r  )r;   r   r3  r4  r5  r6  r7  r8  self_attn_past_key_valueself_attention_outputsrx  rY  present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputslayer_outputs                    r(   r   zAlignTextLayer.forwardM  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr'   c                 L    | j                  |      }| j                  ||      }|S r}   )r  rn  )r;   rx  intermediate_outputr  s       r(   r  z!AlignTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr'   rZ  )r   r    r!   rn   r#   r   r   r$   r   r   r   r  r   r   s   @r(   r  r  >  s    ." 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Br'   r  c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )AlignTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rm   rn   rP   r   r   r   num_hidden_layersr  layergradient_checkpointing)r;   rP   _rz   s      r(   rn   zAlignTextEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#r   r3  r4  r5  r6  past_key_valuesrK  r8  r   r   r4   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
Nr&   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   rS   c              3   $   K   | ]  }|| 
 y wr}   r&   r   s     r(   r<   z+AlignTextEncoder.forward.<locals>.<genexpr>  s      
 = 
s   )r   r  r   r,   cross_attentions)rP   r  r  trainingloggerwarning_once	enumerater  _gradient_checkpointing_func__call__r=   r   )r;   r   r3  r4  r5  r6  r  rK  r8  r   r   r   all_self_attentionsall_cross_attentionsnext_decoder_cacher   layer_modulelayer_head_maskr7  layer_outputss                       r(   r   zAlignTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r'   )	NNNNNNFFT)r   r    r!   rn   r#   r   r   r$   r   r   r   r   r   r   r   s   @r(   r  r    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r'   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )AlignTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r}   )rm   rn   r   r%  r  r_  Tanhrx   ry   s     r(   rn   zAlignTextPooler.__init__  s9    YYv1163E3EF
'')r'   r   r4   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r_  rx   )r;   r   first_token_tensorpooled_outputs       r(   r   zAlignTextPooler.forward  s6     +1a40

#566r'   re  r   s   @r(   r  r    s#    $
U\\ ell r'   r  c                       e Zd ZeZdZdZd Zy)AlignPreTrainedModelalignTc                 &   t        |t        j                  t        j                  f      rn|j                  j
                  j                  d| j                  j                         |j                  ;|j                  j
                  j                          nt        |t              rst        j                  j                  |j                  j                         |j                  j                  j
                  j                          d|j                  _        nt        |t        j                         rx|j                  j
                  j                  d| j                  j                         |j"                  1|j                  j
                  |j"                     j                          t        |t        j$                        rJ|j                  j
                  j                          |j                  j
                  j'                  d       yy)zInitialize the weightsg        )meanstdNTg      ?)r^   r   r%  rq   weightdatanormal_rP   initializer_rangerj   zero_
AlignModelinitxavier_uniform_text_projection_is_hf_initializedr   r   r	  fill_)r;   modules     r(   _init_weightsz"AlignPreTrainedModel._init_weights  sW   fryy"))45MM&&CT[[5R5R&S{{&  &&(
+GG##F$:$:$A$AB""'',,2248<F""5-MM&&CT[[5R5R&S!!-""6#5#56<<>fbll+KK""$MM$$S) ,r'   N)r   r    r!   r   config_classbase_model_prefixsupports_gradient_checkpointingr  r&   r'   r(   r  r    s    L&*#*r'   r  zJ
    The text model from ALIGN without any head or projection on top.
    )custom_introc                   F    e Zd ZeZdgZddedef fdZd Zd Z	e
	 	 	 	 	 	 	 	 	 ddeej                     deej                     d	eej                     d
eej                     deej                     deej                     dee   dee   dee   deeef   fd       Z xZS )AlignTextModelr   rP   add_pooling_layerc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rm   rn   rP   r   r   r  encoderr  pooler	post_init)r;   rP   r  rz   s      r(   rn   zAlignTextModel.__init__$  sM    
 	 -f5'/1Bof- 	r'   c                 .    | j                   j                  S r}   r   r  r?   s    r(   get_input_embeddingsz#AlignTextModel.get_input_embeddings4  s    ...r'   c                 &    || j                   _        y r}   r  )r;   r(  s     r(   set_input_embeddingsz#AlignTextModel.set_input_embeddings7  s    */'r'   r  r3  r   r   r4  r  r8  r   r   r4   c
                 $   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	||t	        d      |#| j                  ||       |j                         }
n!||j                         dd }
nt	        d      |
\  }}||j                  n|j                  }|t        j                  ||f|      }|pt        | j                  d      r4| j                  j                  ddd|f   }|j                  ||      }|}n&t        j                  |
t        j                  |      }| j!                  ||
      }| j#                  || j                   j$                        }| j                  ||||      }| j'                  ||||||		      }|d
   }| j(                  | j)                  |      nd}|	s
||f|dd z   S t+        |||j,                  |j.                  |j0                        S )a-  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignTextModel

        >>> model = AlignTextModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```NzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsrB   r   r  )r  r   r   r  )r3  r4  r8  r   r   r   r   )r   pooler_outputr   r,   r  )rP   r8  r   use_return_dictr"  %warn_if_padding_and_no_attention_maskr  rC   r#   onesr  r   r   r   r   r  get_extended_attention_maskget_head_maskr  r  r  r   r   r,   r  )r;   r  r3  r   r   r4  r  r8  r   r   r  
batch_sizer  rC   r  r  extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                        r(   r   zAlignTextModel.forward:  s<   8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T!"ZZ*j)A6RN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m &&y$++2O2OP	??%)'	 + 
 ,,2/!5# ' 
 *!,8<8OO4UY#]3oab6III;-')77&11,==
 	
r'   T	NNNNNNNNN)r   r    r!   r   r  _no_split_modulesr   rn   r  r  r   r   r#   r   r   r   r   r   r   r   s   @r(   r  r    s    #L./ 4  /0  -11515/3,004,0/3&*^
ELL)^
 !.^
 !.	^

 u||,^
 ELL)^
  -^
 $D>^
 'tn^
 d^^
 
uBB	C^
 ^
r'   r  zL
    The vision model from ALIGN without any head or projection on top.
    c                        e Zd ZeZdZdZdef fdZdej                  fdZ
e	 	 	 d
deej                     dee   dee   deeef   fd	       Z xZS )AlignVisionModelr{   FrP   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dk(  r't        j                  |j                  d      | _        nN|j                  dk(  r't        j                  |j                  d      | _        nt        d|j                         | j                          y )Nr  T)	ceil_moderV   z2config.pooling must be one of ['mean', 'max'] got )rm   rn   rP   rb   r   r   r  pooling_typer   	AvgPool2d
hidden_dimr  	MaxPool2dr"  poolingr  ry   s     r(   rn   zAlignVisionModel.__init__  s     /7)&1 &(,,v'8'8DIDK  E),,v'8'8DIDKQRXR`R`Qabcc 	r'   r4   c                 B    | j                   j                  j                  S r}   )vision_modelr   rr   r?   s    r(   r  z%AlignVisionModel.get_input_embeddings  s      ++777r'   r   r   c                 ~   ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }|d   }| j                  |      }|j                  |j                  dd       }|s
||f|dd z   S t        |||j                        S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignVisionModel

        >>> model = AlignVisionModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```Nz You have to specify pixel_values)r   r   r   rS   r   )r   r  r   )rP   r   r  r"  r   r  r  reshaper?  r   r   )r;   r{   r   r   r  r  r   r  s           r(   r   zAlignVisionModel.forward  s    8 %9$D $++JjJj 	 &1%<k$++B]B]?@@??<8,,!5# ' 
 ,A.$56%--m.A.A"1.EF%}58KKK7/')77
 	
r'   NNN)r   r    r!   r   r  main_input_namer  rn   r   Moduler  r   r   r#   r$   r   r   r   r   r   r   r   s   @r(   r  r    s     %L$O&+#0 "8bii 8  59/3&*	5
u0015
 'tn5
 d^	5

 
u>>	?5
 5
r'   r  c                       e Zd ZeZdef fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     d	ee
   d
ee
   dee
   dej                  fd       Ze	 	 	 ddeej                     d
ee
   dee
   dej                  fd       Ze	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     dee
   d	ee
   d
ee
   dee
   deeef   fd       Z xZS )r  rP   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        t        |      | _        t        |      | _        t!        j"                  | j                  | j                        | _        t!        j&                  t)        j*                  | j,                  j.                              | _        | j3                          y )NzLconfig.text_config is expected to be of type AlignTextConfig but is of type .zPconfig.vision_config is expected to be of type AlignVisionConfig but is of type )rm   rn   r^   text_configr   	TypeErrortypevision_configr   projection_dimr  text_embed_dimr  
text_modelr  r  r   r%  r  	Parameterr#   r@  rP   temperature_init_valuetemperaturer  )r;   rP   r  r  rz   s       r(   rn   zAlignModel.__init__  s#    &,,o>++,-Q0 
 &..0AB--./q2 
 ((,,$33)55(5,];!yy)<)<d>Q>QR<<T[[5W5W(XY 	r'   r  r3  r   r   r4  r  r8  r   r   r4   c
                    ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||||||		      }
|
d   dddddf   }| j                  |      }|S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`AlignTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```N	r  r3  r   r   r4  r  r8  r   r   r   )rP   r8  r   r  r  r  )r;   r  r3  r   r   r4  r  r8  r   r   text_outputsr   text_featuress                r(   get_text_featureszAlignModel.get_text_features  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]))%'/!5# ' 

 )OAq!G4,,->?r'   r{   c                     ||n| j                   j                  }||n| j                   j                  }| j                  |||      }|d   }|S )a9  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`AlignVisionModel`].

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> image_features = model.get_image_features(**inputs)
        ```r{   r   r   r   )rP   r   r  r  )r;   r{   r   r   vision_outputsimage_featuress         r(   get_image_featureszAlignModel.get_image_featuresJ  sf    > %9$D $++JjJj 	 &1%<k$++B]B]**%!5# + 
 (*r'   return_lossc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }| j	                  ||
|      }| j                  |||||||	|
|	      }|d   }|d   dddddf   }| j                  |      }||j                  ddd	      z  }||j                  ddd	      z  }t        j                  ||j                               | j                  z  }|j                         }d}|rt        |      }|s||||||f}||f|z   S |S t        |||||||
      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, AlignModel

        >>> model = AlignModel.from_pretrained("kakaobrain/align-base")
        >>> processor = AutoProcessor.from_pretrained("kakaobrain/align-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(
        ...     images=image, text=["a photo of a cat", "a photo of a dog"], return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr  r  r   r   rS   r   T)r   r   keepdim)r/   r0   r1   r+   r   r2   r3   )rP   r8  r   r  r  r  r  normr#   r=  rL   r  rO   r.   )r;   r  r{   r3  r   r   r4  r  r  r8  r   r   r  r  r   r+   r1   r0   r/   rn  s                       r(   r   zAlignModel.forwardw  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%!5# + 
 ))%'/!5# ' 

 &a("1oaAg.**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  ,,{LNN4DEHXHXX*,,.o.D&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r'   r  r  )NNNNNNNNNNN)r   r    r!   r   r  rn   r   r   r#   r   r   r$   r	  r  r  r   r   r.   r   r   r   s   @r(   r  r    sg   L{ <  -11515/3,004,0/3&*2ELL)2 !.2 !.	2
 u||,2 ELL)2  -2 $D>2 'tn2 d^2 
		2 2h  59/3&*	*u001* 'tn* d^	*
 
		* *X  15481515/3,004&*,0/3&*\
E,,-\
 u001\
 !.	\

 !.\
 u||,\
 ELL)\
  -\
 d^\
 $D>\
 'tn\
 d^\
 
uk!	"\
 \
r'   r  )r  r  r  r  r  )Gr"   r   dataclassesr   typingr   r   r   r   r#   torch.utils.checkpointr   activationsr
   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   configuration_alignr   r   r   
get_loggerr   r  r   r*   r.   r   rI   rO   rW   rZ   r   r`   r  rb   rq   r   r   r   r   r   r   r   r   r  r\  rl  ri  rz  r  r  r  r  r  r  r  r  __all__r&   r'   r(   <module>r     s     ! . .    !  . l l 9 9 P P 
		H	% =[ = =* :; : :8 !
+ !
 !
LuU\\ uell u-5<< -ELL -+ 3  @U3:. @ @*BII 4
 
6		 6$		 $P$BII $N BNryy NbG
 G
V=")) =BCRYY CN"))  #% !0 0hBII  bii SRYY SnZ
ryy Z
|bii  *? * *0 
y
) y

y
x 
O
+ O

O
d `
% `
 `
F Wr'   