
    UhNQ                       d Z ddlZddlmZ ddlmZ ddlmZmZm	Z	m
Z
 ddlZddlZddlmZ ddlmZ dd	lmZmZ dd
lmZmZmZmZmZ ddlmZmZ ddlmZmZ ddlm Z m!Z!m"Z" ddl#m$Z$m%Z%m&Z&  e!jN                  e(      Z)dZ*e G d de             Z+e G d de             Z, G d dejZ                        Z. G d dejZ                        Z/ G d dejZ                        Z0 G d dejZ                        Z1 G d dejZ                        Z2 G d d ejZ                        Z3 G d! d"ejZ                        Z4 G d# d$ejZ                        Z5 G d% d&ejZ                        Z6 G d' d(ejZ                        Z7d)e7iZ8 G d* d+ejZ                        Z9 G d, d-ejZ                        Z: G d. d/ejZ                        Z; G d0 d1ejZ                        Z< G d2 d3ejZ                        Z=dRd4Z>e  G d5 d6e             Z? G d7 d8e?      Z@ e d9:       G d; d<e?             ZA e d=:       G d> d?e?             ZB G d@ dAejZ                        ZC G dB dCejZ                        ZD G dD dEejZ                        ZE e dF:       G dG dHe?             ZF e dI:       G dJ dKe?             ZG G dL dMejZ                        ZH e dN:       G dO dPe?             ZIg dQZJy)SzPyTorch BridgeTower Model    N)OrderedDict)	dataclass)ListOptionalTupleUnion)nn)CrossEntropyLoss   )ACT2FNQuickGELUActivation))BaseModelOutputWithPastAndCrossAttentions,BaseModelOutputWithPoolingAndCrossAttentionsMaskedLMOutputModelOutputSequenceClassifierOutput)PreTrainedModelapply_chunking_to_forward) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )BridgeTowerConfigBridgeTowerTextConfigBridgeTowerVisionConfigRobertaTokenizerc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)BridgeTowerModelOutputa  
    Output type of [`BridgeTowerModel`].

    Args:
        text_features (`torch.FloatTensor` of shape `(batch_size, text_sequence_length, hidden_size)`):
            Sequence of hidden-states at the text output of the last layer of the model.
        image_features (`torch.FloatTensor` of shape `(batch_size, image_sequence_length, hidden_size)`):
            Sequence of hidden-states at the image output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size x 2)`):
            Concatenation of last layer hidden-state of the first token of the text and image sequence (classification
            token), respectively, after further processing through layers used for auxiliary pretraining tasks.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_featuresimage_featurespooler_outputhidden_states
attentions)__name__
__module____qualname____doc__r!   r   torchFloatTensor__annotations__r"   r#   r$   r   r%        /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/bridgetower/modeling_bridgetower.pyr    r    .   s|    . 26M8E--.526NHU../615M8E--.58<M8E%"3"345<59Ju00129r.   r    c                   H   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed	<   y)
BridgeTowerContrastiveOutputaZ  
    Output type of ['BridgeTowerForContrastiveLearning']

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`:
            Image-text contrastive loss.
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        text_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        image_embeds (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        cross_embeds  (`torch.FloatTensor)`, *optional*, returned when model is initialized with `with_projection=True`):
            The text-image cross-modal embeddings obtained by applying the projection layer to the pooler_output.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
    Nlosslogitstext_embedsimage_embedscross_embedsr$   r%   )r&   r'   r(   r)   r2   r   r*   r+   r,   r3   r4   r   r5   r6   r$   r%   r-   r.   r/   r1   r1   N   s    . )-D(5$$
%,*.FHU&&'.6:K% 1 123:7;L(5!2!234;7;L(5!2!234;8<M8E%"3"345<59Ju00129r.   r1   c                        e Zd Z fdZdej
                  dej
                  fdZddej
                  deej
                     fdZ xZ	S )BridgeTowerResidualAttentionc                 h   t         |           t        j                  |j                  |j                  dz        | _        t        j                  |j                  |j                        | _        t        j                  t        dt        j                  |j                  |j                  dz        fdt               fdt        j                  |j                  dz  |j                        fg            | _        t        j                  |j                  |j                        | _        d | _        y )N@   epsc_fc   geluc_proj)super__init__r	   MultiheadAttentionhidden_sizeattn	LayerNormlayer_norm_epsln_1
ModuleDictr   Linearr   mlpln_2	attn_maskselfconfig	__class__s     r/   rB   z%BridgeTowerResidualAttention.__init__q   s    ))&*<*<f>P>PTV>VW	LL!3!39N9NO	==RYYv'9'96;M;MPQ;QRS023ryy););a)?ASASTU
 LL!3!39N9NO	r.   hidden_stateattention_maskc                 ,   |+|j                  t        j                  |j                        }| j                  1| j                  j                  |j
                  |j                        nd | _        | j                  |||d| j                  |      d   S )NdtypedeviceF)need_weightsrM   key_padding_maskr   )tor*   boolrW   rM   rV   rE   )rO   rR   rS   s      r/   	attentionz&BridgeTowerResidualAttention.attention   s    %+..UZZH[H[.\N ~~) NNL$6$6|?R?RS 	
 yynn+  
  	r.   c                     || j                  | j                  |      |      z   }| j                  |      }| j                  j	                         D ]  \  }} ||      } ||z   }|S N)r\   rH   rL   rK   items)rO   rR   rS   residual_state_layers         r/   forwardz$BridgeTowerResidualAttention.forward   sg    %tyy7NP^(__yy0( 	/HAu .L	/%4r.   r^   )
r&   r'   r(   rB   r*   Tensorr\   r   rc   __classcell__rQ   s   @r/   r8   r8   p   sC    "ell ELL "ELL (5<<BX r.   r8   c                   ^     e Zd Z fdZddej
                  deej
                     fdZ xZS )BridgeTowerTransformerc                    t         |           |j                  | _        |j                  | _        |j                  rHt        j                  t        | j                  dz
        D cg c]  }t        |       c}      | _	        nDt        j                  t        | j                        D cg c]  }t        |       c}      | _	        |j                  | _
        y c c}w c c}w Nr   )rA   rB   rD   num_hidden_layersremove_last_layerr	   
ModuleListranger8   	resblocksstop_gradientrO   rP   ra   rQ   s      r/   rB   zBridgeTowerTransformer.__init__   s    !--!'!9!9##]]?DTE[E[^_E_?`a!-f5aDN  ]]?DTE[E[?\]!-f5]DN $11 b ^s   'C,C!rR   rS   c                     g }| j                   D ]H  } |||      }| j                  r |j                  |j                                8|j                  |       J |S r^   )ro   rp   appenddetach)rO   rR   rS   r$   blocks        r/   rc   zBridgeTowerTransformer.forward   s\    ^^ 	3E ~>L!!$$\%8%8%:;$$\2	3 r.   r^   )	r&   r'   r(   rB   r*   rd   r   rc   re   rf   s   @r/   rh   rh      s(    2ELL (5<<BX r.   rh   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )BridgeTowerVisionEmbeddingsrP   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)rA   rB   rP   rD   	embed_dim
image_size
patch_sizer	   	Parameterr*   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandrN   s     r/   rB   z$BridgeTowerVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr.   
embeddingsheightwidthreturnc                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr         ?r   r~   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer*   jit
is_tracingr   r   r   reshapepermuter	   
functionalinterpolateviewcat)rO   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r/   interpolate_pos_encodingz4BridgeTowerVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr.   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (z).rV   r~   r   r   r   )r   r   
ValueErrorr   r   rV   rZ   flatten	transposer   r   r*   r   r   r   r   )rO   r   r   
batch_sizera   r   r   target_dtypepatch_embedsclass_embedsr   s              r/   rc   z#BridgeTowerVisionEmbeddings.forward   s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr.   F)r&   r'   r(   r   rB   r*   rd   intr   r+   rc   re   rf   s   @r/   rw   rw      se    q6 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r.   rw   c                        e Zd Z fdZ	 ddej
                  defdZ	 ddej
                  defdZdej
                  fdZ	 xZ
S )	BridgeTowerVisionTransformerc           	      0   t         |           t        |      | _        t	        j
                  |j                  |j                        | _        t        |      | _
        t	        j
                  |j                  |j                        | _        |j                  | _        |j                  set	        j                  t        |j                        D cg c]-  }t	        j
                  |j                  |j                        / c}      | _        y y c c}w Nr;   )rA   rB   rw   r   r	   rF   rD   rG   ln_prerh   transformerln_postshare_layernormrm   rn   rk   ln_separaterq   s      r/   rB   z%BridgeTowerVisionTransformer.__init__  s    5f=ll6#5#56;P;PQ1&9||F$6$6F<Q<QR%55%%!}}V[\b\t\tVuvQRf00f6K6KLv D &vs   2Dr   r   c                    | j                  ||      }| j                  |      }|j                  ddd      }| j                  ||      }t	        j
                  |d      }|j                  dddd      }| j                  r| j                  |      }|S g }t        || j                        D ]  \  }} ||      }|j                  |         t	        j
                  |d      }|S )Nr   r   r~   r   r   )r   r   r   r   r*   stackr   r   zipr   rs   )rO   r   rS   r   r$   hidden_states_stacklns          r/   rc   z$BridgeTowerVisionTransformer.forward  s     6NOM2%--aA6((GMq9%--aAq9 LL7M  #%%(8H8H%I :!r "= 1#**=9: "KK(;CMr.   c                 t    | j                  ||      }| j                  |      }|j                  ddd      }|S )Nr   r   r   r~   )r   r   r   )rO   r   r   r$   s       r/   forward_prez(BridgeTowerVisionTransformer.forward_pre3  s?    
 OghM2%--aA6r.   rR   c                 N    |j                  ddd      }| j                  |      }|S )Nr   r   r~   )r   r   )rO   rR   visual_output_posts      r/   forward_postz)BridgeTowerVisionTransformer.forward_post>  s-    )11!Q:!\\*<=!!r.   r   )r&   r'   r(   rB   r*   rd   r[   rc   r   r   re   rf   s   @r/   r   r   
  sX    " */	ll #'	< */	ll	 #'	" "r.   r   c                   $     e Zd Z fdZd Z xZS )BridgeTowerLinkTowerc                    t         |           |j                  | _        |j                  | _        |j                  dv r|j                  dk(  r.t	        j
                  t        j                  d            | _        n<|j                  dk(  r-t	        j
                  t        j                  d            | _	        t	        j                  | j                  |j                        | _
        y t        d|j                   d      )	N)add
scaled_addr   r         ?r   r   r;   link_tower_type  is not implemented)rA   rB   link_tower_typerD   r	   r   r*   tensorscaled_factorbetarF   rG   NotImplementedErrorrN   s     r/   rB   zBridgeTowerLinkTower.__init__E  s    %55!--!!%II%%5%'\\%,,s2C%D"''=8LLc):;	\\$*:*:@U@UVDN%(89O9O8PPc&deer.   c                 Z   | j                   dk(  r| j                  ||z         S | j                   dk(  r!| j                  || j                  z  |z         S | j                   dk(  r1| j                  |d| j                  z
  z  || j                  z  z         S t	        d| j                    d      )Nr   r   r   r   r   r   )r   rF   r   r   r   )rO   r$   cross_modal_hidden_statesrS   s       r/   rc   zBridgeTowerLinkTower.forwardR  s    5(>>-2K"KLL!!\1>>-$2D2D"DG`"`aa!!]2>>-1tyy="AD]`d`i`iDi"ijj%(89M9M8NNa&bccr.   r&   r'   r(   rB   rc   re   rf   s   @r/   r   r   D  s    fdr.   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BridgeTowerSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y r   )rA   rB   r	   rJ   rD   denserF   rG   Dropouthidden_dropout_probdropoutrN   s     r/   rB   zBridgeTowerSelfOutput.__init___  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r.   r$   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r^   r   r   rF   rO   r$   r   s      r/   rc   zBridgeTowerSelfOutput.forwarde  7    

=1]3}|'CDr.   r&   r'   r(   rB   r*   rd   rc   re   rf   s   @r/   r   r   ^  1    >U\\  RWR^R^ r.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BridgeTowerIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r^   )rA   rB   r	   rJ   rD   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnrN   s     r/   rB   z BridgeTowerIntermediate.__init__n  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r.   r$   r   c                 J    | j                  |      }| j                  |      }|S r^   )r   r   rO   r$   s     r/   rc   zBridgeTowerIntermediate.forwardv  s&    

=100?r.   r   rf   s   @r/   r   r   m  s#    9U\\ ell r.   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )BridgeTowerOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rA   rB   r	   rJ   r   rD   r   rF   rG   r   r   r   rN   s     r/   rB   zBridgeTowerOutput.__init__~  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r.   r$   r   r   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r^   r   r   s      r/   rc   zBridgeTowerOutput.forward  r   r.   r   rf   s   @r/   r   r   }  r   r.   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )BridgeTowerPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r^   )rA   rB   r	   rJ   rD   r   Tanh
activationrN   s     r/   rB   zBridgeTowerPooler.__init__  s9    YYv1163E3EF
'')r.   r$   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r  )rO   r$   first_token_tensorpooled_outputs       r/   rc   zBridgeTowerPooler.forward  s6     +1a40

#566r.   r   rf   s   @r/   r   r     s#    $
U\\ ell r.   r   c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )BridgeTowerSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()position_embedding_typeabsoluterelative_keyrelative_key_queryr~   r   )rA   rB   rD   num_attention_headshasattrr   r   attention_head_sizeall_head_sizer	   rJ   querykeyvaluer   attention_probs_dropout_probr   getattrr  max_position_embeddingsr   distance_embedding
is_decoderrO   rP   r  rQ   s      r/   rB   z!BridgeTowerSelfAttention.__init__  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r.   xr   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S )Nr   r   r~   r   r   )r   r  r  r   r   )rO   r  new_x_shapes      r/   transpose_for_scoresz-BridgeTowerSelfAttention.transpose_for_scores  sL    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r.   r$   rS   	head_maskencoder_hidden_statesencoder_attention_maskpast_key_valueoutput_attentionsc                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r   r~   r   r   r  r  rU   r   zbhld,lrd->bhlrzbhrd,lrd->bhlrr   ) r  r  r  r  r*   r   r  matmulr   r  r   r   longrW   r   r   r  r  rZ   rV   einsummathsqrtr  r	   r   softmaxr   r   
contiguousr   r  )rO   r$   rS   r   r!  r"  r#  r$  mixed_query_layeris_cross_attention	key_layervalue_layerquery_layer	use_cacheattention_scoresquery_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyattention_probscontext_layernew_context_layer_shapeoutputss                               r/   rc   z BridgeTowerSelfAttention.forward  s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr.   r^   NNNNNF)r&   r'   r(   rB   r*   rd   r  r   r+   r   r[   rc   re   rf   s   @r/   r  r    s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	cr.   r  eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )BridgeTowerAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nr  )	rA   rB   #BRIDGE_TOWER_SELF_ATTENTION_CLASSES_attn_implementationrO   r   outputsetpruned_headsr  s      r/   rB   zBridgeTowerAttention.__init__)  sC    78S8ST,C
	 ,F3Er.   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   rO   r  r  rL  r   r  r  r  rJ  r   r  union)rO   headsindexs      r/   prune_headsz BridgeTowerAttention.prune_heads1  s   u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r.   r$   rS   r   r!  r"  r#  r$  r   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S )Nr   r   )rO   rJ  )rO   r$   rS   r   r!  r"  r#  r$  self_outputsattention_outputrA  s              r/   rc   zBridgeTowerAttention.forwardC  sW     yy!"
  ;;|AF#%QR(88r.   r^   rB  )r&   r'   r(   rB   rR  r*   rd   r   r+   r   r[   rc   re   rf   s   @r/   rE  rE  (  s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r.   rE  c                   6     e Zd Z fdZ	 	 	 	 	 ddZd Z xZS )BridgeTowerBertCrossLayerc                    t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        t	        |      | _        t        |      | _
        t        |      | _        y rj   )rA   rB   chunk_size_feed_forwardseq_len_dimrE  r\   r  add_cross_attentioncrossattentionr   intermediater   rJ  rN   s     r/   rB   z"BridgeTowerBertCrossLayer.__init__\  sq    '-'E'E$-f5 ++#)#=#= 26:3F;'/r.   c           	          | j                  ||d |d       }|d   }	|dd  }
| j                  |	||||||      }|d   }	|
|dd z   }
t        | j                  | j                  | j
                  |	      }|f|
z   }
|
S )N)rS   r   r$  r#  r   r   )rS   r   r!  r"  r#  r$  r   )r\   r\  r   feed_forward_chunkrY  rZ  )rO   r$   r!  rS   r   r"  r#  r$  self_attention_outputsrU  rA  cross_attention_outputslayer_outputs                r/   rc   z!BridgeTowerBertCrossLayer.forwardg  s     "&)/ "0 "
 2!4 ),"&"5"5)"7#9)/ #6 #
 3153Ab990##T%A%A4CSCSUe
  /G+r.   c                 L    | j                  |      }| j                  ||      }|S r^   r]  rJ  rO   rU  intermediate_outputrb  s       r/   r_  z,BridgeTowerBertCrossLayer.feed_forward_chunk  ,    "//0@A{{#68HIr.   )NNNNF)r&   r'   r(   rB   rc   r_  re   rf   s   @r/   rW  rW  [  s$    	0 #*Xr.   rW  c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )BridgeTowerTextLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is addedr  rG  )rA   rB   rY  rZ  rE  r\   r  r[  r   r\  r   r]  r   rJ  rN   s     r/   rB   zBridgeTowerTextLayer.__init__  s    '-'E'E$-f5 ++#)#=#= ##?? D6)g!hii"6vWa"bD3F;'/r.   r$   rS   r   r!  r"  r#  r$  r   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr~   )r$  r#  r   r   r   r\  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r&  )	r\   r  r  r   r\  r   r_  rY  rZ  )rO   r$   rS   r   r!  r"  r#  r$  self_attn_past_key_valuer`  rU  rA  present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuera  rb  s                    r/   rc   zBridgeTowerTextLayer.forward  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr.   c                 L    | j                  |      }| j                  ||      }|S r^   rd  re  s       r/   r_  z'BridgeTowerTextLayer.feed_forward_chunk  rg  r.   rB  )r&   r'   r(   rB   r*   rd   r   r+   r   r[   rc   r_  re   rf   s   @r/   ri  ri    s    0" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Br.   ri  c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )BridgeTowerTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
rA   rB   rP   r	   rm   rn   rk   ri  rb   gradient_checkpointingrq   s      r/   rB   zBridgeTowerTextEncoder.__init__  sP    ]]%PVPhPhJi#jQ$8$@#jk
&+# $ks   A#r$   rS   r   r!  r"  past_key_valuesr3  r$  output_hidden_statesreturn_dictr   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
Nr-   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r   r   r~   c              3   $   K   | ]  }|| 
 y wr^   r-   .0vs     r/   	<genexpr>z1BridgeTowerTextEncoder.forward.<locals>.<genexpr>9  s      
 = 
   )last_hidden_staterv  r$   r%   cross_attentions)rP   r[  ru  trainingloggerwarning_once	enumeraterb   _gradient_checkpointing_func__call__tupler   )rO   r$   rS   r   r!  r"  rv  r3  r$  rw  rx  all_hidden_statesall_self_attentionsall_cross_attentionsnext_decoder_cacheilayer_modulelayer_head_maskr#  layer_outputss                       r/   rc   zBridgeTowerTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r.   )	NNNNNNFFT)r&   r'   r(   rB   r*   rd   r   r+   r   r[   r   r   rc   re   rf   s   @r/   rr  rr    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r.   rr  c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )BridgeTowerTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)padding_idxr;   r  r  r   r   Fr   token_type_idsr   )rA   rB   r	   r   
vocab_sizerD   pad_token_idword_embeddingsr  position_embeddingstype_vocab_sizetoken_type_embeddingsrF   rG   r   r   r   r  r  r   r*   r   r   zerosr   r   r(  r  rN   s     r/   rB   z"BridgeTowerTextEmbeddings.__init__T  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXej 	 	
 	ekk$*;*;*@*@*B%**Ubg 	 	

 "..#%<<**F,>,>DL\L\$
 r.   c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )Nr   r   r  r   rU   r  )"create_position_ids_from_input_idsr  &create_position_ids_from_inputs_embedsr   r  r  r   r*   r  r(  r   rW   r  r  r  r  rF   r   )rO   	input_idsr  r   inputs_embedspast_key_values_lengthinput_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  r   r  s                r/   rc   z!BridgeTowerTextEmbeddings.forwardm  sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r.   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr   r   rU   r   )r   r*   r   r  r(  rW   r   r   )rO   r  r  sequence_lengthr   s        r/   r  z@BridgeTowerTextEmbeddings.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r.   )NNNNr   )r&   r'   r(   r)   rB   rc   r  re   rf   s   @r/   r  r  N  s    

4 rs&P=r.   r  c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   r   )ner   r*   cumsumtype_asr(  )r  r  r  maskincremental_indicess        r/   r  r    sW     <<$((*D <<!4<<TBE[[_cc##%33r.   c                   *    e Zd ZeZdZdZddgZdZd Z	y)BridgeTowerPreTrainedModelbridgetowerFr  r8   rv  c                    t        |t              r|j                  j                  j                  dz  d|j                  j                  j
                  z  dz  z  }|j                  j                  j                  dz  }d|j                  j                  j                  z  dz  }|j                  j                  j                  D ]Q  }t        j                  j                  |j                  j                  || j                  j                  z         t        j                  j                  |j                  j                  j                  || j                  j                  z         t        j                  j                  |j                   j"                  j                  || j                  j                  z         t        j                  j                  |j                   j$                  j                  || j                  j                  z         T t        j                  j                  |j                  j&                  j(                  || j                  j                  z         t        j                  j                  |j                  j&                  j*                  j                  || j                  j                  z         nt        |t        j,                  t        j.                  t        j0                  f      r?|j                  j2                  j                  dd| j                  j                  z         nct        |t        j4                        rI|j6                  j2                  j9                          |j                  j2                  j;                  d       t        |t        j,                        r2|j6                  %|j6                  j2                  j9                          y y y )Ng      r~   )stdg        g?)meanr  r   )r   BridgeTowerVisionModelvisualr   rD   rk   ro   r	   initnormal_rE   in_proj_weightrP   initializer_factorout_projr   rK   r=   r@   r   r   r   rJ   r   r   datarF   r}   zero_fill_)rO   moduleproj_stdattn_stdfc_stdru   s         r/   _init_weightsz(BridgeTowerPreTrainedModel._init_weights  s   f4511==tCV]]..@@@TIH }}00<<dBH&--33???DHF22<< h

 9 9x$++JhJh?hi

 3 3 : :4;;KiKi@ij		 5 56DKKDbDb;bc		 0 0 7 7XHfHf=fg	h GGOOFMM44DD(UYU`U`UsUsJsOtGGOO((;;BBSWS^S^SqSqHq   BIIr|| DEMM&&CTDKK<Z<Z5Z&[-KK""$MM$$S)fbii(V[[-DKK""$ .E(r.   N)
r&   r'   r(   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placementr  r-   r.   r/   r  r    s*    $L%&+#35ST"3%r.   r  c                   :     e Zd ZeZ fdZed        ZddZ xZ	S )r  c                 D    t         |   |       t        |      | _        y r^   )rA   rB   r   r  rN   s     r/   rB   zBridgeTowerVisionModel.__init__  s     26:r.   c                 j    | j                   j                  j                  j                  j                  S r^   )r  r   r   r   rV   rO   s    r/   rV   zBridgeTowerVisionModel.dtype  s$    {{%%55<<BBBr.   c                 Z    | j                  |j                  | j                        ||      S r^   )r  typerV   )rO   image
image_maskr   s       r/   rc   zBridgeTowerVisionModel.forward  s#    {{5::djj1:?WXXr.   rt  )
r&   r'   r(   r   r  rB   propertyrV   rc   re   rf   s   @r/   r  r    s)    *L; C CYr.   r  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                        e Zd ZeZd fd	Zd Zd Zd Ze		 	 	 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
ej                     d	e
ej                     d
e
ej                     de
ej                     de
ej                     de
eej                        de
e   de
e   de
e   de
e   deeej                     ef   fd       Z xZS )BridgeTowerTextModelc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
rA   rB   rP   r  r   rr  encoderr   pooler	post_init)rO   rP   add_pooling_layerrQ   s      r/   rB   zBridgeTowerTextModel.__init__  sN    
 	 3F;-f53D'/$ 	r.   c                 .    | j                   j                  S r^   r   r  r  s    r/   get_input_embeddingsz)BridgeTowerTextModel.get_input_embeddings  s    ...r.   c                 &    || j                   _        y r^   r  rO   r  s     r/   set_input_embeddingsz)BridgeTowerTextModel.set_input_embeddings  s    */'r.   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)z
        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
        class PreTrainedModel
        N)r_   r  rb   r\   rR  )rO   heads_to_prunerb   rP  s       r/   _prune_headsz!BridgeTowerTextModel._prune_heads  sE    
 +002 	CLE5LLu%//;;EB	Cr.   r  rS   r  r   r   r  r!  r"  rv  r3  r$  rw  rx  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|t        j                  |||z   f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j!                  ||      }|}n&t        j"                  |t        j$                  |	      }| j'                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j)                  |      }nd }| j+                  || j                   j,                        }| j                  |||||
      }| j/                  ||||||	|
|||
      }|d   }| j0                  | j1                  |      nd }|s
||f|dd  z   S t3        |||j4                  |j6                  |j8                  |j:                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer   z5You have to specify either input_ids or inputs_embedsr   r~   rW   r  rU   )r  r   r  r  r  )	rS   r   r!  r"  rv  r3  r$  rw  rx  r   )r  r#   rv  r$   r%   r  )rP   r$  rw  use_return_dictr  r3  r   %warn_if_padding_and_no_attention_maskr   rW   r   r*   onesr  r   r  r   r  r(  get_extended_attention_maskinvert_attention_maskget_head_maskrk   r  r  r   rv  r$   r%   r  )rO   r  rS   r  r   r   r  r!  r"  rv  r3  r$  rw  rx  r  r   r  rW   r  r  r  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthra   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputr  s                                  r/   rc   zBridgeTowerTextModel.forward  s   $ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r.   )T)NNNNNNNNNNNNN)r&   r'   r(   r   r  rB   r  r  r  r   r   r*   rd   r   r+   r[   r   r   r   rc   re   rf   s   @r/   r  r    sr    )L /0C  -11515/3,0048<9==A$(,0/3&*l
ELL)l
 !.l
 !.	l

 u||,l
 ELL)l
  -l
  (5l
 !) 6l
 "$u'8'8"9:l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$PP	Ql
 l
r.   r  zv
    The bare BridgeTower Model transformer outputting BridgeTowerModelOutput object without any specific head on
    c            "           e Zd Z fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deej                     dee   dee   dee   dee   deej                     dedeeej                     ef   fd       Zd Z xZS )BridgeTowerModelc           	         t         |   |       || _        |j                  }|j                  }|j
                  r_t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        nt        j                  t        |j                        D cg c],  }t        j                  |j                  |j                        . c}      | _	        t        j                  t        |j                        D cg c],  }t        j                  |j                  |j                        . c}      | _
        t        j                  d|j                        | _        t!        |      | _        t%        |      | _        |j(                  s|j*                  r| j"                  j,                  j.                  D ]  }| j"                  j,                  j0                  j2                  j4                  |j2                  _        | j"                  j,                  j0                  j6                  j4                  |j6                  _         t        j                  t        |j                        D cg c]  }t9        |       c}      | _        t        j                  t        |j                        D cg c]  }t9        |       c}      | _        t?        |      | _         t?        |      | _!        t        jD                  |j                  |jF                        | _$        t        jD                  |j                  |jF                        | _%        |jL                  r!tO        |      | _(        tO        |      | _)        nt        j                  t        |j                  dz
        D cg c]  }tO        |       c}      | _(        t        j                  t        |j                  dz
        D cg c]  }tO        |       c}      | _)        | jU                          y c c}w c c}w c c}w c c}w c c}w c c}w )Nr~   r;   r   )+rA   rB   rP   vision_configtext_config$share_cross_modal_transformer_layersr	   rJ   rD   cross_modal_text_transformcross_modal_image_transformrm   rn   rk   r   r  r  vision_modelr  
text_modelr   "init_layernorm_from_vision_encoderr  cross_modal_ln_separater   r   r  r}   rW  cross_modal_image_layerscross_modal_text_layersr   cross_modal_image_poolercross_modal_text_poolerrF   rG   cross_modal_text_layernormcross_modal_image_layernormshare_link_tower_layersr   cross_modal_text_link_towercross_modal_image_link_towerr  )rO   rP   r  r  ra   r   rQ   s         r/   rB   zBridgeTowerModel.__init__  s9    ,,((66.0ii8O8OQWQcQc.dD+/1yy9R9RTZTfTf/gD,.0mmQVW]WoWoQpqA;22F4F4FGq/D+ 02}}SXY_YqYqSrsa=44f6H6HIs0D, &(\\!V5G5G%H"2=A.{;,,1Z1Z''..FF J!%!2!2!9!9!A!A!H!H!M!M		#0077??DDIIJ )+=B6C[C[=\]&{3])
% (*}}=B6C[C[=\]&{3](
$
 ):&(A%'8'@$ +-,,v7I7IvOdOd*e'+-<<8J8JPVPePe+f())/CF/KD,0DV0LD-/1}}7<V=U=UXY=Y7Z[!%f-[0D, 137<V=U=UXY=Y7Z[!%f-[1D- 	W r t ^ ^  \ \s$   1P=$1QQQQQc                 6    | j                   j                         S r^   )r  r  r  s    r/   r  z%BridgeTowerModel.get_input_embeddings  s    3355r.   c                 :    | j                   j                  |       y r^   )r  r  r  s     r/   r  z%BridgeTowerModel.set_input_embeddings  s    ,,U3r.   r  rS   r  r   
pixel_maskr   r  r5   image_token_type_idxr$  rw  rx  labelsr   r   c                    |
|
n| j                   j                  }
||n| j                   j                  }|rdnd}|rdnd}|rdnd}|rdnd}|
rdnd}||t        d      ||n| j                   j                  }|	r|	nd}	|j                         }| j                  j                  |      }|r||fz  }|0t        j                  |t        j                  |j                        }| j                  j                  ||      j                  |j                        }t        | j                  j                  j                         | j                   j"                  z
  dz   }| j                  j                  j                   d| D ]  } |||      d   }|s||fz  } |K| j$                  j&                  j)                  |j+                  | j$                  j,                        |      }n|j/                  ddd	      }|r||fz  }| j$                  j&                  j0                  j2                  d| D ]  } ||      }|s||fz  } | j$                  j&                  j5                  |j+                  | j$                  j,                              }| j7                  |      }| j9                  t        j:                  dt        j                  |j                              j=                  |      }| j?                  ||z         }| jA                  |      }| j9                  t        jB                  d
|	t        j                  |j                              j=                  |      }||z   }| jE                  |      }t        j                  |j                  d      |j                  d      ft        j                  |j                        }| j                  j                  ||j                               j                  |j                        } | jF                  d   |||||
      } | d   }! | jH                  d   |||||
      }"|"d   }#|r||!|#ffz  }|
r|| d   |"d   ffz  }d}$tK        |t        | j                  j                  j                               D ]r  }% | j                  j                  j                   |%   ||      d   } | j$                  j&                  j0                  j2                  |%   |      j+                  | j$                  j,                        }| jA                  | j$                  j&                  j5                  |            |z   }| jL                  |$   }&| jN                  |$   }' |&| j7                  |      |z   |!|      }( |'||#|      }) | jF                  |$dz      |(|)|||
      } | d   }! | jH                  |$dz      |)|(|||
      }"|"d   }#|$dz  }$|r||fz  }||fz  }||!|#ffz  }|
se|| d   |"d   ffz  }u |!|#}+}*| jQ                  |*|+      },|r|||f}|stS        d |*|+|,||fD              S tU        |*|+|,||      S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        image_token_type_idx (`int`, *optional*):
            - The token type ids for images.
        output_hidden_states (`bool`, *optional*):
            If set to `True`, hidden states are returned as a list containing the hidden states of text, image, and
            cross-modal components respectively. i.e. `(hidden_states_text, hidden_states_image,
            hidden_states_cross_modal)` where each element is a list of the hidden states of the corresponding
            modality. `hidden_states_txt/img` are a list of tensors corresponding to unimodal hidden states and
            `hidden_states_cross_modal` is a list of tuples containing `cross_modal_text_hidden_states` and
            `cross_modal_image_hidden_states` of each brdige layer.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels are currently not supported.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerModel
        >>> from PIL import Image
        >>> import requests

        >>> # prepare image and text
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text = "hello world"
        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base")
        >>> model = BridgeTowerModel.from_pretrained("BridgeTower/bridgetower-base")

        >>> inputs = processor(image, text, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> outputs.keys()
        odict_keys(['text_features', 'image_features', 'pooler_output'])
        ```Nr-   zYBridgeTowerModel does not use `inputs_embeds`.  Make sure to pass in `input_ids` instead.r   )r  rU   r   r   r~   r   )rS   r"  r$  c              3   $   K   | ]  }|| 
 y wr^   r-   r{  s     r/   r~  z+BridgeTowerModel.forward.<locals>.<genexpr>  s      = r  )r!   r"   r#   r$   r%   )+rP   r$  rw  r   r  r   r  r   r*   r  r(  rW   r  rZ   rN  r  rb   rk   r  r  r   r  rV   r   r   ro   r   r  r  r  	expand_asr  r  fullr  r   r  rn   r  r  get_cls_featuresr  r    )-rO   r  rS   r  r   r
  r   r  r5   r  r$  rw  rx  r  r   all_hidden_states_textall_hidden_states_imageall_hidden_states_crossr  r  r  r4   extend_text_maskssplit_indexrb   ru   image_embeds_with_lncross_modal_texttext_token_type_embeddingsimage_token_type_embeddingscross_modal_imageextend_image_maskslayer_outputs_textcross_text_featureslayer_outputs_imagecross_image_featureslink_layer_indexr  text_link_towerimage_link_towercross_text_features_cross_image_features_r!   r"   cls_featuress-                                                r/   rc   zBridgeTowerModel.forward  s   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 (<(<"$(<"$"6BD$5b4$):%k  &1%<k$++B]B]7K3QRnn&oo0090E"{n4"!"ZZ5::iN^N^_N OOGGXcdgg

 $//117784;;;X;XX[\\ __,,22<K@ 	9E->?BK#&;.8&		9 ,,33??!!$"3"3"9"9:Um @ L
 (//1a8L#6# &&--99CCL[Q 	;E .L#'L?:'	;
  $0077DD\EVEVW[WhWhWnWnEop  ::;G%)%?%?KKI4D4DE&

)$
% 	#  ::;KNh;hi#??@TU&*&@&@JJt1IL\L\]'

)(
) 	$  46QQ <<=QRZZ##A&(9(>(>q(AB**##


 "__HHU_UdUdUfgjj
 =T99!<,#5/
 13>d;;A>-#4/
  315#)<>R(S'UU#%7%:<OPQ<R$S#UU {C(?(?(E(E$FG 0	ZA:$//1177:;HYZ[\]KL4,,33??II!L\Z__!!''L 001B1B1I1I1V1VWc1de-. !
 #>>?OPO#@@AQR $3//<?YY#!$ 
 %55IK_as$t! "T!=!=>NQR>R!S$%0'9"3" #5Q"7"U$"?"?@PST@T"U%$1'8"3# $7q#9 !#&;.8&'L?:''-@BV,W+YY' #);A)>@STU@V(W'YY#a0	Zf )<=Q~,,]NK!79PRi j 'GXZmn   &')&+*
 	
r.   c                 x    | j                  |      }| j                  |      }t        j                  ||gd      S )Nr   r   )r  r  r*   r   )rO   r!   r"   cls_features_textcls_features_images        r/   r  z!BridgeTowerModel.get_cls_features  s<     88G!::>Jyy+-?@bIIr.   )NNNNNNNNNNNNNF)r&   r'   r(   rB   r  r  r   r   r*   
LongTensorr+   r   r[   r   r   rd   r    rc   r  re   rf   s   @r/   r  r    s   6p64  156:594815155948.2,0/3&*-1).j
E,,-j
 !!2!23j
 !!1!12	j

 u001j
 U--.j
 E--.j
   1 12j
 u001j
 'smj
 $D>j
 'tnj
 d^j
 ))*j
 #'j
  
uU\\"$::	;!j
 j
XJr.   r  c                   $     e Zd Z fdZd Z xZS )"BridgeTowerPredictionHeadTransformc                 h   t         |           t        j                  |j                  |j                        | _        t        |j                  t              rt        |j                     | _
        n|j                  | _
        t        j                  |j                  |j                        | _        y r   )rA   rB   r	   rJ   rD   r   r   r   r   r   transform_act_fnrF   rG   rN   s     r/   rB   z+BridgeTowerPredictionHeadTransform.__init__  s{    YYv1163E3EF
f''-$*6+<+<$=D!$*$5$5D!f&8&8f>S>STr.   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r^   )r   r/  rF   r   s     r/   rc   z*BridgeTowerPredictionHeadTransform.forward  s4    

=1--m<}5r.   r   rf   s   @r/   r-  r-    s    Ur.   r-  c                   &     e Zd Zd fd	Zd Z xZS )BridgeTowerMLMHeadc                 p   t         |           || _        t        |      | _        t        j                  |j                  |j                  j                  d      | _
        t        j                  t        j                  |j                  j                              | _        ||| j                  _        y y )NF)r}   )rA   rB   rP   r-  	transformr	   rJ   rD   r  r  decoderr   r*   r  r}   r   )rO   rP   r   rQ   s      r/   rB   zBridgeTowerMLMHead.__init__  s    ;FCyy!3!3V5G5G5R5RY^_LLV-?-?-J-J!KL	"(DLL r.   c                 d    | j                  |      }| j                  |      | j                  z   }|S r^   )r4  r5  r}   )rO   r  	mlm_scores      r/   rc   zBridgeTowerMLMHead.forward  s-    NN1%	LL+dii7	r.   r^   r   rf   s   @r/   r2  r2    s    )r.   r2  c                   $     e Zd Z fdZd Z xZS )BridgeTowerITMHeadc                 X    t         |           t        j                  |d      | _        y Nr~   rA   rB   r	   rJ   fc)rO   rD   rQ   s     r/   rB   zBridgeTowerITMHead.__init__  s     ))K+r.   c                 (    | j                  |      }|S r^   r=  )rO   r  	itm_scores      r/   rc   zBridgeTowerITMHead.forward  s    GGAJ	r.   r   rf   s   @r/   r9  r9    s    ,r.   r9  z\
    BridgeTower Model with a language modeling head on top as done during pretraining.
    c                       e Zd ZdgZ fdZd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     dee	j                     dee   dee   dee   dee	j                     deeee	j                     f   fd       Z xZS )BridgeTowerForMaskedLMzmlm_score.decoder.weightc                     t         |   |       t        |      | _        t	        |      | _        | j                          y r^   )rA   rB   r  r  r2  r7  r  rN   s     r/   rB   zBridgeTowerForMaskedLM.__init__  s5     +F3+F3 	r.   c                 .    | j                   j                  S r^   r7  r5  r  s    r/   get_output_embeddingsz,BridgeTowerForMaskedLM.get_output_embeddings  s    ~~%%%r.   c                 &    || j                   _        y r^   rE  )rO   new_embeddingss     r/   set_output_embeddingsz,BridgeTowerForMaskedLM.set_output_embeddings  s    !/r.   r  rS   r  r   r
  r   r  r5   r$  rw  rx  r  r   c                    ||n| j                   j                  }| j                  |||||||||	|
|      }| j                  |r|j                  n|d         }d}|kt               }|j                  |j                        } ||j                  d| j                   j                  j                        |j                  d            }|st        |      }||f|z   S |S t        |||j                  |j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForMaskedLM
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000360943.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        >>> text = "a <mask> looking out of the window"

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
        >>> model = BridgeTowerForMaskedLM.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

        >>> # prepare inputs
        >>> encoding = processor(image, text, return_tensors="pt")

        >>> # forward pass
        >>> outputs = model(**encoding)

        >>> results = processor.decode(outputs.logits.argmax(dim=-1).squeeze(0).tolist())

        >>> print(results)
        .a cat looking out of the window.
        ```N
rS   r  r   r
  r   r  r5   r$  rw  rx  r   r   r2   r3   r$   r%   )rP   r  r  r7  r!   r
   rZ   rW   r   r  r  r  r   r$   r%   )rO   r  rS   r  r   r
  r   r  r5   r$  rw  rx  r  rA  
mlm_logitsmasked_lm_lossloss_fctrJ  s                     r/   rc   zBridgeTowerForMaskedLM.forward  s   d &1%<k$++B]B]""))%!'%/!5# # 
 ^^[G$9$9gVWjY
')HYYz001F%joob$++:Q:Q:\:\&]_e_j_jkm_noN:&F3A3M^%.YSYY!//))	
 	
r.   NNNNNNNNNNNN)r&   r'   r(   _tied_weights_keysrB   rF  rI  r   r   r*   r+  r+   r[   r   r   r   rc   re   rf   s   @r/   rB  rB    sj    55&0  156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
~uU%6%677	8Q
 Q
r.   rB  z
    BridgeTower Model transformer with a classifier head on top (a linear layer on top of the final hidden state of the
    [CLS] token) for image-to-text matching.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   deej                     de
eeej                     f   fd       Z xZS )#BridgeTowerForImageAndTextRetrievalc                     t         |   |       t        |      | _        t	        |j
                  dz        | _        | j                          y r;  )rA   rB   r  r  r9  rD   r@  r  rN   s     r/   rB   z,BridgeTowerForImageAndTextRetrieval.__init__`  s@     +F3+F,>,>,BC 	r.   r  rS   r  r   r
  r   r  r5   r$  rw  rx  r  r   c                    ||n| j                   j                  }| j                  |||||||||	|
|      }|r|j                  n|d   }| j	                  |      }d}|.t               }|j                  |j                        } |||      }|st        |      }||f|z   S |S t        |||j                  |j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        labels (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
            Labels for computing the image-text matching loss. 0 means the pairs don't match and 1 means they match.
            The pairs with 0 will be skipped for calculation.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForImageAndTextRetrieval
        >>> import requests
        >>> from PIL import Image

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> texts = ["An image of two cats chilling on a couch", "A football player scoring a goal"]

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")
        >>> model = BridgeTowerForImageAndTextRetrieval.from_pretrained("BridgeTower/bridgetower-base-itm-mlm")

        >>> # forward pass
        >>> scores = dict()
        >>> for text in texts:
        ...     # prepare inputs
        ...     encoding = processor(image, text, return_tensors="pt")
        ...     outputs = model(**encoding)
        ...     scores[text] = outputs.logits[0, 1].item()
        ```NrK  r~   rL  )rP   r  r  r#   r@  r
   rZ   rW   r  r   r$   r%   )rO   r  rS   r  r   r
  r   r  r5   r$  rw  rx  r  rA  r#   r3   itm_lossrO  rJ  s                      r/   rc   z+BridgeTowerForImageAndTextRetrieval.forwardj  s    \ &1%<k$++B]B]""))%!'%/!5# # 
 2=--'!*.')HYYv}}-F/H6]F-5-AXK&(MvM'!//))	
 	
r.   rP  )r&   r'   r(   rB   r   r   r*   r+  r+   r[   r   r   r   rc   re   rf   s   @r/   rS  rS  Y  sV     156:594815155948,0/3&*-1Q
E,,-Q
 !!2!23Q
 !!1!12	Q

 u001Q
 U--.Q
 E--.Q
   1 12Q
 u001Q
 $D>Q
 'tnQ
 d^Q
 ))*Q
 
'u/@/@)AA	BQ
 Q
r.   rS  c                   $     e Zd Z fdZd Z xZS )BridgeTowerContrastiveHeadc                 X    t         |           t        j                  ||      | _        y r^   r<  )rO   rD   
embed_sizerQ   s      r/   rB   z#BridgeTowerContrastiveHead.__init__  s     ))K4r.   c                 (    | j                  |      }|S r^   r?  )rO   r  s     r/   rc   z"BridgeTowerContrastiveHead.forward  s    GGAJr.   r   rf   s   @r/   rX  rX    s    5r.   rX  zl
    BridgeTower Model with a image-text contrastive head on top computing image-text contrastive loss.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     deej                     deej                     d	eej                     d
ee	   dee	   dee	   dee	   de
eeej                     f   fd       Z xZS )!BridgeTowerForContrastiveLearningc                    t         |   |       t        |      | _        t	        |j
                  |j                        | _        t	        |j
                  |j                        | _        t	        |j
                  dz  |j                        | _	        t        j                  t        j                  | j                  j                              | _        | j#                          y r;  )rA   rB   r  r  rX  rD   contrastive_hidden_sizeitc_text_headitc_image_headitc_cross_modal_headr	   r   r*   r   rP   logit_scale_init_valuelogit_scaler  rN   s     r/   rB   z*BridgeTowerForContrastiveLearning.__init__  s     +F378J8JFLjLjk89K9KVMkMkl$>v?Q?QTU?UW]WuWu$v!<<T[[5W5W(XYr.   r  rS   r  r   r
  r   r  r5   r$  rw  rx  return_lossr   c                 H   ||n| j                   j                  }| j                  |||||||||	d|      }|r|j                  n|d   }|r|j                  n|d   \  }}}|d   }|d   }| j                  j
                  j                  j                  |      }| j                  j                  t        j                  ddt        j                  | j                  j                  j                  j                  	            j                  |      }| j                  j                  |      |z   }t         j"                  j%                  | j'                  |ddd
ddf         dd      }t         j"                  j%                  | j)                  |ddd
ddf         dd      j+                  |j                        }t         j"                  j%                  | j-                  |      dd      j+                  |j                        }t        j.                  |||gd      }| j0                  j3                         j+                  |j                        }t        j4                  ||j7                               |z  }t        j4                  ||j7                               |z  }t        j4                  ||j7                               |z  }d}|rt        j8                  t;        |      |j                        }t         j"                  j=                  ||      }t         j"                  j=                  ||      }t         j"                  j=                  ||      }||z   |z   dz  }|s||||f|dd z   } ||f| z   S | S t?        ||||||j                  |j@                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`, *optional*):
            Optionally, instead of passing `pixel_values`, you can choose to directly pass an embedded representation.
            This is useful if you want more control over how to convert `pixel_values` into patch embeddings.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> from transformers import BridgeTowerProcessor, BridgeTowerForContrastiveLearning
        >>> import requests
        >>> from PIL import Image
        >>> import torch

        >>> image_urls = [
        ...     "https://farm4.staticflickr.com/3395/3428278415_81c3e27f15_z.jpg",
        ...     "http://images.cocodataset.org/val2017/000000039769.jpg",
        ... ]
        >>> texts = ["two dogs in a car", "two cats sleeping on a couch"]
        >>> images = [Image.open(requests.get(url, stream=True).raw) for url in image_urls]

        >>> processor = BridgeTowerProcessor.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")
        >>> model = BridgeTowerForContrastiveLearning.from_pretrained("BridgeTower/bridgetower-large-itm-mlm-itc")

        >>> inputs = processor(images, texts, padding=True, return_tensors="pt")
        >>> loss = model(**inputs, return_loss=True).loss

        >>> inputs = processor(images, texts[::-1], padding=True, return_tensors="pt")
        >>> loss_swapped = model(**inputs, return_loss=True).loss

        >>> print("Loss", round(loss.item(), 4))
        Loss 0.0019

        >>> print("Loss with swapped images", round(loss_swapped.item(), 4))
        Loss with swapped images 2.126
        ```NTrK  r~   r   r   r  r   rU   r   )r   pr  r&  r   g      @)r2   r3   r4   r5   r6   r$   r%   )!rP   r  r  r#   r$   r  r  r   r  r*   r  r(  r   rW   r  r  r	   r   	normalizer`  ra  rZ   rb  r   rd  expr'  tr   rN  cross_entropyr1   r%   )!rO   r  rS   r  r   r
  r   r  r5   r$  rw  rx  re  rA  r#   hidden_states_txthidden_states_imghidden_states_cross_modalr4   r  r  r6   r3   rd  logits_text_to_imagelogits_text_to_crosslogits_image_to_crossitc_lossr  text_to_image_losstext_to_cross_lossimage_to_cross_lossrJ  s!                                    r/   rc   z)BridgeTowerForContrastiveLearning.forward  sz   j &1%<k$++B]B]""))%!'%/!%# # 
 2=--'!*%0G!!gaj 	H,.G (+(,#//<<CCPPQ]^&*&6&6&L&LJJtQejj9I9I9_9_9f9f9m9mn'

)(
) 	$ ''CCDXY\ww mm--d.@.@QPQSTWAU.V\^bc-d}}..t/B/B<PQSTVWPWCX/Y_aef.gjj%% k 
 }}..t/H/H/W]_cd.ehh%% i 
 k<FBO&&**,//{7I7I/J$||K9IJ[X$||K9IJ[X %\<>>;K L{ Z\\#f+fmmDF!#!<!<=QSY!Z!#!<!<=QSY!Z"$--"="=>SU["\*-??BUUY\\Hk<FQRQSTF-5-AXK&(MvM+#%%!//))
 	
r.   )NNNNNNNNNTNN)r&   r'   r(   rB   r   r   r*   r+  r+   r[   r   r1   r   rc   re   rf   s   @r/   r]  r]    sO     156:594815155948,0/3&*&*x
E,,-x
 !!2!23x
 !!1!12	x

 u001x
 U--.x
 E--.x
   1 12x
 u001x
 $D>x
 'tnx
 d^x
 d^x
 
+U53D3D-EE	Fx
 x
r.   r]  )r]  rS  rB  r  r  )r   )Kr)   r*  collectionsr   dataclassesr   typingr   r   r   r   r*   torch.utils.checkpointr	   torch.nnr
   activationsr   r   modeling_outputsr   r   r   r   r   modeling_utilsr   r   pytorch_utilsr   r   utilsr   r   r   configuration_bridgetowerr   r   r   
get_loggerr&   r  _TOKENIZER_FOR_DOCr    r1   Moduler8   rh   rw   r   r   r   r   r   r   r  rH  rE  rW  ri  rr  r  r  r  r  r  r  r-  r2  r9  rB  rS  rX  r]  __all__r-   r.   r/   <module>r     s      # ! / /    % 6  I Q 7 7 h h 
		H	%'  :[ : :> :; : :B)299 )XRYY 6P")) Pf7"299 7"td299 d4BII bii  		 		  Cryy CN %' #0299 0f;		 ;|S299 SnZ
RYY Z
|V=		 V=t4  % % %DY7 Y O
5 O
O
d 
oJ1 oJ
oJf	 "    
d
7 d

d
N ]
*D ]
]
@  
G
(B G

G
Tr.   