
    Uh                       d Z ddlZddlZddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlmc mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ dd
lmZmZmZmZ ddlmZmZm Z m!Z! ddl"m#Z#m$Z$m%Z%  e jL                  e'      Z(d Z)d Z*d Z+dWdZ,dejZ                  dejZ                  fdZ.e G d de             Z/e G d de             Z0e G d de             Z1 G d dejd                        Z3 G d dejd                        Z4 G d d ejd                        Z5 G d! d"ejd                        Z6 G d# d$ejd                        Z7 G d% d&ejd                        Z8 G d' d(ejd                        Z9 G d) d*ejd                        Z: G d+ d,ejd                        Z; G d- d.ejd                        Z< G d/ d0ejd                        Z= G d1 d2ejd                        Z> G d3 d4ejd                        Z? G d5 d6ejd                        Z@ G d7 d8ejd                        ZA G d9 d:ejd                        ZBd;eAiZC G d< d=ejd                        ZD G d> d?ejd                        ZE G d@ dAejd                        ZF G dB dCejd                        ZG G dD dEejd                        ZH G dF dGejd                        ZIe G dH dIe             ZJ G dJ dKeJ      ZK edLM       G dN dOeJ             ZLe G dP dQeJ             ZMe G dR dSeJ             ZNe G dT dUeJ             ZOg dVZPy)XzPyTorch CLAP model.    N)	dataclass)AnyListOptionalTupleUnion)nn   )ACT2FN))BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling,BaseModelOutputWithPoolingAndCrossAttentions)PreTrainedModel)apply_chunking_to_forward find_pruneable_heads_and_indicesmeshgridprune_linear_layer)ModelOutputauto_docstringlogging	torch_int   )ClapAudioConfig
ClapConfigClapTextConfigc                     | j                   \  }}}| dddddddf   j                  dd|d      }|j                  |||z  |      }|S )ae  
    Interpolate data in time domain. This is used to compensate the resolution reduction in downsampling of a CNN.

    Args:
        hidden_states (`torch.FloatTensor` of shape (batch_size, time_length, classes_num)):
            Input hidden states
        ratio (`int`):
            The ratio of the length of the output to the length of the input.
    Nr   )shaperepeatreshape)hidden_statesratio
batch_sizetime_lengthclasses_num	upsampleds         x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/clap/modeling_clap.pyinterpolater'   *   sX     .;-@-@*ZkaD!m,33Aq%CI!!*kE.A;OI    c                     | j                   \  }}}}| j                  |||z  |||z  ||      } | j                  dddddd      j                         j                  d|||      }|S )aR  
    Returns the resized hidden states. The output shape should be `(batch_size * num_windows, window_size, window_size,
    num_channels)`

    Args:
        hidden_states (`torch.FloatTensor` of shape `(batch_size, height, width, num_channels)`):
            Input hidden states
        window_size (`int`):
            Window size
    r   r   r
            r   viewpermute
contiguous)r    window_sizer"   heightwidthnum_channelswindowss          r&   window_partitionr7   ;   s}     /<.A.A+J|!&&Fk);8Lk[gM ##Aq!Q15@@BGGKYdfrsGNr(   c                     | j                   d   }| j                  d||z  ||z  |||      } | j                  dddddd      j                         j                  d|||      } | S )a  
    Merges windows to produce higher resolution features.
    Args:
        windows (`torch.FloatTensor` of shape `(num_windows * batch_size, window_size, window_size, num_channels)`):
            Input windows
        window_size (`int`):
            Window size
        height (`int`):
            Height of the resized audio
        width (`int`):
            Width of the resized audio
    r-   r   r   r
   r*   r+   r,   r.   )r6   r2   r3   r4   r5   s        r&   window_reverser9   P   sn     ==$Lll2v4e{6JKYdfrsGooaAq!Q/::<AA"feUabGNr(   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
    are ignored. This is modified from fairseq's `utils.make_positions`.

    Args:
        x: torch.Tensor x:

    Returns: torch.Tensor
    r   dim)neinttorchcumsumtype_aslong)	input_idspadding_idxpast_key_values_lengthmaskincremental_indicess        r&   "create_position_ids_from_input_idsrH   d   sW     <<$((*D <<!4<<TBE[[_cc##%33r(   logitsreturnc                     t        j                  t        |       | j                        }t        j
                  j                  | |      S )Ndevice)r?   arangelenrM   r	   
functionalcross_entropy)rI   labelss     r&   contrastive_lossrS   v   s1    \\#f+fmm<F==&&vv66r(   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)ClapTextModelOutputa  
    Base class for text model's outputs that also contains a pooling of the last hidden states.

    Args:
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The text embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Ntext_embedslast_hidden_state.r    
attentions)__name__
__module____qualname____doc__rV   r   r?   FloatTensor__annotations__rW   r    r   rX    r(   r&   rU   rU   {   sr    * 04K%++,359x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r(   rU   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)ClapAudioModelOutputak  
    ClapAudio model output to mimic the output of the original implementation.

    Args:
        audio_embeds (`torch.FloatTensor` of shape `(batch_size, hidden_size)`):
            The Audio embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
    Naudio_embedsrW   .r    rX   )rY   rZ   r[   r\   rb   r   r?   r]   r^   rW   r    r   rX   r_   r(   r&   ra   ra      sr    * 15L(5,,-459x 1 129=AM8E%"3"3S"89:A:>Ju00#567>r(   ra   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)
ClapOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for audio-text similarity.
        logits_per_audio (`torch.FloatTensor` of shape `(audio_batch_size, text_batch_size)`):
            The scaled dot product scores between `audio_embeds` and `text_embeds`. This represents the audio-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, audio_batch_size)`):
            The scaled dot product scores between `text_embeds` and `audio_embeds`. This represents the text-audio
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`ClapTextModel`].
        audio_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The audio embeddings obtained by applying the projection layer to the pooled output of [`ClapAudioModel`].
        text_model_output (`BaseModelOutputWithPooling`):
            The output of the [`ClapTextModel`].
        audio_model_output (`BaseModelOutputWithPooling`):
            The output of the [`ClapAudioModel`].
    Nlosslogits_per_audiologits_per_textrV   rb   text_model_outputaudio_model_outputrJ   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))rh   ri   N)getattrto_tuple).0kselfs     r&   	<genexpr>z&ClapOutput.to_tuple.<locals>.<genexpr>   s=      
  KKDGQXY]_`QaQjQjQll
s   -0)tuplekeysrp   s   `r&   rm   zClapOutput.to_tuple   s#     
YY[
 
 	
r(   )rY   rZ   r[   r\   re   r   r?   r]   r^   rf   rg   rV   rb   rh   r   ri   r   r   rm   r_   r(   r&   rd   rd      s    ( )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448185929
%* 
r(   rd   c                   *     e Zd ZdZd fd	Zd Z xZS )ClapDropPathz
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). This is a slightly
    refactored version of the `SwinDropPath` implementation.
    c                 0    t         |           || _        y N)super__init__	drop_prob)rp   r{   	__class__s     r&   rz   zClapDropPath.__init__   s    "r(   c                 J   | j                   dk(  s| j                  s|S d| j                   z
  }|j                  d   fd|j                  dz
  z  z   }|t	        j
                  ||j                  |j                        z   }|j                          |j                  |      |z  }|S )N        r   r   )r   dtyperM   )
r{   trainingr   ndimr?   randr   rM   floor_div)rp   r    	keep_probr   random_tensoroutputs         r&   forwardzClapDropPath.forward   s    >>S   &	$$Q')DM4F4F4J,KK!EJJuM<O<OXeXlXl$mm""9-=r(   rx   )rY   rZ   r[   r\   rz   r   __classcell__r|   s   @r&   rv   rv      s    
#r(   rv   c                   .     e Zd ZdZdef fdZd Z xZS )ClapAudioAFFBlockz
    ATTENTIONAL FEATURE FUSION Block from CLAP, since in CLAP we are always in 2D mode, it is not needed to implement
    the 1D version.
    configc                    t         |           |j                  }|j                  }t	        ||z        }t        j                  t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _
        t        j                  t        j                  d      t        j                  ||ddd      t        j                  |      t        j                  d      t        j                  ||ddd      t        j                  |            | _        t        j                         | _        y )Nr   r   kernel_sizestridepaddingT)inplace)ry   rz   patch_embeds_hidden_sizeaff_block_rr>   r	   
SequentialConv2dBatchNorm2dReLU	local_attAdaptiveAvgPool2d
global_attSigmoidsigmoid)rp   r   channelsdownsize_ratiointer_channelsr|   s        r&   rz   zClapAudioAFFBlock.__init__   s   22++X78IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 --  #IIhAaQRSNN>*GGD!IInhAaQRSNN8$
 zz|r(   c                     ||z   }| j                  |      | j                  |      z   }| j                  |      }d|z  |z  d|z  d|z
  z  z   }|S )Nr*   r   )r   r   r   )rp   r    residualattention_inputfused_layer_outputr   s         r&   r   zClapAudioAFFBlock.forward  sb    '(2!^^O<t?__!\\*<=]"%77!h,!N`J`:aar(   rY   rZ   r[   r\   r   rz   r   r   r   s   @r&   r   r      s    
$ $0r(   r   c                   0     e Zd ZdZdef fdZddZ xZS )ClapAudioPatchEmbedz
    This module converts the hidden states reshaped as an image to patch embeddings ready to be passed to the
    Transformer block.
    r   c                    t         |           t        |j                  t              r|j                  |j                  fn|j                  }t        |j
                  t              r|j
                  |j
                  fn|j
                  }t        |j                  t              r|j                  |j                  fn|j                  }|| _        || _        |d   |d   z  |d   |d   z  f| _        | j                  d   | j                  d   z  | _	        |j                  | _        |j                  | _        |d   |d   z
  dz  |d   |d   z
  dz  f}| j                  r|j                  dk(  rdnd}t        j                  |j                   |z  |j"                  |||      | _        |j&                  rt        j(                  |j"                        nt        j*                         | _        | j                  rZt/        |      | _        t        j                  |j                   |j"                  |d   |d   dz  f|d   |d   dz  f|      | _        y y )Nr   r   r*   channel_mapr+   r   r
   )ry   rz   
isinstance	spec_sizer>   
patch_sizepatch_strideimg_size	grid_sizenum_patchesflatten_patch_embedsflattenenable_fusionfusion_typer	   r   patch_embed_input_channelsr   projenable_patch_layer_norm	LayerNormIdentitynormr   fusion_model
mel_conv2d)rp   r   r   r   r   r   scale_factorr|   s          r&   rz   zClapAudioPatchEmbed.__init__$  s+   ;EfFVFVX[;\F$$f&6&67bhbrbr6@ARARTW6XV 1 12^d^o^o 	 ;EVEXEXZ]:^V  &"5"56djdwdw 	 !("1+a8(1+VW:XY>>!,t~~a/@@22#11qMLO39JqMLYZO<[`a;ab!//f6H6HM6Yq`aII--<++"
	 FLEcEcBLL!@!@Aikititiv	 1& 9D ii11//']JqMA,=>$Qa1)<=DO r(   c                    | j                   r|d d ddd d d d f   }|j                  \  }}}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }|j                  d      }t        |      dkD  r||dd d d d d f   j                         }	|	j                  \  }}}}|	j                  ||z  d||      }	| j                  |	      }	|	j                  \  }
}}}|	j                  |||||      }	|	j                  d      j                         j                  d	      }	|	j                  d      }t        j                  j                  j                  |	d||z
  fd
d      }	| j!                  ||   |	      ||<   |}nx|j                  \  }
}
}}|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j	                  |      }| j                  r!|j                  d      j#                  dd      }| j%                  |      }|S )Nr   r   zInput audio size (*z) doesn't match model (z).r-   )r   r*   r
   r   r+   r
   constantr*   )r   r   r   
ValueErrorr   sizerO   r1   r/   r   r0   r   r?   r	   rP   padr   	transposer   )rp   r    is_longer_idxglobal_hidden_statesr"   r5   r3   r4   output_widthlocal_hidden_states_featureslocal_widths                r&   r   zClapAudioPatchEmbed.forwardN  s   #0AaCA#>  7K6P6P3Jfeq))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  $(99-A#B /44R8L=!A%&3M12q!4K&L&W&W&Y#:M:S:S7
L&%&9&>&>zL?XZ[]cej&k#&*oo6I&J#-@-F-F*8VU&9&>&>z<Yacikp&q#&9&A&A/&R&]&]&_&g&ghi&j#166r:&+hh&9&9&=&='!\K-G)H*VW'# 7;6G6G(79L7$]3 1M"/"5"5Aq&%q))UdmmA6F-F (%8OPTP]P]^_P`Oaabcgcpcpqrcsbttvw  !IIm4M<<)11!4>>q!DM		-0r(   rx   r   r   s   @r&   r   r     s    
( (T/r(   r   c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
ClapAudioSelfAttentionc                    t         |           ||z  dk7  rt        d| d| d      || _        t	        ||z        | _        | j                  | j
                  z  | _        t        |t        j                  j                        r|n||f| _        t        j                  t        j                  d| j                  d   z  dz
  d| j                  d   z  dz
  z  |            | _        t        j"                  | j                  d         }t        j"                  | j                  d         }t        j$                  t'        ||gd            }t        j(                  |d      }|d d d d d f   |d d d d d f   z
  }	|	j+                  ddd      j-                         }	|	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   | j                  d   dz
  z  cc<   |	d d d d dfxx   d| j                  d   z  dz
  z  cc<   |	j/                  d	      }
| j1                  d
|
       t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j2                  | j                  | j                  |j4                        | _        t        j<                  |j>                        | _         y )Nr   The hidden size (6) is not a multiple of the number of attention heads ()r*   r   ij)indexingr-   relative_position_indexbias)!ry   rz   r   num_attention_headsr>   attention_head_sizeall_head_sizer   collectionsabcIterabler2   r	   	Parameterr?   zerosrelative_position_bias_tablerN   stackr   r   r0   r1   sumregister_bufferLinearqkv_biasquerykeyvalueDropoutattention_probs_dropout_probdropout)rp   r   r<   	num_headsr2   coords_hcoords_wcoordscoords_flattenrelative_coordsr   r|   s              r&   rz   zClapAudioSelfAttention.__init__  s   ?a#C5(^_h^iijk  $- #&sY#7 !558P8PP%k;??3K3KLKS^`kRl 	 -/LLKKT--a0014T=M=Ma=P9PST9TUW`a-
)
 << 0 0 34<< 0 0 34Xx&:TJKvq1(At4~aqj7QQ)11!Q:EEG1a D$4$4Q$7!$;; 1a D$4$4Q$7!$;; 1a A(8(8(;$;a$?? "1"5"5b"968OPYYt1143E3EFOO\
99T//1C1C&//ZYYt1143E3EFOO\
zz&"E"EFr(   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S Nr-   r   r*   r   r
   r   r   r   r/   r0   rp   xnew_x_shapes      r&   transpose_for_scoresz+ClapAudioSelfAttention.transpose_for_scores  L    ffhsmt'?'?AYAY&ZZFF;yyAq!$$r(   r    attention_mask	head_maskoutput_attentionsrJ   c                    |j                   \  }}}| j                  |      }| j                  | j                  |            }	| j                  | j	                  |            }
| j                  |      }t        j                  ||	j                  dd            }|t        j                  | j                        z  }| j                  | j                  j                  d         }|j                  | j                  d   | j                  d   z  | j                  d   | j                  d   z  d      }|j                  ddd      j!                         }||j#                  d      z   }|r|j                   d   }|j                  ||z  || j$                  ||      }||j#                  d      j#                  d      z   }|j                  d| j$                  ||      }t&        j(                  j+                  |d      }| j-                  |      }|||z  }t        j                  ||
      }|j                  dddd      j!                         }|j/                         d d | j0                  fz   }|j                  |      }|r||f}|S |f}|S )Nr-   r   r   r*   r;   r
   )r   r   r   r   r   r?   matmulr   mathsqrtr   r   r   r/   r2   r0   r1   	unsqueezer   r	   rP   softmaxr   r   r   )rp   r    r   r   r   r"   r<   r5   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresrelative_position_bias
mask_shapeattention_probscontext_layernew_context_layer_shapeoutputss                      r&   r   zClapAudioSelfAttention.forward  s    )6(;(;%
C JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ!%!B!B4C_C_CdCdegCh!i!7!<!<Q$"2"21"55t7G7G7JTM]M]^_M`7`bd"
 "8!?!?1a!H!S!S!U+.D.N.Nq.QQ%'--a0J/44j(*d6N6NPSUX   0.2J2J12M2W2WXY2ZZ/44R9Q9QSVX[\ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2 O\M]r(   NNF)rY   rZ   r[   rz   r   r?   Tensorr   r]   boolr   r   r   r   s   @r&   r   r     sv    #GJ% 7;15,16||6 !!2!236 E--.	6
 $D>6 
u||	6r(   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapAudioSelfOutputc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rx   )ry   rz   r	   r   denser   r   r   rp   r   r<   r|   s      r&   rz   zClapAudioSelfOutput.__init__  s6    YYsC(
zz&"E"EFr(   r    input_tensorrJ   c                 J    | j                  |      }| j                  |      }|S rx   r  r   rp   r    r  s      r&   r   zClapAudioSelfOutput.forward  s$    

=1]3r(   rY   rZ   r[   rz   r?   r  r   r   r   s   @r&   r  r    s2    G
U\\  RWR^R^ r(   r  c                        e Zd Z fdZd Z	 	 	 d	dej                  deej                     deej                     dee	   de
ej                     f
dZ xZS )
ClapAudioAttentionc                     t         |           t        ||||      | _        t	        ||      | _        t               | _        y rx   )ry   rz   r   rp   r  r   setpruned_heads)rp   r   r<   r   r2   r|   s        r&   rz   zClapAudioAttention.__init__  s8    *63	;O	)&#6Er(   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y Nr   r   r;   rO   r   rp   r   r   r!  r   r   r   r   r   r  r   unionrp   headsindexs      r&   prune_headszClapAudioAttention.prune_heads     u:?749900$))2O2OQUQbQb
u
 -TYY__eD		*499==%@		,TYY__eD		.t{{/@/@%QO )-		(E(EE
(R		%"&))"?"?$))B_B_"_		 --33E:r(   r    r   r   r   rJ   c                 j    | j                  ||||      }| j                  |d   |      }|f|dd  z   }|S Nr   r   rp   r   )rp   r    r   r   r   self_outputsattention_outputr  s           r&   r   zClapAudioAttention.forward  sG     yy	K\];;|AF#%QR(88r(   r  )rY   rZ   r[   rz   r)  r?   r  r   r]   r  r   r   r   r   s   @r&   r  r    st    ";* 7;15,1
||
 !!2!23
 E--.	

 $D>
 
u||	
r(   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioIntermediatec                    t         |           t        j                  |t	        |j
                  |z              | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rx   )ry   rz   r	   r   r>   	mlp_ratior  r   
hidden_actstrr   intermediate_act_fnr  s      r&   rz   zClapAudioIntermediate.__init__  sa    YYsC(8(83(>$?@
f''-'-f.?.?'@D$'-'8'8D$r(   r    rJ   c                 J    | j                  |      }| j                  |      }|S rx   r  r6  rp   r    s     r&   r   zClapAudioIntermediate.forward$  &    

=100?r(   r  r   s   @r&   r1  r1    #    9U\\ ell r(   r1  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapAudioOutputc                     t         |           t        j                  t	        |j
                  |z        |      | _        t        j                  |j                        | _	        y rx   )
ry   rz   r	   r   r>   r3  r  r   hidden_dropout_probr   r  s      r&   rz   zClapAudioOutput.__init__,  sF    YYs6#3#3c#9:C@
zz&"<"<=r(   r    rJ   c                 J    | j                  |      }| j                  |      }|S rx   r  r9  s     r&   r   zClapAudioOutput.forward1  s$    

=1]3r(   r  r   s   @r&   r=  r=  +  s#    >
U\\ ell r(   r=  c                        e Zd Zd fd	Zd Zd Zd Z	 	 	 ddej                  de	e
e
f   deej                     dee   d	ee   d
e	ej                  ej                  f   fdZ xZS )ClapAudioLayerc                    t         |           |j                  | _        || _        |j                  | _        || _        t        j                  ||j                        | _	        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t!        ||      | _        t%        ||      | _        y )Neps)r2   r~   )ry   rz   chunk_size_feed_forward
shift_sizer2   input_resolutionr	   r   layer_norm_epslayernorm_beforer  	attentionrv   r   	drop_pathlayernorm_afterr1  intermediater=  r   )rp   r   r<   rH  r   drop_path_raterG  r|   s          r&   rz   zClapAudioLayer.__init__9  s    '-'E'E$$!-- 0 "Sf6K6K L+FCPTP`P`a9G#9Mn5SUS^S^S`!||CV5J5JK1&#>%fc2r(   c                    t        |      | j                  k  rgt        d      | _        t        j
                  j                         r(t	        j                   t	        j                  |            n
t        |      | _        y y Nr   )minr2   r   rG  r?   jit
is_tracingtensor)rp   rH  s     r&   set_shift_and_window_sizez(ClapAudioLayer.set_shift_and_window_sizeF  s\     D$4$44'lDO=BYY=Q=Q=S		%,,'789Y\]mYn  5r(   c           	         | j                   dkD  rzt        j                  d||df||      }t        d| j                         t        | j                   | j                          t        | j                    d       f}t        d| j                         t        | j                   | j                          t        | j                    d       f}d}|D ]  }	|D ]  }
||d d |	|
d d f<   |dz  }  t        || j                        }|j                  d| j                  | j                  z        }|j                  d      |j                  d      z
  }|j                  |dk7  t        d            j                  |dk(  t        d            }|S d }|S )Nr   r   r   r-   r*   g      Yr~   )
rG  r?   r   slicer2   r7   r/   r  masked_fillfloat)rp   r3   r4   r   rM   img_maskheight_sliceswidth_slicescountheight_slicewidth_slicemask_windows	attn_masks                r&   get_attn_maskzClapAudioLayer.get_attn_maskN  s   ??Q{{Avua#8fUHa$***+t'''$//)9:t&-M a$***+t'''$//)9:t&-L
 E - #/ K@EHQk1<=QJE
 ,Hd6F6FGL',,R1A1ADDTDT1TUL$..q1L4J4J14MMI!--i1neFmLXXYbfgYginoristI  Ir(   c                     | j                   || j                   z  z
  | j                   z  }| j                   || j                   z  z
  | j                   z  }ddd|d|f}t        j                  j                  ||      }||fS rQ  )r2   r	   rP   r   )rp   r    r3   r4   	pad_right
pad_bottom
pad_valuess          r&   	maybe_padzClapAudioLayer.maybe_padj  s    %%0@0@(@@DDTDTT	&&$2B2B)BBdFVFVV
Ay!Z8
))-Dj((r(   r    input_dimensionsr   r   always_partitionrJ   c                    |s| j                  |       n	 |\  }}|j                         \  }}	}
|}| j                  |      }|j                  ||||
      }| j	                  |||      \  }}|j
                  \  }	}}}	| j                  dkD  r1t        j                  || j                   | j                   fd      }n|}t        || j                        }|j                  d| j                  | j                  z  |
      }| j                  |||j                  |j                        }| j                  ||||      }|d   }|j                  d| j                  | j                  |
      }t        || j                  ||      }| j                  dkD  r/t        j                  || j                  | j                  fd      }n|}|d   dkD  xs |d   dkD  }|r|d d d |d |d d f   j!                         }|j                  |||z  |
      }|| j#                  |      z   }| j%                  |      }| j'                  |      }|| j)                  |      z   }|r	||d	   f}|S |f}|S )
Nr   )r   r*   )shiftsdimsr-   r   )r   r
   r,   r   )rV  r   rJ  r/   rh  r   rG  r?   rollr7   r2   rc  r   rM   rK  r9   r1   rL  rM  rN  r   )rp   r    ri  r   r   rj  r3   r4   r"   r   r   shortcutrg  
height_pad	width_padshifted_hidden_stateshidden_states_windowsrb  attention_outputsr/  attention_windowsshifted_windows
was_paddedlayer_outputlayer_outputss                            r&   r   zClapAudioLayer.forwardq  s     **+;<("/"4"4"6
Ax --m<%**:vuhO %)NN=&%$P!z&3&9&9#:y!??Q$)JJ}tFVY]YhYhXhEipv$w!$1! !11FHXHX Y 5 : :2t?O?ORVRbRb?bdl m&&	)<)<EZEaEa ' 
	 !NN!9iK\ + 
 -Q/,11"d6F6FHXHXZbc():D<L<LjZcd ??Q %

?DOOUYUdUdCelr s /]Q&;*Q-!*;
 1!WfWfufa2G H S S U-22:v~xX 4>>2C#DD++M:((6$t{{<'@@@Q'8';< YeWfr(   )r~   r   NFF)rY   rZ   r[   rz   rV  rc  rh  r?   r  r   r>   r   r]   r  r   r   r   s   @r&   rB  rB  8  s    38) 26,1+0A||A  S/A E--.	A
 $D>A #4.A 
u||U\\)	*Ar(   rB  c                        e Zd Z fdZ	 	 	 d	dej
                  deeef   deej                     dee
   dee
   deej
                     fdZ xZS )
ClapAudioStagec                 h   t         	|           || _        || _        t	        j
                  t        |      D cg c]-  }t        ||||||   |dz  dk(  rdn|j                  dz        / c}      | _	        |& |||t        j                        | _        d| _        y d | _        d| _        y c c}w )Nr*   r   )r   r<   rH  r   rO  rG  )r<   
norm_layerF)ry   rz   r   r<   r	   
ModuleListrangerB  r2   blocksr   
downsamplepointing)
rp   r   r<   rH  depthr   rL  r  ir|   s
            r&   rz   zClapAudioStage.__init__  s    mm u
  !%5'#,Q<%&UaZqf6H6HA6M

 !()9sr||\DO  #DO'
s   2B/r    ri  r   r   rj  rJ   c                    |\  }}t        | j                        D ]  \  }}	|||   nd }
 |	|||
||      }|d   }! |}| j                  )|dz   dz  |dz   dz  }}||||f}| j                  ||      }n||||f}|||f}|r|dd  z  }|S )Nr   r   r*   )	enumerater  r  )rp   r    ri  r   r   rj  r3   r4   r  layer_modulelayer_head_maskry  !hidden_states_before_downsamplingheight_downsampledwidth_downsampledoutput_dimensionsstage_outputss                    r&   r   zClapAudioStage.forward  s     )(5 	-OA|.7.CilO(/BSUeM *!,M	- -:)??&5;aZA4EPQ	VWGW 1!'0BDU V OO,MO_`M!' >&(IK\]]12..Mr(   rz  )rY   rZ   r[   rz   r?   r  r   r>   r   r]   r  r   r   r   s   @r&   r|  r|    sz    < 26,1+0||  S/ E--.	
 $D> #4. 
u||	r(   r|  c            	            e Zd ZdZej
                  fdee   dedej                  ddf fdZ	d Z
d	ej                  d
eeef   dej                  fdZ xZS )ClapAudioPatchMerginga'  
    Patch Merging Layer.

    Args:
        input_resolution (`Tuple[int]`):
            Resolution of input feature.
        dim (`int`):
            Number of input channels.
        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
            Normalization layer class.
    rH  r<   r~  rJ   Nc                     t         |           || _        || _        t	        j
                  d|z  d|z  d      | _         |d|z        | _        y )Nr+   r*   Fr   )ry   rz   rH  r<   r	   r   	reductionr   )rp   rH  r<   r~  r|   s       r&   rz   zClapAudioPatchMerging.__init__   sI     01s7AG%@q3w'	r(   c                     |dz  dk(  xs |dz  dk(  }|r.ddd|dz  d|dz  f}t         j                  j                  ||      }|S )Nr*   r   r   )r	   rP   r   )rp   input_featurer3   r4   
should_padrg  s         r&   rh  zClapAudioPatchMerging.maybe_pad  sU    qjAo:519>
Q519a!<JMM--mZHMr(   r  ri  c                    |\  }}|j                   \  }}}|j                  ||||      }| j                  |||      }|d d dd ddd dd d f   }|d d dd ddd dd d f   }	|d d dd ddd dd d f   }
|d d dd ddd dd d f   }t        j                  ||	|
|gd      }|j                  |dd|z        }| j                  |      }| j                  |      }|S )Nr   r*   r   r-   r+   )r   r/   rh  r?   catr   r  )rp   r  ri  r3   r4   r"   r<   r5   input_feature_0input_feature_1input_feature_2input_feature_3s               r&   r   zClapAudioPatchMerging.forward  s   ((5(;(;%
C%**:vulS}feD'14a4Aq(89'14a4Aq(89'14a4Aq(89'14a4Aq(89		?O_Ve"fhjk%**:r1|;KL		-0}5r(   )rY   rZ   r[   r\   r	   r   r   r>   Modulerz   rh  r?   r  r   r   r   s   @r&   r  r    sr    
 XZWcWc (s (# (299 (hl (U\\ U3PS8_ Y^YeYe r(   r  c                        e Zd Z fdZd Z	 	 	 	 	 	 	 ddeej                     deej                     dee   dee   dee   dee   d	ee   d
e	e
ef   fdZ xZS )ClapAudioEncoderc                    t         |           t        |j                        | _        || _        t        |      | _        |j                  | _        | j                  j                  | _	        |j                  | _
        |j                  |j                  z  | _        t        |j                  d| j                  dz
  z  z        | _        t!        j"                  d|j$                  t'        |j                        d      D cg c]  }|j)                          }}| j                  j*                  }t-        | j                        D cg c]  }|d   d|z  z  |d   d|z  z  f c}| _        t1        j2                  t-        | j                        D cg c]  }t5        |t        |j                  d|z  z        | j.                  |   |j                  |   |j6                  |   |t'        |j                  d |       t'        |j                  d |dz           || j                  dz
  k  rt8        nd        c}      | _        d| _        t1        j>                  |j                        | _         t1        jB                  | j                        | _"        |j                  | _        t1        jF                  d      | _$        y c c}w c c}w c c}w )Nr*   r   r   cpurL   )r   r<   rH  r  r   rL  r  F)%ry   rz   rO   depths
num_layersr   r   patch_embedr   r   r   num_mel_bins
freq_ratior>   r   num_featuresr?   linspacerO  r   itemr   r  input_resolutionsr	   r  r|  r   r  layersgradient_checkpointingr   
batch_normr   r   AdaptiveAvgPool1davgpool)rp   r   r   rO  r   r  i_layerr|   s          r&   rz   zClapAudioEncoder.__init__*  sW   fmm,.v6#11 ,,99)) **f.A.AA ? ?!Z[H[B\ \],1NN1f>S>SUXY_YfYfUgpu,vwq!&&(ww$$..	\abfbqbq\r!sWX9Q<AqD#99Q<AqD;Q"R!smm  %T__5  !F;;ajHI%)%;%;G%D --0$88A,Sx1H-ICPVP]P]^k`gjk`kPlLmn9@4??UVCV9V4]a
 ',#..)<)<=LL!2!23	mm++A.3 x "ts   J<KB#Kc                    |j                   \  }}}}t        | j                  | j                  z        }| j                  | j                  z  }||kD  s||kD  rt	        d      ||k  r%t
        j                  j                  |||fdd      }||k  r%t
        j                  j                  |||fdd      }|j                   \  }}}	}
|j                  ||| j                  z  |	| j                  z  |
      }|j                  dddd      j                         }|j                  |||
| j                  z  |	| j                  z        }|S )	z
        The input is 4 normalized log mel spectrograms. It is reshape to the common shape of images. Each channel
        should represent 1 of the 4 crops of the spectrogram. For more details, refer to the [`ClapFeatureExtractor`].
        z@the wav size should be less than or equal to the swin input sizebicubicT)modealign_cornersr   r   r
   r*   )r   r>   r   r  r   r	   rP   r'   r   r0   r1   )rp   normalized_input_featuresr   r#   freq_length
spec_widthspec_heightbatchr   timefreqs              r&   reshape_mel2imgz ClapAudioEncoder.reshape_mel2imgR  s`   
 *C)H)H&1k;$//9:
nn7#{['@_`` #(*(A(A)J+D9dh )B )% $(*(A(A)K+EIei )B )% '@&E&E#xt %>$E$E8doo-tt/F%
! %>$E$EaAq$Q$\$\$^!$=$E$E8TDOO3TT__5L%
! )(r(   	is_longerr   r   output_hidden_states(output_hidden_states_before_downsamplingrj  return_dictrJ   c	                    |j                  dd      }| j                  |      }	|	j                  dd      }	d }
| j                  r6|j                  |j                        }t        j                  |dk(        d   }
| j                  |	      }|j                  d   }| j                  ||
      }|rdnd }|rdnd }|rdnd }| j                  d   }|rE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }t        | j                        D ]"  \  }}|||   nd }| j                  |   }| j                  r,| j                   r | j#                  |j$                  ||||      }n ||||||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                  \  }}} |j                  |g|d   |d   f| }|j                  dddd      }||fz  }||fz  }nI|rG|sE|j                  \  }}} |j                  |g|| }|j                  dddd      }||fz  }||fz  }|s||dd  z  }% | j'                  |      }|j                  \  }}}|dt)        | j*                        dz
  z  z  | j,                  d   z  }|dt)        | j*                        dz
  z  z  | j,                  d   z  }|j                  ddd      j/                         j1                  ||||      }|j                  \  }}} }!| | j2                  z  }"|j1                  ||| |"z  |"|!      }|j                  ddddd      j/                         j1                  |||"d      }| j5                  t        j6                  |d            }#t        j6                  |#d      }#|st9        d	 ||#||fD              S t;        ||#||
      S )Nr   r
   r   r*   r_   r   r-   r+   c              3   $   K   | ]  }|| 
 y wrx   r_   rn   vs     r&   rq   z+ClapAudioEncoder.forward.<locals>.<genexpr>  s      	 = 	   )rW   pooler_outputr    rX   )r   r  r   torM   r?   wherer  r   r  r  r/   r0   r  r  r  r   _gradient_checkpointing_func__call__r   rO   r  r   r1   r   r  r  r   rr   r   )$rp   input_featuresr  r   r   r  r  rj  r  r  is_longer_list_idxis_longer_listr    
frames_numall_hidden_statesall_reshaped_hidden_statesall_self_attentionsri  r"   r   hidden_sizereshaped_hidden_stater  r  r  ry  r  r  rW   
n_channels
freq_shapetemporal_shapen_frequenciesn_temp
c_freq_binlatent_outputs$                                       r&   r   zClapAudioEncoder.forwardv  s    (11!Q7$(OON$C!$=$G$G1$M!!&\\.*?*?@N!&^q-@!A!!D,,-FG"((+
((8JK"6BD+?RT"$5b411!4)6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5 (	9OA|.7.CilO#55a8**t}} $ A A ))=:JO]n! !-!#3_FWYi! *!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF* #}QR'88#Q(	9T !IIm4$5$;$;!
AzA#dkk*:Q*>$?@DDUDUVWDXX
#c$++.>.B(CDHYHYZ[H\\ %%aA.99;CCJPZ\fhvw 	 9J8O8O5
Jv"doo5
-55
MZ$?V
 %%aAq!4??AII*V`blnpq 	 U]]3Da%HImQ7 	 &!.'		 	 	 */'4*	
 	
r(   )NNFFFFT)rY   rZ   r[   rz   r  r   r?   r]   r  r   r   ra   r   r   r   s   @r&   r  r  )  s    &/P")N 2615,1/4CH+0&*z
 E--.z
 E--.	z

 $D>z
 'tnz
 3;4.z
 #4.z
 d^z
 
u**	+z
r(   r  c                   4     e Zd Zdeeef   f fdZd Z xZS )ClapProjectionLayerr   c                     t         |           || _        |j                  }|j                  }t        j                  ||      | _        t        |j                     | _
        t        j                  ||      | _        y rx   )ry   rz   r   r  projection_dimr	   r   linear1r   projection_hidden_act
activationlinear2)rp   r   r  r  r|   s       r&   rz   zClapProjectionLayer.__init__  sa    ((..yyn= !=!=>yy@r(   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rx   )r  r  r  r9  s     r&   r   zClapProjectionLayer.forward  s2    ]36]3r(   )	rY   rZ   r[   r   r   r   rz   r   r   r   s   @r&   r  r    s     Au_n%DE Ar(   r  c                   2     e Zd ZdZ fdZ	 ddZd Z xZS )ClapTextEmbeddingszV
    Same as BertEmbeddings with a tiny tweak for positional embeddings indexing.
    c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        t#        |dd      | _        | j'                  dt)        j*                  |j                        j-                  d      d       | j'                  d	t)        j.                  | j0                  j3                         t(        j4                  
      d       |j                  | _        t        j                  |j                  |j
                  | j6                        | _	        y )N)rD   rD  position_embedding_typeabsoluteposition_ids)r   r-   T)
persistenttoken_type_idsr   )ry   rz   r	   	Embedding
vocab_sizer  pad_token_idword_embeddingsmax_position_embeddingsposition_embeddingstype_vocab_sizetoken_type_embeddingsr   rI  r   r?  r   rl   r  r   r?   rN   expandr   r  r   rB   rD   rp   r   r|   s     r&   rz   zClapTextEmbeddings.__init__  si   !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]" f&8&8f>S>STzz&"<"<='.v7PR\']$ELL)G)GHOOPWXei 	 	
 	ekk$*;*;*@*@*B%**Ubf 	 	

 "..#%<<**F,>,>DL\L\$
 r(   c                    |+|t        || j                  |      }n| j                  |      }||j                         }n|j                         d d }|d   }|st	        | d      r-| j
                  d d d |f   }|j                  |d   |      }	|	}n:t        j                  |t        j                  | j                  j                        }|| j                  |      }| j                  |      }
||
z   }| j                  dk(  r| j                  |      }||z  }| j!                  |      }| j#                  |      }|S )Nr-   r   r  r   r   r  )rH   rD   &create_position_ids_from_inputs_embedsr   hasattrr  r  r?   r   rB   r  rM   r  r  r  r  r   r   )rp   rC   r  r  inputs_embedsrE   input_shape
seq_lengthbuffered_token_type_ids buffered_token_type_ids_expandedr  
embeddingsr  s                r&   r   zClapTextEmbeddings.forward%  sR    $A)TM]M]_uv#JJ=Y #..*K',,.s3K ^

 !t-.*.*=*=a*n*M'3J3Q3QR]^_R`bl3m0!A!&[

SWSdSdSkSk!l  00;M $ : :> J"%::
'':5"&":":<"H--J^^J/
\\*-
r(   c                    |j                         dd }|d   }t        j                  | j                  dz   || j                  z   dz   t        j                  |j
                        }|j                  d      j                  |      S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr-   r   r   r   )r   r?   rN   rD   rB   rM   r  r  )rp   r  r  sequence_lengthr  s        r&   r  z9ClapTextEmbeddings.create_position_ids_from_inputs_embedsM  s     $((*3B/%a.||q /D4D4D"Dq"HPUPZPZcpcwcw
 %%a(//<<r(   )NNNNr   )rY   rZ   r[   r\   rz   r   r  r   r   s   @r&   r  r    s    

4 rs&P=r(   r  c                   P    e Zd Zd fd	Zdej
                  dej
                  fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     d	eej                     d
ee	e	ej                           dee
   de	ej
                     fdZ xZS )ClapTextSelfAttentionc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        |xs t#        |dd      | _        | j$                  dk(  s| j$                  d	k(  rF|j&                  | _        t        j(                  d
|j&                  z  dz
  | j                        | _        |j,                  | _        y )Nr   embedding_sizer   r   r   r  r  relative_keyrelative_key_queryr*   r   )ry   rz   r  r   r  r   r>   r   r   r	   r   r   r   r   r   r   r   rl   r  r  r  distance_embedding
is_decoderrp   r   r  r|   s      r&   rz   zClapTextSelfAttention.__init__a  s    : ::a?PVXhHi#F$6$6#7 8 445Q8 
 $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EF'> (
'-zC
$ ''>9T=Y=Y]q=q+1+I+ID(&(ll1v7U7U3UXY3Y[_[s[s&tD# ++r(   r   rJ   c                     |j                         d d | j                  | j                  fz   }|j                  |      }|j	                  dddd      S r   r   r   s      r&   r   z*ClapTextSelfAttention.transpose_for_scores{  r   r(   r    r   r   encoder_hidden_statesencoder_attention_maskpast_key_valuer   c                 $   | j                  |      }|d u}	|	r||d   }
|d   }|}n |	rC| j                  | j                  |            }
| j                  | j                  |            }|}n|y| j                  | j                  |            }
| j                  | j                  |            }t	        j
                  |d   |
gd      }
t	        j
                  |d   |gd      }n@| j                  | j                  |            }
| j                  | j                  |            }| j                  |      }|d u}| j                  r|
|f}t	        j                  ||
j                  dd            }| j                  dk(  s| j                  dk(  r|j                  d   |
j                  d   }}|rDt	        j                  |dz
  t        j                  |j                  	      j                  dd      }n@t	        j                  |t        j                  |j                  	      j                  dd      }t	        j                  |t        j                  |j                  	      j                  dd      }||z
  }| j!                  || j"                  z   dz
        }|j%                  |j&                  
      }| j                  dk(  rt	        j(                  d||      }||z   }nE| j                  dk(  r6t	        j(                  d||      }t	        j(                  d|
|      }||z   |z   }|t+        j,                  | j.                        z  }|||z   }t0        j2                  j5                  |d      }| j7                  |      }|||z  }t	        j                  ||      }|j9                  dddd      j;                         }|j=                         d d | j>                  fz   }|j                  |      }|r||fn|f}| j                  r||fz   }|S )Nr   r   r*   r;   r-   r   r  r  r   r  zbhld,lrd->bhlrzbhrd,lrd->bhlrr
   ) r   r   r   r   r?   r  r  r   r   r  r   rU  rB   rM   r/   rN   r  r  r  r   einsumr  r  r   r	   rP   r  r   r0   r1   r   r   )rp   r    r   r   r  r  r  r   r  is_cross_attentionr  r  r  	use_cacher	  query_length
key_lengthposition_ids_lposition_ids_rdistancepositional_embeddingrelative_position_scoresrelative_position_scores_queryrelative_position_scores_keyr  r  r  r  s                               r&   r   zClapTextSelfAttention.forward  s    !JJ}5
 3$>."<&q)I(+K3N11$((;P2QRI33DJJ?T4UVK3N'11$((=2IJI33DJJ}4MNK		>!#4i"@aHI))^A%6$D!LK11$((=2IJI33DJJ}4MNK//0AB"$.	?? (5N !<<Y5H5HR5PQ''>9T=Y=Y]q=q'2'8'8';Y__Q=O*L!&j1nEJJWdWkWk!l!q!q" "'l%**UbUiUi!j!o!oprtu!v"\\*EJJ}OcOcdiijkmopN%6H#'#:#:8dFbFb;bef;f#g #7#:#:ARAR#:#S ++~=+0<<8H+Wk+l(#36N#N --1EE16>NP[]q1r./4||<LiYm/n,#36T#TWs#s +dii8P8P.QQ%/.@ --//0@b/I ,,7  -	9O_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BC6G=/2mM]?? 11Gr(   rx   NNNNNF)rY   rZ   r[   rz   r?   r  r   r   r]   r   r  r   r   r   s   @r&   r  r  `  s    ,4%ell %u|| % 7;15=A>BDH,1c||c !!2!23c E--.	c
  ((9(9:c !)):): ;c !uU->->'?!@Ac $D>c 
u||	cr(   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y NrD  )ry   rz   r	   r   r  r  r   rI  r   r?  r   r  s     r&   rz   zClapTextSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r(   r    r  rJ   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rx   r  r   r   r  s      r&   r   zClapTextSelfOutput.forward  7    

=1]3}|'CDr(   r  r   s   @r&   r  r    1    >U\\  RWR^R^ r(   r  eagerc                       e Zd Zd fd	Zd Z	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     dee	e	ej                           d	ee
   d
e	ej                     fdZ xZS )ClapTextAttentionc                     t         |           t        |j                     ||      | _        t        |      | _        t               | _        y )Nr  )	ry   rz    CLAP_TEXT_SELF_ATTENTION_CLASSES_attn_implementationrp   r  r   r   r!  r	  s      r&   rz   zClapTextAttention.__init__  sC    4V5P5PQ,C
	 )0Er(   c                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y r#  r$  r&  s      r&   r)  zClapTextAttention.prune_heads  r*  r(   r    r   r   r  r  r  r   rJ   c           	      p    | j                  |||||||      }| j                  |d   |      }	|	f|dd  z   }
|
S r,  r-  )rp   r    r   r   r  r  r  r   r.  r/  r  s              r&   r   zClapTextAttention.forward  sW     yy!"
  ;;|AF#%QR(88r(   rx   r  )rY   rZ   r[   rz   r)  r?   r  r   r]   r   r  r   r   r   s   @r&   r&  r&    s    ";* 7;15=A>BDH,1|| !!2!23 E--.	
  ((9(9: !)):): ; !uU->->'?!@A $D> 
u||	r(   r&  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y rx   )ry   rz   r	   r   r  intermediate_sizer  r   r4  r5  r   r6  r  s     r&   rz   zClapTextIntermediate.__init__0  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r(   r    rJ   c                 J    | j                  |      }| j                  |      }|S rx   r8  r9  s     r&   r   zClapTextIntermediate.forward8  r:  r(   r  r   s   @r&   r.  r.  /  r;  r(   r.  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )ClapTextOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r  )ry   rz   r	   r   r0  r  r  r   rI  r   r?  r   r  s     r&   rz   zClapTextOutput.__init__@  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r(   r    r  rJ   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S rx   r!  r  s      r&   r   zClapTextOutput.forwardF  r"  r(   r  r   s   @r&   r3  r3  ?  r#  r(   r3  c                       e Zd Z fdZ	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	eej
                     fd
Z
d Z xZS )ClapTextLayerc                 f   t         |           |j                  | _        d| _        t	        |      | _        |j                  | _        |j                  | _        | j                  r,| j                  st        |  d      t	        |d      | _	        t        |      | _        t        |      | _        y )Nr   z> should be used as a decoder model if cross attention is addedr  r(  )ry   rz   rF  seq_len_dimr&  rK  r  add_cross_attentionr   crossattentionr.  rN  r3  r   r  s     r&   rz   zClapTextLayer.__init__O  s    '-'E'E$*62 ++#)#=#= ##?? D6)g!hii"3FT^"_D08$V,r(   r    r   r   r  r  r  r   rJ   c           	         ||d d nd }| j                  |||||      }	|	d   }
| j                  r|	dd }|	d   }n|	dd  }d }| j                  rT|Rt        | d      st        d|  d      ||d	d  nd }| j	                  |
||||||      }|d   }
||dd z   }|d   }|z   }t        | j                  | j                  | j                  |
      }|f|z   }| j                  r|fz   }|S )
Nr*   )r   r  r   r   r-   r;  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`r   )	rK  r  r  r   r;  r   feed_forward_chunkrF  r9  )rp   r    r   r   r  r  r  r   self_attn_past_key_valueself_attention_outputsr/  r  present_key_valuecross_attn_present_key_valuecross_attn_past_key_valuecross_attention_outputsrx  s                    r&   r   zClapTextLayer.forward]  s}    :H9S>"1#5Y] !%/3 "0 "
 2!4 ??,Qr2G 6r :,QR0G'+$??4@4!12 =dV DD D  @N?Yrs(;_c%&*&9&9 %&)!'#  7q9 7" ==G ,C2+F( 14P P0##T%A%A4CSCSUe
  /G+ ??!2 44Gr(   c                 L    | j                  |      }| j                  ||      }|S rx   )rN  r   )rp   r/  intermediate_outputrx  s       r&   r=  z ClapTextLayer.feed_forward_chunk  s,    "//0@A{{#68HIr(   r  )rY   rZ   r[   rz   r?   r  r   r]   r   r  r   r=  r   r   s   @r&   r7  r7  N  s    -" 7;15=A>BDH,1?||? !!2!23? E--.	?
  ((9(9:? !)):): ;? !uU->->'?!@A? $D>? 
u||	?Br(   r7  c                   D    e Zd Z fdZ	 	 	 	 	 	 	 	 	 ddej
                  deej                     deej                     deej                     deej                     deeeej                           dee	   d	ee	   d
ee	   dee	   de
eej
                     ef   fdZ xZS )ClapTextEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
ry   rz   r   r	   r  r  num_hidden_layersr7  layerr  )rp   r   r   r|   s      r&   rz   zClapTextEncoder.__init__  sN    ]]5IaIaCb#caM&$9#cd
&+# $ds   A#r    r   r   r  r  past_key_valuesr  r   r  r  rJ   c                    |	rdnd }|rdnd }|r| j                   j                  rdnd }| j                  r%| j                  r|rt        j                  d       d}|rdnd }t        | j                        D ]  \  }}|	r||fz   }|||   nd }|||   nd }| j                  r/| j                  r#| j                  |j                  |||||||      }n ||||||||      }|d   }|r	||d   fz  }|s|||d   fz   }| j                   j                  s||d   fz   } |	r||fz   }|
st        d |||||fD              S t        |||||	      S )
Nr_   zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr   r-   r   r*   c              3   $   K   | ]  }|| 
 y wrx   r_   r  s     r&   rq   z*ClapTextEncoder.forward.<locals>.<genexpr>  s      
 = 
r  )rW   rK  r    rX   cross_attentions)r   r:  r  r   loggerwarning_oncer  rJ  r  r  rr   r   )rp   r    r   r   r  r  rK  r  r   r  r  r  r  all_cross_attentionsnext_decoder_cacher  r  r  r  ry  s                       r&   r   zClapTextEncoder.forward  s    #7BD$5b4%64;;;Z;Zr`d&&4==##p "	#,R$(4 #	VOA|#$58H$H!.7.CilO3B3N_Q/TXN**t}} $ A A ))!"#)*"%	! !-!"#)*"%! *!,M"}R'8&::" &9]1=M<O&O#;;22+?=QRCSBU+U(G#	VJ   1]4D D 
 "&%'(
 
 
 9+.+*1
 	
r(   )	NNNNNNFFT)rY   rZ   r[   rz   r?   r  r   r]   r   r  r   r   r   r   r   s   @r&   rG  rG    s   , 7;15=A>BEI$(,1/4&*S
||S
 !!2!23S
 E--.	S

  ((9(9:S
 !)):): ;S
 "%e.?.?(@"ABS
 D>S
 $D>S
 'tnS
 d^S
 
uU\\"$MM	NS
r(   rG  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )ClapTextPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y rx   )ry   rz   r	   r   r  r  Tanhr  r  s     r&   rz   zClapTextPooler.__init__  s9    YYv1163E3EF
'')r(   r    rJ   c                 \    |d d df   }| j                  |      }| j                  |      }|S rQ  )r  r  )rp   r    first_token_tensorpooled_outputs       r&   r   zClapTextPooler.forward	  s6     +1a40

#566r(   r  r   s   @r&   rT  rT    s#    $
U\\ ell r(   rT  c                       e Zd ZeZdZdZd Zy)ClapPreTrainedModelclapFc                    | j                   j                  }t        |t              ri|j                  j
                  j                  j                  d|dz         |j                  j
                  j                  j                  d|dz         yt        |t              r]t        j                  j                  |j                  |dz         t        j                  j                  |j                  |dz         yt        |t        j                        r+|j
                  j                  j                  d|dz         yt        |t        j                        rJ|j                   j                  j#                          |j
                  j                  j%                  d       yt        |t        j&                  t        j(                  f      r| j                   j*                  dz  d| j                   j,                  z  dz  z  |z  }t        j                  j                  |j
                  |       |j                   %|j                   j                  j#                          yyy)	zInitialize the weightsr~   g{Gz?)meanstd)r_  g      ?g      r*   N)r   initializer_factorr   r  r  weightdatanormal_r  	ClapModelr	   initlogit_scale_alogit_scale_tr  r   r   zero_fill_r   r   r  rI  )rp   modulefactorin_proj_stds       r&   _init_weightsz!ClapPreTrainedModel._init_weights  s   //f01&&--22::RV:W((//44<<#6TX=<Y	*GGOOF00ftmODGGOOF00ftmOD-MM&&CVd]&C-KK""$MM$$S)BII 67;;22D8a$++B_B_>_dh=hilrrKGGOOFMM{O;{{&  &&( ' 8r(   N)rY   rZ   r[   r   config_classbase_model_prefixsupports_gradient_checkpointingrm  r_   r(   r&   r[  r[    s    L&+#)r(   r[  c                        e Zd ZeZdZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     deej                     dee   dee   d	ee   deeef   fd
       Z xZS )ClapAudioModelr  r   c                 d    t         |   |       t        |      | _        | j	                          y rx   )ry   rz   r  audio_encoder	post_initr  s     r&   rz   zClapAudioModel.__init__3  s'     -f5r(   rJ   c                 B    | j                   j                  j                  S rx   )rt  r  r   rt   s    r&   get_input_embeddingsz#ClapAudioModel.get_input_embeddings9  s    !!--222r(   r  r   r  r  c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      S )a  
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapAudioModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapAudioModel.from_pretrained("laion/clap-htsat-fused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```r  r  r   r  r  )r   use_return_dictr   r  rt  )rp   r  r  r   r  r  s         r&   r   zClapAudioModel.forward<  sy    D &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 !!)/!5# " 
 	
r(   NNNNN)rY   rZ   r[   r   rn  main_input_namerz   r	   r  rw  r   r   r?   r]   
BoolTensorr  r   r   r   r   r   r   s   @r&   rr  rr  /  s    "L&O 3bii 3  7;04,0/3&*-
 !2!23-
 E,,--
 $D>	-

 'tn-
 d^-
 
u00	1-
 -
r(   rr  a(  
    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
    cross-attention is added between the self-attention layers, following the architecture described in *Attention is
    all you need*_ by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz
    Kaiser and Illia Polosukhin.

    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.

    .. _*Attention is all you need*: https://arxiv.org/abs/1706.03762
    )custom_introc                        e Zd ZeZd fd	Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	ee
j                        de	e   de	e   de	e   de	e   deee
j                     ef   fd       Z xZS )ClapTextModelc                     t         |   |       || _        t        |      | _        t        |      | _        |rt        |      nd| _        | j                          y)zv
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        N)
ry   rz   r   r  r  rG  encoderrT  poolerru  )rp   r   add_pooling_layerr|   s      r&   rz   zClapTextModel.__init__~  sM    
 	 ,V4&v.0AnV,t 	r(   c                 .    | j                   j                  S rx   r  r  rt   s    r&   rw  z"ClapTextModel.get_input_embeddings  s    ...r(   c                 &    || j                   _        y rx   r  rp   r   s     r&   set_input_embeddingsz"ClapTextModel.set_input_embeddings  s    */'r(   rC   r   r  r  r   r  r  r  rK  r  r   r  r  rJ   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                   j                  r|
|
n| j                   j
                  }
nd}
||t        d      |#| j                  ||       |j                         }n!||j                         d d }nt        d      |\  }}||j                  n|j                  }|	|	d   d   j                  d   nd}|t        j                  |||z   f|      }|pt        | j                  d      r4| j                  j                  d d d |f   }|j!                  ||      }|}n&t        j"                  |t        j$                  |	      }| j'                  ||      }| j                   j                  rE|C|j                         \  }}}||f}|t        j                  ||      }| j)                  |      }nd }| j+                  || j                   j,                        }| j                  |||||
      }| j/                  ||||||	|
|||
      }|d   }| j0                  | j1                  |      nd }|s
||f|dd  z   S t3        |||j4                  |j6                  |j8                  |j:                        S )NFzDYou cannot specify both input_ids and inputs_embeds at the same timer-   z5You have to specify either input_ids or inputs_embedsr   r*   rL   r  r   )rC   r  r  r  rE   )	r   r   r  r  rK  r  r   r  r  r   )rW   r  rK  r    rX   rN  )r   r   r  rz  r  r  r   %warn_if_padding_and_no_attention_maskr   rM   r   r?   onesr  r  r  r  r   rB   get_extended_attention_maskinvert_attention_maskget_head_maskrI  r  r  r   rK  r    rX   rN  )rp   rC   r   r  r  r   r  r  r  rK  r  r   r  r  r  r"   r  rM   rE   r  r  extended_attention_maskencoder_batch_sizeencoder_sequence_lengthr   encoder_hidden_shapeencoder_extended_attention_maskembedding_outputencoder_outputssequence_outputrY  s                                  r&   r   zClapTextModel.forward  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B];;!!%.%:	@U@UII ]%>cdd"66y.Q#..*K&',,.s3KTUU!,
J%.%:!!@T@T DSC^!3A!6!<!<Q!?de!"ZZ*jCY6Y)ZdjkN!t(89*.//*H*HKZK*X'3J3Q3QR\^h3i0!A!&[

SY!Z 150P0PQ_al0m ;;!!&;&G=R=W=W=Y: 7$68O#P %-).4HQW)X&.2.H.HI_.`+.2+ &&y$++2O2OP	??%)'#9 + 
 ,,2"7#B+/!5# ' 
 *!,8<8OO4UY#]3oab6III;-'+;;)77&11,==
 	
r(   )T)NNNNNNNNNNNNN)rY   rZ   r[   r   rn  rz   rw  r  r   r   r?   r  r   r]   r  r   r   r   r   r   r   s   @r&   r  r  m  sl    "L /0  -11515/3,0048<9==A$(,0/3&*l
ELL)l
 !.l
 !.	l

 u||,l
 ELL)l
  -l
  (5l
 !) 6l
 "$u'8'8"9:l
 D>l
 $D>l
 'tnl
 d^l
 
uU\\"$PP	Ql
 l
r(   r  c                   z    e Zd ZeZdef fdZe	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dee
   d	ej                  fd
       Ze	 	 	 	 	 	 ddeej                     deej                     deej                     dee
   dee
   dee
   d	ej                  fd       Ze	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     dee
   dee
   dee
   dee
   d	eeef   fd       Z xZS )rd  r   c                 .   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }t        j                  t        j                  t        j                  |j                                    | _        t        j                  t        j                  t        j                  |j                                    | _        |j$                  | _        t'        |      | _        t+        |      | _        t/        |      | _        t+        |      | _        | j5                          y )NzKconfig.text_config is expected to be of type ClapTextConfig but is of type .zMconfig.audio_config is expected to be of type ClapAudioConfig but is of type )ry   rz   r   text_configr   	TypeErrortypeaudio_configr   r	   r   r?   rU  r  loglogit_scale_init_valuerf  rg  r  r  
text_modelr  text_projectionrr  audio_modelaudio_projectionru  )rp   r   r  r  r|   s       r&   rz   zClapModel.__init__  s=    &,,n=++,-Q0 
 &--?,,-.a1 
 ((**\\%,,txx@]@]7^*_`\\%,,txx@]@]7^*_`$33'42;?),7 3L A 	r(   rC   r   r  r   r  r  rJ   c                 F   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||||      }||d   n|j
                  }| j                  |      }	t        j                  |	d      }	|	S )a  
        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`ClapTextModel`].

        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapModel

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["the sound of a cat", "the sound of a dog"], padding=True, return_tensors="pt")
        >>> text_features = model.get_text_features(**inputs)
        ```rC   r   r  r   r  r  r   r-   r;   )	r   r   r  rz  r  r  r  F	normalize)
rp   rC   r   r  r   r  r  text_outputsrY  text_featuress
             r&   get_text_featureszClapModel.get_text_features(  s    6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B])%/!5# ' 
 ,7+BQHbHb,,];Mr:r(   r  r  c                 @   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||      }|s|d   n|j
                  }| j                  |      }	t        j                  |	d      }	|	S )a  
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Returns:
            audio_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The audio embeddings obtained by
            applying the projection layer to the pooled output of [`ClapAudioModel`].

        Examples:

        ```python
        >>> from transformers import AutoFeatureExtractor, ClapModel
        >>> import torch

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("laion/clap-htsat-unfused")
        >>> random_audio = torch.rand((16_000))
        >>> inputs = feature_extractor(random_audio, return_tensors="pt")
        >>> audio_features = model.get_audio_features(**inputs)
        ```)r  r  r  r   r-   r;   )	r   r   r  rz  r  r  r  r  r  )
rp   r  r  r   r   r  r  audio_outputsrY  audio_featuress
             r&   get_audio_featureszClapModel.get_audio_featuresX  s    D 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()# ) 
 1<a(A\A\..}=^<r(   return_lossc
           	         ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||	      }
| j                  ||||||	      }|	s|
d   n|
j                  }| j                  |      }|	s|d   n|j                  }| j                  |      }||j                  ddd      z  }||j                  ddd      z  }| j                  j                         }| j                  j                         }t        j                  ||j                               |z  }t        j                  ||j                               |z  }d}|r,t!        |      }t!        |j                               }||z   d	z  }|	s||||||
f}||f|z   S |S t#        |||||||

      S )a  
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import AutoProcessor, ClapModel

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> model = ClapModel.from_pretrained("laion/clap-htsat-unfused")
        >>> processor = AutoProcessor.from_pretrained("laion/clap-htsat-unfused")

        >>> input_text = ["Sound of a dog", "Sound of vaccum cleaner"]

        >>> inputs = processor(text=input_text, audios=audio_sample, return_tensors="pt", padding=True)

        >>> outputs = model(**inputs)
        >>> logits_per_audio = outputs.logits_per_audio  # this is the audio-text similarity score
        >>> probs = logits_per_audio.softmax(dim=-1)  # we can take the softmax to get the label probabilities
        ```Nry  r  r   r*   r-   T)pr<   keepdimg       @)re   rf   rg   rV   rb   rh   ri   )r   r   r  rz  r  r  r  r  r  r   rg  exprf  r?   r   trS   rd   )rp   rC   r  r  r   r  r  r   r  r  r  r  rb   rV   logit_scale_textlogit_scale_audiorg   rf   re   caption_loss
audio_lossr   s                         r&   r   zClapModel.forward  s   X 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B](()/!5# ) 
 )%/!5# ' 
 0;}Q'@[@[,,\:-8l1ol>X>X**;7 $l&7&7!T&7&RR!K$4$4qb$$4$OO  --113 ..224,,{LNN4DEHXX <<kmmoFIZZ+O<L)*:*<*<*>?J :-4D&lT`bopF)-)9TGf$EvE-+#%*,
 	
r(   NNNNNN)	NNNNNNNNN)rY   rZ   r[   r   rn  rz   r   r   r?   r  r  r]   r  r  
LongTensorr}  r   r   rd   r   r   r   s   @r&   rd  rd    s7   Lz @  -115/3,0/3&*-ELL)- !.- u||,	-
 $D>- 'tn- d^- 
		- -^  26,015,0/3&*2 .2 ELL)2 !.	2
 $D>2 'tn2 d^2 
		2 2h  156:041537&*,0/3&*d
E,,-d
 !!2!23d
 E,,-	d

 !.d
 u//0d
 d^d
 $D>d
 'tnd
 d^d
 
uj 	!d
 d
r(   rd  c                        e Zd ZeZdef fdZdej                  fdZd Z	e
	 	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   dee   deeef   fd       Z xZS )ClapTextModelWithProjectionr   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y rx   )ry   rz   r  r  r  r  ru  r  s     r&   rz   z$ClapTextModelWithProjection.__init__  s3     '/26:r(   rJ   c                 B    | j                   j                  j                  S rx   r  r  r  rt   s    r&   rw  z0ClapTextModelWithProjection.get_input_embeddings   s    ))999r(   c                 :    || j                   j                  _        y rx   r  r  s     r&   r  z0ClapTextModelWithProjection.set_input_embeddings  s    5:""2r(   rC   r   r  r   r  r  c                 H   ||n| j                   j                  }| j                  ||||||      }|s|d   n|j                  }| j	                  |      }	|s|	|d   f|dd z   }
t        d |
D              S t        |	|j                  |j                  |j                        S )a  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, ClapTextModelWithProjection

        >>> model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused")
        >>> tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused")

        >>> inputs = tokenizer(["a sound of a cat", "a sound of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> text_embeds = outputs.text_embeds
        ```Nr  r   r   r*   c              3   &   K   | ]	  }||  y wrx   r_   rn   r   s     r&   rq   z6ClapTextModelWithProjection.forward.<locals>.<genexpr>/       LF9KL   )rV   rW   r    rX   )
r   rz  r  r  r  rr   rU   rW   r    rX   )rp   rC   r   r  r   r  r  r  rY  rV   r  s              r&   r   z#ClapTextModelWithProjection.forward  s    0 &1%<k$++B]B])%/!5# ' 
 0;Q@Z@Z**=9"LO4|AB7GGGLgLLL"#*<<&44#..	
 	
r(   r  )rY   rZ   r[   r   rn  rz   r	   r  rw  r  r   r   r?   r  r  r   r   rU   r   r   r   s   @r&   r  r    s    !L~ :bii :;  -115/3,0/3&*/
ELL)/
 !./
 u||,	/

 $D>/
 'tn/
 d^/
 
u))	*/
 /
r(   r  c                        e Zd ZeZdZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     deej                     dee   dee   d	ee   deeef   fd
       Z xZS )ClapAudioModelWithProjectionr  r   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y rx   )ry   rz   rr  r  r  r  ru  r  s     r&   rz   z%ClapAudioModelWithProjection.__init__>  s4     )&1 3F ;r(   rJ   c                 V    | j                   j                  j                  j                  S rx   )r  rt  r  r   rt   s    r&   rw  z1ClapAudioModelWithProjection.get_input_embeddingsE  s     --99>>>r(   r  r   r  r  c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      }|s|d   n|j
                  }| j                  |      }|s||d   f|dd z   }	t        d |	D              S t        ||j                  |j                  |j                        S )a  
        input_features (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Input audio features. This should be returned by the [`ClapFeatureExtractor`] class that you can also
            retrieve from [`AutoFeatureExtractor`]. See [`ClapFeatureExtractor.__call__`] for details.
        is_longer (`torch.FloatTensor`, of shape `(batch_size, 1)`, *optional*):
            Whether the audio clip is longer than `max_length`. If `True`, a feature fusion will be enabled to enhance
            the features.

        Examples:

        ```python
        >>> from datasets import load_dataset
        >>> from transformers import ClapAudioModelWithProjection, ClapProcessor

        >>> model = ClapAudioModelWithProjection.from_pretrained("laion/clap-htsat-fused")
        >>> processor = ClapProcessor.from_pretrained("laion/clap-htsat-fused")

        >>> dataset = load_dataset("hf-internal-testing/ashraq-esc50-1-dog-example")
        >>> audio_sample = dataset["train"]["audio"][0]["array"]

        >>> inputs = processor(audios=audio_sample, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> audio_embeds = outputs.audio_embeds
        ```Nry  r   r   r*   c              3   &   K   | ]	  }||  y wrx   r_   r  s     r&   rq   z7ClapAudioModelWithProjection.forward.<locals>.<genexpr>}  r  r  )rb   rW   rX   r    )r   rz  r   r  r  r  r  rr   ra   rW   rX   r    )
rp   r  r  r   r  r  r  rY  rb   r  s
             r&   r   z$ClapAudioModelWithProjection.forwardH  s    B &1%<k$++B]B]1B1N-TXT_T_TqTq$8$D $++JjJj 	 (()/!5# ) 
 1<a(A\A\,,];#]1%56qr9JJGLgLLL#%+==$//'55	
 	
r(   r{  )rY   rZ   r[   r   rn  r|  rz   r	   r  rw  r   r   r?   r]   r}  r  r   r   ra   r   r   r   s   @r&   r  r  9  s    "L&O ?bii ?  7;04,0/3&*;
 !2!23;
 E,,-;
 $D>	;

 'tn;
 d^;
 
u**	+;
 ;
r(   r  )rd  r[  r  r  rr  r  )r   )Qr\   r   r  dataclassesr   typingr   r   r   r   r   r?   torch.nn.functionalr	   rP   r  activationsr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   r   utilsr   r   r   r   configuration_clapr   r   r   
get_loggerrY   rO  r'   r7   r9   rH   r  rS   rU   ra   rd   r  rv   r   r   r   r  r  r1  r=  rB  r|  r  r  r  r  r  r  r)  r&  r.  r3  r7  rG  rT  r[  rr  r  rd  r  r  __all__r_   r(   r&   <module>r     s%      ! 4 4     ! 
 . v v D D K K 
		H	%"*(4$7U\\ 7ell 7
 ?+ ? ?8 ?; ? ?8 !
 !
 !
J299 2%		 %P_")) _FaRYY aJ
")) 
# #NBII  	bii 	zRYY z|9RYY 9z3BII 3lG
ryy G
T")) &V= V=tCBII CN  "$  0		 0h299  RYY SBII SnZ
bii Z
|RYY  )/ ) )8;
( ;
| F
' F
F
R m
# m
 m
` @
"5 @
 @
F J
#6 J
 J
Zr(   