
    Uh                       d dl Z d dlmZmZmZ d dlZd dlZd dlm	Z	 d dlm
Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZmZmZ dd
lmZ ddlmZmZ ddlmZ  e       rddlmZ  ej@                  e!      Z" G d de	jF                        Z$ G d de	jF                        Z% G d de	jF                        Z& G d de	jF                        Z' G d de	jF                        Z( G d de	jF                        Z) G d de	jF                        Z* G d de	jF                        Z+ G d d e+      Z, G d! d"e+      Z- G d# d$e	jF                        Z.e+e-e,d%Z/ G d& d'e	jF                        Z0 G d( d)e	jF                        Z1 G d* d+e	jF                        Z2 G d, d-e	jF                        Z3 G d. d/e	jF                        Z4e G d0 d1e             Z5	 	 dCd2ee6e6f   d3e7d4e6d5eejp                     d6e6d7ejr                  fd8Z:e G d9 d:e5             Z;dZ< ed;<       G d= d>e5             Z= ed?<       G d@ dAe5             Z>g dBZ?y)D    N)OptionalTupleUnion)CrossEntropyLoss   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)!flash_attn_supports_top_left_maskis_flash_attn_available)BaseModelOutputCausalLMOutputSequenceClassifierOutput)PreTrainedModel)auto_docstringlogging   )HubertConfig)_flash_attention_forwardc                   $     e Zd Z fdZd Z xZS )HubertPositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        d | _        |j                  r&t        j                  |j                        | _        nt        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j"                  j%                  | j                  j&                  d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j&                  j(                  }| j                  j                  j&                  j*                  }n,| j                  j,                  }| j                  j.                  }|j"                  j1                  | |       |j"                  j1                  | |       n || j                  dd      | _        t3        |j
                        | _        t6        |j8                     | _        y # 1 sw Y   'xY w)	N   )kernel_sizepaddinggroupsweight_normr   modifier_rankweight)namedimparametrizations)super__init__nnConv1dhidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsconv
batch_normconv_pos_batch_normBatchNorm1dutilsr   hasattrr#   r	   	deepspeedzeroGatheredParametersr    	original0	original1weight_gweight_vregister_external_parameterHubertSamePadLayerr   r   feat_extract_activation
activation)selfconfigr   r1   r6   r7   	__class__s         |/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/hubert/modeling_hubert.pyr%   z&HubertPositionalConvEmbedding.__init__!   s   II6622a777
	 %% nnV-?-?@DO((..Krxx00-@ hh77CC)+ ^^66tyy7G7GWX6Y M +DIIH! LDIM499&89#yy99@@JJH#yy99@@JJH#yy11H#yy11H::4J::4J'		aH	)&*H*HI !?!?@M Ms   ?I??J	c                     |j                  dd      }| j                  | j                  |      }| j                  |      }| j                  |      }| j	                  |      }|j                  dd      }|S Nr   r   )	transposer,   r+   r   r;   r<   hidden_statess     r?   forwardz%HubertPositionalConvEmbedding.forwardF   sn    %//15??& OOM:M		-0]36%//15    __name__
__module____qualname__r%   rE   __classcell__r>   s   @r?   r   r       s    #AJ	rF   r   c                   $     e Zd Z fdZd Z xZS )r9   c                 P    t         |           |dz  dk(  rd| _        y d| _        y )Nr   r   r   )r$   r%   num_pad_remove)r<   r)   r>   s     r?   r%   zHubertSamePadLayer.__init__S   s)    #:Q#>!#CarF   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )rO   rC   s     r?   rE   zHubertSamePadLayer.forwardW   s6    ")!Q0F43F3F2F0F*FGMrF   rG   rL   s   @r?   r9   r9   R   s    KrF   r9   c                   &     e Zd Zd fd	Zd Z xZS )HubertNoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   r   stridebias)r$   r%   conv_dimin_conv_dimout_conv_dimr&   r'   conv_kernelconv_stride	conv_biasr+   r   r:   r;   r<   r=   layer_idr>   s      r?   r%   z#HubertNoLayerNormConvLayer.__init__^   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@rF   c                 J    | j                  |      }| j                  |      }|S N)r+   r;   rC   s     r?   rE   z"HubertNoLayerNormConvLayer.forwardl   s$    		-06rF   r   rG   rL   s   @r?   rS   rS   ]   s    ArF   rS   c                   &     e Zd Zd fd	Zd Z xZS )HubertLayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rU   T)elementwise_affine)r$   r%   rX   rY   rZ   r&   r'   r[   r\   r]   r+   	LayerNorm
layer_normr   r:   r;   r^   s      r?   r%   z!HubertLayerNormConvLayer.__init__s   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@rF   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )N)r+   rB   rh   r;   rC   s     r?   rE   z HubertLayerNormConvLayer.forward   sV    		-0%//B76%//B76rF   rb   rG   rL   s   @r?   rd   rd   r   s    ArF   rd   c                   &     e Zd Zd fd	Zd Z xZS )HubertGroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rU   T)
num_groupsnum_channelsaffine)r$   r%   rX   rY   rZ   r&   r'   r[   r\   r]   r+   r   r:   r;   	GroupNormrh   r^   s      r?   r%   z!HubertGroupNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqrF   c                 l    | j                  |      }| j                  |      }| j                  |      }|S ra   )r+   rh   r;   rC   s     r?   rE   z HubertGroupNormConvLayer.forward   s2    		-066rF   rb   rG   rL   s   @r?   rm   rm      s    r rF   rm   c                   .     e Zd ZdZ fdZd Zd Z xZS )HubertFeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )r_   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)r$   r%   feat_extract_normrm   rangenum_feat_extract_layersrS   rd   
ValueErrorr&   
ModuleListconv_layersgradient_checkpointing_requires_grad)r<   r=   ir~   r>   s       r?   r%   zHubertFeatureEncoder.__init__   s    ##w.3FQGHLQRXRpRpstRtLuLGH*6AEBL K %%0QVW]WuWuQvwA3FQGwKw01I1I0JJst  ==5&+#"L xs   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y NF)
parametersrequires_gradr   r<   params     r?   _freeze_parametersz'HubertFeatureEncoder._freeze_parameters   s(    __& 	(E"'E	(#rF   c                 
   |d d d f   }| j                   r| j                  rd|_        | j                  D ]K  }| j                   r5| j                  r)| j                  r| j                  |j                  |      }D ||      }M |S )NT)r   trainingr   r~   r   _gradient_checkpointing_func__call__)r<   input_valuesrD   
conv_layers       r?   rE   zHubertFeatureEncoder.forward   s    $QW- 4==*.M'** 	:J""t'B'Bt}} $ A A''!!
 !+= 9	: rF   )rH   rI   rJ   __doc__r%   r   rE   rK   rL   s   @r?   ru   ru      s    8#"$
rF   ru   c                   $     e Zd Z fdZd Z xZS )HubertFeatureProjectionc                 n   t         |           |j                  | _        | j                  r3t        j                  |j
                  d   |j                        | _        t        j                  |j
                  d   |j                        | _
        t        j                  |j                        | _        y )Nrk   eps)r$   r%   feat_proj_layer_normr&   rg   rX   layer_norm_epsrh   Linearr(   
projectionDropoutfeat_proj_dropoutdropoutr<   r=   r>   s     r?   r%   z HubertFeatureProjection.__init__   s}    $*$?$?!$$ ll6??2+>FDYDYZDO))FOOB$79K9KLzz&":":;rF   c                     | j                   r| j                  |      }| j                  |      }| j                  |      }|S ra   )r   rh   r   r   rC   s     r?   rE   zHubertFeatureProjection.forward   s;    $$ OOM:M6]3rF   rG   rL   s   @r?   r   r      s    <rF   r   c                       e Zd ZdZ	 	 	 	 	 ddededededededee   f fd	Z	d
e
j                  dedefdZ	 	 	 	 	 dde
j                  dee
j                     deee
j                        dee
j                     dee
j                     dedee
j                  ee
j                     eee
j                        f   fdZ xZS )HubertAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsr   
is_decoderrW   	is_causalr=   c                 
   t         |           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      )rW   )r$   r%   r   r   r   head_dimr=   r|   scalingr   r   r&   r   k_projv_projq_projout_proj)	r<   r   r   r   r   rW   r   r=   r>   s	           r?   r%   zHubertAttention.__init__   s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTBrF   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S rA   )viewr   r   rB   
contiguousr<   r   r   r   s       r?   _shapezHubertAttention._shape  s7    {{3GQQRSUVWbbddrF   rD   key_value_statespast_key_valueattention_masklayer_head_maskoutput_attentionsreturnc                 
   |du}|j                         \  }}	}
| j                  |      | j                  z  }|r0|.|d   j                  d   |j                  d   k(  r|d   }|d   }n
|rE| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }n|}| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j	                  | j                  |      d|      }| j	                  | j                  |      d|      }| j                  r||f}|| j                  z  d| j                  f} | j	                  ||	|      j                  | } |j                  | } |j                  | }|j                  d      }t        j                  ||j                  dd            }|j                         || j                  z  |	|fk7  r/t!        d|| j                  z  |	|f d|j                                |{|j                         |d|	|fk7  r#t!        d	|d|	|f d|j                                |j                  || j                  |	|      |z   }|j                  || j                  z  |	|      }t"        j$                  j'                  |d      }||j                         | j                  fk7  r*t!        d
| j                  f d|j                                |j                  dddd      |j                  || j                  |	|      z  }|j                  || j                  z  |	|      }|r?|j                  || j                  |	|      }|j                  || j                  z  |	|      }nd}t"        j$                  j)                  || j(                  | j*                        }t        j                  ||      }|j                         || j                  z  |	| j                  fk7  r9t!        d|| j                  z  |	| j                  f d|j                                |j                  || j                  |	| j                        }|j                  dd      }|j                  ||	| j,                        }| j/                  |      }|||fS )#Input shape: Batch x Time x ChannelNr   r   r   rk   r"   z$Attention weights should be of size 	, but is z!Attention mask should be of size z/Head mask for a single layer should be of size )pr    `attn_output` should be of size )sizer   r   shaper   r   r   torchcatr   r   r   r   reshapebmmrB   r|   r&   
functionalsoftmaxr   r   r   r   )r<   rD   r   r   r   r   r   is_cross_attentionr   tgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r?   rE   zHubertAttention.forward  s    .T9',,.Wa {{=1DLL@ *q!''*.>.D.DQ.GG (*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
Ct{{<#>CCZP'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C$..4H'SWS`S`3a2b c$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK01>AArF   )        FTFNNNNNF)rH   rI   rJ   r   intfloatboolr   r   r%   r   Tensorr   r   rE   rK   rL   s   @r?   r   r      sM   G  )-CC C 	C
 C C C &C>eU\\ eC ec e 488<1526"'vB||vB #5<<0vB !u||!45	vB
 !.vB "%,,/vB  vB 
u||Xell3XeELL>Q5RR	SvBrF   r   c                   V    e Zd ZdZ fdZdej                  dedefdZ	 	 	 	 	 ddej                  de	ej                     d	e	e
ej                        d
e	ej                     de	ej                     dede
ej                  e	ej                     e	e
ej                        f   fdZ xZS )HubertFlashAttention2aH  
    Hubert flash attention module. This module inherits from `HubertAttention` as the weights of the module stays
    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
    flash attention and deal with padding tokens in case the input contains any of them.
    c                 B    t        |   |i | t               | _        y ra   )r$   r%   r   _flash_attn_uses_top_left_mask)r<   argskwargsr>   s      r?   r%   zHubertFlashAttention2.__init__  s#    $)&)
 /P.Q+rF   r   r   r   c                 R    |j                  ||| j                  | j                        S ra   )r   r   r   r   s       r?   _reshapezHubertFlashAttention2._reshape  s    {{3GGrF   rD   r   r   r   r   r   r   c           
         |d u}|j                         \  }}	}
| j                  | j                  |      d|      }|rP|N|d   j                  d   |j                  d   k(  r,|d   j	                  dd      }|d   j	                  dd      }n*|rE| j                  | j                  |      d|      }| j                  | j                  |      d|      }n|| j                  | j                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   j	                  dd      |gd      }t        j                  |d   j	                  dd      |gd      }nD| j                  | j                  |      d|      }| j                  | j                  |      d|      }| j                  r$|j	                  dd      |j	                  dd      f}|j                  d   }|||d   j                  d   z  }|j                  }|t        j                  k(  rt        j                         rt        j                         }nMt        | j                  d      r| j                  j                   }n | j                  j"                  j                  }t$        j'                  d| d	       |j)                  |      }|j)                  |      }|j)                  |      }t+        |||||	| j,                  r| j.                  nd
| j0                  | j2                        }|j5                  ||	d      }| j7                  |      }|sd }||fS )Nrk   r   r   r   r   rj   _pre_quantization_dtypezThe input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in .r   )r   r   use_top_left_mask)r   r   r   r   rB   r   r   r   r   r   dtypefloat32is_autocast_enabledget_autocast_gpu_dtyper0   r=   r   r    loggerwarning_oncetor   r   r   r   r   r   r   )r<   rD   r   r   r   r   r   r   r   q_lenr   r   r   r   
kv_seq_leninput_dtypetarget_dtyper   r   s                      r?   rE   zHubertFlashAttention2.forward  s    .T9%**,UA }}T[[%?SI *q!''*.>.D.DQ.GG (*44Q:J)!,66q!<Lt{{3C'Db#NJ==5E)FCPL't{{='A2sKJ==])CRMLN1$5$?$?1$Ez#RXYZJ 99nQ&7&A&A!Q&G%V\]^L t{{='A2sKJ==])CRML?? )221a8,:P:PQRTU:VWN%%b)
%.+11"55J #((%--'((*$;;=&?@#{{BB#{{1177 >$ (??<8L#|4J'??<8L.$(MMDLLsnn"AA	
 "))#ub9mmK0 LL.88rF   r   )rH   rI   rJ   r   r%   r   r   r   r   r   r   r   rE   rK   rL   s   @r?   r   r     s    RHu|| Hc H H 488<1526"'e9||e9 #5<<0e9 !u||!45	e9
 !.e9 "%,,/e9  e9 
u||Xell3XeELL>Q5RR	Se9rF   r   c                   $    e Zd Z	 	 	 	 	 d	dej                  deej                     deeej                        deej                     deej                     dedeej                  eej                     eeej                        f   f fdZ xZ	S )
HubertSdpaAttentionrD   r   r   r   r   r   r   c                 t   |r)t         j                  d       t        |   |||||      S |du}|j	                         \  }}	}
| j                  |      }|r0|.|d   j                  d   |j                  d   k(  r|d   }|d   }n
|rE| j                  | j                  |      d|      }| j                  | j                  |      d|      }n|}| j                  | j                  |      d|      }| j                  | j                  |      d|      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j                  |      d|      }| j                  | j                  |      d|      }| j                  r||f}| j                  ||	|      }| j                  r	||	dkD  rd	nd
}t        j                  j                  j!                  ||||| j"                  r| j$                  nd|      }|j	                         || j&                  |	| j(                  fk7  r7t+        d|| j&                  |	| j(                  f d|j	                                |j-                  dd      }|j/                  ||	| j0                        }| j3                  |      }|d|fS )r   a  HubertModel is using HubertSdpaAttention, but `torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True` . Falling back to the manual attention implementation, but specifying the manual implementation will be required from Transformers version v5.0.0 onwards. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.)r   r   r   r   Nr   r   r   rk   r   TFr   )	attn_mask	dropout_pr   r   r   )r   r   r$   rE   r   r   r   r   r   r   r   r   r   r   r&   r   scaled_dot_product_attentionr   r   r   r   r|   rB   r   r   r   )r<   rD   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r>   s                   r?   rE   zHubertSdpaAttention.forward  s    l 7?!1--"3 #   .T9',,.Wa {{=1 *q!''*.>.D.DQ.GG (*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7N{{<#>
 !NN~/E'TU+D[`	 hh))FF$&*mmdll G 
 #t~~w!NN2CRVR_R_3`2a b$$&') 
 "++Aq1 "))#wGmmK0D.00rF   r   )
rH   rI   rJ   r   r   r   r   r   rE   rK   rL   s   @r?   r   r     s     488<1526"'e1||e1 #5<<0e1 !u||!45	e1
 !.e1 "%,,/e1  e1 
u||Xell3XeELL>Q5RR	Se1 e1rF   r   c                   $     e Zd Z fdZd Z xZS )HubertFeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                         | _        y ra   )r$   r%   r&   r   activation_dropoutintermediate_dropoutr   r(   intermediate_sizeintermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropoutr   s     r?   r%   zHubertFeedForward.__init__e  s    $&JJv/H/H$I!"$))F,>,>@X@X"Yf''-'-f.?.?'@D$'-'8'8D$IIf&>&>@R@RS jj)>)>?rF   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S ra   )r   r   r   r  r  rC   s     r?   rE   zHubertFeedForward.forwardr  sX    //>00?11-@))-8++M:rF   rG   rL   s   @r?   r   r   d  s    @rF   r   )eagersdpaflash_attention_2c                   &     e Zd Z fdZddZ xZS )HubertEncoderLayerc                    t         |           t        |j                     |j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t        |      | _        t        j                  |j                  |j                        | _        y )NFr   r   r   r   r   )r$   r%   HUBERT_ATTENTION_CLASSES_attn_implementationr(   num_attention_headsattention_dropout	attentionr&   r   r  r   rg   r   rh   r   feed_forwardfinal_layer_normr   s     r?   r%   zHubertEncoderLayer.__init__  s    1&2M2MN((00,,	
 zz&"7"78,,v'9'9v?T?TU-f5 "V-?-?VEZEZ [rF   c                     |}| j                  |||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}|r||fz  }|S Nr   r   )r  r   rh   r  r  r<   rD   r   r   attn_residualr   r   outputss           r?   rE   zHubertEncoderLayer.forward  s    %)-.L] *8 *
&|Q ]3%56%(9(9-(HH--m< "&GrF   r   rG   rL   s   @r?   r	  r	    s    \rF   r	  c                   r     e Zd Z fdZ	 	 	 	 ddej
                  deej                     dededef
dZ	 xZ
S )	HubertEncoderc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        d| _        |j&                  dk(  | _        y c c}w Nr   Fr  )r$   r%   r=   r   pos_conv_embedr&   rg   r(   r   rh   r   r  r   r}   rz   num_hidden_layersr	  layersr   r  _use_flash_attention_2r<   r=   r   r>   s      r?   r%   zHubertEncoder.__init__  s    ;FC,,v'9'9v?T?TUzz&"7"78mmvOgOgIh$iA%7%?$ij&+#&,&A&AEX&X# %j   !CrD   r   r   output_hidden_statesreturn_dictc                 4   |rdnd }|rdnd }||j                  d      j                  dd|j                  d         }d|| <   | j                  r|d|v r|nd }nd|d d d d d d f   j	                  |j
                        z
  }|t        j                  |j
                        j                  z  }|j                  |j                  d   d|j                  d   |j                  d         }| j                  |      }	||	z   }| j                  |      }| j                  |      }t               xs t        |       }
| j                  D ]  }|r||fz   }t        j                   g       }| j"                  r|| j$                  j&                  k  rdnd	}|r|
rG| j(                  r+| j"                  r| j+                  |j,                  |||      }n ||||
      }|d   }|rd}|s|d   fz   } |r||fz   }|st/        d |||fD              S t1        |||      S )N rk   r   r   r         ?r   TFr  NNc              3   &   K   | ]	  }||  y wra   r&  .0vs     r?   	<genexpr>z(HubertEncoder.forward.<locals>.<genexpr>       mq_`_lm   last_hidden_staterD   
attentions)	unsqueezerepeatr   r   r   r   r   finfominexpandr  rh   r   r	   r
   r  randr   r=   	layerdropr   r   r   tupler   r<   rD   r   r   r#  r$  all_hidden_statesall_self_attentionsexpand_attention_maskposition_embeddingssynced_gpusrx   dropout_probabilityskip_the_layerlayer_outputss                  r?   rE   zHubertEncoder.forward  s[    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!45M001**4B4NSTXfSfmq "%~atQ6F'G'J'JQ^QdQd'J'e!e!/%++m>Q>Q2R2V2V!V!/!6!6"((+Q0D0DR0H.J^J^_aJb" #11-@%(;;6]302R6LT6R[[ 	PE#$58H$H! #(**R.%)]]8KdkkNcNc8cTjoN![..4==$($E$E%&)	%M %*%nXi%M !.a 0 , &9]1=M<O&O#7	P:   1]4D Dm]4EGZ$[mmm++*
 	
rF   NFFT)rH   rI   rJ   r%   r   r   r   r   r   rE   rK   rL   s   @r?   r  r    s_    Y 26"'%* G
||G
 !.G
  	G

 #G
 G
rF   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )HubertAttnAdapterLayerc                    t         |           |j                  | _        |j                  | _        t        j                  | j
                        | _        t        j                  | j
                  | j                        | _
        t        j                         | _        t        j                  | j                  | j
                        | _        y)z
        Implements adapter modules directly with 3D tensor weight as parameters and without using ModuleList to speed
        up training throughput.
        N)r$   r%   adapter_attn_dim	input_dimr(   
hidden_dimr&   rg   normr   linear_1ReLUact_fnlinear_2r   s     r?   r%   zHubertAttnAdapterLayer.__init__  s    
 	00 ,,LL1			$//4>>Bggi		$..$//BrF   rD   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S ra   )rL  rM  rO  rP  rC   s     r?   rE   zHubertAttnAdapterLayer.forward
  s@    		-0m4M2m4rF   )rH   rI   rJ   r%   r   FloatTensorrE   rK   rL   s   @r?   rG  rG    s    CU%6%6 rF   rG  c                   f     e Zd Z fdZ	 	 ddej
                  deej
                     defdZ xZ	S )!HubertEncoderLayerStableLayerNormc                    t         |           t        |j                     |j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t        |      | _        t        j                  |j                  |j                        | _        t%        |dd       t'        |      | _        y d | _        y )NFr  r   rI  )r$   r%   r  r  r(   r  r  r  r&   r   r  r   rg   r   rh   r   r  r  getattrrG  adapter_layerr   s     r?   r%   z*HubertEncoderLayerStableLayerNorm.__init__  s    1&2M2MN((00,,	
 zz&"7"78,,v'9'9v?T?TU-f5 "V-?-?VEZEZ [6-t4@!7!?D!%DrF   rD   r   r   c                 $   |}| j                  |      }| j                  |||      \  }}}| j                  |      }||z   }|| j                  | j	                  |            z   }| j
                  || j                  |      z   }|f}|r||fz  }|S r  )rh   r  r   r  r  rW  r  s           r?   rE   z)HubertEncoderLayerStableLayerNorm.forward'  s     &6)-.L] *8 *
&|Q ]3%5%(9(9$:O:OP]:^(__))D,>,>},MMM "&GrF   r   )
rH   rI   rJ   r%   r   r   r   r   rE   rK   rL   s   @r?   rT  rT    s>    &* 26"'	|| !.  	rF   rT  c                   .     e Zd Z fdZ	 	 	 	 ddZ xZS )HubertEncoderStableLayerNormc                    t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        j                  |j                        | _        t        j                  t        |j                        D cg c]  }t!        |       c}      | _        d| _        |j&                  dk(  | _        y c c}w r  )r$   r%   r=   r   r  r&   rg   r(   r   rh   r   r  r   r}   rz   r  rT  r  r   r  r   r!  s      r?   r%   z%HubertEncoderStableLayerNorm.__init__B  s    ;FC,,v'9'9v?T?TUzz&"7"78mm@EfF^F^@_`1.v6`
 ',#&,&A&AEX&X# ar"  c                 f   |rdnd }|rdnd }||j                  d      j                  dd|j                  d         }||j                  |j                        z  }| j
                  r|d|v r|nd }nd|d d d d d d f   j                  |j                        z
  }|t        j                  |j                        j                  z  }|j                  |j                  d   d|j                  d   |j                  d         }| j                  |      }	||	z   }| j                  |      }t               xs t        |       }
| j                  D ]  }|r||fz   }t        j                  g       }| j                   r|| j"                  j$                  k  rdnd	}|r|
rG| j&                  r+| j                   r| j)                  |j*                  |||      }n ||||
      }|d   }|rd}|s|d   fz   } | j-                  |      }|r||fz   }|st/        d |||fD              S t1        |||      S )Nr&  rk   r   r   r(  r   r'  TFr  r)  c              3   &   K   | ]	  }||  y wra   r&  r+  s     r?   r.  z7HubertEncoderStableLayerNorm.forward.<locals>.<genexpr>  r/  r0  r1  )r4  r5  r   r   r   r   r   r6  r7  r8  r  r   r	   r
   r  r9  r   r=   r:  r   r   r   rh   r;  r   r<  s                  r?   rE   z$HubertEncoderStableLayerNorm.forwardN  sn    #7BD$5b4%$2$<$<R$@$G$G1mNaNabcNd$e!),A,D,D=K^K^,D,__M**4B4NSTXfSfmq "%~atQ6F'G'J'JQ^QdQd'J'e!e!/%++m>Q>Q2R2V2V!V!/!6!6"((+Q0D0DR0H.J^J^_aJb" #11-@%(;;]302R6LT6R[[ 	PE#$58H$H! #(**R.%)]]8KdkkNcNc8cTjoN![ ..4==$($E$E%&)	%M %*%nXi%M !.a 0 , &9]1=M<O&O#9	P< 6 1]4D Dm]4EGZ$[mmm++*
 	
rF   rE  rG   rL   s   @r?   rZ  rZ  A  s    
Y "I
rF   rZ  c                   x    e Zd ZeZdZdZdZdZdZ	d Z
deej                  ef   fdZdedej                  fd	Zy
)HubertPreTrainedModelhubertr   Tc                 z   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                  t        j                  t        j                  f      rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t        j                        r_t               rddl}t#        |d      r|t#        |d      rp|j$                  j'                  |j(                  |j*                  gd      5  t        j,                  j/                  |j                  j                         ddd       n|j$                  j'                  |j                  d      5  t        j,                  j/                  |j                  j                         ddd       n3t        j,                  j/                  |j                  j                         |j                  %|j                  j                  j                          yyt        |t0              r2t#        |d	      r%|j2                  j                  j5                          yyt        |t6              rMt#        |d
      r@|j8                  j                  j                  d| j                  j:                  dz   z         yyy# 1 sw Y   xY w# 1 sw Y   xY w)zInitialize the weightsr   )meanstdNr'  r   r7   r6   r   masked_spec_embedlayer_weightsr   )r   r&   r   r    datanormal_r=   initializer_rangerW   zero_rg   rr   r.   fill_r'   r	   r1   r0   r2   r3   r7   r6   initkaiming_normal_HubertModelrd  uniform_HubertForSequenceClassificationre  r  )r<   moduler1   s      r?   _init_weightsz#HubertPreTrainedModel._init_weights  sP   fbii( MM&&CT[[5R5R&S{{&  &&( 'r||R^^ LMKK""$MM$$S)		*)+ 6:.76:3N"::FOOV__;]mn:o D//0B0BCD D #::6==XY:Z D//0B0BCD D ''(:(:;{{&  &&( ',v23((--668 4 ?@v/$$))//t{{7T7TWX7X0YZ 0 AD DD Ds   ?4L%#4L1%L.1L:input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)input_lengthr   rV   s      r?   _conv_out_lengthzPHubertPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length  s"     99\K7wWZ[[[rF   )zipr=   r[   r\   )r<   rr  ry  r   rV   s        r?    _get_feat_extract_output_lengthsz6HubertPreTrainedModel._get_feat_extract_output_lengths  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q rF   feature_vector_lengthr   c                    | j                  |j                  d            j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nrk   r   )r   devicer   )r~  )r{  sumr   r   longr   zerosr   r~  arangeflipcumsumr   )r<   r|  r   output_lengths
batch_sizes        r?   "_get_feature_vector_attention_maskz8HubertPreTrainedModel._get_feature_vector_attention_mask  s    >>~?Q?QRT?UVYYZ_ZdZde#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOrF   N)rH   rI   rJ   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_2_supports_sdparq  r   r   
LongTensorr   r{  r  r&  rF   r?   r_  r_    sa    L $O&*#!N[BeEDTDTVYDY>Z 
 
]b]m]m 
rF   r_  r   	mask_probmask_lengthr   	min_masksr   c                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )r   max)rx  num_masked_spanepsilonr  r  r  sequence_lengths     r?   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOrF   Nrk   r(  r   F)replace)r|   nprandomr9  itemdetachr  tolistrz   r  r   choicer  lenconcatenateonesint32appendarraybroadcast_tor   r  put_along_axis)r   r  r  r   r  r  r  r   rr  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanrx  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  r  s    `` `            @@r?   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   *    e Zd Zdef fdZ	 	 ddej                  deej                     deej                     fdZ	e
	 	 	 	 	 ddeej                     deej                     deej                     dee   d	ee   d
ee   deeef   fd       Z xZS )rm  r=   c                    t         |   |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          y )Nr   )r$   r%   r=   ru   feature_extractorr   feature_projectionmask_time_probmask_feature_probr&   	Parameterr   r   r(   rn  rd  do_stable_layer_normrZ  encoderr  	post_initr   s     r?   r%   zHubertModel.__init__Y  s     !5f!="9&"A   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&7?DL(0DL 	rF   rD   mask_time_indicesr   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        apply_spec_augmentTNr   )r  r  r   r  )r~  r   )r  r  r  rk   )rV  r=   r   rd  r   r   r  r   r  mask_time_lengthmask_time_min_masksr   r   r~  r   r  mask_feature_lengthmask_feature_min_masksr8  )r<   rD   r  r   r  r  r(   mask_feature_indicess           r?   _mask_hidden_stateszHubertModel._mask_hidden_statesk  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./rF   r   r   r#  r$  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      }| j                  ||      }| j                  |||||      }	|	d   }|s	|f|	dd z   S t        ||	j                  |	j                        S )an  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.

        Example:

        ```python
        >>> from transformers import AutoProcessor, HubertModel
        >>> from datasets import load_dataset
        >>> import soundfile as sf

        >>> processor = AutoProcessor.from_pretrained("facebook/hubert-large-ls960-ft")
        >>> model = HubertModel.from_pretrained("facebook/hubert-large-ls960-ft")


        >>> def map_to_array(batch):
        ...     speech, _ = sf.read(batch["file"])
        ...     batch["speech"] = speech
        ...     return batch


        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> ds = ds.map(map_to_array)

        >>> input_values = processor(ds["speech"][0], return_tensors="pt").input_values  # Batch size 1
        >>> hidden_states = model(input_values).last_hidden_state
        ```Nr   r   )r  r   r   r#  r$  r   r1  )r=   r   r#  use_return_dictr  rB   r  r   r  r  r  r   rD   r3  )
r<   r   r   r  r   r#  r$  extract_featuresrD   encoder_outputss
             r?   rE   zHubertModel.forward  s,   L 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN//0@A00Rc0d,,)/!5# ' 
 (*!#oab&999+)77&11
 	
rF   r)  NNNNN)rH   rI   rJ   r   r%   r   rR  r   r  r  r   r   r   r   r   r   rE   rK   rL   s   @r?   rm  rm  W  s    | * :>59	,((, $E$5$56, !!1!12	,\  269=,0/3&*F
u||,F
 !.F
 $E$5$56	F

 $D>F
 'tnF
 d^F
 
uo%	&F
 F
rF   rm  zn
    Hubert Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
    )custom_introc                        e Zd Zddee   f fdZd Zd Zd Zd Z	e
	 	 	 	 	 ddeej                     deej                     d	ee   d
ee   dee   deej                     deeef   fd       Z xZS )HubertForCTCtarget_langc                    t         |   |       t        |      | _        t	        j
                  |j                        | _        || _        |j                  t        d| j                   d      t        |d      r|j                  r|j                  n|j                  }t	        j                   ||j                        | _        | j%                          y)a0  
        target_lang (`str`, *optional*):
            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
            adapter.<lang>.bin. Only relevant when using an instance of [`HubertForCTC`] with adapters. Uses 'eng' by
            default.
        NzYou are trying to instantiate z with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `HubertForCTC.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.add_adapter)r$   r%   rm  r`  r&   r   final_dropoutr   r  
vocab_sizer|   r>   r0   r  output_hidden_sizer(   r   lm_headr  )r<   r=   r  r  r>   s       r?   r%   zHubertForCTC.__init__  s     	 !&)zz&"6"67&$00@ AH H  *1)GFL^L^F%%djdvdv 	 yy!3V5F5FG 	rF   c                     | j                   }|&t        | j                  dd      t        d| d      |-t        | j                  dd      t        j                  d       y|| j                  |d       yy)a'  
        This method overwrites [`~PreTrainedModel.tie_weights`] so that adapter weights can be correctly loaded when
        passing `target_lang=...` to `from_pretrained(...)`.

        This method is **not** supposed to be called by the user and is prone to be changed in the future.
        NrI  zCannot pass `target_lang`: z- if `config.adapter_attn_dim` is not defined.z)By default `target_lang` is set to 'eng'.T)
force_load)r  rV  r=   r|   r   infoload_adapter)r<   r  s     r?   tie_weightszHubertForCTC.tie_weights	  s     &&"wt{{<NPT'U']:;-Gtuvv WT[[:Ld%S%_KKCD$kd; %rF   c                 X    t        j                  dt               | j                          y)
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        The method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.NwarningswarnFutureWarningfreeze_feature_encoderr<   s    r?   freeze_feature_extractorz%HubertForCTC.freeze_feature_extractor  '    
 	Q	

 	##%rF   c                 L    | j                   j                  j                          yr  Nr`  r  r   r  s    r?   r  z#HubertForCTC.freeze_feature_encoder*      
 	%%88:rF   c                 P    | j                   j                         D ]	  }d|_         yz
        Calling this function will disable the gradient computation for the base model so that its parameters will not
        be updated during training. Only the classification head will be updated.
        FNr`  r   r   r   s     r?   freeze_base_modelzHubertForCTC.freeze_base_model1  (    
 [[++- 	(E"'E	(rF   r   r   r   r#  r$  labelsr   c           
         ||n| j                   j                  }|I|j                         | j                   j                  k\  r"t	        d| j                   j                         | j                  |||||      }|d   }| j                  |      }| j                  |      }	d}
|b||n$t        j                  |t        j                        }| j                  |j                  d            j                  t        j                        }|dk\  }|j                  d      }|j                  |      }t        j                   j#                  |	dt        j$                        j'                  dd      }t        j(                  j*                  j-                  d	
      5  t        j                   j/                  ||||| j                   j0                  | j                   j2                  | j                   j4                        }
ddd       |s|	f|t6        d z   }|
|
f|z   S |S t9        |
|	|j:                  |j<                        S # 1 sw Y   ExY w)a  
        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
            config.vocab_size - 1]`.
        Nz$Label values must be <= vocab_size: r  r   r(  rk   )r"   r   r   F)enabled)blank	reductionzero_infinitylosslogitsrD   r3  )r=   r  r  r  r|   r`  r   r  r   	ones_liker  r{  r  r   masked_selectr&   r   log_softmaxr   rB   backendscudnnflagsctc_losspad_token_idctc_loss_reductionctc_zero_infinity_HIDDEN_STATES_START_POSITIONr   rD   r3  )r<   r   r   r   r#  r$  r  r  rD   r  r  rr  labels_masktarget_lengthsflattened_targets	log_probsoutputs                    r?   rE   zHubertForCTC.forward9  s'   " &1%<k$++B]B]&**,$++2H2H"HCDKKDZDZC[\]]++)/!5#  
  
]3m, #1"<%//R^fkfpfpBq  !AA.BTBTUWBXY\\]b]g]ghM !A+K(__R0N & 4 4[ A 11&b1V``abdefI%%++E+: 	}}--%!"++22"kk<<"&++"?"? . 	 Y)F)G!HHF)-)9TGf$EvEfG4I4IV]VhVh
 	
	 	s   A#IIra   r  )rH   rI   rJ   r   r   r%   r  r  r  r  r   r   r   r   r   r   r   rE   rK   rL   s   @r?   r  r    s    HSM :<*
&;(  26,0/3&*)-D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 &D
 
un$	%D
 D
rF   r  z
    Hubert Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
    SUPERB Keyword Spotting.
    c                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 ddee	j                     dee	j                     dee   dee   d	ee   d
ee	j                     deeef   fd       Z xZS )ro  c                    t         |   |       t        |d      r|j                  rt	        d      t        |      | _        |j                  dz   }|j                  r0t        j                  t        j                  |      |z        | _        t        j                  |j                  |j                         | _        t        j                  |j                   |j$                        | _        | j)                          y )Nr  z]Sequence classification does not support the use of Hubert adapters (config.add_adapter=True)r   )r$   r%   r0   r  r|   rm  r`  r  use_weighted_layer_sumr&   r  r   r  re  r   r(   classifier_proj_size	projector
num_labels
classifierr  )r<   r=   
num_layersr>   s      r?   r%   z(HubertForSequenceClassification.__init__  s     6=)f.@.@o  "&)--1
((!#ejj.Dz.Q!RD6#5#5v7R7RS))F$?$?ARARS 	rF   c                 X    t        j                  dt               | j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        r  Nr  r  s    r?   r  z8HubertForSequenceClassification.freeze_feature_extractor  r  rF   c                 L    | j                   j                  j                          yr  r  r  s    r?   r  z6HubertForSequenceClassification.freeze_feature_encoder  r  rF   c                 P    | j                   j                         D ]	  }d|_         yr  r  r   s     r?   r  z1HubertForSequenceClassification.freeze_base_model  r  rF   r   r   r   r#  r$  r  r   c                 <   ||n| j                   j                  }| j                   j                  rdn|}| j                  |||||      }| j                   j                  rr|t           }t        j                  |d      }t        j                  j                  | j                  d      }	||	j                  ddd      z  j                  d      }n|d   }| j                  |      }||j                  d      }
n| j                  |j                   d   |      }|j#                  d      j%                  dd|j                   d         }d	|| <   |j                  d      |j                  d      j                  dd      z  }
| j'                  |
      }d}|Ft)               } ||j                  d| j                   j*                        |j                  d            }|s|f|t        d z   }||f|z   S |S t-        |||j.                  |j0                  
      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
            conversion into a tensor of type `torch.FloatTensor`. See [`HubertProcessor.__call__`] for details.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NTr  r   r   rk   r   r   r   r  )r=   r  r  r`  r  r   stackr&   r   r   re  r   r  r	  rb  r  r   r4  r5  r  r   r
  r   rD   r3  )r<   r   r   r   r#  r$  r  r  rD   norm_weightspooled_outputpadding_maskexpand_padding_maskr  r  loss_fctr  s                    r?   rE   z'HubertForSequenceClassification.forward  s   , &1%<k$++B]B]'+{{'I'ItOc++)/!5#  
 ;;--#$ABM!KK1=M==001C1C0LL*\->->r1a-HHMMRSMTM#AJM}5!)..1.5MBB=CVCVWXCY[ijL"."8"8"<"C"CAq-J]J]^_J`"a25M../)--!-4|7G7GA7G7N7S7STVXY7ZZM/')HFKKDKK,B,BCV[[QS_UDY)F)G!HHF)-)9TGf$EvE'!//))	
 	
rF   r  )rH   rI   rJ   r%   r  r  r  r   r   r   r   r   r   r   r   rE   rK   rL   s   @r?   ro  ro    s    "
&;(  26,0/3&*)-A
u||,A
 !.A
 $D>	A

 'tnA
 d^A
 &A
 
u..	/A
 A
rF   ro  )r  ro  rm  r_  rQ   )@r  typingr   r   r   numpyr  r   torch.nnr&   r   activationsr   integrations.deepspeedr	   integrations.fsdpr
   modeling_flash_attention_utilsr   r   modeling_outputsr   r   r   modeling_utilsr   r/   r   r   configuration_hubertr   r   
get_loggerrH   r   Moduler   r9   rS   rd   rm   ru   r   r   r   r   r   r  r	  r  rG  rT  rZ  r_  r   r   r  ndarrayr  rm  r  r  ro  __all__r&  rF   r?   <module>r%     s}    ) )    % ! @ 7 h Y Y - , . J 
		H	%/BII /d  *ryy 6ryy 0)299 )Xbii $[Bbii [B|w9O w9tf1/ f1R		 2 .    FR
BII R
jRYY 2*		 *ZV
299 V
r BO B BR 26tc?tt t U--.	t
 t ZZtn H
' H
 H
V !"  
S
( S

S
l o
&; o
o
d frF   