
    Uh;G                        d dl Z d dlZd dlmZ d dlmZmZmZ d dlZd dl	m
Z
 ddlmZmZ ddlmZ ddlmZmZ dd	lmZmZmZmZmZmZmZmZmZ d
dlmZ  ej>                  e       Z!e G d de             Z" G d de      Z# G d de      Z$ G d de      Z% G d de      Z& G d de      Z' G d de      Z(e G d de             Z)eZ* G d de)e      Z+ ed       G d  d!e)             Z, G d" d#e      Z- G d$ d%e      Z.g d&Z/y)'    N)	dataclass)OptionalTupleUnion   )ModelOutputWav2Vec2BaseModelOutput)PreTrainedModel)auto_docstringlogging   )	Wav2Vec2EncoderWav2Vec2EncoderStableLayerNormWav2Vec2FeatureEncoderWav2Vec2FeatureProjectionWav2Vec2ForCTC!Wav2Vec2ForSequenceClassificationWav2Vec2GumbelVectorQuantizerWav2Vec2ModelWav2Vec2PositionalConvEmbedding   )UniSpeechConfigc                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)	UniSpeechForPreTrainingOutputaL  
    Output type of [`UniSpeechForPreTrainingOutput`], with potential hidden states and attentions.

    Args:
        loss (*optional*, returned when model is in train mode, `torch.FloatTensor` of shape `(1,)`):
            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
            projected quantized states.
        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
            target vectors for contrastive loss.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossprojected_statesprojected_quantized_statescodevector_perplexityhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r   r         /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/unispeech/modular_unispeech.pyr   r      s    4 )-D(5$$
%,48hu0018>B):): ;B9=8E$5$56=8<M8E%"3"345<59Ju00129r)   r   c                       e Zd Zy) UniSpeechPositionalConvEmbeddingNr!   r"   r#   r(   r)   r*   r,   r,   A       r)   r,   c                       e Zd Zy)UniSpeechFeatureEncoderNr-   r(   r)   r*   r0   r0   E   r.   r)   r0   c                       e Zd Zy)UniSpeechFeatureProjectionNr-   r(   r)   r*   r2   r2   I   r.   r)   r2   c                       e Zd Zy)UniSpeechEncoderNr-   r(   r)   r*   r4   r4   M   r.   r)   r4   c                       e Zd Zy)UniSpeechEncoderStableLayerNormNr-   r(   r)   r*   r6   r6   Q   r.   r)   r6   c                   "    e Zd Zed        Zd Zy)UniSpeechGumbelVectorQuantizerc           	          | j                  d      }t        j                  t        j                  |t        j                  |dz         z  d             j                         }|S )Nr   dimgHz>)meanr%   expsumlog)probsmarginal_probs
perplexitys      r*   _compute_perplexityz2UniSpeechGumbelVectorQuantizer._compute_perplexityV   sR    *YY		.599^VZEZ;[*[ac ddeiik
r)   c                    |j                   \  }}}| j                  |      }|j                  ||z  | j                  z  d      }| j                  rt
        j                  j                  |j                         | j                  d      j                  |      }t        j                  |j                  ||z  | j                  d      j                         d      }| j                  |      }n}|j                  d      } |j                  |j                    j!                  d|j                  dd      d      }|j                  ||z  | j                  d      }| j                  |      }|j                  ||z  d      }|j#                  d      | j$                  z  }	|	j                  ||z  | j                  | j&                  d      }
|
j)                  d      j                  ||d      }
|
|fS )Nr<   T)tauhardr:   r         ?)shapeweight_projview
num_groupstrainingnn
functionalgumbel_softmaxfloattemperaturetype_asr%   softmaxrD   argmax	new_zerosscatter_	unsqueezecodevectorsnum_varsr?   )selfr   
batch_sizesequence_lengthhidden_sizecodevector_probscodevector_soft_distrC   codevector_idxcodevectors_per_grouprZ   s              r*   forwardz&UniSpeechGumbelVectorQuantizer.forward\   s   3@3F3F0
O[ ((7%**:+G$//+Y[]^==!}};;##%4+;+;$  <  gm$ 
 $)=="":#?RTU[[]ce$  112FGJ +11b19N6}668K8KLUUN''A.   044Z/5QSWSbSbdfg112BCJ+00o1MrR 0 : :2 >AQAQ Q+00o1Mt`d`m`moqr!oob)..z?BOJ&&r)   N)r!   r"   r#   staticmethodrD   rd   r(   r)   r*   r8   r8   U   s     
#'r)   r8   c                   x    e Zd ZeZdZdZdZdZdZ	d Z
deej                  ef   fdZdedej                  fd	Zy
)UniSpeechPreTrainedModel	unispeechinput_valuesTc           
      z   t        |t              r|j                  j                  j                  j                  dd       |j                  j                  j                  j                          t        j                  j                  |j                         yt        |t              rt        j                  j                  |j                  j                  ddt        j                  d|j                  j                   d   |j                  j"                  z  z        z         t        j                  j%                  |j                  j                  d       yt        |t&              rt        j                  d|j(                  j*                  z        }t        j                  j                  |j(                  j                  | |       t        j                  j                  |j(                  j                  | |       yt        |t        j,                        rm|j                  j                  j                  d| j.                  j0                         |j                  %|j                  j                  j                          yyt        |t        j2                  t        j4                  f      rJ|j                  j                  j                          |j                  j                  j7                  d       yt        |t        j8                        rt        j                  j;                  |j                         |j                  jt        j                  |j<                  |j"                  |j                   d   z  z        }t        j                  j                  |j                  | |       yyy)	zInitialize the weights        r   )r=   stdr   r   )abNrH   )
isinstancer8   rK   weightdatanormal_biaszero_rO   inituniform_rZ   r,   convmathsqrtkernel_sizein_channels	constant_r2   
projectionin_featuresLinearconfiginitializer_range	LayerNorm	GroupNormfill_Conv1dkaiming_normal_groups)r\   moduleks      r*   _init_weightsz&UniSpeechPreTrainedModel._init_weights   s    f<=%%**222C##((..0GGV//0 @AGGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 :;		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' +r)   input_lengthsc                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r%   div)input_lengthrz   strides      r*   _conv_out_lengthzSUniSpeechPreTrainedModel._get_feat_extract_output_lengths.<locals>._conv_out_length   s"     99\K7wWZ[[[r)   )zipr   conv_kernelconv_stride)r\   r   r   rz   r   s        r*    _get_feat_extract_output_lengthsz9UniSpeechPreTrainedModel._get_feat_extract_output_lengths   sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r)   feature_vector_lengthattention_maskc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr<   r:   r   )dtypedevicer   )r   )cumsumr   tor%   longrJ   zerosr   r   arangeflipbool)r\   r   r   non_padded_lengthsoutput_lengthsr]   s         r*   "_get_feature_vector_attention_maskz;UniSpeechPreTrainedModel._get_feature_vector_attention_mask   s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr)   N)r!   r"   r#   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_supports_flash_attn_2_supports_sdpar   r   r%   
LongTensorintr   r   r(   r)   r*   rg   rg      s`    "L#$O&*#!N9BeEDTDTVYDY>Z  ]b]m]m r)   rg   c                       e Zd ZdefdZd Zd Z	 	 	 	 	 ddeej                     deej                     deej                     d	ee   d
ee   dee   deeef   fdZy)UniSpeechModelr   c                    t         j                  |       || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        |j                   rt#        |      | _        nt'        |      | _        | j)                          y )Nrk   )rg   __init__r   r0   feature_extractorr2   feature_projectionmask_time_probmask_feature_probrO   	Parameterr%   Tensorr_   rv   masked_spec_embeddo_stable_layer_normr6   encoderr4   	post_init)r\   r   s     r*   r   zUniSpeechModel.__init__   s     ))&1!8!@"<V"D  3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"&&:6BDL+F3DL 	r)   c                     t        d      NzNot needed for UniSpeechAttributeErrorr\   s    r*   freeze_feature_extractorz'UniSpeechModel.freeze_feature_extractor       788r)   c                     t        d      r   r   r   s    r*   freeze_feature_encoderz%UniSpeechModel.freeze_feature_encoder   r   r)   Nri   r   mask_time_indicesoutput_attentionsoutput_hidden_statesreturn_dictreturnc                 
   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |      }|j                  dd      }|| j                  |j                  d   |      }| j                  |      \  }}| j                  |||      }| j                  |||||      }	|	d   }|s
||f|	dd z   S t        |||	j                  |	j                        S )a/  
        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
            masked extracted features in *config.proj_codevector_dim* space.
        Nr   r   )r   r   r   r   r   r   r   )last_hidden_stateextract_featuresr   r    )r   r   r   use_return_dictr   	transposer   rJ   r   _mask_hidden_statesr   UniSpeechBaseModelOutputr   r    )
r\   ri   r   r   r   r   r   r   r   encoder_outputss
             r*   rd   zUniSpeechModel.forward   s@    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]11,?+55a;%!DDEUE[E[\]E^`noN*.*A*ABR*S''00->~ 1 
 ,,)/!5# ' 
 (*!#34qr7JJJ'+-)77&11	
 	
r)   )NNNNN)r!   r"   r#   r   r   r   r   r   r%   r   r&   r   r   r   r   rd   r(   r)   r*   r   r      s     "99 269=,0/3&*2
u||,2
 !.2
 $E$5$56	2

 $D>2
 'tn2
 d^2
 
u..	/2
r)   r   zZ
    UniSpeech Model with a vector-quantization module and ctc loss for pre-training.
    )custom_introc                   "    e Zd Zdef fdZdefdZd Zd Ze		 dde
j                  de
j                  d	e
j                  defd
       Ze	 	 	 	 ddee
j                     dee
j                     dee   dee   dee   deeef   fd       Z xZS )UniSpeechForPreTrainingr   c                 .   t         |   |       t        |      | _        t	        j
                  |j                        | _        t        |      | _	        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                        | _        t	        j                  |j                  |j                         | _        t	        j
                  |j$                        | _        | j)                          y )N)superr   r   rh   rO   Dropoutfeat_quantizer_dropoutdropout_featuresr8   	quantizerr   codevector_dimproj_codevector_dim	project_qr_   project_hidnum_ctc_classesctc_projfinal_dropoutdropoutr   )r\   r   	__class__s     r*   r   z UniSpeechForPreTraining.__init__!  s     '/ "

6+H+H I7?6#8#8&:T:TU99V%?%?ASAST		&"4"4f6L6LMzz&"6"67 	r)   rS   c                 &    || j                   _        y)zb
        Set the Gumbel softmax temperature to a given value. Only necessary for training
        N)r   rS   )r\   rS   s     r*   set_gumbel_temperaturez.UniSpeechForPreTraining.set_gumbel_temperature0  s     &1"r)   c                 X    t        j                  dt               | j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameters will
        not be updated during training.
        zThe method `freeze_feature_extractor` is deprecated and will be removed in Transformers v5. Please use the equivalent `freeze_feature_encoder` method instead.N)warningswarnFutureWarningr   r   s    r*   r   z0UniSpeechForPreTraining.freeze_feature_extractor6  s'    
 	Q	

 	##%r)   c                 L    | j                   j                  j                          y)z
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)rh   r   _freeze_parametersr   s    r*   r   z.UniSpeechForPreTraining.freeze_feature_encoderB  s    
 	((;;=r)   target_featuresnegative_featurespredicted_featuresc                     t        j                  | |gd      } t        j                  |j                         | j                         d      }|j	                  |       }||z  }|S )z
        Compute logits for contrastive loss based using cosine similarity as the distance measure between
        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
        r   r:   r<   )r%   catcosine_similarityrR   rT   )r   r   r   rS   logitss        r*   compute_contrastive_logitsz2UniSpeechForPreTraining.compute_contrastive_logitsI  sa      ))_6G$HaP(();)A)A)C_EZEZE\bde0 +%r)   ri   r   r   r   r   r   c                    ||n| j                   j                  }| j                  |||||      }|d   }| j                  |d         }| j	                  |      \  }	}
| j                  |	j                  | j
                  j                  j                              }	| j                  |	      }	t        j                  |j                  d      |j                  d            j                  | j                   j                        }|j                  dd      }t        j                   |      j#                         j                  |j$                        }|j                  dd      }|j'                  d      }|j)                  |d      |	j)                  | d      z   }| j+                  |      }| j-                  |      }d}|s||||	|
f|dd z   S ||	|
f|dd z   S t/        |||	|
|j0                  |j2                        S )	a  
        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, UniSpeechForPreTraining

        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> model = UniSpeechForPreTraining.from_pretrained("microsoft/unispeech-large-1500h-cv")
        >>> # TODO: Add full pretraining example
        ```Nr   r   r   r<   rk   r   )r   r   r   r   r   r    )r   r   rh   r   r   r   r   rp   r   r   r%   emptysizer   replace_probr   	bernoullir   r   rY   masked_fillr   r   r   r   r    )r\   ri   r   r   r   r   outputstransformer_featuresr   quantized_featuresr   prob_replace_matrixsampled_replace_matrixr   r   s                  r*   rd   zUniSpeechForPreTraining.forward]  s   * &1%<k$++B]B]..)/!5# ! 
  'qz  00<48NNCS4T11 "^^,>,A,A$..BWBWB]B],^_!--.@A#kk*>*C*CA*FH\HaHabcHdekkKK$$
 2;;AqA!&1D!E!J!J!L!O!OPdPkPk!l!7!A!A!Q!G!7!A!A"!E%112H#N**,B+BCH

 f%v& 24FH]^ahijikalll(*<>STW^_`_aWbbb,1'9"7!//))
 	
r)   )r   )NNNN)r!   r"   r#   r   r   r   r   r   r   re   r%   r&   r   r   r   r   r   r   r   r   rd   __classcell__)r   s   @r*   r   r     s    1# 1
&> 
 	** ,, "-- 	 &  26,0/3&*D
u||,D
 !.D
 $D>	D

 'tnD
 d^D
 
u33	4D
 D
r)   r   c                       e Zd Zy)UniSpeechForCTCNr-   r(   r)   r*   r  r    r.   r)   r  c                       e Zd Zy)"UniSpeechForSequenceClassificationNr-   r(   r)   r*   r  r    r.   r)   r  )r  r   r  r   rg   )0rx   r   dataclassesr   typingr   r   r   r%   torch.nnrO   modeling_outputsr   r	   modeling_utilsr
   utilsr   r   wav2vec2.modeling_wav2vec2r   r   r   r   r   r   r   r   r   configuration_unispeechr   
get_loggerr!   loggerr   r,   r0   r2   r4   r6   r8   rg   r   r   r   r  r  __all__r(   r)   r*   <module>r     s8     ! ) )   D - ,
 
 
 5 
		H	%  :K  :  :F	'F 		4 		!: 		 		&D 	*'%B *'Z E E EP 3 J
-} J
Z 
B
6 B

B
J	n 		)J 	r)   