
    UhB                       d Z ddlZddlmZmZmZmZ ddlZddl	Z	ddl
Z	ddl	mZ ddlmZmZmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZmZm Z  ddl!m"Z" ddl#m$Z$m%Z% ddl&m'Z'm(Z(  e%jR                  e*      Z+dZ,de	jZ                  de.de.fdZ/	 d{de	jZ                  de.dee	jZ                     fdZ0	 	 d|dee.e.f   de1de.dee	jd                     de.dejf                  fdZ4 G d d ejj                        Z6 G d! d"ejj                        Z7 G d# d$ejj                        Z8 G d% d&ejj                        Z9 G d' d(ejj                        Z: G d) d*ejj                        Z; G d+ d,e	j                  jj                        Z< G d- d.ejj                        Z= G d/ d0ejj                        Z> G d1 d2ejj                        Z? G d3 d4ejj                        Z@ G d5 d6ejj                        ZA G d7 d8ejj                        ZB G d9 d:ejj                        ZC G d; d<ejj                        ZD G d= d>ejj                        ZE G d? d@ejj                        ZF G dA dBejj                        ZG G dC dDejj                        ZH G dE dFejj                        ZI G dG dHejj                        ZJe$ G dI dJe"             ZK G dK dLeK      ZL G dM dNeK      ZM G dO dPeK      ZN G dQ dReK      ZO G dS dTeK      ZP G dU dVeK      ZQ G dW dXeK      ZR G dY dZeK      ZS G d[ d\ejj                        ZT G d] d^ejj                        ZU e$d_`       G da dbeK             ZV e$dc`       G dd deeKe             ZW	 	 	 	 	 	 	 	 d}dfeKde	j                  dgee	j                     dee	jd                     dhe1die1dje1dkeejj                     dleYdmeYdee	j                  ee	j                  e	j                  f   f   fdnZZ e$do`       G dp dqeK             Z[ e$dr`       G ds dteK             Z\ G du dvejj                        Z] e$dw`       G dx dye"             Z^g dzZ_y)~zPyTorch SpeechT5 model.    N)ListOptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossL1Loss   )ACT2FN)GenerationMixin)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask!_prepare_4d_causal_attention_mask)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutputSeq2SeqSpectrogramOutput)PreTrainedModel)auto_docstringlogging   )SpeechT5ConfigSpeechT5HifiGanConfig	input_idspad_token_iddecoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    Nr   r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosshapeclone
ValueErrormasked_fill_)r   r   r   shifted_input_idss       /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/speecht5/modeling_speecht5.pyshift_tokens_rightr)   4   s}     "++IOO<(CRC0668ae4adLMM""#4#<lK    input_valuesreduction_factorattention_maskc                     |dkD  r | dd|dz
  d|f   } ||dd|dz
  d|f   }| j                  | j                        }| ddddf   j                         |ddddf<   |j                  |dk(  d       ||fS )zw
    Shift input spectrograms one timestep to the right. Also applies the reduction factor to the sequence length.
    r   Nr!         Y        )r"   r#   r$   r&   )r+   r,   r-   shifted_input_valuess       r(   shift_spectrograms_rightr2   D   s     !#A'7!';'O?O'O$OP%+A/?!/C/WGW/W,WXN'11,2D2DE".q#2#v"6"<"<">AB %%&:f&DcJ//r*   r#   	mask_probmask_length	min_masksreturnc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r   z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr   r   )intmax)input_lengthnum_masked_spanepsilonr4   r3   r5   sequence_lengths     r(   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span   so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr*   Nr!   dtyper   F)replace)r%   nprandomranditemdetachsumtolistrangezerosboolchoicearangelenconcatenateonesint32appendarraybroadcast_toreshaper;   put_along_axis)r#   r3   r4   r-   r5   
batch_sizer@   _input_lengthsspec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr<   r=   spec_aug_mask_idxdummy_mask_idxoffsetsr>   r?   s    `` `            @@r(   _compute_mask_indicesrb   Z   s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5NoLayerNormConvLayerc                 d   t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        y )Nr   r   kernel_sizestridebias)super__init__conv_dimin_conv_dimout_conv_dimr   Conv1dconv_kernelconv_stride	conv_biasconvr   feat_extract_activation
activationselfconfiglayer_id	__class__s      r(   rk   z%SpeechT5NoLayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@r*   c                 J    | j                  |      }| j                  |      }|S N)rs   ru   rw   hidden_statess     r(   forwardz$SpeechT5NoLayerNormConvLayer.forward   s$    		-06r*   r   __name__
__module____qualname__rk   r   __classcell__rz   s   @r(   rd   rd      s    Ar*   rd   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5LayerNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        j                  | j                  d      | _        t        |j                     | _        y )Nr   r   rf   T)elementwise_affine)rj   rk   rl   rm   rn   r   ro   rp   rq   rr   rs   	LayerNorm
layer_normr   rt   ru   rv   s      r(   rk   z#SpeechT5LayerNormConvLayer.__init__   s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 ,,t'8'8TR !?!?@r*   c                     | j                  |      }|j                  dd      }| j                  |      }|j                  dd      }| j                  |      }|S )Nr!   )rs   	transposer   ru   r}   s     r(   r   z"SpeechT5LayerNormConvLayer.forward   sV    		-0%//B76%//B76r*   r   r   r   s   @r(   r   r      s    Ar*   r   c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5GroupNormConvLayerc                    t         |           |dkD  r|j                  |dz
     nd| _        |j                  |   | _        t        j                  | j                  | j                  |j                  |   |j                  |   |j                        | _
        t        |j                     | _        t        j                  | j                  | j                  d      | _        y )Nr   r   rf   T)
num_groupsnum_channelsaffine)rj   rk   rl   rm   rn   r   ro   rp   rq   rr   rs   r   rt   ru   	GroupNormr   rv   s      r(   rk   z#SpeechT5GroupNormConvLayer.__init__  s    <DqL6??8a<8a"OOH5II**84%%h/!!
	 !!?!?@,,$2C2CRVRcRclpqr*   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r|   )rs   r   ru   r}   s     r(   r   z"SpeechT5GroupNormConvLayer.forward  s2    		-066r*   r   r   r   s   @r(   r   r     s    r r*   r   c            	            e Zd ZdZddededee   f fdZddededee   fdZeddededee   fd       Z	 e
j                         dd	e
j                  d
efd       Z	 dd	e
j                  ded
ee   fdZ xZS )%SpeechT5SinusoidalPositionalEmbeddingzDThis module produces sinusoidal positional embeddings of any length.num_positionsembedding_dimpadding_idxc                     t         |           d| _        || _        || _        | j                  || j                  z   ||       y N   )rj   rk   offsetr   r   make_weights)rw   r   r   r   rz   s       r(   rk   z.SpeechT5SinusoidalPositionalEmbedding.__init__   s@    *&-$++5}kRr*   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )NweightsrB   deviceF
persistent)get_embeddinghasattrtor   rB   r   register_buffer)rw   r   r   r   emb_weightss        r(   r   z2SpeechT5SinusoidalPositionalEmbedding.make_weights'  s[    ((T4#%..t||/A/A$,,J]J].^KYFr*   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
        description in Section 3.5 of "Attention Is All You Need".
        r   i'  r   rA   r   dimr!   N)mathlogtorchexprO   int64float	unsqueezecatsincosviewrL   r   get_default_dtype)r   r   r   half_dimembs        r(   r   z3SpeechT5SinusoidalPositionalEmbedding.get_embedding/  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00r*   r   past_key_values_lengthc                    |j                         \  }}| j                  || j                  |      j                  |j                        }| j                  dz   |z   }|| j
                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j
                  j                  d|j                  d            j                  ||d      j                         S )Nr   r   r!   )size"create_position_ids_from_input_idsr   r   r   r   r   r   r   index_selectr   rH   )rw   r   r   bszseq_lenposition_idsmax_poss          r(   r   z-SpeechT5SinusoidalPositionalEmbedding.forwardA  s     ~~'W>>y$JZJZ\rsvv

 ""Q&0T\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVXY``bbr*   c                     |j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
        symbols are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:
        Returns: torch.Tensor
        r   r   )ner:   r   cumsumtype_aslong)rw   r   r   r   maskincremental_indicess         r(   r   zHSpeechT5SinusoidalPositionalEmbedding.create_position_ids_from_input_idsP  sW     ||K(,,.$||Da8@@FI__cgg"'')K77r*   r|   r   )r   r   r   __doc__r:   r   rk   r   staticmethodr   r   no_gradTensorr   r   r   r   s   @r(   r   r     s    NSc S# SHUXM SG3 Gs GQYZ]Q^ G 1c 1# 1HUXM 1 1" U]]_c cs c c bc88478QYZ]Q^8r*   r   c                   $     e Zd Z fdZd Z xZS )SpeechT5PositionalConvEmbeddingc                    t         |           t        j                  |j                  |j                  |j
                  |j
                  dz  |j                        | _        t        j                  j                  }t        t        j                  j                  d      r$t        j                  j                  j                  }t               r(dd l}|j                  j                  | j                  j                   d      5   || j                  dd      | _        d d d        t        | j                  d      rU| j                  j                  j                   j"                  }| j                  j                  j                   j$                  }n,| j                  j&                  }| j                  j(                  }|j                  j+                  | |       |j                  j+                  | |       n || j                  dd      | _        t-        |j
                        | _        t0        |j2                     | _        y # 1 sw Y   'xY w)	Nr   )rg   paddinggroupsweight_normr   )modifier_rankweight)namer   parametrizations)rj   rk   r   ro   hidden_sizenum_conv_pos_embeddingsnum_conv_pos_embedding_groupsrs   utilsr   r   r   r   	deepspeedzeroGatheredParametersr   	original0	original1weight_gweight_vregister_external_parameterSpeechT5SamePadLayerr   r   rt   ru   )rw   rx   r   r   r   r   rz   s         r(   rk   z(SpeechT5PositionalConvEmbedding.__init__c  s   II6622a777
	 hh**288,,m<((33??K%'224993C3CST2U I'		aH	Ityy"459955<<FF9955<<FF99--99--NN66tXFNN66tXF#DIIH!DDI+F,J,JK !?!?@I Is   IIc                     |j                  dd      }| j                  |      }| j                  |      }| j                  |      }|j                  dd      }|S Nr   r   )r   rs   r   ru   r}   s     r(   r   z'SpeechT5PositionalConvEmbedding.forward  sV    %//15		-0]36%//15r*   r   r   s   @r(   r   r   b  s    ABr*   r   c                   *     e Zd ZdZd fd	Zd Z xZS ) SpeechT5ScaledPositionalEncodinguS   
    Scaled positional encoding, see §3.2 in https://arxiv.org/abs/1809.08895
    c                    t        j                  ||      }t        j                  d|      j                  d      }t        j                  t        j                  d|dt         j
                        j                         t        j                  d      |z   z        }t        j                  |j                         |z        |d d dd df<   t        j                  |j                         |z        |d d dd df<   |j                  d      }t        | 1          | j                  d|d       t        j                  |	      | _        || _        t         j                  j%                  t        j&                  d
            | _        y )Nr   r   r   rA   g     @peFr   p      ?)r   rL   rO   r   r   r   r   r   r   r   r   rj   rk   r   r   Dropoutdropoutr   	Parametertensoralpha)rw   r   r   max_lenr   positiondiv_termrz   s          r(   rk   z)SpeechT5ScaledPositionalEncoding.__init__  s&   [[#&<<7+55a899ell1c1EKKHNNPUYU]U]^eUfilUlSmmoii 08 ;<1add7ii 08 ;<1add7\\!_T2%8zzG,XX''S(9:
r*   c                     || j                   | j                  d d d |j                  d      f   z  z   }| j                  |      }|S )Nr   )r   r   r   r   )rw   r   s     r(   r   z(SpeechT5ScaledPositionalEncoding.forward  sB    DJJMchhqkM)9!:::ll3
r*   )i  )r   r   r   r   rk   r   r   r   s   @r(   r   r     s    ;r*   r   c                   &     e Zd Zd fd	Zd Z xZS )"SpeechT5RelativePositionalEncodingc                     t         |           || _        || _        t        j
                  j                  d|z  |      | _        y r   )rj   rk   r   
max_lengthr   r   	Embeddingpe_k)rw   r   r   rz   s      r(   rk   z+SpeechT5RelativePositionalEncoding.__init__  s8    $HH&&q:~s;	r*   c                 ~   |j                   d   }t        j                  d|      j                  |j                  t        j
                        }|d d d f   |d d d f   z
  }| j                   ||| j                   k  <   | j                  dz
  ||| j                  k\  <   || j                  z   }| j                  |      S )Nr   r   r   rB   )r#   r   rO   r   r   r   r   r  )rw   r~   r   pos_seqs       r(   r   z*SpeechT5RelativePositionalEncoding.forward  s    %%a(,,q'*--]5I5IQVQ[Q[-\!T'"WT1W%55/3.>4??**+.2oo.A4??*+DOO+yy!!r*   )i  r   r   s   @r(   r   r     s    <	"r*   r   c                   $     e Zd Z fdZd Z xZS )r   c                 P    t         |           |dz  dk(  rd| _        y d| _        y )Nr   r   r   )rj   rk   num_pad_remove)rw   r   rz   s     r(   rk   zSpeechT5SamePadLayer.__init__  s)    #:Q#>!#Car*   c                 V    | j                   dkD  r|d d d d d | j                    f   }|S Nr   )r  r}   s     r(   r   zSpeechT5SamePadLayer.forward  s6    ")!Q0F43F3F2F0F*FGMr*   r   r   s   @r(   r   r     s    Kr*   r   c                   .     e Zd ZdZ fdZd Zd Z xZS )SpeechT5FeatureEncoderz.Construct the features from raw audio waveformc           	         t         |           |j                  dk(  rDt        |d      gt	        |j
                  dz
        D cg c]  }t        ||dz          c}z   }nV|j                  dk(  r.t	        |j
                        D cg c]  }t        ||       }}nt        d|j                   d      t        j                  |      | _        d| _        d	| _        y c c}w c c}w )
Ngroupr   )ry   r   layerz`config.feat_extract_norm` is z), but has to be one of ['group', 'layer']FT)rj   rk   feat_extract_normr   rK   num_feat_extract_layersrd   r   r%   r   
ModuleListconv_layersgradient_checkpointing_requires_grad)rw   rx   ir  rz   s       r(   rk   zSpeechT5FeatureEncoder.__init__  s    ##w.5fqIJNSTZTrTruvTvNwNIJ,Va!eDN K %%0HMfNlNlHmCD*6A>K  01I1I0JJst  ==5&+#"Ns   C"	C'c                 J    | j                         D ]	  }d|_         d| _        y NF)
parametersrequires_gradr  )rw   params     r(   _freeze_parametersz)SpeechT5FeatureEncoder._freeze_parameters  s(    __& 	(E"'E	(#r*   c                 
   |d d d f   }| j                   r| j                  rd|_        | j                  D ]K  }| j                   r5| j                  r)| j                  r| j                  |j                  |      }D ||      }M |S NT)r  trainingr  r  r  _gradient_checkpointing_func__call__)rw   r+   r~   
conv_layers       r(   r   zSpeechT5FeatureEncoder.forward  s    $QW- 4==*.M'** 	:J""t'B'Bt}} $ A A''!!
 !+= 9	: r*   )r   r   r   r   rk   r  r   r   r   s   @r(   r  r    s    8#&$
r*   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeatureProjectionc                 4   t         |           t        j                  |j                  d   |j
                        | _        t        j                  |j                  d   |j                        | _	        t        j                  |j                        | _        y )Nr!   eps)rj   rk   r   r   rl   layer_norm_epsr   Linearr   
projectionr   feat_proj_dropoutr   rw   rx   rz   s     r(   rk   z"SpeechT5FeatureProjection.__init__  sf    ,,vr':@U@UV))FOOB$79K9KLzz&":":;r*   c                 p    | j                  |      }| j                  |      }| j                  |      }||fS r|   )r   r)  r   )rw   r~   norm_hidden_statess      r(   r   z!SpeechT5FeatureProjection.forward  s:    !__];(:;]3000r*   r   r   s   @r(   r#  r#    s    <1r*   r#  c                   6    e Zd Z fdZd Z	 	 ddej                  deej                     deej                     fdZ
dedej                  fdZd	eej                  ef   fd
Z	 	 ddej                  deej                     deej                     fdZ xZS )SpeechT5SpeechEncoderPrenetc                    t         |           || _        t        |      | _        t        |      | _        |j                  dkD  s|j                  dkD  rEt        j                  t        j                  |j                        j                               | _        t!        |      | _        t%        |j&                  |j(                  z   dz   |j                  |j(                        | _        y )Nr0   r   )rj   rk   rx   r  feature_encoderr#  feature_projectionmask_time_probmask_feature_probr   r   r   r   r   uniform_masked_spec_embedr   pos_conv_embedr   max_speech_positionsr   pos_sinusoidal_embedr+  s     r(   rk   z$SpeechT5SpeechEncoderPrenet.__init__  s    5f=";F"C   3&&*B*BS*H%'\\%,,v?Q?Q2R2[2[2]%^D"=fE$I''&*=*==A%
!r*   c                 8    | j                   j                          y r|   )r1  r  rw   s    r(   freeze_feature_encoderz2SpeechT5SpeechEncoderPrenet.freeze_feature_encoder  s    //1r*   r+   r-   mask_time_indicesc                    | j                  |      }|j                  dd      }|| j                  |j                  d   |      }| j	                  |      \  }}| j                  |||      }| j                  |      }||z   }| |j                  d      j                         }n=t        j                  |j                  d d t        j                  |j                        }| j                  |      }||z   }||fS )Nr   r   )r=  r-   r   )r1  r   "_get_feature_vector_attention_maskr#   r2  _mask_hidden_statesr7  r   r   r   rL   r   r9  )	rw   r+   r-   r=  extract_featuresr~   positional_conv_embeddingpadding_mask positional_sinusoidal_embeddingss	            r(   r   z#SpeechT5SpeechEncoderPrenet.forward  s     //=+55a;%!DD &&q)N
 +/*A*ABR*S''00->~ 1 
 %)$7$7$F!%(AA%),,Q/446L ;;}':':2A'>ejjYfYmYmnL+/+D+D\+R(%(HHn,,r*   feature_vector_lengthc                    |j                  d      d d df   }| j                  |      j                  t        j                        }|j
                  d   }t        j                  ||f|j                  |j                        }d|t        j                  |j
                  d   |j                        |dz
  f<   |j                  dg      j                  d      j                  dg      j                         }|S )Nr!   r   r   r   r   r   )r    _get_feat_extract_output_lengthsr   r   r   r#   rL   rB   r   rO   fliprM   )rw   rE  r-   non_padded_lengthsoutput_lengthsrY   s         r(   r?  z>SpeechT5SpeechEncoderPrenet._get_feature_vector_attention_mask=  s     ,22r2:1b5A>>?QRUUV[V`V`a#))!,
./~7K7KTbTiTi
 uv^%9%9!%<^EZEZ[]kno]opq',,bT299"=BBB4HMMOr*   r[   c                     d }t        | j                  j                  | j                  j                        D ]  \  }} ||||      } |S )zH
        Computes the output length of the convolutional layers
        c                 >    t        j                  | |z
  |d      dz   S )Nfloor)rounding_moder   )r   div)r<   rg   rh   s      r(   _conv_out_lengthzVSpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengths.<locals>._conv_out_lengthR  s"     99\K7wWZ[[[r*   )ziprx   rp   rq   )rw   r[   rQ  rg   rh   s        r(   rH  z<SpeechT5SpeechEncoderPrenet._get_feat_extract_output_lengthsM  sQ    
	\
 $'t{{'>'>@W@W#X 	QK,]KPM	Q r*   r~   c                    t        | j                  dd      s|S |j                         \  }}}|)| j                  j	                  |j
                        ||<   n| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }| j                  j	                  |j
                        ||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                   | j                  j"                        }t        j                  ||j                  t        j                        }|dddf   j%                  d|d      }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        apply_spec_augmentTNr   )r3   r4   r-   r5   r  )r3   r4   r5   r!   )getattrrx   r   r6  r   rB   r3  r  rb   mask_time_lengthmask_time_min_masksr   r   r   rM   r4  mask_feature_lengthmask_feature_min_masksexpand)rw   r~   r=  r-   rY   r?   r   mask_feature_indicess           r(   r@  z/SpeechT5SpeechEncoderPrenet._mask_hidden_states]  s    t{{$8$?   4A3E3E3G0
O[(/3/E/E/H/HI\I\/]M+,[[''!+ 5_-++44 KK88-++99! !&->}G[G[chcmcm n/3/E/E/H/HI\I\/]M+,;;((1,#8[)++77 KK;;++<<	$  $)<<0D]MaMainisis#t #74#@#G#GO]_#` 23M./r*   NN)r   r   r   rk   r<  r   r   r   
LongTensorFloatTensorr   r:   r?  r   rH  r@  r   r   s   @r(   r/  r/    s    
"2 6:9=	 -ll - !!1!12 - $E$5$56	 -F ]b]m]m  eEDTDTVYDY>Z & :>59	,((, $E$5$56, !!1!12	,r*   r/  c                   f     e Zd Z fdZd Z	 ddej                  deej                     fdZ xZ	S )SpeechT5SpeechDecoderPrenetc           	      X   t         |           || _        t        j                  t        |j                        D cg c]=  }t        j                  |dk(  r|j                  n|j                  |j                        ? c}      | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        t        j                  |j"                  |j                  z   |j                        | _        y c c}w r	  )rj   rk   rx   r   r  rK   speech_decoder_prenet_layersr(  num_mel_binsspeech_decoder_prenet_unitslayersr   final_layerr   positional_dropoutr8  encode_positionsspeaker_embedding_dimspeaker_embeds_layerrw   rx   r  rz   s      r(   rk   z$SpeechT5SpeechDecoderPrenet.__init__  s    mm vBBC
 	 		+,6F''v7Y7Y66
 99V%G%GI[I[\ @%%''!

 %'IIf.J.JVM_M_._agasas$t!s   AD'c                     t        j                  |d   |      }|j                  d      j                  |j	                  d      dd      }t        j
                  |dk(  |d      dz  d|z
  z  S )Nr   r   r   )r   	bernoullir   repeatr   where)rw   inputs_embedsr   r   	all_maskss        r(   _consistent_dropoutz/SpeechT5SpeechDecoderPrenet._consistent_dropout  sd    }Q/15NN1%,,]-?-?-BAqI	{{9>=!<q@AEJJr*   r+   speaker_embeddingsc                 8   |}| j                   D ]M  }t        j                  j                   ||            }| j	                  || j
                  j                        }O | j                  |      }| j                  |      }|t        j                  j                  |      }|j                  d      j                  d|j                  d      d      }t        j                  ||gd      }t        j                  j                  | j                  |            }|S )Nr   r!   r   )re  r   
functionalrelurr  rx   speech_decoder_prenet_dropoutrf  rh  	normalizer   rZ  r   r   r   rj  )rw   r+   rs  rp  r  s        r(   r   z#SpeechT5SpeechDecoderPrenet.forward  s     %[[ 	oEMM..u]/CDM 44]DKKDmDmnM	o ((7--m<)!#!8!89K!L!3!=!=a!@!G!GML^L^_`Lace!f!II}6H&IrRMMM..t/H/H/WXMr*   r|   )
r   r   r   rk   rr  r   r   r   r   r   r   s   @r(   r`  r`    s8    u,K 6:ll %U\\2r*   r`  c                   &     e Zd Zd fd	Zd Z xZS )SpeechT5BatchNormConvLayerc                 
   t         |           |dk(  r|j                  }n|j                  }||j                  dz
  k(  r|j                  }n|j                  }t        j                  |||j                  d|j                  dz
  dz  d      | _        t        j                  |      | _
        ||j                  dz
  k  rt        j                         | _        nd | _        t        j                  |j                        | _        y )Nr   r   r   F)rg   rh   r   ri   )rj   rk   rc  speech_decoder_postnet_unitsspeech_decoder_postnet_layersr   ro   speech_decoder_postnet_kernelrs   BatchNorm1d
batch_normTanhru   r   speech_decoder_postnet_dropoutr   )rw   rx   ry   rm   rn   rz   s        r(   rk   z#SpeechT5BatchNormConvLayer.__init__  s    q= --K ==Kv;;a??!..L!>>LII<<99A=!C
	 ..6f::Q>> ggiDO"DOzz&"G"GHr*   c                     | j                  |      }| j                  |      }| j                  | j                  |      }| j                  |      }|S r|   )rs   r  ru   r   r}   s     r(   r   z"SpeechT5BatchNormConvLayer.forward  sJ    		-06??& OOM:M]3r*   r   r   r   s   @r(   rz  rz    s    I<r*   rz  c                   ^     e Zd Z fdZdej
                  fdZdej
                  fdZ xZS )SpeechT5SpeechDecoderPostnetc           	         t         |           || _        t        j                  |j
                  |j                  |j                  z        | _        t        j                  |j
                  |j                        | _	        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        y c c}w r|   )rj   rk   rx   r   r(  r   rc  r,   feat_outprob_outr  rK   r}  rz  re  rk  s      r(   rk   z%SpeechT5SpeechDecoderPostnet.__init__  s    		&"4"4f6I6IFLcLc6cd		&"4"4f6M6MNmm<A&BfBf<ghq'2h
hs   (Cr~   c                    | j                  |      j                  |j                  d      d| j                  j                        }| j                  |      }| j                  |      j                  |j                  d      d      }|||fS )Nr   r!   )r  r   r   rx   rc  postnetr  )rw   r~   outputs_before_postnetoutputs_after_postnetlogitss        r(   r   z$SpeechT5SpeechDecoderPostnet.forward  s~    !%}!=!B!B=CUCUVWCXZ\^b^i^i^v^v!w $-C D}-22=3E3Ea3H"M%'<fDDr*   c                     |j                  dd      }| j                  D ]
  } ||      } ||j                  dd      z   S r   )r   re  )rw   r~   layer_outputr  s       r(   r  z$SpeechT5SpeechDecoderPostnet.postnet  sI    $..q!4[[ 	/E .L	/|55a;;;r*   )	r   r   r   rk   r   r   r   r  r   r   s   @r(   r  r    s*    	
EU\\ E<U\\ <r*   r  c                   J     e Zd Z fdZd Zd Zdej                  fdZ xZ	S )SpeechT5TextEncoderPrenetc                    t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        y r|   )rj   rk   rx   r   r   
vocab_sizer   r   embed_tokensr   rg  max_text_positionsrh  r+  s     r(   rk   z"SpeechT5TextEncoderPrenet.__init__  se    LL):):F<N<NPVPcPcd @%%%%!
r*   c                     | j                   S r|   r  r;  s    r(   get_input_embeddingsz.SpeechT5TextEncoderPrenet.get_input_embeddings         r*   c                     || _         y r|   r  rw   values     r(   set_input_embeddingsz.SpeechT5TextEncoderPrenet.set_input_embeddings  
    !r*   r   c                 J    | j                  |      }| j                  |      }|S r|   )r  rh  )rw   r   rp  s      r(   r   z!SpeechT5TextEncoderPrenet.forward  s(    )))4--m<r*   )
r   r   r   rk   r  r  r   r   r   r   r   s   @r(   r  r    s"    
!" r*   r  c            	            e Zd Z fdZd Zd Z	 	 ddej                  deej                     dee
ej                        fdZ xZS )	SpeechT5TextDecoderPrenetc                    t         |           || _        t        j                  |j
                        | _        |j                  rt        j                  |j                        nd| _        t        j                  |j                  |j                  |j                        | _        t!        |j"                  |j                  z   dz   |j                  |j                        | _        y )Nr   r   )rj   rk   rx   r   r   rg  r   scale_embeddingr   sqrtr   embed_scaler   r  r   r  r   r  embed_positionsr+  s     r(   rk   z"SpeechT5TextDecoderPrenet.__init__  s    zz&";";<<B<R<R499V%7%78X[LL):):F<N<NPVPcPcdD%%(;(;;a? 
r*   c                     | j                   S r|   r  r;  s    r(   r  z.SpeechT5TextDecoderPrenet.get_input_embeddings'  r  r*   c                     || _         y r|   r  r  s     r(   r  z.SpeechT5TextDecoderPrenet.set_input_embeddings*  r  r*   r   r-   past_key_valuesc                 .   |&|j                         }|j                  d|d         }nt        d      ||d   d   j                  d   nd}| j	                  ||      }| j                  |      | j                  z  }||z  }| j                  |      }||fS )Nr!   z'You have to specify `decoder_input_ids`r   r   )r   r   r%   r#   r  r  r  r   )rw   r   r-   r  input_shaper   	positionsrp  s           r(   r   z!SpeechT5TextDecoderPrenet.forward-  s      #..*K!r;r?;IFGGCRC^!3A!6!<!<Q!?de((4JK	)))4t7G7GG"]3n,,r*   r\  )r   r   r   rk   r  r  r   r   r   r]  r   r^  r   r   r   s   @r(   r  r    sZ    
!" 6:=A	-<<- !!1!12- "$u'8'8"9:	-r*   r  c                   J     e Zd Z fdZdej
                  fdZd Zd Z xZ	S )SpeechT5TextDecoderPostnetc                     t         |           || _        t        j                  |j
                  |j                  d      | _        y )NFri   )rj   rk   rx   r   r(  r   r  lm_headr+  s     r(   rk   z#SpeechT5TextDecoderPostnet.__init__D  s5    yy!3!3V5F5FUSr*   r~   c                 $    | j                  |      S r|   r  r}   s     r(   r   z"SpeechT5TextDecoderPostnet.forwardI  s    ||M**r*   c                     | j                   S r|   r  r;  s    r(   get_output_embeddingsz0SpeechT5TextDecoderPostnet.get_output_embeddingsL      ||r*   c                     || _         y r|   r  rw   new_embeddingss     r(   set_output_embeddingsz0SpeechT5TextDecoderPostnet.set_output_embeddingsO  s	    %r*   )
r   r   r   rk   r   r   r   r  r  r   r   s   @r(   r  r  C  s#    T
+U\\ +&r*   r  c                       e Zd ZdZ	 	 	 ddededededef
 fdZdej                  d	ed
efdZ
	 	 	 	 	 	 ddej                  deej                     deeej                        deej                     deej                     deej                     dedeej                  eej                     eeej                        f   fdZ xZS )SpeechT5Attentionz
    Multi-headed attention from 'Attention Is All You Need' paper with relative position bias (see
    https://aclanthology.org/N18-2074.pdf)
    	embed_dim	num_headsr   
is_decoderri   c                    t         |           || _        || _        || _        ||z  | _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      r  )rj   rk   r  r  r   head_dimr%   scalingr  r   r(  k_projv_projq_projout_proj)rw   r  r  r   r  ri   rz   s         r(   rk   zSpeechT5Attention.__init__Y  s     	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$ii	94@ii	94@ii	94@		)YTBr*   r   r   r   c                     |j                  ||| j                  | j                        j                  dd      j	                         S r   )r   r  r  r   
contiguous)rw   r   r   r   s       r(   _shapezSpeechT5Attention._shapet  s7    {{3GQQRSUVWbbddr*   r~   key_value_statespast_key_valuer-   layer_head_maskposition_biasoutput_attentionsr6   c                 D   |du}|j                         \  }	}
}| j                  |      | j                  z  }|r||d   }|d   }n
|rE| j                  | j	                  |      d|	      }| j                  | j                  |      d|	      }n|}| j                  | j	                  |      d|	      }| j                  | j                  |      d|	      }t        j                  |d   |gd      }t        j                  |d   |gd      }nD| j                  | j	                  |      d|	      }| j                  | j                  |      d|	      }| j                  r||f}|	| j                  z  d| j                  f} | j                  ||
|	      j                  | } |j                  | } |j                  | }|j                  d      }t        j                  ||j                  dd            }|j                         |	| j                  z  |
|fk7  r/t        d|	| j                  z  |
|f d|j                                ||j                         j                  |	| j                  z  d| j                        j                  dd      }t        j                   ||j                  d	d            }|j                  dd      j                  |	| j                  z  |j                  d      |j                  d            }||z  }|{|j                         |	d|
|fk7  r#t        d
|	d|
|f d|j                                |j                  |	| j                  |
|      |z   }|j                  |	| j                  z  |
|      }t"        j$                  j'                  |d      }||j                         | j                  fk7  r*t        d| j                  f d|j                                |j                  dddd      |j                  |	| j                  |
|      z  }|j                  |	| j                  z  |
|      }|r?|j                  |	| j                  |
|      }|j                  |	| j                  z  |
|      }nd}t"        j$                  j)                  || j(                  | j*                        }t        j                  ||      }|j                         |	| j                  z  |
| j                  fk7  r7t        d|	| j                  |
| j                  f d|j                                |j                  |	| j                  |
| j                        }|j                  dd      }|j-                  |	|
| j.                        }| j1                  |      }|||fS )z#Input shape: Batch x Time x ChannelNr   r   r!   r   r   z$Attention weights should be of size z	, but is r   z!Attention mask should be of size z/Head mask for a single layer should be of size )r   r  z `attn_output` should be of size )r   r  r  r  r  r  r   r   r  r  r  r   bmmr   r%   r  matmulr   ru  softmaxr   r  rW   r  r  )rw   r~   r  r  r-   r  r  r  is_cross_attentionr   tgt_lenrZ   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weights	reshape_qrel_pos_biasattn_weights_reshaped
attn_probsattn_outputs                          r(   r   zSpeechT5Attention.forwardw  s    .T9',,.Wa {{=1DLL@."<'*J)!,LT[[1A%BBLJ;;t{{3C'Db#NL'T[[%?SIJ;;t{{='A2sKLN1$5z#BJJ 99nQ&7%FANL T[[%?SIJ;;t{{='A2sKL?? ),7NDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  $$//166sT^^7KRQUQ^Q^_iijkmnoI <<	=3J3J2r3RSL'11!Q7<<dnn$m&8&8&;]=O=OPQ=RL L(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK01>AAr*   )r0   FT)NNNNNF)r   r   r   r   r:   r   rM   rk   r   r   r  r   r   r   r   r   s   @r(   r  r  S  sF     CC C 	C
 C C6eU\\ eC ec e 488<152604"'yB||yB #5<<0yB !u||!45	yB
 !.yB "%,,/yB  -yB  yB 
u||Xell3XeELL>Q5RR	SyBr*   r  c                   $     e Zd Z fdZd Z xZS )SpeechT5FeedForwardc                    t         |           t        j                  |j                        | _        t        j                  |j                  |      | _        t        |j                  t              rt        |j                     | _        n|j                  | _        t        j                  ||j                        | _        t        j                  |j                        | _        y r|   )rj   rk   r   r   activation_dropoutintermediate_dropoutr(  r   intermediate_dense
isinstance
hidden_actstrr   intermediate_act_fnoutput_densehidden_dropoutoutput_dropout)rw   rx   intermediate_sizerz   s      r(   rk   zSpeechT5FeedForward.__init__  s    $&JJv/H/H$I!"$))F,>,>@Q"Rf''-'-f.?.?'@D$'-'8'8D$II&79K9KL jj)>)>?r*   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r|   )r  r  r  r  r  r}   s     r(   r   zSpeechT5FeedForward.forward  sX    //>00?11-@))-8++M:r*   r   r   s   @r(   r  r    s    @r*   r  c                        e Zd Zdef fdZ	 	 	 	 d	dej                  deej                     deej                     deej                     def
dZ	 xZ
S )
SpeechT5EncoderLayerrx   c                    t         |           t        |j                  |j                  |j
                  d      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        ||j                        | _        t        j                  |j                  |j                        | _        y )NFr  r  r   r  r%  )rj   rk   r  r   encoder_attention_headsattention_dropout	attentionr   r   r  r   r   r'  r   r  encoder_ffn_dimfeed_forwardfinal_layer_normr+  s     r(   rk   zSpeechT5EncoderLayer.__init__  s    *((44,,	
 zz&"7"78,,v'9'9v?T?TU/8N8NO "V-?-?VEZEZ [r*   r~   r-   r  r  r  c                     |}| j                  |||||      \  }}}| j                  |      }||z   }| j                  |      }|| j                  |      z   }| j	                  |      }|f}	|r|	|fz  }	|	S )as  
        Args:
            hidden_states (`torch.FloatTensor`):
                input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`):
                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
                large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(config.encoder_attention_heads,)`.
            position_bias (`torch.FloatTensor`):
                relative position embeddings of size `(seq_len, seq_len, hidden_size // encoder_attention_heads)`
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r~   r-   r  r  r  )r  r   r   r  r  )
rw   r~   r-   r  r  r  residualr  rZ   outputss
             r(   r   zSpeechT5EncoderLayer.forward  s    . !)-')+'/ *8 *
&|Q ]3 =06%(9(9-(HH--m< "&Gr*   )NNNF)r   r   r   r   rk   r   r   r   rM   r   r   r   s   @r(   r  r    ss    \~ \  262604"',||, !., "%,,/	,
  -,  ,r*   r  c                   $    e Zd Zdef fdZ	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     deej                     d	eeej                        d
ee	   dee	   fdZ
 xZS )SpeechT5DecoderLayerrx   c                    t         |           t        |j                  |j                  |j
                  d      | _        t        j                  |j                        | _
        t        j                  |j                  |j                        | _        t        |j                  |j                  |j
                  d      | _        t        j                  |j                  |j                        | _        t!        ||j"                        | _        t        j                  |j                  |j                        | _        y )NTr  r%  )r   r  )rj   rk   r  r   decoder_attention_headsr  	self_attnr   r   r  r   r   r'  self_attn_layer_normencoder_attnencoder_attn_layer_normr  decoder_ffn_dimr  r  r+  s     r(   rk   zSpeechT5DecoderLayer.__init__I  s    *((44,,	
 zz&"7"78$&LL1C1CI^I^$_!-**,,	
 (*||F4F4FFLaLa'b$/8N8NO "V-?-?VEZEZ [r*   r~   r-   encoder_hidden_statesencoder_attention_maskr  cross_attn_layer_head_maskr  r  	use_cachec
                    |}
||dd nd}| j                  |||||      \  }}}| j                  |      }|
|z   }| j                  |      }d}d}|R|}
||dd nd}| j                  ||||||      \  }}}| j                  |      }|
|z   }| j	                  |      }||z   }|| j                  |      z   }| j                  |      }|f}|r|||fz  }|	r||fz  }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, hidden_size)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            encoder_hidden_states (`torch.FloatTensor`):
                cross attention input to the layer of shape `(batch, seq_len, hidden_size)`
            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
                `(encoder_attention_heads,)`.
            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
                size `(decoder_attention_heads,)`.
            past_key_value (`Tuple(torch.FloatTensor)`): cached past key and value projection states
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        Nr   )r~   r  r-   r  r  r   )r~   r  r-   r  r  r  )r   r   r  r  r  r  r  )rw   r~   r-   r  r  r  r  r  r  r  r  self_attn_past_key_valueself_attn_weightspresent_key_valuecross_attn_present_key_valuecross_attn_weightscross_attn_past_key_valuer  s                     r(   r   zSpeechT5DecoderLayer.forward_  sm   < ! :H9S>"1#5Y] >Bnn'3)+/ ?M ?
;(*; ]3 =011-@ (,$! ,$H @N?Yrs(;_c%NRN_N_+!65 :8"3 O` OKM-/K !LL7M$}4M 88GM !24P P &(9(9-(HH--m< ")+=>>G)++Gr*   )NNNNNNFT)r   r   r   r   rk   r   r   r   r   rM   r   r   r   s   @r(   r  r  H  s    \~ \2 268<9=26=A8<,1$(R||R !.R  (5	R
 !) 6R "%,,/R %-U\\$:R !u||!45R $D>R D>Rr*   r  c                   "    e Zd ZeZdZdZdZd Zy)SpeechT5PreTrainedModelspeecht5r+   Tc           
      p   t        |t              rt        j                  j	                  |j
                  j                  ddt        j                  d|j
                  j                  d   |j
                  j                  z  z        z         t        j                  j                  |j
                  j                  d       yt        |t              rt        j                  d|j                  j                  z        }t        j                  j!                  |j                  j                  | |       t        j                  j!                  |j                  j                  | |       yt        |t        j"                        rm|j                  j$                  j	                  d| j&                  j(                         |j                  %|j                  j$                  j+                          yyt        |t        j,                  t        j.                  f      rJ|j                  j$                  j+                          |j                  j$                  j1                  d       yt        |t        j2                        rt        j                  j5                  |j                         |j                  jt        j                  |j6                  |j                  |j                  d   z  z        }t        j                  j!                  |j                  | |       yyt        |t        j8                        rz|j                  j$                  j	                  d| j&                  j(                         |j:                  2|j                  j$                  |j:                     j+                          yyy)	zInitialize the weightsr   r   r   meanstd)abr0   Nr   )r  r   r   initnormal_rs   r   r   r  rg   in_channels	constant_ri   r#  r)  in_featuresr5  r(  datarx   initializer_rangezero_r   r   fill_ro   kaiming_normal_r   r   r   )rw   moduleks      r(   _init_weightsz%SpeechT5PreTrainedModel._init_weights  s   f=>GGOO""		!v{{'>'>q'AFKKD[D['["\]]  
 GGfkk..2 9:		!f//;;;<AGGV..55!qAGGV..33rQ?		*MM&&CT[[5R5R&S{{&  &&( 'r|| <=KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' -MM&&CT[[5R5R&S!!-""6#5#56<<> . .r*   N)	r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr%   r*   r(   r  r    s    !L"$O&*#?r*   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5Encoderzu
    Transformer encoder consisting of *config.encoder_layers* layers. Each layer is a [`SpeechT5EncoderLayer`].
    rx   c                    t         |   |       t        j                  |j                  |j
                        | _        t        j                  |j                        | _	        |j                  | _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        t#        |j                  |j$                  z  |j&                        | _        d| _        | j-                          y c c}w )Nr%  F)rj   rk   r   r   r   r'  r   r   r  r   encoder_layerdrop	layerdropr  rK   encoder_layersr  re  r   r  encoder_max_relative_positionr  r  	post_initrw   rx   rZ   rz   s      r(   rk   zSpeechT5Encoder.__init__  s     ,,v'9'9v?T?TUzz&"7"7811mm5QWQfQfKg$ha%9&%A$hiA&"@"@@&BfBf 
 ',# 	 %is   Dr~   r-   	head_maskr  output_hidden_statesreturn_dictr6   c           	         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        ||j
                        }| j                  |      }| j                  |      }| j                  |      }t               xs t        |       }|rdnd}	|rdnd}
|_|j                         d   t        | j                        k7  r6t        dt        | j                         d|j                         d    d      t        | j                        D ]  \  }}|r|	|fz   }	d}| j                   r$t#        j$                  g       }|| j&                  k  }|r|rY| j(                  r4| j                   r(| j+                  |j,                  |||||   nd||      }n |||||||   nd|      }|d   }|rd	}|s|
d
   fz   }
 |r|	|fz   }	|st/        d ||	|
fD              S t1        ||	|
      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the encoder prenet.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr*  r   z&The head_mask should be specified for  layers, but it is for .F)r-   r  r  r  r\  r   c              3   &   K   | ]	  }||  y wr|   r*  .0vs     r(   	<genexpr>z*SpeechT5Encoder.forward.<locals>.<genexpr>Z  s     mq_`_lms   last_hidden_stater~   
attentions)rx   r  r5  use_return_dictr   rB   r   r   r  r   r   r   rP   re  r%   	enumerater  r   rF   r/  r  r  r   tupler   )rw   r~   r-   r4  r  r5  r6  r  synced_gpusall_hidden_statesall_self_attentionsidxencoder_layerskip_the_layerdropout_probabilitylayer_outputss                   r(   r   zSpeechT5Encoder.forward  s^   H 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] %7H[H[\N6]3,,];02R6LT6R"6BD$5b4  ~~"c$++&66 <S=M<N O!(+,A/ 
 #,DKK"8 #	PC#$58H$H! #N}}&+jjn#!4t~~!E![..4==$($E$E%..%&+4+@3d%)%M %2%'5&3;D;P3VZ*;%M !.a 0 , &9]1=M<O&O#G#	PJ   1]4D Dm]4EGZ$[mmm++*
 	
r*   NNNNNr   r   r   r   r   rk   r   r^  r   r   rM   r   r   r   r   r   r   s   @r(   r,  r,    s    ~ ( 26,0,0/3&*p
((p
 !.p
 ELL)	p

 $D>p
 'tnp
 d^p
 
uo%	&p
r*   r,  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5EncoderWithSpeechPrenetz
    Wrapper around SpeechT5Encoder that applies SpeechT5SpeechEncoderPrenet to convert the audio waveform data to
    hidden features.
    rx   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r|   )rj   rk   r/  prenetr,  wrapped_encoderr2  r+  s     r(   rk   z(SpeechT5EncoderWithSpeechPrenet.__init__i  5     1&9.v6 	r*   r+   r-   r4  r  r5  r6  r6   c                 ^    | j                  ||      \  }}| j                  ||||||      }|S N)r~   r-   r4  r  r5  r6  rR  rS  	rw   r+   r-   r4  r  r5  r6  r~   r  s	            r(   r   z'SpeechT5EncoderWithSpeechPrenet.forwardq  sG     )-L.(Q%~&&')/!5# ' 
 r*   rM  rN  r   s   @r(   rP  rP  c  s    
~  26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	&r*   rP  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 	 ddej                  de
ej                     de
ej                     d	e
e   d
e
e   de
e   deeef   fdZ xZS )SpeechT5EncoderWithTextPrenetz|
    Wrapper around SpeechT5Encoder that applies SpeechT5TextEncoderPrenet to convert the input_ids to hidden features.
    rx   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r|   )rj   rk   r  rR  r,  rS  r2  r+  s     r(   rk   z&SpeechT5EncoderWithTextPrenet.__init__  5     /7.v6 	r*   c                 6    | j                   j                         S r|   rR  r  r;  s    r(   r  z2SpeechT5EncoderWithTextPrenet.get_input_embeddings      {{//11r*   c                 :    | j                   j                  |       y r|   rR  r  r  s     r(   r  z2SpeechT5EncoderWithTextPrenet.set_input_embeddings      ((/r*   r+   r-   r4  r  r5  r6  r6   c                 V    | j                  |      }| j                  ||||||      }|S rV  rW  rX  s	            r(   r   z%SpeechT5EncoderWithTextPrenet.forward  s@     L1&&')/!5# ' 
 r*   rM  )r   r   r   r   r   rk   r  r  r   r^  r   r   rM   r   r   r   r   r   r   s   @r(   rZ  rZ    s    ~ 20 26,0,0/3&*'' !. ELL)	
 $D> 'tn d^ 
uo%	&r*   rZ  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddej                  deej                     deej                     dee
   dee
   d	ee
   d
eeef   fdZ xZS )SpeechT5EncoderWithoutPrenet
    This wrapper class is a helper class to correctly load pretrained checkpoints when used in combination with
    [`SpeechT5Model`].
    rx   c                 d    t         |   |       t        |      | _        | j	                          y r|   )rj   rk   r,  rS  r2  r+  s     r(   rk   z%SpeechT5EncoderWithoutPrenet.__init__  )     .v6 	r*   r+   r-   r4  r  r5  r6  r6   c                 0    | j                  ||||||      S rV  )rS  )rw   r+   r-   r4  r  r5  r6  s          r(   r   z$SpeechT5EncoderWithoutPrenet.forward  s.     ##&)/!5# $ 
 	
r*   rM  rN  r   s   @r(   re  re    s    
~  26,0,0/3&*
''
 !.
 ELL)	

 $D>
 'tn
 d^
 
uo%	&
r*   re  c                   V    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eeej                        dee   dee   dee   dee   deeef   fdZ xZS )SpeechT5Decoderzt
    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SpeechT5DecoderLayer`]
    rx   c                    t         |   |       |j                  | _        t	        j
                  t        |j                        D cg c]  }t        |       c}      | _	        d| _
        | j                          y c c}w r  )rj   rk   decoder_layerdropr/  r   r  rK   decoder_layersr  re  r  r2  r3  s      r(   rk   zSpeechT5Decoder.__init__  sd     11mm5QWQfQfKg$ha%9&%A$hi&+# 	 %is   A>r~   r-   r  r  r4  cross_attn_head_maskr  r  r  r5  r6  r6   c                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|j                         dd }||d   d   j                  d   nd}t        ||||      }||t        ||j                  |d         }t               xs t        |       }| j                  r%| j                  r|rt        j                  d       d}|
rdnd}|	rdnd}|	r|dnd}|rdnd}t!        ||gd	d
g      D ]j  \  }}|	|j                         d   t#        | j$                        k7  s3t'        d| dt#        | j$                         d|j                         d    d       t)        | j$                        D ]  \  }}|
r||fz   }d}| j                  r$t+        j,                  g       }|| j.                  k  }|r|sE|||   nd}| j                  r?| j                  r3| j1                  |j2                  |||||||   nd|||   ndd|	|
      }n ||||||||   nd|||   nd||	|	      }|d   }|r|||	rdnd   fz  }|	s||d   fz   }|||d   fz   } |
r||fz   }|r|nd}|st5        d |||||fD              S t7        |||||      S )a  
        Args:
            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, feature_size)`):
                Features extracted from the speech or text input by the decoder prenet.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
                of the decoder.
            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
                selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
                Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
                cross-attention on hidden heads. Mask values selected in `[0, 1]`:

                - 1 indicates the head is **not masked**,
                - 0 indicates the head is **masked**.

            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.

                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.

                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        Nr!   r   r   )r  zZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fr*  r4  ro  zThe `z` should be specified for r8  r9  )r-   r  r  r  r  r  r  r  r   r   c              3   $   K   | ]  }|| 
 y wr|   r*  r;  s     r(   r>  z*SpeechT5Decoder.forward.<locals>.<genexpr>  s      = s   )r@  r  r~   rA  cross_attentions)rx   r  r5  r  rB  r   r#   r   r   rB   r   r   r  r  loggerwarning_oncerR  rP   re  r%   rC  r   rF   r/  r  r   rD  r   )rw   r~   r-   r  r  r4  ro  r  r  r  r5  r6  r  r   rE  rF  rG  all_cross_attentionsnext_decoder_cache	attn_mask	mask_namerH  decoder_layerrJ  rK  r  rL  
next_caches                               r(   r   zSpeechT5Decoder.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]#((*3B/CRC^!3A!6!<!<Q!?de:K8N

 !,1G1S%?&(;(;[QS_&" 12R6LT6R&&4==##p "	 #7BD$5b4&7<Q<]rdh#,R$ %(4H(IKYoKp$q 	 Iy$>>#A&3t{{+;<$	{*DSEUDV W%NN,Q/03 	 #,DKK"8 2	VC#$58H$H! #N}}&+jjn#!4t~~!Ek5D5P_S1VZN**t}} $ A A!**!")*&/&;IcN1E1Q(-W[%! !.!#1*?+A7@7LYs^RV5I5U,S1[_#1&7'! *!,M"}:KQQR'S&UU" &9]1=M<O&O#(4+?=QRCSBU+U(e2	Vh   1]4D D+4'$
 '5FH[]qr   9+&+*1
 	
r*   NNNNNNNNNNNr   r   r   r   r   rk   r   r   r^  r]  r   r   rM   r   r   r   r   r   r   s   @r(   rk  rk    s(   	~ 	 6:59=A=A,07;=A$(,0/3&*|
 1 12|
 !!1!12|
  ((9(9:	|

 !))9)9 :|
 ELL)|
 'u||4|
 "$u'8'8"9:|
 D>|
 $D>|
 'tn|
 d^|
 
u??	@|
r*   rk  c                   v    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eej                     deeej                        dee   dee   dee   dee   deeef   fdZ xZS )SpeechT5DecoderWithSpeechPrenetz
    Wrapper around SpeechT5Decoder that applies SpeechT5SpeechDecoderPrenet to convert log-mel filterbanks to hidden
    features.
    rx   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r|   )rj   rk   r`  rR  rk  wrapped_decoderr2  r+  s     r(   rk   z(SpeechT5DecoderWithSpeechPrenet.__init__  rT  r*   r+   r-   r  r  rs  r4  ro  r  r  r  r5  r6  r6   c                 b    | j                  ||      }| j                  ||||||||	|
||      }|S N)r~   r-   r  r  r4  ro  r  r  r  r5  r6  rR  r  )rw   r+   r-   r  r  rs  r4  ro  r  r  r  r5  r6  decoder_hidden_statesr  s                  r(   r   z'SpeechT5DecoderWithSpeechPrenet.forward  sS     !%L:L M&&/)"7#9!5+/!5# ' 
 r*   )NNNNNNNNNNNNr|  r   s   @r(   r~  r~    s1   
~  5959=A=A59,07;=A$(,0/3&*u001 !!1!12  ((9(9:	
 !))9)9 : %U\\2 ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ 
u??	@r*   r~  c                   b    e Zd ZdZdef fdZd Zd Z	 	 	 	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     d	ee	j                     d
ee	j                     dee	j                     deee	j                        dee   dee   dee   dee   deeef   fdZ xZS )SpeechT5DecoderWithTextPrenetz{
    Wrapper around SpeechT5Decoder that applies SpeechT5TextDecoderPrenet to convert input tokens to hidden features.
    rx   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r|   )rj   rk   r  rR  rk  r  r2  r+  s     r(   rk   z&SpeechT5DecoderWithTextPrenet.__init__  r\  r*   c                 6    | j                   j                         S r|   r^  r;  s    r(   r  z2SpeechT5DecoderWithTextPrenet.get_input_embeddings  r_  r*   c                 :    | j                   j                  |       y r|   ra  r  s     r(   r  z2SpeechT5DecoderWithTextPrenet.set_input_embeddings  rb  r*   r+   r-   r  r  r4  ro  r  r  r  r5  r6  r6   c                 j    | j                  |||      \  }}| j                  |||||||||	|
|      }|S r  r  )rw   r+   r-   r  r  r4  ro  r  r  r  r5  r6  r  r  s                 r(   r   z%SpeechT5DecoderWithTextPrenet.forward  sZ     15L.Zi0j-~&&/)"7#9!5+/!5# ' 
 r*   r{  )r   r   r   r   r   rk   r  r  r   r   r^  r]  r   r   rM   r   r   r   r   r   r   s   @r(   r  r    s%   ~ 20
 5959=A=A,07;=A$(,0/3&*u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ 
u??	@r*   r  c                   V    e Zd ZdZdef fdZ	 	 	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     deej                     deej                     d	eej                     d
eeej                        dee   dee   dee   dee   deeef   fdZ xZS )SpeechT5DecoderWithoutPrenetrf  rx   c                 d    t         |   |       t        |      | _        | j	                          y r|   )rj   rk   rk  r  r2  r+  s     r(   rk   z%SpeechT5DecoderWithoutPrenet.__init__  rh  r*   r+   r-   r  r  r4  ro  r  r  r  r5  r6  r6   c                 >    | j                  |||||||||	|
|      }|S r  )r  )rw   r+   r-   r  r  r4  ro  r  r  r  r5  r6  r  s                r(   r   z$SpeechT5DecoderWithoutPrenet.forward  sA     &&&)"7#9!5+/!5# ' 
 r*   r{  r|  r   s   @r(   r  r    s   
~  5959=A=A,07;=A$(,0/3&*u001 !!1!12  ((9(9:	
 !))9)9 : ELL) 'u||4 "$u'8'8"9: D> $D> 'tn d^ 
u??	@r*   r  c                        e Zd ZdZdef fdZdej                  dej                  dej                  dej                  fdZ
d	 Zed
        Z xZS )$SpeechT5GuidedMultiheadAttentionLossz
    Guided attention loss from the paper [Efficiently Trainable Text-to-Speech System Based on Deep Convolutional
    Networks with Guided Attention](https://arxiv.org/abs/1710.08969), adapted for multi-head attention.
    rx   c                 f    t         |           |j                  | _        |j                  | _        y r|   )rj   rk   guided_attention_loss_sigmasigmaguided_attention_loss_scalescaler+  s     r(   rk   z-SpeechT5GuidedMultiheadAttentionLoss.__init__6  s(    77
77
r*   rA  input_masksoutput_masksr6   c                 F   | j                  |||j                        }|j                  d      |j                  d      z  }|j                  |j                        j                  d      }||z  }t	        j
                  |j                  |            }| j                  |z  S )aY  
        Compute the attention loss.

        Args:
            attentions (`torch.FloatTensor` of shape `(batch_size, layers * heads, output_sequence_length, input_sequence_length)`):
                Batch of multi-head attention weights
            input_masks (`torch.BoolTensor` of shape `(batch_size, input_sequence_length)`):
                Input attention mask as booleans.
            output_masks (`torch.BoolTensor` of shape `(batch_size, output_sequence_length)`):
                Target attention mask as booleans.

        Returns:
            `torch.Tensor` with the loss value
        r!   r   r   )_make_guided_attention_masksr   r   r   r   r  masked_selectr  )rw   rA  r  r  guided_attn_masksmaskslosseslosss           r(   r   z,SpeechT5GuidedMultiheadAttentionLoss.forward;  s    " !==k<YcYjYjk&&r*[-B-B2-FF**+55a8"Z/zz&..u56zzD  r*   c                 r   |j                  d      }|j                  d      }t        j                  t        |      |j                  d   |j                  d   f|      }t        t        ||            D ]0  \  }\  }}	| j                  ||	| j                  |      ||d |	d |f<   2 |j                  d      S )Nr!   r   rG  )
rI   r   rL   rP   r#   rC  rR  _make_guided_attention_maskr  r   )
rw   r  r  r   r[   rK  r  rH  ilenolens
             r(   r  zASpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_masksT  s    #+%))"-!KK[)9<;M;Ma;PR]RcRcdeRf(gpvw!*3}n+M!N 	tC$373S3STXZ^`d`j`jlr3sc5D5%4%/0	t !**1--r*   c                 (   t        j                  t        j                  | |      t        j                  ||      d      \  }}|j                         |z  }|j                         | z  }dt        j                  ||z
  dz   d|dz  z  z        z
  S )NrG  xy)indexingr   r   )r   meshgridrO   r   r   )r<   output_lengthr  r   grid_ygrid_xs         r(   r  z@SpeechT5GuidedMultiheadAttentionLoss._make_guided_attention_mask_  s    LLf5LLv6

 -/,.UYY&6/a!78ANKLLLr*   )r   r   r   r   r   rk   r   r^  
BoolTensorr   r   r  r   r  r   r   s   @r(   r  r  0  sj    
8~ 8
!++!:?:J:J!Z_ZjZj!	!2	. M Mr*   r  c                        e Zd ZdZdef fdZ	 ddej                  dej                  dej                  dej                  dej                  d	e	ej                     d
ej                  fdZ xZS )SpeechT5SpectrogramLossz;
    Loss computation used by SpeechT5ForTextToSpeech.
    rx   c                 (   t         |           |j                  | _        |j                  | _        |j                  | _        t               | _        t        t        j                  d            | _
        | j                  rt        |      | _        y y )Ng      @)
pos_weight)rj   rk   use_guided_attention_lossguided_attention_loss_num_headsr,   r
   l1_criterionr   r   r   bce_criterionr  attn_criterionr+  s     r(   rk   z SpeechT5SpectrogramLoss.__init__p  ss    )/)I)I&/5/U/U, & 7 7"H.%,,s:KL))"Fv"ND *r*   r-   r  r  r  labelsrr  r6   c           	      V   |dk7  }|j                  |      }|j                  |      }|j                  |      }| j                  ||      | j                  ||      z   }|d d d d df   }	t        j                  |	 dz  t        j                  |	j                  d      d      j                  |	j                        gd      }
|
d d dd f   j                  |	      }
|j                  |	      }| j                  ||
      }||z   }| j                  rt        j                  |D cg c]  }|d d d | j                  f    c}d      }|dk(  }|d d d d df   }| j                  dkD  r#|d d | j                  dz
  d | j                  f   }| j                  |||      }||z  }|S c c}w )Nr/   r   r   r   r   )r  r  r   r   rR   r   r   r   r  r  r  r,   r  )rw   r-   r  r  r  r  rr  rC  l1_lossr  stop_labelsbce_lossr  xattnr  r  	attn_losss                     r(   r   zSpeechT5SpectrogramLoss.forward|  s    ' %%l3!7!E!El!S 5 C CL Q ##$96BTEVEVWmouEvv Q1W%ii%#uzz%**Q-/K/N/Nu||/\ ]cde!!QR%(66u=%%e, %%fk: ! ))99Tdeqa#IT%I%I#I IJeklmD(A-K'1a0L$$q(+At/D/Dq/H/aDLaLa/a,ab++D+|LIID fs   $F&r|   )r   r   r   r   r   rk   r   r]  r^  r   r   r   r   r   s   @r(   r  r  k  s    
O~ 
O& 9=)(() !& 1 1)  %00	)
 !!) !!) #5#4#45) 
)r*   r  zv
    The bare SpeechT5 Encoder-Decoder Model outputting raw hidden-states without any specific pre- or post-nets.
    custom_introc            "       H    e Zd Z	 	 ddedeej                     deej                     f fdZd Zd Z	d Z
d Zd	 Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej                     deej                      deej                     deej                      deej"                     deej"                     deej                     deeeej"                           deeeej"                           dee   deej"                     dee   dee   dee   deeej"                     ef   fd       Z xZS )SpeechT5Modelrx   encoderdecoderc                     t         |   |       || _        |t        |      n|| _        |t        |      n|| _        | j                          y)z
        encoder (`PreTrainedModel`, *optional*):
            The encoder model to use.
        decoder (`PreTrainedModel`, *optional*):
            The decoder model to use.
        N)rj   rk   rx   re  r  r  r  r2  )rw   rx   r  r  rz   s       r(   rk   zSpeechT5Model.__init__  sM     	 ?F3F;T[?F3F;T[ 	r*   c                     t        | j                  t              r| j                  j                         S t        | j                  t
              r| j                  j                         S t        r|   )r  r  rZ  r  r  r  NotImplementedErrorr;  s    r(   r  z"SpeechT5Model.get_input_embeddings  sL    dll$AB<<4466dll$AB<<4466!!r*   c                     t        | j                  t              r| j                  j                  |       t        | j                  t
              r| j                  j                  |       y y r|   )r  r  rZ  r  r  r  r  s     r(   r  z"SpeechT5Model.set_input_embeddings  sJ    dll$ABLL--e4dll$ABLL--e4 Cr*   c                     | j                   S r|   )r  r;  s    r(   get_encoderzSpeechT5Model.get_encoder  r  r*   c                     | j                   S r|   )r  r;  s    r(   get_decoderzSpeechT5Model.get_decoder  r  r*   c                     t        | j                  t              r%| j                  j                  j	                          yyz
        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
        not be updated during training.
        N)r  r  rP  rR  r<  r;  s    r(   r<  z$SpeechT5Model.freeze_feature_encoder  s/    
 dll$CDLL668 Er*   r+   r-   decoder_input_valuesdecoder_attention_maskr4  decoder_head_maskro  encoder_outputsr  r  rs  r  r5  r6  r6   c                    ||n| j                   j                  }||n| j                   j                  }|
|
n| j                   j                  }
||n| j                   j                  }|| j                  ||||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|Qt        | j
                  t              r7| j
                  j                  j                  |d   j                  d   |      }n|}t        | j                  t              rd|i}ni } | j                  d
|||d   ||||	|
|||d|}|s||z   S t        |j                   |j"                  |j$                  |j&                  |j(                  |j                   |j$                  |j&                  	      S )a  
        input_values (`torch.Tensor` of shape `(batch_size, sequence_length)`):
            Depending on which encoder is being used, the `input_values` are either: float values of the input raw
            speech waveform, or indices of input sequence tokens in the vocabulary, or hidden states.
        decoder_input_values (`torch.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Depending on which decoder is being used, the `decoder_input_values` are either: float values of log-mel
            filterbank features extracted from the raw speech waveform, or indices of decoder input sequence tokens in
            the vocabulary, or hidden states.
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        N)r+   r-   r4  r  r5  r6  r   r   r   r?  rs  )r+   r-   r  r  r4  ro  r  r  r  r5  r6  )r@  r  r  decoder_attentionsrr  encoder_last_hidden_stater  encoder_attentionsr*  )rx   r  r5  r  rB  r  r  r   rP   rP  rR  r?  r#   r  r~  r   r@  r  r~   rA  rr  )rw   r+   r-   r  r  r4  r  ro  r  r  r  rs  r  r5  r6  r  decoder_argsdecoder_outputss                     r(   r   zSpeechT5Model.forward  s   R 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ""ll)-#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO %*T\\Cb*c%)\\%8%8%[%["((+^&" &4"dll$CD02DELL&$,, 
-1"1!"4#9'!5+/!5#
 
 "_44!-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r*   r\  NNNNNNNNNNNNNN)r   r   r   r   r   r   Modulerk   r  r  r  r  r<  r   r   r   r]  r^  r   rM   r   r   r   r   r   s   @r(   r  r    s    (,'+	 "))$ "))$	("59  04597;=A159=7;EIEI$(:>,0/3&*i
u||,i
 !!1!12i
 'u||4	i

 !))9)9 :i
 E--.i
 $E$5$56i
 'u||4i
 "%e.?.?(@"ABi
 "%e.?.?(@"ABi
 D>i
 %U%6%67i
 $D>i
 'tni
 d^i
  
uU&&');;	<!i
 i
r*   r  zB
    SpeechT5 Model with a speech encoder and a text decoder.
    c            "           e Zd ZdgZdef fdZd Zd Zd Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 dd	eej                     d
eej                     deej                     deej                     deej                     deej                     deej                      deeeej                           deeeej                           dee   dee   dee   dee   deej                     deeef   fd       Zed        Z xZS )SpeechT5ForSpeechToTextz#text_decoder_postnet.lm_head.weightrx   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )NYou are trying to instantiate a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForSpeechToText.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rj   rk   r  r%   rz   rP  r  r  r  r  text_decoder_postnetr2  )rw   rx   speech_encodertext_decoderrz   s       r(   rk   z SpeechT5ForSpeechToText.__init__R  s     $00@ A/ /  9@4V<%fnlK$>v$F! 	r*   c                 6    | j                   j                         S r|   r  r  r;  s    r(   r  z#SpeechT5ForSpeechToText.get_encoderf      }}((**r*   c                 6    | j                   j                         S r|   r  r  r;  s    r(   r  z#SpeechT5ForSpeechToText.get_decoderi  r  r*   c                 T    | j                         j                  j                          yr  r  rR  r<  r;  s    r(   r<  z.SpeechT5ForSpeechToText.freeze_feature_encoderl      
 	!!88:r*   c                 6    | j                   j                         S r|   )r  r  r;  s    r(   r  z-SpeechT5ForSpeechToText.get_output_embeddingss  s    ((>>@@r*   c                 :    | j                   j                  |       y r|   )r  r  r  s     r(   r  z-SpeechT5ForSpeechToText.set_output_embeddingsv  s    !!77Gr*   r+   r-   decoder_input_idsr  r4  r  ro  r  r  r  r  r5  r6  r  r6   c                 ~   ||n| j                   j                  }|7|5t        || j                   j                  | j                   j                        }| j                  |||||||||	|
||d      }| j                  |d         }d}|Ft               } ||j                  d| j                   j                        |j                  d            }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                   |j"                  	      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are decoder input IDs?](../glossary#decoder-input-ids)

            SpeechT5 uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

            Label indices can be obtained using [`SpeechT5Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToText
        >>> from datasets import load_dataset

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_asr")
        >>> model = SpeechT5ForSpeechToText.from_pretrained("microsoft/speecht5_asr")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")
        >>> predicted_ids = model.generate(**inputs, max_length=100)

        >>> # transcribe speech
        >>> transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
        >>> transcription[0]
        'mister quilter is the apostle of the middle classes and we are glad to welcome his gospel'
        ```

        ```python
        >>> inputs["labels"] = processor(text_target=dataset[0]["text"], return_tensors="pt").input_ids

        >>> # compute loss
        >>> loss = model(**inputs).loss
        >>> round(loss.item(), 2)
        19.68
        ```
        NT)r+   r-   r  r  r4  r  ro  r  r  r  r  r5  r6  r   r!   r   )	r  r  r  r  r  rr  r  r  r  )rx   rB  r)   r   r   r  r  r	   r   r  r   r  r  r  rr  r  r  r  )rw   r+   r-   r  r  r4  r  ro  r  r  r  r  r5  r6  r  r  r  r  loss_fctoutputs                       r(   r   zSpeechT5ForSpeechToText.forwardy  sZ   r &1%<k$++B]B] ($6DKK44dkk6X6X%! --%)!2#9/!5++/!5   
  **71:6')HFKKDKK,B,BCV[[QS_UDY,F)-)9TGf$EvE#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr*  c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)r   r   r   )r<  
past_statebeam_idxs     r(   r>  z9SpeechT5ForSpeechToText._reorder_cache.<locals>.<genexpr>	  s.     nU_j--aZ=N=N1OPns   58)rD  )r  r  reordered_past
layer_pasts    `  r(   _reorder_cachez&SpeechT5ForSpeechToText._reorder_cache	  s=    ) 	Jncmnn N	 r*   r  )r   r   r   _tied_weights_keysr   rk   r  r  r<  r  r  r   r   r   r^  r]  r   r   rM   r   r   r   r   r  r   r   s   @r(   r  r  J  s    @@~ (++;AH  59598<=A159=7;EIEI$(,0/3&*-1E
u001E
 !!1!12E
 $E$4$45	E

 !))9)9 :E
 E--.E
 $E$5$56E
 'u||4E
 "%e.?.?(@"ABE
 "%e.?.?(@"ABE
 D>E
 $D>E
 'tnE
 d^E
 ))*E
  
uo%	&!E
 E
N  r*   r  modelrs  	thresholdminlenratiomaxlenratiovocoderoutput_cross_attentionsreturn_output_lengthsc
           
      j   |t        d      |+d|| j                  j                  k(  j                         z
  }
n|}
|j	                  d      }| j
                  j                  ||
d      }|j                  }t        | j
                  j                  t              r@| j
                  j                  j                  j                  |d   j                  d   |
      }
t        |j	                  d      |z  | j                  j                  z        }t        |j	                  d      |z  | j                  j                  z        }|j                  |d| j                  j                        }g }g }d }d}i }	 |dz  }| j
                  j                   j                  ||      }| j
                  j                   j#                  |d d dd f   d ||
|d|d      }|r0|j%                  t'        j(                  |j*                  d             |j                  j-                  d      }|j.                  }| j0                  j3                  |      }|j5                  || j                  j                  | j                  j                        }|j%                  |       |d d dd d f   j5                  |d| j                  j                        }t'        j(                  ||fd      }t'        j6                  | j0                  j9                  |            }||k  r||k  rAt'        j:                  |d      |k\  }t'        j<                  |      d   j?                         }ntA        tC        |            }|D cg c]	  }||vs| }}tC        |      dkD  rat'        jD                  |      }|jG                  dd      jI                  dd	      }| j0                  jK                  |      }|D ]
  } ||    || <    tC        |      |k\  rntA        tC        |            D cg c]  }||   	 }}|	s|dk(  r|d   n4t&        jL                  jN                  jP                  jS                  |d
      }|	 ||      }!n|}!|r`t'        j(                  |d	      }|dkD  r@ |j4                  |t        |j	                  d      |z        g|j	                         dd   }|!|f}!|!S g }"tA        |      D ]%  }|"j%                  ||   j	                  d             ' |:t&        jL                  jN                  jP                  jS                  |d
      }||"f}!nyg }#t&        jL                  jN                  jP                  jS                  |d
      } ||      }#|"D cg c]+  }t        |#j	                  d      tU        |"      z        |z  - }$}|#|$f}!|r^t'        j(                  |d	      } |j4                  |t        |j	                  d      |z        g|j	                         dd   }g |!|}!|!S c c}w c c}w c c}w )Na  `speaker_embeddings` must be specified. For example, you can use a speaker embeddings by following
                    the code snippet provided in this link:
                    https://huggingface.co/datasets/Matthijs/cmu-arctic-xvectors
                    r   r   T)r+   r-   r6  r!   )r~   r-   r  r  r  r  r  r6  r   r   )batch_first)+r%   rx   r   r:   r   r  r  r@  r  rP  rR  r?  r#   r,   r"   rc  r  r  rT   r   r   rr  squeezer  speech_decoder_postnetr  r   sigmoidr  rI   ro  rJ   rK   rP   stackr   flattenr  r   r   rnnpad_sequencer;   )%r  r+   rs  r-   r  r  r  r  r  r  r  r   encoder_outr  maxlenminlenoutput_sequencespectrogramrr  r  rH  result_spectrogramr  decoder_outlast_decoder_outputspectrumnew_spectrogramprobmeet_thresholdsmeet_indexesr  spectrograms
meet_indexr  spectrogram_lengths	waveformswaveform_lengthss%                                        r(   _generate_speechr  	  s    !
 	
 !"lell6O6O&O%T%T%V!V!/


A
C..((!- ) K !, = = %..((*IJ!&!7!7!>!>!a!aN  #%;"
 *//2[@5<<C`C``aF*//2[@5<<C`C``aF 099#q%,,B[B[\OKO
C
q !& 6 6 = =oOa bnn,,<</237";#9+5 = 	
 ###EIIk.J.JPQ$RS);;CCAF%55 //889LM==ell&C&CU\\E^E^_8$ #1b!8,11#q%,,:S:ST))_o$FAN}}U99BBCVWX< V|"'))Db"9Y"F${{?;A>EEG$SY/'3S!q@R7RASLS< 1$${{;7+55a;CCAqI$;;CCLQ". NJ5A*5M&z2N%&#-i j 49=O9P3QRa&q)RLR ),l1ouxx~~7I7I7V7VWcqu7V7vk*G!G"$yy)9qAQw#8#3#8#8-2215;<$?O?T?T?VWYWZ?[$   01G* N% !s 	@A&&|A';';A'>?	@? 88>>--::<UY:ZL#%89GI 88>>--::<UY:ZL-I_rsZ[INN1$5<O8P$P QTU Uss "23G"$yy)9qA4/44S)..q1C78 ;K;P;P;RSUSV;W  32!12GNW T S4  ts   4	X&>X&X+0X0zB
    SpeechT5 Model with a text encoder and a speech decoder.
    c            &       l    e Zd ZdZdef fdZedefd       Zd Z	d Z
e	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d deej                     deej                     d	eej                     d
eej                     deej                     deej                     deej                      deeeej                           deeeej                           dee   dee   dee   dee   deej                     deej                     deej                      deeef   f"d       Z ej*                         	 	 	 	 	 	 	 	 d!dej                  deej                     deej                     dedededeej0                     dededeej                  eej                  ej                  f   f   fd       Z ej*                         	 	 	 	 	 	 	 	 d!dej                  deej                     deej                     dedededeej0                     dededeej                  eej                  ej                  f   f   fd       Z xZS )"SpeechT5ForTextToSpeechr   rx   c                     t         |   |       |j                  t        d| j                   d      t        |      }t        |      }t        |||      | _        t        |      | _
        | j                          y )Nr  a    with a configuration that does not define the vocabulary size of the language model head. Please instantiate the model as follows: `SpeechT5ForTextToSpeech.from_pretrained(..., vocab_size=vocab_size)`. or define `vocab_size` of your model's configuration.)rj   rk   r  r%   rz   rZ  r~  r  r  r  r   r2  )rw   rx   text_encoderspeech_decoderrz   s       r(   rk   z SpeechT5ForTextToSpeech.__init__	  s     $00@ A/ /  5V<8@%flNK&B6&J# 	r*   r6   c                      yr  r*  )clss    r(   can_generatez$SpeechT5ForTextToSpeech.can_generate	  s    
 r*   c                 6    | j                   j                         S r|   r  r;  s    r(   r  z#SpeechT5ForTextToSpeech.get_encoder	  r  r*   c                 6    | j                   j                         S r|   r  r;  s    r(   r  z#SpeechT5ForTextToSpeech.get_decoder	  r  r*   r-   r  r  r4  r  ro  r  r  r  r  r5  r6  rs  r  r  c                 b   ||n| j                   j                  }|>|$t        || j                   j                  |      \  }}| j                   j                  rd}| j                  |||||||||	|
|||d      }| j                  |d         \  }}}d}|,t        | j                         } |||||||j                        }|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary.

            Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
            [`~PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation. Spectrograms can be obtained using [`SpeechT5Processor`]. See [`SpeechT5Processor.__call__`]
            for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan, set_seed
        >>> import torch

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
        >>> model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> inputs = processor(text="Hello, my dog is cute", return_tensors="pt")
        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate(inputs["input_ids"], speaker_embeddings=speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([15872])
        ```
        NTr+   r-   r  r  r4  r  ro  r  r  r  rs  r  r5  r6  r   r   	r  r
  r  r  r  rr  r  r  r  )rx   rB  r2   r,   r  r  r   r  rr  r   r  r  r  r  r  r  )rw   r   r-   r  r  r4  r  ro  r  r  r  r  r5  r6  rs  r  r  r  r  r  r  r  	criterionr  s                           r(   r   zSpeechT5ForTextToSpeech.forward	  s}   X &1%<k$++B]B]#+?WDKK88:P@<$&< {{44$(!--")!5#9/!5++1/!5   
" AE@[@[\cde\f@g= 5v/<I&%((D +-;F)-)9TGf$EvE'-#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   r  r  r  r  r  r  c
                     |W|j                  d      }|j                  d      |k7  r2|j                  d      dk(  r|j                  |d      }nt        d      t        | |||||||||	
      S )aE  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Attention mask from the tokenizer, required for batched inference to signal to the model where to
                ignore padded tokens from the input_ids.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch_size.r   rn  r%   r  )rw   r   r-   rs  r  r  r  r  r  r  kwargsrY   s               r(   generatez SpeechT5ForTextToSpeech.generateH
  s    J )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r*   c
                     |W|j                  d      }
|j                  d      |
k7  r2|j                  d      dk(  r|j                  |
d      }nt        d      t        | |||||||||	
      S )a  
        Converts a sequence of input tokens into a sequence of mel spectrograms, which are subsequently turned into a
        speech waveform using a vocoder.

        Args:
            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Indices of input sequence tokens in the vocabulary.

                Indices can be obtained using [`SpeechT5Tokenizer`]. See [`~PreTrainedTokenizer.encode`] and
                [`~PreTrainedTokenizer.__call__`] for details.

                [What are input IDs?](../glossary#input-ids)
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        r   r   zUThe first dimension of speaker_embeddings must be either 1 or the same as batch size.r(  )rw   r   rs  r-   r  r  r  r  r  r  rY   s              r(   generate_speechz'SpeechT5ForTextToSpeech.generate_speech
  s    R )"*J!&&q)Z7%**1-2);)B)B:q)Q&$o   #!
 	
r*   NNNNNNNNNNNNNNNNNNg      ?r0   g      4@NFF)r   r   r   r(  r   rk   classmethodrM   r   r  r  r   r   r   r]  r^  r   r   r   r   r   r   r   r   r  r*  r,  r   r   s   @r(   r  r  	  s    "O~ ( T  ++  1559<@=A159=7;EIEI$(,0/3&*:>.2.2#B
E,,-B
 !!1!12B
 'u'8'89	B

 !))9)9 :B
 E--.B
 $E$5$56B
 'u||4B
 "%e.?.?(@"ABB
 "%e.?.?(@"ABB
 D>B
 $D>B
 'tnB
 d^B
 %U%6%67B
  **+!B
" ell+#B
$ 
u..	/%B
 B
H U]]_ 6::> !'+(-&+Y
##Y
 !!1!12Y
 %U%6%67	Y

 Y
 Y
 Y
 "))$Y
 "&Y
  $Y
 
u  %(9(95;L;L(L"MM	NY
 Y
v U]]_ ;?59 !'+(-&+]
##]
 %U%6%67]
 !!1!12	]

 ]
 ]
 ]
 "))$]
 "&]
  $]
 
u  %(9(95;L;L(L"MM	N]
 ]
r*   r  zD
    SpeechT5 Model with a speech encoder and a speech decoder.
    c            &            e Zd Zdef fdZd Zd Zd Ze	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 dde	e
j                     de	e
j                     de	e
j                     d	e	e
j                     d
e	e
j                     de	e
j                     de	e
j                     de	eee
j                           de	eee
j                           de	e   de	e   de	e   de	e   de	e
j                     de	e
j                     de	e
j                     deeef   f"d       Z e
j&                         	 	 	 	 	 	 	 	 d de
j                  de	e
j                     de	e
j                     dededede	ej,                     dedede
j                  fd       Z xZS )!SpeechT5ForSpeechToSpeechrx   c                     t         |   |       t        |      }t        |      }t	        |||      | _        t        |      | _        | j                          y r|   )	rj   rk   rP  r~  r  r  r  r   r2  )rw   rx   r  r  rz   s       r(   rk   z"SpeechT5ForSpeechToSpeech.__init__  sM     8@8@%fnnM&B6&J# 	r*   c                 6    | j                   j                         S r|   r  r;  s    r(   r  z%SpeechT5ForSpeechToSpeech.get_encoder  r  r*   c                 6    | j                   j                         S r|   r  r;  s    r(   r  z%SpeechT5ForSpeechToSpeech.get_decoder  r  r*   c                 T    | j                         j                  j                          yr  r  r;  s    r(   r<  z0SpeechT5ForSpeechToSpeech.freeze_feature_encoder  r  r*   r+   r-   r  r  r4  r  ro  r  r  r  r  r5  r6  rs  r  r  r6   c                    ||n| j                   j                  }|&|$t        || j                   j                  |      \  }}| j	                  |||||||||	|
|||d      }| j                  |d         \  }}}d}|s|f|dd z   }||f|z   S |S t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                  	      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            Float values of input raw speech waveform. Values can be obtained by loading a *.flac* or *.wav* audio file
            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install
            soundfile*). To prepare the array into `input_values`, the [`SpeechT5Processor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
        decoder_input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`):
            Float values of input mel spectrogram.

            SpeechT5 uses an all-zero spectrum as the starting token for `decoder_input_values` generation. If
            `past_key_values` is used, optionally only the last `decoder_input_values` have to be input (see
            `past_key_values`).
        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_values`. Causal mask will
            also be used by default.

            If you want to change padding behavior, you should read [`SpeechT5Decoder._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.
        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
            Tensor containing the speaker embeddings.
        labels (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_mel_bins)`, *optional*):
            Float values of target mel spectrogram. Spectrograms can be obtained using [`SpeechT5Processor`]. See
            [`SpeechT5Processor.__call__`] for details.
        stop_labels (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Binary tensor indicating the position of the stop token in the sequence.

        Example:

        ```python
        >>> from transformers import SpeechT5Processor, SpeechT5ForSpeechToSpeech, SpeechT5HifiGan, set_seed
        >>> from datasets import load_dataset
        >>> import torch

        >>> dataset = load_dataset(
        ...     "hf-internal-testing/librispeech_asr_demo", "clean", split="validation", trust_remote_code=True
        ... )  # doctest: +IGNORE_RESULT
        >>> dataset = dataset.sort("id")
        >>> sampling_rate = dataset.features["audio"].sampling_rate

        >>> processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_vc")
        >>> model = SpeechT5ForSpeechToSpeech.from_pretrained("microsoft/speecht5_vc")
        >>> vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

        >>> # audio file is decoded on the fly
        >>> inputs = processor(audio=dataset[0]["audio"]["array"], sampling_rate=sampling_rate, return_tensors="pt")

        >>> speaker_embeddings = torch.zeros((1, 512))  # or load xvectors from a file

        >>> set_seed(555)  # make deterministic

        >>> # generate speech
        >>> speech = model.generate_speech(inputs["input_values"], speaker_embeddings, vocoder=vocoder)
        >>> speech.shape
        torch.Size([77824])
        ```
        NTr$  r   r   r%  )rx   rB  r2   r,   r  r   r   r  r  r  rr  r  r  r  )rw   r+   r-   r  r  r4  r  ro  r  r  r  r  r5  r6  rs  r  r  r  rZ   r
  r  r  r  s                          r(   r   z!SpeechT5ForSpeechToSpeech.forward$  s*   d &1%<k$++B]B]#+?WDKK88:P@<$&< --%)!5#9/!5++1/!5   
" "&!<!<WQZ!H;!^gabk1F)-)9TGf$EvE'##33")"?"?&99$55&-&G&G")"?"?&99

 
	
r*   r  r  r  r  r  r  c
                 p    |!t        j                  d|j                        }t        | |||||||||	
      S )a  
        Converts a raw speech waveform into a sequence of mel spectrograms, which are subsequently turned back into a
        speech waveform using a vocoder.

        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
                Float values of input raw speech waveform.

                Values can be obtained by loading a *.flac* or *.wav* audio file into an array of type `List[float]` or
                a `numpy.ndarray`, *e.g.* via the soundfile library (*pip install soundfile*). To prepare the array
                into `input_values`, the [`SpeechT5Processor`] should be used for padding and conversion into a tensor
                of type `torch.FloatTensor`. See [`SpeechT5Processor.__call__`] for details.
            speaker_embeddings (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_dim)`, *optional*):
                Tensor containing the speaker embeddings.
            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing convolution and attention on padding token indices. Mask values selected in
                `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            threshold (`float`, *optional*, defaults to 0.5):
                The generated sequence ends when the predicted stop token probability exceeds this value.
            minlenratio (`float`, *optional*, defaults to 0.0):
                Used to calculate the minimum required length for the output sequence.
            maxlenratio (`float`, *optional*, defaults to 20.0):
                Used to calculate the maximum allowed length for the output sequence.
            vocoder (`nn.Module`, *optional*, defaults to `None`):
                The vocoder that converts the mel spectrogram into a speech waveform. If `None`, the output is the mel
                spectrogram.
            output_cross_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of the decoder's cross-attention layers.
            return_output_lengths (`bool`, *optional*, defaults to `False`):
                Whether or not to return the concrete spectrogram/waveform lengths.

        Returns:
            `tuple(torch.FloatTensor)` comprising various elements depending on the inputs:
            - when `return_output_lengths` is False
                - **spectrogram** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrogram.
                - **waveform** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(num_frames,)` -- The predicted speech waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
            - when `return_output_lengths` is True
                - **spectrograms** (*optional*, returned when no `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, output_sequence_length, config.num_mel_bins)` -- The predicted log-mel spectrograms that
                are padded to the maximum length.
                - **spectrogram_lengths** (*optional*, returned when no `vocoder` is provided) `List[Int]` -- A list of
                all the concrete lengths for each spectrogram.
                - **waveforms** (*optional*, returned when a `vocoder` is provided) `torch.FloatTensor` of shape
                `(batch_size, num_frames)` -- The predicted speech waveforms that are padded to the maximum length.
                - **waveform_lengths** (*optional*, returned when a `vocoder` is provided) `List[Int]` -- A list of all
                the concrete lengths for each waveform.
                - **cross_attentions** (*optional*, returned when `output_cross_attentions` is `True`)
                `torch.FloatTensor` of shape `(batch_size, config.decoder_layers, config.decoder_attention_heads,
                output_sequence_length, input_sequence_length)` -- The outputs of the decoder's cross-attention layers.
        )r   i   rG  )r   rL   r   r  )
rw   r+   rs  r-   r  r  r  r  r  r  s
             r(   r,  z)SpeechT5ForSpeechToSpeech.generate_speech  sM    R %!&Xl>Q>Q!R#!
 	
r*   r-  r.  )r   r   r   r   rk   r  r  r<  r   r   r   r^  r]  r   r   rM   r   r   r   r   r   r   r  r,  r   r   s   @r(   r1  r1    s   
~ 
++;  5959<@=A159=7;EIEI$(,0/3&*:>.2.2#|
u001|
 !!1!12|
 'u'8'89	|

 !))9)9 :|
 E--.|
 $E$5$56|
 'u||4|
 "%e.?.?(@"AB|
 "%e.?.?(@"AB|
 D>|
 $D>|
 'tn|
 d^|
 %U%6%67|
  **+!|
" ell+#|
$ 
u..	/%|
 |
| U]]_ ;?59 !'+(-&+V
''V
 %U%6%67V
 !!1!12	V

 V
 V
 V
 "))$V
 "&V
  $V
 
		V
 V
r*   r1  c                   :     e Zd Zd fd	ZddZd Zd Zd Z xZS )HifiGanResidualBlockc                    t         |           || _        t        j                  t        t        |            D cg c]3  }t        j                  |||d||   | j                  |||               5 c}      | _	        t        j                  t        t        |            D cg c]-  }t        j                  |||dd| j                  |d            / c}      | _
        y c c}w c c}w )Nr   )rh   dilationr   )rj   rk   leaky_relu_sloper   r  rK   rP   ro   get_paddingconvs1convs2)rw   channelsrg   r;  r<  r  rZ   rz   s          r(   rk   zHifiGanResidualBlock.__init__  s     0mm s8}-
  		%a[ ,,[(1+F

 mm s8}-
  		 ,,[!<



s   8C$%2C)c                     ||z  |z
  dz  S r   r*  )rw   rg   r;  s      r(   r=  z HifiGanResidualBlock.get_padding  s    h&1a77r*   c                 ,   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]
  } ||        y Nr   )r   r   r   r   r   r>  r?  rw   r   r  s      r(   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm   sp    hh**288,,m<((33??K[[ 	E	[[ 	E	r*   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]!  }t        j                  j                  |       # y r|   )r>  r   r   remove_weight_normr?  rw   r  s     r(   rG  z'HifiGanResidualBlock.remove_weight_norm*  sL    [[ 	/EHH''.	/[[ 	/EHH''.	/r*   c                 ,   t        | j                  | j                        D ]p  \  }}|}t        j                  j                  || j                        } ||      }t        j                  j                  || j                        } ||      }||z   }r |S r|   )rR  r>  r?  r   ru  
leaky_relur<  )rw   r~   conv1conv2r  s        r(   r   zHifiGanResidualBlock.forward0  s    T[[9 	5LE5$HMM44]DDYDYZM!-0MMM44]DDYDYZM!-0M)H4M	5 r*   )r   )r   r      g?)r   )	r   r   r   rk   r=  rE  rG  r   r   r   s   @r(   r9  r9    s    
>8/r*   r9  z
    HiFi-GAN vocoder.
    c                        e Zd ZeZdZdef fdZd Zd Zd Z	 e
d      dej                  d	ej                  fd
       Z xZS )SpeechT5HifiGanr
  rx   c                    t         |   |       t        |j                        | _        t        |j
                        | _        t        j                  |j                  |j                  ddd      | _        t        j                         | _        t        t        |j
                  |j                               D ]d  \  }\  }}| j                  j#                  t        j$                  |j                  d|z  z  |j                  d|dz   z  z  ||||z
  dz               f t        j                         | _        t)        t        | j                              D ]p  }|j                  d|dz   z  z  }t        |j                  |j*                        D ]6  \  }}| j&                  j#                  t-        ||||j.                               8 r t        j                  dddd      | _        | j3                  dt5        j6                  |j                               | j3                  dt5        j8                  |j                               | j;                          y )N   r   r   )rg   rh   r   r   r  r  )rj   rk   rP   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   ro   model_in_dimupsample_initial_channelconv_prer  	upsamplerrC  rR  upsample_kernel_sizesrT   ConvTranspose1d	resblocksrK   resblock_dilation_sizesr9  r<  	conv_postr   r   rL   rR   r2  )rw   rx   r  upsample_raterg   r@  r;  rz   s          r(   rk   zSpeechT5HifiGan.__init__D  s    v;;< !6!67		++
 /8V=R=RTZTpTp9q/r 		+A+{NN!!""331=33a!eE +((=8Q>		 s4>>*+ 	vA661Q<HH),V-I-I6KiKi)j v%X%%&:8[RZ\b\s\s&tuv	v
 8QAaQRSVU[[1D1D%EFWejj1D1D&EF 	r*   c                 2   t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyy)zInitialize the weights.r0   r  N)r  r   r(  ro   r   r  r  rx   r  ri   r   )rw   r#  s     r(   r%  zSpeechT5HifiGan._init_weightsj  sj    fryy"))45MM&&CT[[5R5R&S{{&  &&( ' 6r*   c                    t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  } || j
                         | j                  D ]
  } ||        | j                  D ]  }|j                            || j                         y rC  )
r   r   r   r   r   rX  rY  r\  rE  r^  rD  s      r(   rE  z!SpeechT5HifiGan.apply_weight_normq  s    hh**288,,m<((33??KDMM"^^ 	E	^^ 	&E##%	&DNN#r*   c                 J   t         j                  j                  | j                         | j                  D ]!  }t         j                  j                  |       # | j
                  D ]  }|j                           t         j                  j                  | j                         y r|   )r   r   rG  rX  rY  r\  r^  rH  s     r(   rG  z"SpeechT5HifiGan.remove_weight_norm}  sr    
##DMM2^^ 	/EHH''.	/^^ 	'E$$&	'
##DNN3r*   a  
        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
        waveform.
        r  r6   c                    | j                   j                  r|| j                  z
  | j                  z  }|j	                         dk(  }|s|j                  d      }|j                  dd      }| j                  |      }t        | j                        D ]  }t        j                  j                  || j                   j                        } | j                  |   |      } | j                  || j                   z     |      }t        d| j                         D ]*  }| | j                  || j                   z  |z      |      z  }, || j                   z  } t        j                  j                  |      }| j#                  |      }t%        j&                  |      }|s2|j)                  d      j                  dd      j+                  d      }|S |j)                  d      }|S )a  
        spectrogram (`torch.FloatTensor`):
            Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
            config.model_in_dim)`, or un-batched and of shape `(sequence_length, config.model_in_dim)`.

        Returns:
            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
        r   r   r   r   r!   )rx   normalize_beforer  r  r   r   r   rX  rK   rU  r   ru  rJ  r<  rY  r\  rS  r^  r   tanhr  r   )rw   r
  
is_batchedr~   r  	res_statejwaveforms           r(   r   zSpeechT5HifiGan.forward  s   " ;;''&2djj@K __&!+
%//2K#--a3m4t))* 	9AMM44]DKKD`D`aM-DNN1-m<M<q4+;+;';<]KI1d../ UET^^A0@0@,@1,DEmTT	U%(8(88M	9 00?}5

=1$,,Q/99!Q?DDRHH
  %,,Q/Hr*   )r   r   r   r   r&  r(  rk   r%  rE  rG  r   r   r^  r   r   r   s   @r(   rO  rO  ;  sd     )L#O$4 $L)
$4 (5#4#4 (9J9J ((r*   rO  )r  r1  r  r  r  rO  )r   Nr	  r.  )`r   r   typingr   r   r   r   numpyrD   r   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   
generationr   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   r   r   r   modeling_utilsr   r   r   r   configuration_speecht5r   r   
get_loggerr   rs  _HIDDEN_STATES_START_POSITIONr   r:   r)   r2   r   r]  ndarrayrb   r  rd   r   r   r   r   r   r   r   r  r#  r/  r`  rz  r  r  r  r  r  r  r  r  r  r,  rP  rZ  re  rk  r~  r  r  r  r  r  r  r^  rM   r  r  r1  r9  rO  __all__r*  r*   r(   <module>rz     s     / /     @ @ ! ) @ 7 e  . , I 
		H	% !" %,, c [^ " ei0,,0250KSTYT`T`Ka04 26tc?tt t U--.	t
 t ZZtp299 , 8 2A8BII A8J*bii *Zryy 0" "(299 +RYY +^1		 1D")) DN1")) 1h% %P<299 <2		 .(-		 (-V& & ]B		 ]B@")) 0:299 :zi299 iX "?o "? "?JF
- F
R"&= "J'$; 'T
#: 
@L
- L
^-&= -`1$; 1h(#: (V8M299 8Mv:bii :z 
Z
+ Z

Z
z 
y5 y
y~ 7;15#'$)"'L"L##L !!2!23L U--.	L
 L L L bii L "L  L 5eE$5$5u7H7H$HIIJL^ 
c
5 c

c
L 
p
 7 p

p
f;299 ;| 
to t
tnr*   