
    Uh                     ~   d Z ddlZddlmZ ddlmZmZmZmZ ddl	Z
ddlZddlZddlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZ ddlmZ  ej@                  e!      Z"e G d de             Z#e G d de             Z$ejJ                  jL                  d        Z'	 	 	 	 	 d=dZ(d Z) G d dej                  jT                        Z+ G d dejT                        Z, G d dejT                        Z- G d dejT                        Z. G d d ejT                        Z/ G d! d"ejT                        Z0 G d# d$ejT                        Z1 G d% d&ejT                        Z2 G d' d(ejT                        Z3 G d) d*ejT                        Z4 G d+ d,ejT                        Z5 G d- d.ejT                        Z6 G d/ d0ejT                        Z7 G d1 d2ejT                        Z8 G d3 d4ejT                        Z9 G d5 d6ejT                        Z:e G d7 d8e             Z; ed9:       G d; d<e;             Z<d<d8gZ=y)>zPyTorch VITS model.    N)	dataclass)AnyOptionalTupleUnion)nn   )ACT2FN)is_deepspeed_zero3_enabled)is_fsdp_managed_module)_prepare_4d_attention_mask)BaseModelOutputModelOutput)PreTrainedModel)auto_docstringlogging   )
VitsConfigc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   y)VitsModelOutputaC  
    Describes the outputs for the VITS model, with potential hidden states and attentions.

    Args:
        waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
            The final audio waveform predicted by the model.
        sequence_lengths  (`torch.FloatTensor` of shape `(batch_size,)`):
            The length in samples of each element in the `waveform` batch.
        spectrogram (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_bins)`):
            The log-mel spectrogram predicted at the output of the flow model. This spectrogram is passed to the Hi-Fi
            GAN decoder model to obtain the final audio waveform.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nwaveformsequence_lengthsspectrogramhidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r   r        x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/vits/modeling_vits.pyr   r   '   s    0 -1Hhu(()048hu00186:K% 1 123:8<M8E%"3"345<59Ju00129r$   r   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeeej                        ed<   y)VitsTextEncoderOutputaa  
    Describes the outputs for the VITS text encoder model, with potential hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        prior_means (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted mean values of the prior distribution for the latent text variables.
        prior_log_variances (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            The predicted log-variance values of the prior distribution for the latent text variables.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attention weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlast_hidden_stateprior_meansprior_log_variancesr   r   )r   r   r   r   r(   r   r    r!   r"   r)   r*   r   r   r   r#   r$   r%   r'   r'   H   s~    . 6:x 1 129/3K%++,37;%"3"34;8<M8E%"3"345<59Ju00129r$   r'   c                     | |z   }t        j                  |d d d |d d f         }t        j                  |d d |d d d f         }||z  }|S N)r    tanhsigmoid)input_ainput_bnum_channelsin_actt_acts_actactss          r%   fused_add_tanh_sigmoid_multiplyr6   h   sT    wFJJva,123EMM&LM1!456E5=DKr$   c	                    | | k\  | |k  z  }	|	 }
t        j                  |       }t        j                  |       }t        j                  t        j                  d|z
        dz
        }t
        j                  j                  |d      }||d<   ||d<   | |
   ||
<   d||
<   t        | |	   ||	ddf   ||	ddf   ||	ddf   |||||	      \  ||	<   ||	<   ||fS )	a	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Outside of the
    `tail_bound`, the transform behaves as an identity function.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`, *optional*, defaults to `False`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`, *optional* defaults to 5):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`, *optional*, defaults to 1e-3):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function with the `tail_bound` limits
            applied.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs` with the `tail_bound`
            limits applied.
    r   )r   r   )pad.r   .        N)	inputsunnormalized_widthsunnormalized_heightsunnormalized_derivativesreverse
tail_boundmin_bin_widthmin_bin_heightmin_derivative)	r    
zeros_likenplogexpr   
functionalr8   _rational_quadratic_spline)r=   r>   r?   r@   rA   rB   rC   rD   rE   inside_interval_maskoutside_interval_maskoutputslog_abs_detconstants                 r%   (_unconstrained_rational_quadratic_splinerQ   q   s   \ #zk1f
6JK11v&G""6*KvvbffQ/0145H!}}001Iv0V'/V$(0W%%+,A%BG!"),K%&Ga*+/0Da0GH12F2IJ!9:NPQ:Q!R#%%
HDG !;/C#D Kr$   c	                    |}	| }
t        j                  |       |
k  st        j                  |       |	kD  rt        d      |j                  d   }||z  dkD  rt        d| d|       ||z  dkD  rt        d| d|       t
        j                  j                  |d      }|d||z  z
  |z  z   }t        j                  |d      }t
        j                  j                  |d	d
d      }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf   |dddf   z
  }|t
        j                  j                  |      z   }t
        j                  j                  |d      }|d||z  z
  |z  z   }t        j                  |d      }t
        j                  j                  |d	d
d      }|	|
z
  |z  |
z   }|
|d<   |	|d<   |dddf   |dddf   z
  }|r|n|}|dxx   dz  cc<   t        j                  | d   |k\  d      dz
  }|d   }|j                  d|      d   }|j                  d|      d   }|j                  d|      d   }||z  }|j                  d|      d   }|j                  d|      d   }|dddf   j                  d|      d   }|j                  d|      d   }||z   d|z  z
  }|s| |z
  |z  }|d|z
  z  }|||j                  d      z  ||z  z   z  }|||z  z   }|||z  z   } |j                  d      ||j                  d      z  d|z  |z  z   |d|z
  j                  d      z  z   z  }!t        j                  |!      dt        j                  |      z  z
  }"| |"fS | |z
  }#|#|z  }$|||z
  z  |$z   }%||z  |$z
  }&| |#z  }'|&j                  d      d|%z  |'z  z
  }(|(dk\  j                         st!        d|(       d|'z  |& t        j"                  |(      z
  z  })|)|z  |z   } |)d|)z
  z  }|||z  z   }|j                  d      ||)j                  d      z  d|z  |z  z   |d|)z
  j                  d      z  z   z  }!t        j                  |!      dt        j                  |      z  z
  }"| |" fS )a(	  
    This transformation represents a monotonically increasing piecewise rational quadratic function. Unlike the
    function `_unconstrained_rational_quadratic_spline`, the function behaves the same across the `tail_bound`.

    Args:
        inputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Second half of the hidden-states input to the Vits convolutional flow module.
        unnormalized_widths (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            First `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_heights (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Second `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        unnormalized_derivatives (`torch.FloatTensor` of shape `(batch_size, channels, seq_len, duration_predictor_flow_bins)`):
            Third `duration_predictor_flow_bins` of the hidden-states from the output of the convolution projection
            layer in the convolutional flow module
        reverse (`bool`):
            Whether the model is being run in reverse mode.
        tail_bound (`float`):
            Upper and lower limit bound for the rational quadratic function. Outside of this `tail_bound`, the
            transform behaves as an identity function.
        min_bin_width (`float`):
            Minimum bin value across the width dimension for the piecewise rational quadratic function.
        min_bin_height (`float`):
            Minimum bin value across the height dimension for the piecewise rational quadratic function.
        min_derivative (`float`):
            Minimum bin value across the derivatives for the piecewise rational quadratic function.
    Returns:
        outputs (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Hidden-states as transformed by the piecewise rational quadratic function.
        log_abs_det (`torch.FloatTensor` of shape `(batch_size, channels, seq_len)`:
            Logarithm of the absolute value of the determinants corresponding to the `outputs`.
    z-Input to a transform is not within its domainr;         ?zMinimal bin width z" too large for the number of bins zMinimal bin height dimr   )r   r   rP   r<   )r8   modevaluer9   r:   .Ngư>).N      r   zinvalid discriminant )r    minmax
ValueErrorshaper   rJ   softmaxcumsumr8   softplussumgatherpowrH   allRuntimeErrorsqrt)*r=   r>   r?   r@   rA   rB   rC   rD   rE   upper_boundlower_boundnum_binswidths	cumwidthsderivativesheights
cumheightsbin_locationsbin_idxinput_cumwidthsinput_bin_widthsinput_cumheightsdeltainput_deltainput_derivativesinput_derivatives_plus_oneinput_heightsintermediate1thetatheta_one_minus_theta	numeratordenominatorrN   derivative_numeratorrO   intermediate2intermediate3abcdiscriminantroots*                                             r%   rK   rK      sR   X K+Kyy;&%))F*;k*IHII"((,Hx#%-m_<^_g^hijj 3&.~.>>`ai`jkll]]""#6B"?Fa-("::fDDFV,I!!)jPS!TI{*i7+EI#If$IgsABw)C"H"55F 2==#9#9:R#SSKmm##$8b#AGNX$= =HHGg2.J"":6
RU"VJ+z9KGJ$Jv%Jwab!JsCRCx$88G")JyM'd"iiy)]:CaGGi G&&r73F;O}}R1&9!((W5f=fE,,r7+F3K#**2w7?!,S!"W!5!<!<R!I&!QNN2w/7M%(BBQ_TM/)-== %U 3![599Q<%?BSVkBk%kl	!M4I$II"Y%<<*q1&15+o 5561u9//!"445 

 ii 45EIIk<R8RR## !11%5[+<<=M--=L=(uuQx!a%!)+!&&(!6|nEFFA1"uzz,778))O; $D 1!M4I$II*q1&!4+o 5561t8.."334 

 ii 45EIIk<R8RR$$r$   c                   6     e Zd Zdedef fdZddZd Z xZS )VitsWaveNetconfig
num_layersc                    t         |           |j                  | _        || _        t        j
                  j                         | _        t        j
                  j                         | _        t        j                  |j                        | _        t        t
        j                  j                  d      r%t
        j                  j                  j                  }nt
        j                  j                  }|j                   dk7  rJt        j
                  j#                  |j                   d|j                  z  |z  d      } ||d      | _        t'        |      D ]  }|j(                  |z  }|j*                  |z  |z
  dz  }t        j
                  j#                  |j                  d|j                  z  |j*                  ||      } ||d      }| j                  j-                  |       ||dz
  k  rd|j                  z  }	n|j                  }	t        j
                  j#                  |j                  |	d      }
 ||
d      }
| j                  j-                  |
        y )Nweight_normr   rX   r   weight)name)in_channelsout_channelskernel_sizedilationpadding)super__init__hidden_sizer   r    r   
ModuleList	in_layersres_skip_layersDropoutwavenet_dropoutdropouthasattrutilsparametrizationsr   speaker_embedding_sizeConv1d
cond_layerrangewavenet_dilation_ratewavenet_kernel_sizeappend)selfr   r   r   r   ir   r   in_layerres_skip_channelsres_skip_layer	__class__s              r%   r   zVitsWaveNet.__init__D  s   !--$,,.$xx224zz&"8"89288,,m<((33??K((..K((A-)F)FFL^L^H^akHkmnoJ)*8DDOz" 	8A33Q6H11H<xGAMGxx"..!3!33"66! ' H #8(;HNN!!(+ :>!$%(:(:$:!$*$6$6!"XX__V-?-?ARTUVN(hGN  ''7+	8r$   c                    t        j                  |      }t        j                  | j                  g      }|| j	                  |      }t        | j                        D ]  } | j                  |   |      }|1|dz  | j                  z  }|d d ||d| j                  z  z   d d f   }	nt        j                  |      }	t        ||	|d         }
| j                  |
      }
 | j                  |   |
      }|| j                  dz
  k  r<|d d d | j                  d d f   }||z   |z  }||d d | j                  d d d f   z   }||z   } ||z  S )NrX   r   r   )r    rF   	IntTensorr   r   r   r   r   r6   r   r   )r   r=   padding_maskglobal_conditioningrN   num_channels_tensorr   r   cond_offsetglobal_statesr5   res_skip_actsres_actss                r%   forwardzVitsWaveNet.forwardm  sm   ""6*#oot/?/?.@A*"&//2E"Ft' 	2A-DNN1-f5M".!ed&6&66 3A{[STW[WgWgSgEg7gij4j k % 0 0 ?2=-QdefQghD<<%D3D003D9M4??Q&&(,>d.>.>,>)AB 8+|;!M!T5E5E5G2J$KK!M1%	2( %%r$   c                 p   | j                   dk7  r3t        j                  j                  j	                  | j
                         | j                  D ]+  }t        j                  j                  j	                  |       - | j                  D ]+  }t        j                  j                  j	                  |       - y )Nr   )r   r    r   r   remove_weight_normr   r   r   r   layers     r%   r   zVitsWaveNet.remove_weight_norm  s~    &&!+HHNN--doo>^^ 	5EHHNN--e4	5)) 	5EHHNN--e4	5r$   r,   )	r   r   r   r   intr   r   r   __classcell__r   s   @r%   r   r   C  s!    '8z '8s '8R&:5r$   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsPosteriorEncoderr   c                 B   t         |           |j                  | _        t	        j
                  |j                  |j                  d      | _        t        ||j                        | _        t	        j
                  |j                  | j                  dz  d      | _        y )Nr   r   rX   )r   r   	flow_sizer   r   r   spectrogram_binsr   conv_prer   $posterior_encoder_num_wavenet_layerswavenet	conv_projr   r   r   s     r%   r   zVitsPosteriorEncoder.__init__  su    ",,		&"9"96;M;MqQ"6f6a6ab6#5#5t7H7H17LaPr$   c                 .   | j                  |      |z  }| j                  |||      }| j                  |      |z  }t        j                  || j
                  d      \  }}|t        j                  |      t        j                  |      z  z   |z  }|||fS )Nr   rT   )r   r   r   r    splitr   
randn_likerI   )r   r=   r   r   statsmean
log_stddevsampleds           r%   r   zVitsPosteriorEncoder.forward  s    v&5fl4GHv&5 ;;ud.?.?QGj%**40599Z3HHHLXj((r$   r,   r   r   r   r   r   r   r   r   s   @r%   r   r     s    Qz Q)r$   r   c                   :     e Zd Zd fd	ZddZd Zd Zd Z xZS )HifiGanResidualBlockc                    t         |           || _        t        j                  t        t        |            D cg c]3  }t        j                  |||d||   | j                  |||               5 c}      | _	        t        j                  t        t        |            D cg c]-  }t        j                  |||dd| j                  |d            / c}      | _
        y c c}w c c}w )Nr   )strider   r   )r   r   leaky_relu_sloper   r   r   lenr   get_paddingconvs1convs2)r   channelsr   r   r   r   _r   s          r%   r   zHifiGanResidualBlock.__init__  s     0mm s8}-
  		%a[ ,,[(1+F

 mm s8}-
  		 ,,[!<



s   8C$%2C)c                     ||z  |z
  dz  S )NrX   r#   )r   r   r   s      r%   r   z HifiGanResidualBlock.get_padding  s    h&1a77r$   c                 ,   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]
  } ||        y Nr   )r   r   r   r   r   r   r   r   r   r   s      r%   apply_weight_normz&HifiGanResidualBlock.apply_weight_norm  sp    hh**288,,m<((33??K[[ 	E	[[ 	E	r$   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]!  }t        j                  j                  |       # y r,   )r   r   r   r   r   r   s     r%   r   z'HifiGanResidualBlock.remove_weight_norm  sL    [[ 	/EHH''.	/[[ 	/EHH''.	/r$   c                 ,   t        | j                  | j                        D ]p  \  }}|}t        j                  j                  || j                        } ||      }t        j                  j                  || j                        } ||      }||z   }r |S r,   )zipr   r   r   rJ   
leaky_relur   )r   r   conv1conv2residuals        r%   r   zHifiGanResidualBlock.forward  s    T[[9 	5LE5$HMM44]DDYDYZM!-0MMM44]DDYDYZM!-0M)H4M	5 r$   )r	   )r   r	      g?r   )	r   r   r   r   r   r   r   r   r   r   s   @r%   r   r     s    
>8/r$   r   c                        e Zd Zdef fdZd Zd Z	 d	dej                  de	ej                     dej                  fdZ
 xZS )
VitsHifiGanr   c                 d   t         |           || _        t        |j                        | _        t        |j                        | _        t        j                  |j                  |j                  ddd      | _        t        j                         | _        t        t!        |j                  |j"                              D ]d  \  }\  }}| j                  j%                  t        j&                  |j                  d|z  z  |j                  d|dz   z  z  ||||z
  dz               f t        j                         | _        t+        t        | j                              D ]p  }|j                  d|dz   z  z  }t!        |j                  |j,                        D ]6  \  }}| j(                  j%                  t/        ||||j0                               8 r t        j                  ddddd      | _        |j4                  dk7  r1t        j                  |j4                  |j                  d      | _        y y )	N   r   r	   )r   r   r   rX   F)r   r   r   biasr   )r   r   r   r   resblock_kernel_sizesnum_kernelsupsample_ratesnum_upsamplesr   r   r   upsample_initial_channelr   r   	upsampler	enumerater   upsample_kernel_sizesr   ConvTranspose1d	resblocksr   resblock_dilation_sizesr   r   	conv_postr   cond)r   r   r   upsample_rater   r   r   r   s          r%   r   zVitsHifiGan.__init__  s   v;;< !6!67		++
 /8V=R=RTZTpTp9q/r 		+A+{NN!!""331=33a!eE +((=8Q>		 s4>>*+ 	vA661Q<HH),V-I-I6KiKi)j v%X%%&:8[RZ\b\s\s&tuv	v
 8QAaQRY^_((A-		&"?"?A`A`bcdDI .r$   c                 <   t         j                  j                  }t        t         j                  j                  d      r$t         j                  j                  j                  }| j
                  D ]
  } ||        | j                  D ]  }|j                           y r   )r   r   r   r   r   r   r   r   r   s      r%   r   zVitsHifiGan.apply_weight_norm	  st    hh**288,,m<((33??K^^ 	E	^^ 	&E##%	&r$   c                     | j                   D ]!  }t        j                  j                  |       # | j                  D ]  }|j                           y r,   )r   r   r   r   r   r   s     r%   r   zVitsHifiGan.remove_weight_norm  sF    ^^ 	/EHH''.	/^^ 	'E$$&	'r$   r   r   returnc                    | j                  |      }||| j                  |      z   }t        | j                        D ]  }t        j
                  j                  || j                  j                        } | j                  |   |      } | j                  || j                  z     |      }t        d| j                        D ]*  }| | j                  || j                  z  |z      |      z  }, || j                  z  } t        j
                  j                  |      }| j                  |      }t        j                  |      }|S )aG  
        Converts a spectrogram into a speech waveform.

        Args:
            spectrogram (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`):
                Tensor containing the spectrograms.
            global_conditioning (`torch.FloatTensor` of shape `(batch_size, config.speaker_embedding_size, 1)`, *optional*):
                Tensor containing speaker embeddings, for multispeaker models.

        Returns:
            `torch.FloatTensor`: Tensor of shape shape `(batch_size, 1, num_frames)` containing the speech waveform.
        r   )r   r   r   r   r   rJ   r   r   r   r   r   r   r   r    r-   )r   r   r   r   r   	res_statejr   s           r%   r   zVitsHifiGan.forward  s+    k2*)DII6I,JJMt))* 	9AMM44]DKKD`D`aM-DNN1-m<M<q4+;+;';<]KI1d../ UET^^A0@0@,@1,DEmTT	U%(8(88M	9 00?}5::m,r$   r,   )r   r   r   r   r   r   r   r    r!   r   r   r   r   s   @r%   r   r     sW    "ez "eH&' bf  ,, CKEL]L]C^ 			 r$   r   c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingLayerr   c                 B   t         |           |j                  dz  | _        t	        j
                  | j                  |j                  d      | _        t        ||j                        | _
        t	        j
                  |j                  | j                  d      | _        y )NrX   r   r   )r   r   r   half_channelsr   r   r   r   r    prior_encoder_num_wavenet_layersr   r   r   s     r%   r   z"VitsResidualCouplingLayer.__init__=  su    #--2		$"4"4f6H6H!L"6f6]6]^6#5#5t7I7I1Mr$   c                    t        j                  || j                  gdz  d      \  }}| j                  |      |z  }| j	                  |||      }| j                  |      |z  }t        j                  |      }	|sS||t        j                  |	      z  |z  z   }t        j                  ||gd      }
t        j                  |	ddg      }|
|fS ||z
  t        j                  |	       z  |z  }t        j                  ||gd      }
|
d fS )NrX   r   rT   )
r    r   r  r   r   r   rF   rI   catra   )r   r=   r   r   rA   
first_halfsecond_halfr   r   r   rN   log_determinants               r%   r   z!VitsResidualCouplingLayer.forwardE  s   "'++ft7I7I6JQ6NTU"V
Kj1L@]LBUV~~m,|;%%d+
uyy/D!D|!SSKii[ 9qAG#ii
QF;OO++&-J;1GG,VKii[ 9qAGD= r$   NFr   r   s   @r%   r  r  <  s    Nz N!r$   r  c                   ,     e Zd Zdef fdZddZ xZS )VitsResidualCouplingBlockr   c                     t         |           t        j                         | _        t        |j                        D ]&  }| j                  j                  t        |             ( y r,   )	r   r   r   r   flowsr   prior_encoder_num_flowsr   r  r   r   r   r   s      r%   r   z"VitsResidualCouplingBlock.__init__X  sO    ]]_
v556 	AAJJ7?@	Ar$   c                     |s7| j                   D ]&  } ||||      \  }}t        j                  |dg      }( |S t        | j                         D ](  }t        j                  |dg      } ||||d      \  }}* |S )Nr   TrA   )r  r    flipreversed)r   r=   r   r   rA   flowr   s          r%   r   z!VitsResidualCouplingBlock.forward^  s    

 1 7JK	FQC01  !, ZFQC0 7JTXY	Z r$   r  r   r   s   @r%   r  r  W  s    Az A	r$   r  c                   .     e Zd Zddef fdZddZ xZS )VitsDilatedDepthSeparableConvr   c                 D   t         |           |j                  }|j                  }|j                  | _        t        j                  |      | _        t        j                         | _
        t        j                         | _        t        j                         | _        t        j                         | _        t        | j
                        D ]  }||z  }||z  |z
  dz  }| j                  j                  t        j                   ||||||             | j                  j                  t        j                   ||d             | j                  j                  t        j"                  |             | j                  j                  t        j"                  |              y )NrX   )r   r   r   groupsr   r   r   )r   r   duration_predictor_kernel_sizer   depth_separable_num_layersr   r   r   r   r   convs_dilatedconvs_pointwisenorms_1norms_2r   r   r   	LayerNorm)	r   r   dropout_rater   r   r   r   r   r   s	           r%   r   z&VitsDilatedDepthSeparableConv.__init__k  s7   ;;%% ;;zz,/]]_!}}}}}}t' 	8A"A~H"X-8Q>G%%		 (!) +#%#	   ''		(Ha(HILLX 67LLX 67	8r$   c                 $   |||z   }t        | j                        D ]  } | j                  |   ||z        } | j                  |   |j	                  dd            j	                  dd      }t
        j                  j                  |      } | j                  |   |      } | j                  |   |j	                  dd            j	                  dd      }t
        j                  j                  |      }| j                  |      }||z   } ||z  S Nr   r;   )r   r   r  r   	transposer   rJ   gelur  r!  r   )r   r=   r   r   r   r   s         r%   r   z%VitsDilatedDepthSeparableConv.forward  s   *11Ft' 	,A1D..q1&<2GHM+DLLOM,C,CAr,JKUUVWY[\MMM..}=M3D003MBM+DLLOM,C,CAr,JKUUVWY[\MMM..}=M LL7Mm+F	, $$r$   )r<   r,   r   r   s   @r%   r  r  j  s    8z 88%r$   r  c                   ,     e Zd Zdef fdZddZ xZS )VitsConvFlowr   c                    t         |           |j                  | _        |j                  dz  | _        |j                  | _        |j                  | _	        t        j                  | j
                  | j                  d      | _        t        |      | _        t        j                  | j                  | j
                  | j                  dz  dz
  z  d      | _        y )NrX   r   r	   )r   r   r   filter_channelsdepth_separable_channelsr  duration_predictor_flow_binsri   duration_predictor_tail_boundrB   r   r   r   r  conv_ddsr   r   s     r%   r   zVitsConvFlow.__init__  s    %11#<<A;; >>		$"4"4d6J6JAN5f=4#7#79K9Kt}}_`O`cdOd9eghir$   c                     t        j                  || j                  gdz  d      \  }}| j                  |      }| j	                  |||      }| j                  |      |z  }|j                  \  }}	}
|j                  ||	d|
      j                  dddd      }|dd | j                  f   t        j                  | j                        z  }|d| j                  d| j                  z  f   t        j                  | j                        z  }|dd| j                  z  d f   }t        |||||| j                        \  }}t        j                  ||gd      |z  }|st        j                   ||z  ddg      }||fS |d fS )	NrX   r   rT   r;   r   r	   .)rA   rB   )r    r   r  r   r/  r   r]   reshapepermuteri   mathrf   r+  rQ   rB   r  ra   )r   r=   r   r   rA   r	  r
  r   
batch_sizer   lengthr>   r?   r@   rO   rN   r  s                    r%   r   zVitsConvFlow.forward  s   "'++ft7I7I6JQ6NTU"V
Kj1m\CVW}5D'1'7'7$
Hf%--j(BOWWXY[\^_abc+C4==,@ADIIdNbNbDcc,S$--!dmmBS2S-STW[W`W`aeauauWvv#0a$--6G6I1I#J #K $$
 [ ))Z51=L#iil(BQFKOO++D= r$   r  r   r   s   @r%   r)  r)    s    	jz 	j!r$   r)  c                   ,     e Zd Zdef fdZddZ xZS )VitsElementwiseAffiner   c                 $   t         |           |j                  | _        t	        j
                  t        j                  | j                  d            | _        t	        j
                  t        j                  | j                  d            | _	        y Nr   )
r   r   r,  r   r   	Parameterr    zeros	translate	log_scaler   s     r%   r   zVitsElementwiseAffine.__init__  sY    77ekk$--&CDekk$--&CDr$   c                 .   |s]| j                   t        j                  | j                        |z  z   }||z  }t        j                  | j                  |z  ddg      }||fS || j                   z
  t        j                  | j                         z  |z  }|d fS Nr   rX   )r<  r    rI   r=  ra   )r   r=   r   r   rA   rN   r  s          r%   r   zVitsElementwiseAffine.forward  s    nnuyy'@6'IIG,G#ii(E1vNOO++.%))T^^O2LL|[GD= r$   r  r   r   s   @r%   r7  r7    s    Ez E!r$   r7  c                   &     e Zd Z fdZddZ xZS )VitsStochasticDurationPredictorc                    t         |           |j                  }|j                  }t	        j
                  ||d      | _        t	        j
                  ||d      | _        t        ||j                        | _
        |dk7  rt	        j
                  ||d      | _        t	        j                         | _        | j                  j                  t        |             t!        |j"                        D ]&  }| j                  j                  t%        |             ( t	        j
                  d|d      | _        t	        j
                  ||d      | _        t        ||j                        | _        t	        j                         | _        | j,                  j                  t        |             t!        |j"                        D ]&  }| j,                  j                  t%        |             ( y )Nr   )r#  r   )r   r   r   r   r   r   r   r   r  duration_predictor_dropoutr/  r   r   r  r   r7  r   duration_predictor_num_flowsr)  post_conv_prepost_conv_projpost_conv_dds
post_flows)r   r   	embed_dimr+  r   r   s        r%   r   z(VitsStochasticDurationPredictor.__init__  st   11	 ,,		/?AF?OQG5::

 >		)_a@DI]]_


/78v::; 	4AJJl623	4  YYq/1= ii!L:::

 --/4V<=v::; 	9AOO""<#78	9r$   c                    t        j                  |      }| j                  |      }|)t        j                  |      }|| j                  |      z   }| j	                  ||      }| j                  |      |z  }|s| j                  |      }| j                  ||      }| j                  |      |z  }t        j                  |j                  d      d|j                  d            j                  |j                  |j                        |z  }d}	|}
| j                  D ]/  } ||
|||z         \  }
}t        j                  |
dg      }
|	|z  }	1 t        j                   |
ddgd      \  }}|	t        j"                  t$        j&                  j)                  |      t$        j&                  j)                  |       z   |z  ddg      z  }	t        j"                  dt+        j,                  dt*        j.                  z        |dz  z   z  |z  ddg      |	z
  }|t        j0                  |      z
  |z  }t        j,                  t        j2                  |d            |z  }t        j"                  | ddg      }t        j4                  ||gd      }| j6                  D ],  } ||||      \  }}t        j                  |dg      }||z  }. t        j"                  d	t+        j,                  dt*        j.                  z        |dz  z   z  |z  ddg      |z
  }||z   S t9        t;        | j6                              }|d d
 |d   gz   }t        j                  |j                  d      d|j                  d            j                  |j                  |j                        |z  }|D ](  }t        j                  |dg      } ||||d      \  }}* t        j                   |ddgd      \  }}|S )Nr   rX   )devicedtype)r   r   rT         gh㈵>g      ?r;   T)r   rA   )r    detachr   r   r/  r   rE  rG  rF  randnsizetorK  rL  rH  r  r   ra   r   rJ   
logsigmoidr3  rH   pir.   	clamp_minr  r  listr  )r   r=   r   r   	durationsrA   noise_scaler   random_posteriorlog_determinant_posterior_sumlatents_posteriorr  r  r	  r
  logqlog_determinant_sumlatentsnllr  r   log_durations                         r%   r   z'VitsStochasticDurationPredictor.forward  s   f%v&*"',,/B"Cdii(;<<Fv|4',6 ..y9M ..}lKM //>MM INN1-q)..2CDGGv}}djdpdpGq  -.) 0 A59%|R_I_62!? %*JJ/@1#$F!-@-A ',kk2CaVQR&S#J)UYY))*58P8PR\Q\8]]ammpqstou. ) 		$$((1tww;"7;KQ;N"OPS__bcefagh/0 
 $emmJ&??<OJ5??:t#DETJ"'))ZK!Q"@ii[ 9qAG

 7+/[a+b(**Wqc2#6#7
 ))C488AK#8GQJ#GH<WZ[]^Y_`cvvC:$**-.E#2J%),E FKKNAv{{1~>AA^d^j^jAk   c**Wqc2!'<V]ab
c $kk'Aq6qAOL!r$   )NNFrS   r   r   r   r   r   r   r   s   @r%   rA  rA    s    9@@ r$   rA  c                   &     e Zd Z fdZddZ xZS )VitsDurationPredictorc                    t         |           |j                  }|j                  }t	        j
                  |j                        | _        t	        j                  |j                  |||dz        | _
        t	        j                  ||j                        | _        t	        j                  ||||dz        | _        t	        j                  ||j                        | _        t	        j                  |dd      | _        |j"                  dk7  r1t	        j                  |j"                  |j                  d      | _        y y )NrX   )r   epsr   r   )r   r   r  "duration_predictor_filter_channelsr   r   rC  r   r   r   conv_1r"  layer_norm_epsnorm_1conv_2norm_2projr   r   )r   r   r   r+  r   s       r%   r   zVitsDurationPredictor.__init__:  s    ;; CCzz&"C"CDii 2 2O[ZeijZjkll?8M8MNii+WbfgWghll?8M8MNIIoq!4	((A-		&"?"?ASASUVWDI .r$   c                 `   t        j                  |      }|)t        j                  |      }|| j                  |      z   }| j                  ||z        }t        j                  |      }| j                  |j                  dd            j                  dd      }| j                  |      }| j                  ||z        }t        j                  |      }| j                  |j                  dd            j                  dd      }| j                  |      }| j                  ||z        }||z  S r%  )r    rO  r   rh  relurj  r&  r   rk  rl  rm  )r   r=   r   r   s       r%   r   zVitsDurationPredictor.forwardI  s   f%*"',,/B"Cdii(;<<FVl23F#V--a45??2Ff%Vl23F#V--a45??2Ff%6L01$$r$   r,   ra  r   s   @r%   rc  rc  9  s    X%r$   rc  c                   &    e Zd ZdZdef fdZdej                  dedefdZ		 	 	 	 ddej                  d	e
ej                     d
e
ej                     de
ej                     dedeej                  e
ej                     f   fdZd Zd Zd Z xZS )VitsAttentionz?Multi-headed attention with relative positional representation.r   c                    t         |           |j                  | _        |j                  | _        |j                  | _        |j                  | _        | j                  | j
                  z  | _	        | j                  dz  | _
        | j                  | j
                  z  | j                  k7  r&t        d| j                   d| j
                   d      t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        t        j                  | j                  | j                  |j                        | _        | j                  rt        j&                  t)        j*                  d| j                  dz  dz   | j                        | j                  z        | _        t        j&                  t)        j*                  d| j                  dz  dz   | j                        | j                  z        | _        y y )NrM  zIhidden_size must be divisible by num_attention_heads (got `hidden_size`: z and `num_attention_heads`: z).)r   r   rX   )r   r   r   rI  num_attention_heads	num_headsattention_dropoutr   window_sizehead_dimscalingr\   r   Linearuse_biask_projv_projq_projout_projr:  r    rP  	emb_rel_k	emb_rel_vr   s     r%   r   zVitsAttention.__init__a  s   ++33//!--$..8}}d*MMDNN*t~~=[\`\j\j[k.t~~.>bB 
 iiV__UiiV__UiiV__U		$..$..vW\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN\\%++a9I9IA9MPQ9QSWS`S`*adhdpdp*pqDN r$   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S r?  )viewrt  rw  r&  
contiguous)r   r  r  r  s       r%   _shapezVitsAttention._shapez  s7    {{3GQQRSUVWbbddr$   r   key_value_statesattention_masklayer_head_maskoutput_attentionsr   c                 X	   |j                         \  }}}| j                  |      | j                  z  }	| j                  | j	                  |      d|      }
| j                  | j                  |      d|      }|| j                  z  d| j                  f} | j                  |	||      j                  | }	 |
j                  | }
 |j                  | }|
j                  d      }t        j                  |	|
j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                | j                  X| j                  | j                  |      }t        j                   |	|j                  dd            }| j#                  |      }||z  }|{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t$        j&                  j)                  |d	      }||j                         | j                  fk7  r*t        d
| j                  f d|j                                |j                  dddd      |j                  || j                  ||      z  }|j                  || j                  z  ||      }|r?|j                  || j                  ||      }|j                  || j                  z  ||      }nd}t$        j&                  j+                  || j*                  | j,                        }t        j                  ||      }|j                         || j                  z  || j                  fk7  r7t        d|| j                  || j                  f d|j                                | j                  H| j                  | j.                  |      }| j1                  |      }t        j                   ||      }||z  }|j                  || j                  || j                        }|j                  dd      }|j3                  ||| j4                        }| j7                  |      }||fS )z#Input shape: Batch x Time x Channelr;   r   rX   z$Attention weights should be of size z	, but is NrN  z!Attention mask should be of size rT   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )rQ  r}  rx  r  r{  r|  rt  rw  r  r    bmmr&  r\   rv  _get_relative_embeddingsr  matmul'_relative_position_to_absolute_positionr   rJ   r^   r   r  r  '_absolute_position_to_relative_positionr1  rI  r~  )r   r   r  r  r  r  r  tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightskey_relative_embeddingsrelative_logitsrel_pos_biasattn_weights_reshaped
attn_probsattn_outputvalue_relative_embeddingsrelative_weightss                          r%   r   zVitsAttention.forward}  s    (,,.Wa {{=1DLL@ [[]!;RE
{{4;;}#=r3GDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 '&*&C&CDNNT[&\##ll<9P9Z9Z[]_a9bcOGGXLL(L%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 '(,(E(EdnnV](^%#KKJW <<(8:STL<'K!&&sDNNGT]]S!++Aq1 "))#wGmmK0111r$   c           	          t        || j                  dz   z
  d      }|dkD  r&t        j                  j	                  |dd||ddg      }t        | j                  dz   |z
  d      }|d|z  z   dz
  }|d d ||f   S )Nr   r   rX   )r[   rv  r   rJ   r8   )r   relative_embeddingsr5  
pad_lengthslice_start_positionslice_end_positions         r%   r  z&VitsAttention._get_relative_embeddings  s    4#3#3a#78!<
>"$--"3"34G!QPZ\fhiklIm"n"D$4$4q$8F#BAF1AJ>B"1&:;M&M#MNNr$   c                 N   |j                         \  }}}t        j                  j                  |g d      }|j	                  ||dz  |z  g      }t        j                  j                  |d|dz
  ddg      }|j	                  ||dz   d|z  dz
  g      }|d d d ||dz
  d f   }|S )N)r   r   r   r   r   r   rX   r   r   rQ  r   rJ   r8   r  r   xbatch_headsr5  r   x_flatx_finals          r%   r  z5VitsAttention._relative_position_to_absolute_position  s    !"VQ MMa!34 fqj6&9:;""6Avz1a+@A ++{FQJF
QGH!WfWfqjl23r$   c           	      F   |j                         \  }}}t        j                  j                  |d|dz
  ddddg      }|j	                  ||d|z  dz
  z  g      }t        j                  j                  ||dddg      }|j	                  ||d|z  g      d d d d dd f   }|S )Nr   r   rX   r  r  s          r%   r  z5VitsAttention._absolute_position_to_relative_position  s    !"VQ MMa!VaZAq!!<=fF
Q&?@A ""6FAq!+<=++{FAJ?@AqrJr$   )NNNF)r   r   r   r   r   r   r    Tensorr   r  r   boolr   r   r  r  r  r   r   s   @r%   rq  rq  ^  s    Irz r2eU\\ eC ec e 481526"'`2||`2 #5<<0`2 !.	`2
 "%,,/`2  `2 
u||Xell33	4`2DO
r$   rq  c                   $     e Zd Z fdZd Z xZS )VitsFeedForwardc                 d   t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j
                  |j                  |j                        | _        t        j                  |j                        | _        t        |j                  t              rt        |j                     | _        n|j                  | _        |j                  dkD  r/|j                  dz
  dz  }|j                  dz  }||ddddg| _        y d | _        y )Nr   rX   r   )r   r   r   r   r   ffn_dimffn_kernel_sizerh  rk  r   activation_dropoutr   
isinstance
hidden_actstrr
   act_fnr   )r   r   pad_left	pad_rightr   s       r%   r   zVitsFeedForward.__init__  s    ii 2 2FNNFDZDZ[ii0B0BFDZDZ[zz&";";<f''- !2!23DK ++DK!!A%..2q8H..!3I$iAq!<DLDLr$   c                    |j                  ddd      }|j                  ddd      }||z  }| j                  *t        j                  j	                  || j                        }| j                  |      }| j                  |      }| j                  |      }||z  }| j                  *t        j                  j	                  || j                        }| j                  |      }||z  }|j                  ddd      }|S )Nr   rX   r   )	r2  r   r   rJ   r8   rh  r  r   rk  )r   r   r   s      r%   r   zVitsFeedForward.forward  s    %--aA6#++Aq!4%4<<#MM--mT\\JMM2M2]3%4<<#MM--mT\\JMM2%4%--aA6r$   ra  r   s   @r%   r  r    s     $r$   r  c            	            e Zd Zdef fdZ	 	 ddej                  dej                  deej                     de	fdZ
 xZS )	VitsEncoderLayerr   c                 j   t         |           t        |      | _        t	        j
                  |j                        | _        t	        j                  |j                  |j                        | _        t        |      | _        t	        j                  |j                  |j                        | _        y )Nre  )r   r   rq  	attentionr   r   hidden_dropoutr   r"  r   ri  
layer_normr  feed_forwardfinal_layer_normr   s     r%   r   zVitsEncoderLayer.__init__/  sz    &v.zz&"7"78,,v'9'9v?T?TU+F3 "V-?-?VEZEZ [r$   r   r   r  r  c                 
   |}| j                  |||      \  }}| j                  |      }| j                  ||z         }|}| j                  ||      }| j                  |      }| j	                  ||z         }|f}|r||fz  }|S )N)r   r  r  )r  r   r  r  r  )r   r   r   r  r  r   r  rN   s           r%   r   zVitsEncoderLayer.forward7  s     !&*nn')/ '5 '
#| ]3=(@A ))-F]3--h.FG "&Gr$   r  )r   r   r   r   r   r    r  r!   r   r  r   r   r   s   @r%   r  r  .  sW    \z \ 26"'|| '' !.	
  r$   r  c                        e Zd Zdef fdZ	 	 	 	 ddej                  dej                  deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )VitsEncoderr   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        |j                  | _
        y c c}w r  )r   r   r   r   r   r   num_hidden_layersr  layersgradient_checkpointing	layerdropr  s      r%   r   zVitsEncoder.__init__V  s\    mmuVMeMeGf$g!%5f%=$gh&+#)) %hs   A4r   r   r  r  output_hidden_statesreturn_dictr   c                 L   |rdnd }|rdnd }|t        ||j                        }||z  }t               xs t        |       }	| j                  D ]  }
|r||fz   }t
        j                  j                  dd      }| j                  xr || j                  k  }|r|	rI| j                  r,| j                  r | j                  |
j                  ||||      }n |
||||      }|d   }|rd}|s|d   fz   } ||z  }|r||fz   }|st        d |||fD              S t        |||      S )Nr#   r   r   )r  r   r  )NNc              3   &   K   | ]	  }||  y wr,   r#   ).0vs     r%   	<genexpr>z&VitsEncoder.forward.<locals>.<genexpr>  s     mq_`_lms   )r(   r   r   )r   rL  r   r   r  rG   randomuniformr  r  r  _gradient_checkpointing_func__call__tupler   )r   r   r   r  r  r  r  all_hidden_statesall_self_attentionssynced_gpusencoder_layerdropout_probabilityskip_the_layerlayer_outputss                 r%   r   zVitsEncoder.forward]  sr    #7BD$5b4 %7H[H[\N%402R6LT6R![[ 	PM#$58H$H! #%))"3"3Aq"9!]]U0Cdnn0TN![..4==$($E$E%..%$&)%M %2%'5%1*;	%M !.a 0 , &9]1=M<O&O#?	PB &4 1]4D Dm]4EGZ$[mmm++*
 	
r$   )NNNN)r   r   r   r   r   r    r!   r   r  r  r   r   r   r   r   r   s   @r%   r  r  U  s    *z * 26,0/3&*B
((B
 ''B
 !.	B

 $D>B
 'tnB
 d^B
 
uo%	&B
r$   r  c                        e Zd ZdZdef fdZd Zd Z	 	 	 	 ddej                  dej                  deej                     d	ee   d
ee   dee   deeej                     ef   fdZ xZS )VitsTextEncoderzs
    Transformer encoder that uses relative positional representation instead of absolute positional encoding.
    r   c                 ,   t         |           || _        t        j                  |j
                  |j                  |j                        | _        t        |      | _
        t        j                  |j                  |j                  dz  d      | _        y )NrX   r   )r   )r   r   r   r   	Embedding
vocab_sizer   pad_token_idembed_tokensr  encoderr   r   projectr   s     r%   r   zVitsTextEncoder.__init__  so    LL):):F<N<NPVPcPcd"6*yy!3!3V5E5E5IWXYr$   c                     | j                   S r,   r  r   s    r%   get_input_embeddingsz$VitsTextEncoder.get_input_embeddings         r$   c                     || _         y r,   r  )r   rW   s     r%   set_input_embeddingsz$VitsTextEncoder.set_input_embeddings  s
    !r$   	input_idsr   r  r  r  r  r   c                    | j                  |      t        j                  | j                  j                        z  }| j                  ||||||      }|s|d   n|j                  }	| j                  |	j                  dd            j                  dd      |z  }
t        j                  |
| j                  j                  d      \  }}|s|	||f|dd  z   }|S t        |	|||j                  |j                        S )N)r   r   r  r  r  r  r   r   rX   rT   )r(   r)   r*   r   r   )r  r3  rf   r   r   r  r(   r  r&  r    r   r   r'   r   r   )r   r  r   r  r  r  r  r   encoder_outputsr(   r   r)   r*   rN   s                 r%   r   zVitsTextEncoder.forward  s    )))4tyyAXAX7YY,,'%)/!5# ' 
 7BOA.GhGh.88A>?II!QOR^^+0;;udkk>S>SYZ+[(((+7JKo^_^`NaaGN$/# 3)77&11
 	
r$   )NNNT)r   r   r   r   r   r   r  r  r    r  r!   r   r  r   r   r'   r   r   r   s   @r%   r  r    s    Zz Z!" 26,0/3&*#
<<#
 ''#
 !.	#

 $D>#
 'tn#
 d^#
 
uU\\"$99	:#
r$   r  c                   "    e Zd ZeZdZdZdZd Zy)VitsPreTrainedModelvitsr  Tc                 v   t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          yyt        |t        j                        rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t        j                        rt        j                  j                  |j                         |j                  jt        j                   |j"                  |j$                  |j&                  d   z  z        }t        j                  j)                  |j                  | |       yyt        |t        j*                        rz|j                  j                  j                  d| j                  j                         |j,                  2|j                  j                  |j,                     j                          yyy)zInitialize the weightsr<   )r   stdNrS   r   )r   r   )r  r   ry  r   datanormal_r   initializer_ranger   zero_r"  fill_r   initkaiming_normal_r3  rf   r  r   r   uniform_r  padding_idx)r   moduleks      r%   _init_weightsz!VitsPreTrainedModel._init_weights  s   fbii(MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S)		*GG##FMM2{{&IIfmmv/A/AFDVDVWXDY/YZ[  a 8 ' -MM&&CT[[5R5R&S!!-""6#5#56<<> . .r$   N)	r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr  r#   r$   r%   r  r    s    L!O&*#?r$   r  z@
    The complete VITS model, for text-to-speech synthesis.
    )custom_introc                        e Zd Zdef fdZd Ze	 	 	 	 	 	 	 ddeej                     deej                     dee
   dee   dee   d	ee   d
eej                     deee   ef   fd       Z xZS )	VitsModelr   c                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        |j                  rt        |      | _        nt        |      | _        |j                  dkD  r/t        j                  |j                  |j                         | _        t%        |      | _        |j(                  | _        |j*                  | _        |j,                  | _        | j/                          y r9  )r   r   r   r  text_encoderr  r  r   decoder"use_stochastic_duration_predictionrA  duration_predictorrc  num_speakersr   r  r   embed_speakerr   posterior_encoderspeaking_raterX  noise_scale_duration	post_initr   s     r%   r   zVitsModel.__init__  s     +F3-f5	"6*44&Ef&MD#&;F&CD#"!#f.A.A6C`C`!aD "6f!= $11!--$*$?$?! 	r$   c                     | j                   S r,   )r  r  s    r%   get_encoderzVitsModel.get_encoder  r  r$   r  r  
speaker_idr  r  r  labelsr   c                 j   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j
                  j                  j                  j                  }|!|j                  d      j                  |      }	n3t        j                  |      j                  d      j                  |      }	| j                   j                  dkD  r|d|cxk  r| j                   j                  k  s(n t        d| j                   j                  dz
   d      t        |t               r"t        j"                  d|| j$                  	      }| j'                  |      j                  d      }
nd}
| j                  ||	||||
      }|s|d   n|j(                  }|j+                  dd      }|	j+                  dd      }	|s|d   n|j,                  }|s|d   n|j.                  }| j                   j0                  r!| j3                  ||	|
d| j4                        }n| j3                  ||	|
      }d| j6                  z  }t        j8                  t        j:                  |      |	z  |z        }t        j<                  t        j>                  |ddg      d      jA                         }t        jB                  |jE                         |j                  |j$                        }|j                  d      |j                  d      k  }|j                  d      j                  |	j                        }t        j                  |	d      t        j                  |d      z  }|jF                  \  }}}}t        jH                  |d      jK                  ||z  d      }t        jB                  ||j                  |j$                        }|j                  d      |k  }|j                  |j                        jK                  |||      }|tL        jN                  jQ                  |g d      ddddf   z
  }|j                  d      j+                  dd      |z  }t        jR                  |jU                  d      |      j+                  dd      }t        jR                  |jU                  d      |      j+                  dd      }|t        jV                  |      t        j:                  |      z  | jX                  z  z   }| j[                  |||
d      }||z  } | j]                  | |
      }!|!jU                  d      }!|t_        j`                  | j                   jb                        z  }"|s|!|"| f|dd z   }#|#S te        |!|"| |jf                  |jh                        S )a  
        speaker_id (`int`, *optional*):
            Which speaker embedding to use. Only used for multispeaker models.
        labels (`torch.FloatTensor` of shape `(batch_size, config.spectrogram_bins, sequence_length)`, *optional*):
            Float values of target spectrogram. Timesteps set to `-100.0` are ignored (masked) for the loss
            computation.

        Example:

        ```python
        >>> from transformers import VitsTokenizer, VitsModel, set_seed
        >>> import torch

        >>> tokenizer = VitsTokenizer.from_pretrained("facebook/mms-tts-eng")
        >>> model = VitsModel.from_pretrained("facebook/mms-tts-eng")

        >>> inputs = tokenizer(text="Hello - my dog is cute", return_tensors="pt")

        >>> set_seed(555)  # make deterministic

        >>> with torch.no_grad():
        ...     outputs = model(inputs["input_ids"])
        >>> outputs.waveform.shape
        torch.Size([1, 45824])
        ```
        Nz&Training of VITS is not supported yet.r;   r   r   z Set `speaker_id` in the range 0-.r   )rQ  
fill_valuerK  )r  r   r  r  r  r  rX   T)rA   rX  rS   )rL  rK  )r   r   r   r   r   r   r	   r  )r   r   r   r   r   )5r   r  r  use_return_dictNotImplementedErrorr  r  r   rL  	unsqueezerR  r    	ones_liker  r\   r  r   fullrK  r  r(   r&  r)   r*   r  r  r  r  ceilrI   rU  ra   longaranger[   r]   r_   r  r   rJ   r8   r  squeezer   rX  r  r  rG   prodr   r   r   r   )$r   r  r  r  r  r  r  r  
mask_dtypeinput_padding_maskspeaker_embeddingstext_encoder_outputr   r)   r*   r`  length_scaledurationpredicted_lengthsindicesoutput_padding_mask	attn_maskr4  r   output_lengthinput_lengthcum_durationvalid_indicespadded_indicesattnprior_latentsr^  r   r   r   rN   s$                                       r%   r   zVitsModel.forward  s>   J 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%&NOO&&33::@@
%!/!9!9"!=!@!@!L!&!;!E!Eb!I!L!LZ!X;;##a'J,B
=T[[%=%== #CDKKD\D\_`D`Caab!cdd*c*"ZZTjQUQ\Q\]
!%!3!3J!?!I!I"!M!%"//+)/!5# 0 
 7B+A.GZGlGl%//15/99!Q?4?)!,EXEdEd<G1!4M`MtMt;;9922"" 55 3 L  22=BTVhiLT///::eii58JJ\YZ!OOEIIhA,GKPPR ,,0446>O>U>U^o^v^vw%//25F5P5PQR5SS1;;A>AABTBZBZ[ OO$6:U__M`bd=ee	5>__2
A}l||Hb166zL7PRST,,}HNN8??[))!,|;%((9>>z<Yfg&):):=J\)]^_adbdad^d)ee''*44Q:YF ll4<<?K@JJ1aP#ll4<<?<OPZZ[\^_`#e&6&6{&CeiiPcFd&dgkgwgw&ww))M+>@R\`)a 33<<-?@##A&,rwwt{{7Q7Q/RR!1;?BUVWVXBYYGN-#-;;*55
 	
r$   )NNNNNNN)r   r   r   r   r   r  r   r   r    r  r   r  r!   r   r   r   r   r   r   r   s   @r%   r  r    s    z 4!  -115$(,0/3&*.2~
ELL)~
 !.~
 SM	~

 $D>~
 'tn~
 d^~
 **+~
 
uSz?*	+~
 ~
r$   r  )Fg      @MbP?r:  r:  )>r   r3  dataclassesr   typingr   r   r   r   numpyrG   r    torch.utils.checkpointr   activationsr
   integrations.deepspeedr   integrations.fsdpr   modeling_attn_mask_utilsr   modeling_outputsr   r   modeling_utilsr   r   r   r   configuration_vitsr   
get_loggerr   loggerr   r'   jitscriptr6   rQ   rK   Moduler   r   r   r   r  r  r  r)  r7  rA  rc  rq  r  r  r  r  r  r  __all__r#   r$   r%   <module>rL     s.     ! . .     ! @ 7 B < - , * 
		H	% :k : :@ :K : :>   G TE%PM5%((// M5`)299 )&;299 ;|U")) Up!		 !6		 &+%BII +%\(!299 (!V!BII !$a bii a H"%BII "%JcBII cL'bii 'T$ryy $NJ
")) J
Z5
bii 5
p ?/ ? ?4 
]
# ]

]
@ -
.r$   