
    Uh7Q                       d Z ddlZddlmZ ddlmZmZmZ ddlZddl	m
Z
 ddlmZ ddlmZ ddlmZmZmZ dd	lmZmZ dd
lmZ ddlmZ  ej4                  e      Z G d de
j:                        Z G d de
j:                        Z G d de
j:                        Z  G d de
j:                        Z! G d de
j:                        Z" G d de
j:                        Z# G d de
j:                        Z$ G d de
j:                        Z% G d de
j:                        Z& G d d e
j:                        Z' G d! d"e
j:                        Z( G d# d$e
j:                        Z) G d% d&e
j:                        Z*e G d' d(e             Z+ G d) d*e
j:                        Z,	 	 	 ded+ejZ                  d,e.d-ee/   d.e0d/e1f
d0Z2	 	 dfd+ejZ                  d1ee/e1f   d-ee/   d/e1fd2Z3 G d3 d4e
j:                        Z4 G d5 d6e
j:                        Z5 G d7 d8e
j:                        Z6 G d9 d:e
j:                        Z7 G d; d<e
j:                        Z8e G d= d>e             Z9 G d? d@e+      Z:e G dA dBe             Z; edCD       G dE dFe+             Z<e G dG dHe             Z= G dI dJe+      Z>e G dK dLe             Z?e G dM dNe             Z@e G dO dPe             ZAdQej                  j                  dRejZ                  dSejZ                  fdTZDdgdUejZ                  dVeejZ                     dSejZ                  fdWZE G dX dYe+      ZFe G dZ d[e             ZG G d\ d]e+      ZHe G d^ d_e             ZI G d` dae
j:                        ZJ G db dce+      ZKg ddZLy)hzPyTorch PatchTSMixer model.    N)	dataclass)OptionalTupleUnion)PreTrainedModel)ModelOutput   )NegativeBinomialOutputNormalOutputStudentTOutput)auto_docstringlogging)deprecate_kwarg   )PatchTSMixerConfigc                   2     e Zd ZdZdedef fdZd Z xZS )PatchTSMixerGatedAttentionz
    Module that applies gated attention to input data.

    Args:
        in_size (`int`): The input size.
        out_size (`int`): The output size.
    in_sizeout_sizec                     t         |           t        j                  ||      | _        t        j
                  d      | _        y )Ndim)super__init__nnLinear
attn_layerSoftmaxattn_softmax)selfr   r   	__class__s      /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/patchtsmixer/modeling_patchtsmixer.pyr   z#PatchTSMixerGatedAttention.__init__-   s1    ))GX6JJ2.    c                 P    | j                  | j                  |            }||z  }|S N)r    r   )r!   inputsattn_weights      r#   forwardz"PatchTSMixerGatedAttention.forward2   s*    ''(?@+%r$   )__name__
__module____qualname____doc__intr   r)   __classcell__r"   s   @r#   r   r   $   s     / /s /
r$   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSMixerBatchNormzP
    Compute batch normalization over the sequence length (time) dimension.
    configc                     t         |           t        j                  |j                  |j
                        | _        y )Neps)r   r   r   BatchNorm1dd_modelnorm_eps	batchnormr!   r3   r"   s     r#   r   zPatchTSMixerBatchNorm.__init__>   s(    FOOLr$   r'   c                 l    |j                  dd      }| j                  |      }|j                  dd      S )a  
        Parameters:
            inputs (`torch.Tensor` of shape `(batch_size, sequence_length, d_model)`):
                input for Batch norm calculation
        Returns:
            `torch.Tensor` of shape `(batch_size, sequence_length, d_model)`
        r      )	transposer:   )r!   r'   outputs      r#   r)   zPatchTSMixerBatchNorm.forwardB   s7     !!!Q''1%%r$   
r*   r+   r,   r-   r   r   torchTensorr)   r/   r0   s   @r#   r2   r2   9   s'    M1 M
&ell 
&r$   r2   c                   v     e Zd ZdZdef fdZededej                  fd       Z	de
j                  fdZ xZS )PatchTSMixerPositionalEncodingz'
    Class for positional encoding
    r3   c                     t         |           |j                  r| j                  |      | _        y t        j                  t        j                  |j                  |j                              | _        y r&   )r   r   use_positional_encoding_init_peposition_encr   	ParameterrA   zerosnum_patchesr8   r;   s     r#   r   z'PatchTSMixerPositionalEncoding.__init__T   sN    )) $f 5D "U[[9K9KV^^-\ ]Dr$   returnc                 `   | j                   dk(  rAt        j                  t        j                  | j
                  | j                        d      }|S | j                   dk(  r7t        j                  | j
                  | j                        }t        j                  d| j
                        j                  d      }t        j                  t        j                  d| j                  d      t        j                  d      | j                  z   z        }t        j                  ||z        |d d dd df<   t        j                  ||z        |d d dd df<   ||j                         z
  }||j!                         d	z  z  }t        j                  |d
      }|S t#        | j                    d      )NrandomTrequires_gradsincosr   r   r=   g     @
   FzN is not a valid positional encoder. Available types are 'random' and 'sincos'.)positional_encoding_typer   rI   rA   randnrK   r8   rJ   arange	unsqueezeexpmathlogsincosmeanstd
ValueError)r3   rH   positiondiv_terms       r#   rG   z'PatchTSMixerPositionalEncoding._init_pe\   sv    **h6<<F4F4F(WgklL  ,,8 ;;v'9'96>>JL||Av'9'9:DDQGHyya!CQXHY\b\j\jHjFk!klH$)IIh.A$BLADqD!$)IIh.A$BLADqD!',*;*;*==L'<+;+;+=+BCL<<EJL
  223  4B  C r$   patch_inputc                 $    || j                   z   }|S r&   )rH   )r!   ra   hidden_states      r#   r)   z&PatchTSMixerPositionalEncoding.forwardp   s    "T%6%66r$   )r*   r+   r,   r-   r   r   staticmethodr   rI   rG   rA   rB   r)   r/   r0   s   @r#   rD   rD   O   sN    ^1 ^ +   &5<< r$   rD   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSMixerNormLayerzeNormalization block

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                     t         |           |j                  | _        d|j                  j                         v rt	        |      | _        y t        j                  |j                  |j                        | _        y )Nbatchr5   )
r   r   norm_mlplowerr2   normr   	LayerNormr8   r9   r;   s     r#   r   zPatchTSMixerNormLayer.__init__~   sT    foo++---f5DIV^^IDIr$   r'   c                 f   d| j                   j                         v rt        j                  ||j                  d   |j                  d   z  |j                  d   |j                  d   f      }| j                  |      }t        j                  ||j                        }|S | j                  |      }|S )a  
        Args:
            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
                Input to the normalization layer.
        Returns:
            `torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`
        rh   r   r   r=   r	   )ri   rj   rA   reshapeshaperk   )r!   r'   inputs_reshapeds      r#   r)   zPatchTSMixerNormLayer.forward   s     dmm))++#mmLLOfll1o5LLOLLOO #ii8O ]]?FLLAF
  YYv&Fr$   r@   r0   s   @r#   rf   rf   v   s'    J1 Jell r$   rf   c                   >     e Zd Z fdZdej
                  fdZ xZS )PatchTSMixerMLPc                 <   t         |           ||j                  z  }t        j                  ||      | _        t        j                  |j                        | _        t        j                  ||      | _	        t        j                  |j                        | _
        y r&   )r   r   expansion_factorr   r   fc1Dropoutdropoutdropout1fc2dropout2)r!   in_featuresout_featuresr3   
num_hiddenr"   s        r#   r   zPatchTSMixerMLP.__init__   sj     6#:#::
99[*5

6>>299Z6

6>>2r$   r'   c                     | j                  t        j                  j                  | j	                  |                  }| j                  |      }| j                  |      }|S )z
        Args:
            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
                Input to the MLP layer.
        Returns:
            `torch.Tensor` of the same shape as `inputs`
        )rx   r   
functionalgeluru   ry   rz   )r!   r'   s     r#   r)   zPatchTSMixerMLP.forward   sK     r}}11$((62BCD&!v&r$   )r*   r+   r,   r   rA   rB   r)   r/   r0   s   @r#   rr   rr      s    3ell r$   rr   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )$PatchTSMixerChannelFeatureMixerBlockzThis module mixes the features in the channel dimension.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                    t         |           t        |      | _        |j                  | _        t        |j                  |j                  |      | _        |j                  r't        |j                  |j                        | _	        y y Nr{   r|   r3   r   r   )
r   r   rf   rk   
gated_attnrr   num_input_channelsmlpr   gating_blockr;   s     r#   r   z-PatchTSMixerChannelFeatureMixerBlock.__init__   sv    )&1	 ++"1122
  :11F<U<U!D r$   r'   c                     |}| j                  |      }|j                  dddd      }| j                  r| j                  |      }| j	                  |      }|j                  dddd      }||z   }|S )z
        Args:
            inputs (`torch.Tensor` of shape `((batch_size, num_channels, num_patches, d_model))`):
                input to the MLP layer
        Returns:
            `torch.Tensor` of the same shape as `inputs`
        r   r	   r=   r   )rk   permuter   r   r   )r!   r'   residualouts       r#   r)   z,PatchTSMixerChannelFeatureMixerBlock.forward   ss     6"1a+??&&v.F&!1a+x
r$   r@   r0   s   @r#   r   r      s%    1  ell r$   r   c                       e Zd ZdZ	 	 	 	 	 	 ddededededededee   d	ee   f fd
Z	 e
dd       e
dd       e
dd      	 	 	 	 	 	 ddej                  deej                     deeej                        deej                     deej                     dedeej                     deej                  eej                     eeej                        f   fd                     Z xZS )PatchTSMixerAttentionz=Multi-headed attention from 'Attention Is All You Need' paper	embed_dim	num_headsrw   
is_decoderbias	is_causalr3   	layer_idxc	                    t         	|           || _        || _        || _        ||z  | _        || _        | j
                  |z  | j                  k7  rt        d| j                   d| d      | j
                  dz  | _        || _	        || _
        || _        |9| j                  r-t        j                  d| j                  j                   d       t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        t!        j"                  |||      | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: ).g      zInstantiating a decoder z without passing `layer_idx` is not recommended and will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.)r   )r   r   r   r   rw   head_dimr3   r^   scalingr   r   r   loggerwarning_oncer"   r*   r   r   k_projv_projq_projout_proj)
r!   r   r   rw   r   r   r   r3   r   r"   s
            r#   r   zPatchTSMixerAttention.__init__   s$    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$""*4>>+B+B*C D, , ii	94@ii	94@ii	94@		)YTBr$   key_value_statesz4.55)versionpast_key_valuecache_positionhidden_statesattention_masklayer_head_maskoutput_attentionsrL   c                    |j                         \  }}	}
| j                  |      j                  |d| j                  | j                        j                  dd      }|| j                  z  }| j                  |      j                  |d| j                  | j                        j                  dd      }| j                  |      j                  |d| j                  | j                        j                  dd      }|| j                  z  d| j                  f} |j                  | } |j                  | } |j                  | }|j                  d      }t        j                  ||j                  dd            }|j                         || j                  z  |	|fk7  r/t        d|| j                  z  |	|f d|j                                |_|ddddddd|j                  d   f   }|j                  || j                  |	|      |z   }|j                  || j                  z  |	|      }t        j                  j!                  |d      }||j                         | j                  fk7  r*t        d	| j                  f d|j                                |j                  dddd      |j                  || j                  |	|      z  }|j                  || j                  z  |	|      }|r?|j                  || j                  |	|      }|j                  || j                  z  |	|      }nd}t        j                  j#                  || j"                  | j$                  
      }t        j                  ||      }|j                         || j                  z  |	| j                  fk7  r9t        d|| j                  z  |	| j                  f d|j                                |j                  || j                  |	| j                        }|j                  dd      }|j                  ||	| j&                        }| j)                  |      }||dfS )z#Input shape: Batch x Time x Channelr   r   r=   z$Attention weights should be of size z	, but is Nr   z/Head mask for a single layer should be of size )ptrainingz `attn_output` should be of size )sizer   viewr   r   r>   r   r   r   rn   rA   bmmr^   ro   r   r   softmaxrw   r   r   r   )r!   r   r   r   r   r   r   r   bsztgt_len_query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                       r#   r)   zPatchTSMixerAttention.forward  s$    (,,.Wa {{=166sBPTP]P]^hhijlmn#dll2[[/44S"dnndmm\ffghjkl
{{=166sBPTP]P]^hhijlmnDNN*B>
+|++Z8'Z''4
+|++Z8//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %+Aq!5Kz7G7G7K5K,KLN',,S$..'7SVddL',,S4>>-A7GTL}},,\r,B&##%$..):: Et~~FWEX Y',,./1  +//2q!<|?P?PQTVZVdVdfmov?wwL',,S4>>-A7GTL
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
ii
L9#"6!OO2C$..4H'SWS`S`3a2b c$$&') 
 "&&sDNNGT]]S!++Aq1 "))#wGmmK01477r$   )        FTFNN)NNNNFN)r*   r+   r,   r-   r.   floatboolr   r   r   r   rA   rB   r   r)   r/   r0   s   @r#   r   r      s   G  /3#'%C%C %C 	%C
 %C %C %C +,%C C=%CP '8%v6%v6 488<1526"'15P8||P8 #5<<0P8 !u||!45	P8
 !.P8 "%,,/P8  P8 !.P8 
u||Xell3XeELL>Q5RR	SP8 7 7 9P8r$   r   c                   .     e Zd ZdZdef fdZd Z xZS )PatchMixerBlockzxThis module mixes the patch dimension.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                    t         |           t        |      | _        |j                  | _        |j
                  | _        t        |j                  |j                  |      | _        |j
                  r&t        |j                  |j                        | _
        |j                  rBt        |j                  |j                  |j                        | _        t        |      | _        y y )Nr   r   )r   r   rw   )r   r   rf   rk   	self_attnr   rr   rK   r   r   r   r   r8   self_attn_headsrw   self_attn_layer	norm_attnr;   s     r#   r   zPatchMixerBlock.__init__x  s    )&1	)) ++"**++
  :6CUCU`f`r`r sD#8 .. 00$D 
 36:DN r$   c                    |}| j                  |      }| j                  rR|j                  \  }}}}|j                  ||z  ||      }| j	                  |d      \  }}	}	|j                  ||||      }|j                  dd      }| j                  |      }| j                  r| j                  |      }|j                  dd      }| j                  r| j                  |z         }||z   }
|
S )z
        Args:
            hidden_state (`torch.Tensor`): Input tensor.

        Returns:
            `torch.Tensor`: Transformed tensor.
        F)r   r=   r	   )
rk   r   ro   rn   r   r>   r   r   r   r   )r!   rc   r   
batch_sizen_varsrK   r8   hidden_state_reshapedx_attnr   r   s              r#   r)   zPatchMixerBlock.forward  s      yy.>>7C7I7I4JW$0$8$8f9Lk[b$c!//0EY^/_LFAq^^JWMF $--a3xx-??,,\:L $--a3>>>>,*?@LX%
r$   r*   r+   r,   r-   r   r   r)   r/   r0   s   @r#   r   r   p  s    ;1 ;2!r$   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )FeatureMixerBlockzThis module mixes the hidden feature dimension.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    r3   c                    t         |           t        |      | _        |j                  | _        t        |j                  |j                  |      | _        |j                  r't        |j                  |j                        | _	        y y r   )
r   r   rf   rk   r   rr   r8   r   r   r   r;   s     r#   r   zFeatureMixerBlock.__init__  sn    )&1	 ++"
  :6>>\b\j\j kD r$   hiddenc                     |}| j                  |      }| j                  |      }| j                  r| j                  |      }||z   }|S )
        Args:
            hidden (`torch.Tensor` of shape `(batch_size, num_patches, d_model)`):
                Input tensor to the layer.

        Returns:
            `torch.Tensor`: Transformed tensor.
        )rk   r   r   r   )r!   r   r   r   s       r#   r)   zFeatureMixerBlock.forward  sK     6"&!??&&v.Fx
r$   r@   r0   s   @r#   r   r     s'    l1 l ell r$   r   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSMixerLayerz
    The `PatchTSMixer` layer that does all three kinds of mixing.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    r3   c                     t         |           t        |      | _        t	        |      | _        |j                  | _        |j                  dk(  rt        |      | _        y y )Nr3   mix_channel)	r   r   r   patch_mixerr   feature_mixermoder   channel_feature_mixerr;   s     r#   r   zPatchTSMixerLayer.__init__  sR    *&9.f=KK	;;-')MU[)\D& (r$   r   c                     | j                   dk(  r| j                  |      }| j                  |      }| j                  |      }|S )r   r   )r   r   r   r   )r!   r   s     r#   r)   zPatchTSMixerLayer.forward  sE     99%//7F!!&)##F+r$   r@   r0   s   @r#   r   r     s'    	]1 	]ell r$   r   c                   6     e Zd ZdZdef fdZddefdZ xZS )PatchTSMixerBlockzThe main computing framework of the `PatchTSMixer` model.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                     t         |           |j                  }t        j                  t        |      D cg c]  }t        |       c}      | _        y c c}w Nr   )r   r   
num_layersr   
ModuleListranger   mixers)r!   r3   r   r   r"   s       r#   r   zPatchTSMixerBlock.__init__  sC    &&
mmuU_O`$a!%6f%E$ab$as   Aoutput_hidden_statesc                 x    g }|}| j                   D ]  } ||      }|s|j                  |         |r||fS |dfS )as  
        Args:
            hidden_state (`torch.Tensor`): The input tensor.
            output_hidden_states (`bool`, *optional*, defaults to False.):
                Whether to output the hidden states as well.

        Returns:
            `torch.Tensor`: The embedding. `list`: List of all hidden states if `output_hidden_states` is set to
            `True`.
        N)r   append)r!   rc   r   all_hidden_states	embeddingmods         r#   r)   zPatchTSMixerBlock.forward  sW      	;; 	4CII#!((3	4
  ///d?"r$   F)	r*   r+   r,   r-   r   r   r   r)   r/   r0   s   @r#   r   r     s#    c1 c#$ #r$   r   c                   0     e Zd ZdZddef fdZd Z xZS )PatchTSMixerForPredictionHeadzqPrediction Head for Forecasting

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                    t         |           |j                  | _        | j                  | j                  j                          t	        j
                  |j                        | _        |=t	        j                  |j                  |j                  z  |j                        | _        n-|j                  |j                  |j                  z        | _        t	        j                  d      | _        y )Nr   	start_dim)r   r   prediction_channel_indicessortr   rv   head_dropoutdropout_layerr   rK   r8   prediction_lengthbase_forecast_blockget_parameter_projectionFlattenflatten)r!   r3   distribution_outputr"   s      r#   r   z&PatchTSMixerForPredictionHead.__init__9  s    *0*K*K'**6++002ZZ(;(;<&')yy&2D2Dv~~2UX^XpXp'qD$':'S'S""V^^3(D$ zzB/r$   c                 ^     j                  |      } j                  |      } j                  |      }t        |t              rt	        d |D              }n|j                  dd      } j                  7t        |t              rt	         fd|D              }|S |d j                  f   }|S )ar  

        Args:
            hidden_features (`torch.Tensor` of shape `(batch_size, num_patch, d_model)` in `flatten` mode
                or `(batch_size, n_vars, num_patch, d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
                features.

        Returns:
            `torch.Tensor` of shape `(batch_size, prediction_length, nvars)`.

        c              3   @   K   | ]  }|j                  d d        yw)r   r   N)r>   ).0zs     r#   	<genexpr>z8PatchTSMixerForPredictionHead.forward.<locals>.<genexpr>\  s     CQQ[[R0Cs   r   r   c              3   @   K   | ]  }|d j                   f     yw).N)r   )r   r   r!   s     r#   r   z8PatchTSMixerForPredictionHead.forward.<locals>.<genexpr>b  s!      [Q3(G(G#G!H [s   .)r   r   r   
isinstancetupler>   r   r!   hidden_featuresforecasts   `  r#   r)   z%PatchTSMixerForPredictionHead.forwardK  s     ,,7,,_=++O<h&C(CCH))"b1H**6(E*  [RZ [[  $C)H)H$HIr$   r&   r   r0   s   @r#   r   r   1  s    01 0$r$   r   c                   0     e Zd ZdZddef fdZd Z xZS )PatchTSMixerLinearHeadzLinear head for Classification and Regression.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                 \   t         |           |j                  | _        |j                  | _        |j                  |j                  }nd}|| _        |@t        j                  |j                  |j                  z  |z  |j                        | _        n0|j                  |j                  |j                  z  |z        | _        |j                  t        j                  d      | _        nt        j                  d      | _        t        j                  |j                         | _        y )Nr   r   r   )r   r   head_aggregationoutput_rangerK   r   r   r   r8   r   num_targets
projectionr   r   r   rv   r   rw   )r!   r3   r   
mul_factorr"   s       r#   r   zPatchTSMixerLinearHead.__init__q  s     & 7 7"//""*++JJ#6 & ii!:!::ZG""DO
 2JJ!:!::ZGDO ""*::3DL::3DLzz&"5"56r$   c                 &   |j                  dd      }| j                  dk(  r|d   }nM| j                  dk(  r|j                  d      j                  }n!| j                  dk(  r|j	                  d      }| j
                  r| j                  |      }| j                  |      }| j                  |      }| j                  Q| j                  Et        j                  |      | j                  d   | j                  d	   z
  z  | j                  d	   z   }|S )
ai  
        Args:
            hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
                or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
                features.

        Returns:
            `torch.Tensor` of shape `(batch_size x num_targets)`.
        r   r   use_last).r   max_poolr   avg_poolr   r   )r>   r
  maxvaluesr\   r   rw   r  r   r  rA   sigmoid)r!   r  s     r#   r)   zPatchTSMixerLinearHead.forward  s
    *33B;  J.-g6O""j0-11b19@@O""j0-22r2:O<<"ll?;O,,7///:$$,43D3D3Po.$2C2CA2FIZIZ[\I]2]^aeararstauu  r$   r&   r   r0   s   @r#   r  r  i  s    71 78 r$   r  c                   "    e Zd ZeZdZdZdZd Zy)PatchTSMixerPreTrainedModelmodelpast_valuesFc                    t        |t              rG| j                  j                  dk(  r-t        j
                  j                  |j                  dd       yyt        |t        j                  t        j                  f      rJ|j                  j                  j                          |j                  j                  j                  d       yt        |t              r^|j                   j                  j                  j                          |j                   j                  j                  j                  d       yt        |t        j"                        rm|j                  j                  j                  d| j                  j$                         |j                  %|j                  j                  j                          yyy)zInitialize weightsrN   r   g?)r\   r]         ?N)r  rD   r3   rS   r   initnormal_rH   rl   r7   r   datazero_weightfill_r2   r:   r   init_std)r!   modules     r#   _init_weightsz)PatchTSMixerPreTrainedModel._init_weights  s.   f<={{33x? 3 3#3G @r~~ >?KK""$MM$$S) 56!!&&,,.##((..s3		*MM&&CT[[5I5I&J{{&  &&( ' +r$   N)	r*   r+   r,   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointingr$   r$   r#   r  r    s     &L#O&+#)r$   r  c                   .     e Zd ZdZdef fdZd Z xZS )PatchTSMixerPretrainHeadzcPretraining head.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                     t         |           t        j                  |j                        | _        t        j                  |j                  |j                        | _	        y r&   )
r   r   r   rv   r   r   r   r8   patch_lengthbase_pt_blockr;   s     r#   r   z!PatchTSMixerPretrainHead.__init__  sB    ZZ(;(;<YYv~~v7J7JKr$   c                 J    | j                  |      }| j                  |      }|S )a  
        Args:
            hidden_features (`torch.Tensor` of shape `(batch_size x num_patch x d_model)` in `flatten` mode
                or `(batch_size x n_vars x num_patch x d_model)` in `common_channel`/`mix_channel` mode.): Input hidden
                features.

        Returns:
            `torch.Tensor` of shape `(batch_size x n_vars x num_patch x patch_length)`.
        )r   r.  r  s      r#   r)   z PatchTSMixerPretrainHead.forward  s)     ,,_=%%o6r$   r   r0   s   @r#   r+  r+    s    L1 Lr$   r+  r'   
mask_ratiounmasked_channel_indiceschannel_consistent_masking
mask_valuec                    |dk  s|dk\  rt        d| d      | j                  \  }}}}| j                  }	t        |d|z
  z        }
|r-t	        j
                  |d||	      }|j                  d|d      }nt	        j
                  ||||	      }t	        j                  ||||	      }d|ddddd|
f<   t	        j                  |d      }t	        j                  |d      }t	        j                  |d|	      }|j                  d      j                  ddd|      }|d|dd|ddddf<   | j                  |j                         |      }||d
   fS )a  random_masking: Mask the input considering the control variables.

    Args:
        inputs (`torch.Tensor` of shape `(batch_size, num_channels, sequence_length, num_features)`):
            The input tensor to mask.
        mask_ratio (`float`):
            Masking ratio applied to mask the input data during random pretraining. It is the number between 0 and 1.
        unmasked_channel_indices (list, *optional*):
            Indices of channels that will not be masked.
        channel_consistent_masking (bool, *optional*, defaults to `False`):
            When true, masking will be same across all channels of a timeseries. Otherwise, masking positions will vary
            across channels.
        mask_value (int, *optional*, defaults to 0):
            Define the value of masked patches for pretraining.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as input Tensor and mask tensor of shape [bs x c x
        n]
    r   r   zMask ratio z has to be between 0 and 1.deviceNr   r   )r   index.r   )r^   ro   r6  r.   rA   randrepeatonesargsortgatherrV   masked_fillr   )r'   r0  r1  r2  r3  r   num_channelssequence_lengthnum_featuresr6  len_keepnoisemaskids_shuffleids_restoreinputs_masks                   r#   random_maskingrH    sQ   4 A~q;zl2MNOO>Dll;Jo|]]F?a*n56H!

:q/&IQa0 

:|_VT ::j,ODDAyy --2.K--4K<<"K8D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$r$   num_forecast_mask_patchesc                 P   t        |t              r|g}|D cg c]  }d }}| j                  \  }}}}	t        j                  |||| j
                        }
g }d}t        |      }t        ||      D ]H  \  }}|dk  s||k\  rt        d| d      t        ||z  |z        }|j                  |||g       ||z  }J t        |d       }||k  r|d   d   ||z
  z   |d   d<   n||kD  r|d	   d   ||z
  z   |d	   d<   d}|D ]  \  }}}||z   }d|
||d
d
| d
f<   |} t        j                  |
j                  d         }|
|   }
|
j                  d	      j                  ddd|	      }
|d|
d
d
|d
d
d
d
f<   | j                  |
j                         |      }||
d   fS c c}w )a  Forecast masking that masks the last K patches where K is from the num_forecast_mask_patches.
    If num_forecast_mask_patches is a list, samples in the batch will be randomly masked by numbers defined in the list.

    Parameters:
        inputs (`torch.Tensor`):
            Input of shape `(bs, num_channels, num_patch, patch_length)`
        num_forecast_mask_patches (`list`):
            Number of patches to be masked at the end of each batch sample. e.g. 4 or [3, 5].
        unmasked_channel_indices (`list`, *optional*):
            Indices of channels that are not masked.
        mask_value (`int`, *optional*, defaults to 0):
            Values in the masked patches will be filled by `mask_value`.

    Returns:
        `tuple(torch.Tensor)`: inputs_mask, masked input, same shape as inputs Tensor and Mask tensor of shape `(bs,
        num_channels , num_patch)` or `(bs, tsg1, tsg2, num_channels, num_patch)`
    r   r5  r   znum_forecast_mask_patches z6 should be greater than 0 and less than total patches.c                     | d   S Nr=   r)  )xs    r#   <lambda>z"forecast_masking.<locals>.<lambda>P  s
    !A$ r$   )keyr=   r   Nr8  )r  r.   ro   rA   rJ   r6  sumzipr^   r   sortedrandpermrV   r:  r>  r   )r'   rI  r1  r3  r   forecast_mask_ratiosr   r?  r@  rA  rD  t_listtotal_lengthtotal_ratior-  ratiotemp_lenbatch1	patch_lenbatch2permrG  s                         r#   forecast_maskingr^  $  s   0 +S1%>$?!'@A!AAA>Dll;Jo|;;z<WDFL*+K"#<>RS !e1 ?,\N:pq  zE)K78|UH56 ! F/Fj ay|zL'@Aq	!	
	"r
1
)BCr
1F"( 	1h("./VF]A	z{*+
 >>$**Q-(D:D>>"$$Q1l;D+23Q(!Q./$$TYY[*=KV$$O Bs   	F#c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSMixerPatchifyz
    A class to patchify the time series sequence into different patches

    Returns:
        `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
    r3   c                    t         |           |j                  | _        |j                  | _        |j
                  | _        | j                  | j                  k  r&t        d| j                   d| j                   d      t        | j                  | j                        | j                  z
  | j
                  z  dz   | _        | j                  | j
                  | j                  dz
  z  z   }| j                  |z
  | _	        y )NzSequence length (z+) has to be greater than the patch length ()r   )
r   r   context_lengthr@  r-  patch_strider^   r  rK   sequence_start)r!   r3   new_sequence_lengthr"   s      r#   r   zPatchTSMixerPatchify.__init__q  s    %44"//"//4#4#44#D$8$8#99deievevdwwxy 
   4 4d6G6GH4K\K\\aeararruvv"//$2C2CtGWGWZ[G[2\\"225HHr$   r  c                 :   |j                   d   }|| j                  k7  rt        d| d| j                   d      |dd| j                  dddf   }|j	                  d| j
                  | j                        }|j                  dd      j                         }|S )a!  
        Parameters:
            past_values (`torch.Tensor` of shape `(batch_size, sequence_length, num_channels)`, *required*):
                Input for patchification

        Returns:
            `torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`
        r   zInput sequence length (z%) doesn't match model configuration (r   N)	dimensionr   stepr	  )	ro   r@  r^   re  unfoldr-  rd  r>   
contiguous)r!   r  r@  r?   s       r#   r)   zPatchTSMixerPatchify.forward  s     &++B/d222)/)::_`d`t`t_uuwx  Q 3 3 5q89$2C2C$J[J[\!!"b)446r$   r@   r0   s   @r#   r`  r`  i  s'    I1 I"5<< r$   r`  c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )PatchTSMixerMaskinga  
    Class to perform random or forecast masking.

    Parameters:
        config (`PatchTSMixerConfig`): model config
    Returns:
        x_mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
            Masked patched input
        mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
            Bool tensor indicating True on masked points
    r3   c                 <   t         |           |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        | j                  t        | j                        | _        y y r&   )	r   r   random_mask_ratior2  	mask_typerI  r1  r3  rR  r;   s     r#   r   zPatchTSMixerMasking.__init__  s    !'!9!9*0*K*K')))/)I)I&(.(G(G% ++((4,243P3P,QD) 5r$   ra   c                 r   | j                   dk(  r<t        || j                  | j                  | j                  | j
                        \  }}nY| j                   dk(  r1t        || j                  | j                  | j
                        \  }}nt        d| j                    d      |j                         }||fS )a  
        Parameters:
            patch_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`, *required*):
                Patch input

        Return:
            masked_input (`torch.Tensor` of shape `(batch_size, num_channels, num_patches, patch_length)`)
                Masked patched input
            mask (`torch.Tensor` of shape `(batch_size, num_channels, num_patches)`)
                Bool tensor indicating True on masked points

        rN   )r'   r0  r1  r2  r3  r  )r'   rI  r1  r3  zInvalid mask type .)
rp  rH  ro  r1  r2  r3  r^  rI  r^   r   )r!   ra   masked_inputrD  s       r#   r)   zPatchTSMixerMasking.forward  s     >>X%!/"11)-)F)F+/+J+J??"L$ ^^z)!1"*.*H*H)-)F)F??	"L$ 1$..1ACDD yy{T!!r$   r@   r0   s   @r#   rm  rm    s'    
	R1 	R!"5<< !"r$   rm  c            	            e Zd ZdZdef fdZdej                  dej                  deej                  ej                  ej                  f   fdZ	 xZ
S )PatchTSMixerStdScalerz
    Standardize features by calculating the mean and scaling along the first dimension, and then normalizes it by
    subtracting from the mean and dividing by the standard deviation.
    r3   c                     t         |           t        |d      r|j                  nd| _        t        |d      r|j
                  nd| _        t        |d      r|j                  | _        y d| _        y )Nscaling_dimr   keepdimTminimum_scalegh㈵>)r   r   hasattrrw  r   rx  ry  r;   s     r#   r   zPatchTSMixerStdScaler.__init__  s[    )0)G6%%Q)0)Cv~~5<V_5UV11[_r$   r  observed_indicatorrL   c                    |j                  | j                  | j                        }|j                  d      }||z  j                  | j                  | j                        |z  }||z
  |z  dz  j                  | j                  | j                        |z  }t	        j
                  || j                  z         }||z
  |z  ||fS )C  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
            observed_indicator (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Calculating the scale on the observed indicator.
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        rx  r  r=   )rP  r   rx  	clamp_minrA   sqrtry  )r!   r  r{  denominatorlocvariancescales          r#   r)   zPatchTSMixerStdScaler.forward  s     ),,TXXt||,L!++C0((--dhh-MP[[Sj$661<AA$((TXT`T`Aadoo

8d&8&889s
e#S%//r$   r*   r+   r,   r-   r   r   rA   rB   r   r)   r/   r0   s   @r#   ru  ru    sT    
`1 `0LL06;ll0	u||U\\5<<7	80r$   ru  c            	            e Zd ZdZdef fdZdej                  dej                  deej                  ej                  ej                  f   fdZ	 xZ
S )PatchTSMixerMeanScalerz
    Computes a scaling factor as the weighted average absolute value along the first dimension, and scales the data
    accordingly.
    r3   c                 &   t         |           t        |d      r|j                  nd| _        t        |d      r|j
                  nd| _        t        |d      r|j                  nd| _        t        |d      r|j                  | _        y d | _        y )Nrw  r   rx  Try  绽|=default_scale)r   r   rz  rw  r   rx  ry  r  r;   s     r#   r   zPatchTSMixerMeanScaler.__init__  su    )0)G6%%Q)0)Cv~~5<V_5UV11[`5<V_5UV11[_r$   r  r{  rL   c                    ||z  j                         j                  | j                  d      }|j                  | j                  d      }|t        j                  |d      z  }| j
                  Q|j                  d      }t        j                  |j                  d      d      }t        j                  ||z        }n"| j
                  t        j                  |      z  }t        j                  |dkD  ||      }t        j                  || j                        }||z  }	| j                  s|j                  | j                        }|	t        j                  |      |fS )r}  Tr~  r   minr   r   )absrP  r   rA   clampr  squeeze	ones_likewherery  rx  
zeros_like)
r!   r  r{  ts_sumnum_observedr  	batch_sumbatch_observationsr  scaled_datas
             r#   r)   zPatchTSMixerMeanScaler.forward  s.    ++00266txx6N)--dhh-E\q99 %

q
)I!&\-=-=a-@a!H!MM)6H*HIM ..1GGM L1,e]C Et'9'9:Ul||MMdhhM/EE,,U3U::r$   r  r0   s   @r#   r  r    sT    
`1 `&;LL&;6;ll&;	u||U\\5<<7	8&;r$   r  c            
            e Zd ZdZdef fdZ	 ddej                  deej                     de	ej                  ej                  ej                  f   fdZ
 xZS )	PatchTSMixerNOPScalerz|
    Assigns a scaling factor equal to 1 along the first dimension, and therefore applies no scaling to the input data.
    r3   c                     t         |           t        |d      r|j                  nd| _        t        |d      r|j
                  | _        y d| _        y )Nrw  r   rx  T)r   r   rz  rw  r   rx  r;   s     r#   r   zPatchTSMixerNOPScaler.__init__7  s@    )0)G6%%Q)0)Cv~~r$   r  r{  rL   c                     t        j                  |d      j                  | j                  | j                        }t        j
                  |d      j                  | j                  | j                        }|||fS )a  
        Parameters:
            data (`torch.Tensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                input for Batch norm calculation
        Returns:
            tuple of `torch.Tensor` of shapes
                (`(batch_size, sequence_length, num_input_channels)`,`(batch_size, 1, num_input_channels)`,
                `(batch_size, 1, num_input_channels)`)
        FrO   )r   rx  )rA   r  r\   r   rx  r  )r!   r  r{  r  r  s        r#   r)   zPatchTSMixerNOPScaler.forward<  si     E:??DHHVZVbVb?ct59>>488UYUaUa>bS%r$   r&   )r*   r+   r,   r-   r   r   rA   rB   r   r   r)   r/   r0   s   @r#   r  r  2  s`    N1 N PT LL 6>u||6L 	u||U\\5<<7	8 r$   r  c                   h    e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   y)PatchTSMixerEncoderOutputa  
    Base class for `PatchTSMixerEncoderOutput`, with potential hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, d_model)`):
            Hidden-state at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Hidden-states of the model at the output of each layer.
    Nlast_hidden_stater   )r*   r+   r,   r-   r  r   rA   FloatTensor__annotations__r   r   r)  r$   r#   r  r  M  s9     6:x 1 1298<M8E%"3"345<r$   r  c                   z     e Zd ZdZdef fdZe	 	 d	dej                  de	e
   de	e
   deeef   fd       Z xZS )
PatchTSMixerEncoderz
    Encoder for PatchTSMixer which inputs patched time-series and outputs patched embeddings.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.
    r3   c                 J   t         |   |       |j                  | _        t        j                  |j
                  |j                        | _        |j                  rt        |      | _
        nd | _
        t        |      | _        |j                  r| j                          y y r   )r   r   use_return_dictr   r   r-  r8   patcherrF   rD   positional_encoderr   mlp_mixer_encoder	post_initr;   s     r#   r   zPatchTSMixerEncoder.__init__f  s     %55yy!4!4fnnE))&DF&SD#&*D#!2&!A NN r$   r  r   return_dictrL   c                     ||n| j                   }| j                  |      }| j                  | j                  |      }| j                  ||      \  }}|st	        d ||fD              S t        ||      S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to
            predict the masked portion. For a forecasting task, this denotes the history/past time series values.
            Similarly, for classification or regression tasks, it denotes the appropriate context values of the
            time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series,
            it is greater than 1.

        Returns:
            `torch.FloatTensor` of shape `(batch_size, n_vars, num_patches, d_model)`
        )r   c              3       K   | ]  }|  y wr&   r)  r   vs     r#   r   z.PatchTSMixerEncoder.forward.<locals>.<genexpr>  s          )r  r   )r  r  r  r  r  r  )r!   r  r   r  patchesr  r   s          r#   r)   zPatchTSMixerEncoder.forwardv  s    * &1%<k$BVBV ,,{+ "".--g6G+/+A+A'`t+A+u(=  &!   );L\ijjr$   )FN)r*   r+   r,   r-   r   r   r   rA   rB   r   r   r   r   r  r)   r/   r0   s   @r#   r  r  ]  st    1    05&*	(k\\(k 'tn(k d^	(k
 
u//	0(k (kr$   r  c                      e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   y)	PatchTSMixerModelOutputa  
    Base class for model's outputs, with potential hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor`  of shape `(batch_size, num_channels, num_patches, d_model)`):
            Hidden-state at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Hidden-states of the model at the output of each layer.
        patch_input (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches, patch_length)`):
            Patched input data to the model.
        mask: (`torch.FloatTensor` of shape `(batch_size, num_channels, num_patches)`,*optional*):
            Bool Tensor indicating True in masked patches and False otherwise.
        loc: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*):
            Gives the mean of the context window per channel. Used for revin denorm outside the model, if revin
            enabled.
        scale: (`torch.FloatTensor` of shape `(batch_size, 1, num_channels)`,*optional*):
            Gives the std dev of the context window per channel. Used for revin denorm outside the model, if revin
            enabled.
    Nr  r   ra   rD  r  r  )r*   r+   r,   r-   r  r   rA   r  r  r   r   ra   rD  r  r  r)  r$   r#   r  r    s    ( 6:x 1 1298<M8E%"3"345</3K%++,3(,D(5$$
%,'+C%##	$+)-E8E%%&-r$   r  z=
    The PatchTSMixer Model for time-series forecasting.
    )custom_introc                        e Zd Zd
dedef fdZe	 	 	 ddej                  de	ej                     de	e   de	e   de
f
d	       Z xZS )PatchTSMixerModelr3   
mask_inputc                    t         |   |       |j                  | _        t        |      | _        t        |      | _        |du rt        |      | _        nd| _        |j                  dk(  rt        |      | _        n>|j                  dk(  s|j                  du rt        |      | _        nt        |      | _        |j                  r| j                          yy)z
        mask_input (bool, *optional*, defaults to `False`):
            Whether to mask the input using the [`PatchTSMixerMasking`] module.
        TNr\   r]   )r   r   r  r  encoderr`  patchingrm  maskingr   r  scalerru  r  r  )r!   r3   r  r"   s      r#   r   zPatchTSMixerModel.__init__  s    
 	 %55*62,V4.v6DLDL>>V#08DK^^u$$(>/7DK/7DK NN r$   r  observed_maskr   r  rL   c           	         ||n| j                   }d}|t        j                  |      }| j                  ||      \  }}}| j	                  |      }	|	}
| j
                  | j                  |	      \  }
}| j                  |
||      }t        |t              rt        | }|s,t        d |j                  |j                  |	|||fD              S t        |j                  |j                  |	|||      S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:
            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
        Nr   r  c              3       K   | ]  }|  y wr&   r)  r  s     r#   r   z,PatchTSMixerModel.forward.<locals>.<genexpr>        
 
r  )r  r   ra   rD  r  r  )r  rA   r  r  r  r  r  r  r  r  r  r   r  )r!   r  r  r   r  rD  scaled_past_valuesr  r  	patched_x	enc_inputencoder_outputs               r#   r)   zPatchTSMixerModel.forward  s   , &1%<k$BVBV !OOK8M)-[-)P&CMM"45		<<#"ll95OIt !5# & 
 ne,6GN 
 #44"00
 
 
 ',>>(66!
 	
r$   r   )NFN)r*   r+   r,   r   r   r   r   rA   rB   r   r  r)   r/   r0   s   @r#   r  r    s    1 t 6  15/4&*A
\\A
  -A
 'tn	A

 d^A
 
!A
 A
r$   r  c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   y) PatchTSMixerForPreTrainingOutputa  
    Output type of [`PatchTSMixerForPreTrainingOutput`].

    Args:
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, patch_length)`):
            Prediction output from the pretrain head.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Hidden-states of the model at the output of each layer.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
            Backbone embeddings before passing through the head.
        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
            Total loss
    Nlossprediction_outputsr  r   r*   r+   r,   r-   r  r   rA   r  r  r  r  r   r   r)  r$   r#   r  r  &  d     )-D(5$$
%,6:!2!23:59x 1 1298<M8E%"3"345<r$   r  c                        e Zd ZdZdef fdZe	 	 	 	 ddej                  de	ej                     de	e
   de
de	e
   d	efd
       Z xZS )PatchTSMixerForPretrainingz
    `PatchTSMixer` for mask pretraining.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
        `None`.
    r3   c                     t         |   |       t        |d      | _        t	        |      | _        |j                  | _        |j                  | _        |j                  r| j                          y y )NT)r  r   )	r   r   r  r  r+  headmasked_lossr  r  r;   s     r#   r   z#PatchTSMixerForPretraining.__init__H  s`     &v$?
,F;	!--%55 NN r$   r  r  r   return_lossr  rL   c                    ||n| j                   }| j                  du r!t        j                  j	                  d      }n t        j                  j	                  d      }| j                  ||||      }t        |t              rt        | }| j                  |j                        }|du r |||j                        }	nd}	| j                  du rM|	K|	j                  d      |j                  z  j                         |j                  j                         d	z   z  }	|s*t        d
 |	||j                  |j                  fD              S t!        |	||j                  |j                        S )aT  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:
            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
        return_loss (`bool`,  *optional*):
            Whether to return the loss in the `forward` call.
        NTnone	reductionr\   r  r   r  r   r   r  c              3       K   | ]  }|  y wr&   r)  r  s     r#   r   z5PatchTSMixerForPretraining.forward.<locals>.<genexpr>         r  r  r  r  r   )r  r  rA   r   MSELossr  r  r  r  r  r  ra   r\   rD  rP  r   r  )
r!   r  r  r   r  r  r  model_outputx_hatloss_vals
             r#   r)   z"PatchTSMixerForPretraining.forwardS  so   2 &1%<k$BVBVt#88##f#5D88##f#5D zz'!5#	 " 
 lE*2LAL		,889$E<#;#;<HH t#(< "-0A0AAFFHLL]L]LaLaLcfkLklH   22 ..	   0$*<<&44	
 	
r$   NFTN)r*   r+   r,   r-   r   r   r   rA   rB   r   r   r  r)   r/   r0   s   @r#   r  r  <  s    		1 	  15/4 &*D
\\D
  -D
 'tn	D

 D
 d^D
 
*D
 D
r$   r  c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   dZeej                     ed<   dZeej                     ed<   y)	PatchTSMixerForPredictionOutputa  
    Output type of [`PatchTSMixerForPredictionOutput`].

    Args:
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, prediction_length, num_input_channels)`):
            Prediction output from the forecast head.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
            Backbone embeddings before passing through the head.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
            Total loss.
        loc (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
            Input mean
        scale (`torch.FloatTensor`, *optional* of shape `(batch_size, 1, num_input_channels)`):
            Input std dev

    Nr  r  r  r   r  r  )r*   r+   r,   r-   r  r   rA   r  r  r  r  r   r   r  r  r)  r$   r#   r  r    s    & )-D(5$$
%,6:!2!23:59x 1 1298<M8E%"3"345<'+C%##	$+)-E8E%%&-r$   r  c                   :    e Zd ZU dZdZeej                     ed<   y)"SamplePatchTSMixerPredictionOutputa9  
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.

    Args:
        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, prediction_length, number_channels)`):
            Sampled values from the chosen distribution.
    N	sequences	r*   r+   r,   r-   r  r   rA   r  r  r)  r$   r#   r  r         .2Ix))*1r$   r  c                   :    e Zd ZU dZdZeej                     ed<   y)"SamplePatchTSMixerRegressionOutputa$  
    Base class for time series model's predictions outputs that contains the sampled values from the chosen
    distribution.

    Args:
        sequences (`torch.FloatTensor` of shape `(batch_size, num_samples, num_targets)`
                Sampled values from the chosen distribution.
    Nr  r  r)  r$   r#   r  r    r  r$   r  inputtargetrL   c                 &    | j                  |       S )zc
    Computes the negative log likelihood loss from input distribution with respect to target.
    )log_prob)r  r  s     r#   nllr    s     NN6"""r$   input_tensorweightsc                 P   |t        j                  |dk7  | |z  t        j                  |             }t        j                  |r|j	                  |      n|j	                         d      }|r|j	                  |      |z  S |j	                         |z  S | j                  |      S )aj  
    Computes the weighted average of a given tensor across a given `dim`, masking values associated with weight zero,
    meaning instead of `nan * 0 = nan` you will get `0 * 0 = 0`.

    Args:
        input_tensor (`torch.FloatTensor`):
            Input tensor, of which the average must be computed.
        weights (`torch.FloatTensor`, *optional*):
            Weights tensor, of the same shape as `input_tensor`.
        dim (`int`, *optional*):
            The dim along which to average `input_tensor`.

    Returns:
        `torch.FloatTensor`: The tensor with values averaged along the specified `dim`.
    r   r   r  r  )rA   r  r  r  rP  r\   )r  r  r   weighted_tensorsum_weightss        r#   weighted_averager    s      ++glL74JEL\L\]iLjkkk#'++#+"67;;=VYZ03###,R]]]9L9L9NR]]]  S ))r$   c                        e Zd ZdZdef fdZe	 	 	 	 	 ddej                  de	ej                     de	ej                     de	e
   de
d	e	e
   d
efd       Z	 ddej                  de	ej                     d
efdZ xZS )PatchTSMixerForPredictionz
    `PatchTSMixer` for forecasting application.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
        `None`.
    r3   c                 2   t         |   |       |j                  | _        |j                  | _        |j                  | _        |j
                  | _        |j                  dk(  rd | _        nc|j                  }t        t        t        d}|j                  |j                  d       }| ||      | _        nt        d|j                         t        |      | _        t        || j                        | _        |j"                  r| j#                          y y )Nmse	student_tnormalnegative_binomialr   Unknown distribution output r3   r   )r   r   r  r  r   num_parallel_samplesr   r   r   r   r
   getr^   r  r  r   r  r  )r!   r3   r   distribution_output_mapoutput_classr"   s        r#   r   z"PatchTSMixerForPrediction.__init__  s     KK	%55*0*K*K'$*$?$?!;;%'+D$**C+&%;'#
 366v7Q7QSWXL'+7C+@( #?@Z@Z?[!\]]&v.
1 $ 8 8
	 NN r$   r  r  future_valuesr   r  r  rL   c           	         | j                   dk(  rt        j                  d      }n!| j                   dk(  rt        }nt	        d      ||n| j
                  }| j                  ||||      }t        |t              rt        | }| j                  |j                        }	d}
| j                  | j                  r|| j                  j                  |	|j                  d| j                  f   |j                   d| j                  f   	      }||d
u r |||d| j                  f         }
t#        |
      }
n|	|j                   d| j                  f   z  |j                  d| j                  f   z   }	||d
u r ||	|d| j                  f         }
n| j                  rM| j                  j                  |	|j                  |j                   	      }|D|d
u r@ |||      }
t#        |
      }
n+|	|j                   z  |j                  z   }	||d
u r	 ||	|      }
| j                  7|j                  d| j                  f   }|j                   d| j                  f   }n|j                  }|j                   }|s,t        d |
|	|j                  |j$                  ||fD              S t'        |
|	|j                  |j$                  ||      S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        observed_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
            Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
            in `[0, 1]`:
            - 1 for values that are **observed**,
            - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).
        future_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,:
            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
            Target values of the time series, that serve as labels for the model. The `future_values` is what the
            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
            required for a pretraining task.

            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
            pass the target data with all channels, as channel Filtering for both prediction and target will be
            manually applied before the loss computation.
        return_loss (`bool`,  *optional*):
            Whether to return the loss in the `forward` call.
        r  r\   r  r  2Invalid loss function: Allowed values: mse and nllNr  .r  r  Tc              3       K   | ]  }|  y wr&   r)  r  s     r#   r   z4PatchTSMixerForPrediction.forward.<locals>.<genexpr>  r  r  )r  r  r  r   r  r  )r  r   r  r  r^   r  r  r  r  r  r  r  r   r   distributionr  r  r  r   r  )r!   r  r  r   r   r  r  r  r  y_hatr  r  r  r  s                 r#   r)   z!PatchTSMixerForPrediction.forward!  s   H 99::/DYY%DQRR%0%<k$BVBV zz'!5#	 " 
 lE*2LAL 		,889**6''#77DD$((d.M.M)MN&,,S$2Q2Q-QR  E  
 !,1D#$%c4+J+J&JK H
  09H L..sD4S4S/STT"&&sD,K,K'KLM  !,1D#E=d>]>]9]+^_H''#77DD|//|7I7I  E   !,1D#L-@H/9H 2 22\5E5EE ,1D#E=9H**6""3(G(G#GHC &&sD,K,K'KLE""C &&E 
  22 ..
 
 
 /$*<<&44
 	
r$   c                 F   | j                   } | |d|d      }| j                  j                  |j                  |j                  |j
                        }t        |      D cg c]  }|j                          }}t        j                  |d      }t        |      S c c}w )a  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Args:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the future.

            observed_mask (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_input_channels)`, *optional*):
                Boolean mask to indicate which `past_values` were observed and which were missing. Mask values selected
                in `[0, 1]`:

                - 1 for values that are **observed**,
                - 0 for values that are **missing** (i.e. NaNs that were replaced by zeros).

        Return:
            [`SamplePatchTSMixerPredictionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
            number of samples, prediction_length, num_input_channels)`.
        NF)r  r   r  r   r  r   r   r  )r  r   r  r  r  r  r   samplerA   stackr  )r!   r  r  r  outputsr  r   sampless           r#   generatez"PatchTSMixerForPrediction.generate  s    0  $88 #'!&	
 //<<&&GKKw}} = 

 388L2MNQ<&&(NN ++g1-1GDD	 Os   "B)NNFTNr&   )r*   r+   r,   r-   r   r   r   rA   rB   r   r   r  r)   r  r  r/   r0   s   @r#   r  r    s    	1 @  1504/4 &*w
\\w
  -w
  -	w

 'tnw
 w
 d^w
 
)w
 w
x 15-E\\-E  --E 
,	-Er$   r  c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   y)-PatchTSMixerForTimeSeriesClassificationOutputa  
    Output type of [`PatchTSMixerForTimeSeriesClassificationOutput`].

    Args:
        prediction_outputs (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
            Prediction output from the classification head.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
            Backbone embeddings before passing through the head.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
            Total loss.
    Nr  r  r  r   r  r)  r$   r#   r  r    r  r$   r  c                        e Zd ZdZdef fdZe	 	 	 	 ddej                  de	ej                     de	e
   de
de	e
   d	efd
       Z xZS )'PatchTSMixerForTimeSeriesClassificationz
    `PatchTSMixer` for classification application.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
        `None`.
    r3   c                 :   t         |   |       t        |      | _        t	        |      | _        |j                  | _        |j                  dv r't        |j                  |j                        | _        nd | _        |j                  r| j                          y y )Nr   r]   r\   Tr8   rK   )r   r   r  r  r  r  r  r   InjectScalerStatistics4Dr8   rK   inject_scaler  r;   s     r#   r   z0PatchTSMixerForTimeSeriesClassification.__init__  s     &v.
*
	  &55>>22 8]c]o]o pD $D NN r$   r  target_valuesr   r  r  rL   c                 4   t         j                  j                         }||n| j                  }| j	                  |||      }t        |t              rt        | }| j                  7| j                  |j                  |j                  |j                        |_	        | j                  |j                        }||du r
 |||      }	nd}	|s*t        d |	||j                  |j                  fD              S t        |	||j                  |j                        S )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
            Target
            values of the time series, that serve as labels for the model. The `target_values` is what the
            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
            required for a pretraining task.

            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
            pass the target data with all channels, as channel Filtering for both prediction and target will be
            manually applied before the loss computation.

            For a classification task, it has a shape of `(batch_size,)`.

            For a regression task, it has a shape of `(batch_size, num_targets)`.
        return_loss (`bool`, *optional*):
            Whether to return the loss in the `forward` call.
        Nr  r  Tc              3       K   | ]  }|  y wr&   r)  r  s     r#   r   zBPatchTSMixerForTimeSeriesClassification.forward.<locals>.<genexpr>=  r  r  r  )rA   r   CrossEntropyLossr  r  r  r  r  r  r  r  r  r  r   r  )
r!   r  r  r   r  r  r  r  r  r  s
             r#   r)   z/PatchTSMixerForTimeSeriesClassification.forward  s.   H xx((*%0%<k$BVBVzz!5# " 

 lE*2LAL(-1->->.. $$"(( .? .L* 		,889$)<E=1HH   22 ..	   =$*<<&44	
 	
r$   r  )r*   r+   r,   r-   r   r   r   rA   rB   r   r   r  r)   r/   r0   s   @r#   r  r    s    	1 "  15/4 &*M
\\M
  -M
 'tn	M

 M
 d^M
 
7M
 M
r$   r  c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeeej                        ed<   y)PatchTSMixerForRegressionOutputa  
    Output type of [`PatchTSMixerForRegressionOutput`].

    Args:
        regression_outputs (`torch.FloatTensor` of shape `(batch_size, num_targets)`):
            Prediction output from the regression head.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_input_channels, num_patches, d_model)`):
            Backbone embeddings before passing through the head.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        loss (*optional*, returned when `y` is provided, `torch.FloatTensor` of shape `()`):
            Total loss.
    Nr  regression_outputsr  r   )r*   r+   r,   r-   r  r   rA   r  r  r  r  r   r   r)  r$   r#   r  r  O  r  r$   r  c                   ~     e Zd Zd	dededef fdZdej                  dej                  dej                  fdZ xZS )
r  r8   rK   	expansionc                 &   t         |           t        j                  |dz   ||z        | _        t        j                  ||z  |      | _        t        j                  dd|z        | _        t        j                  d|z  d      | _        || _        y rL  )	r   r   r   r   inverse_trans_expansioninverse_trans_compressionmap_scale_expansionmap_scale_compressionrK   )r!   r8   rK   r  r"   s       r#   r   z!InjectScalerStatistics4D.__init__f  sx    ')yy1i'>Q'R$)+9w3F)P&#%99QI#> %'YYq9}a%@"&r$   r'   r  r  c                    |j                  dd      }|j                  d      }|j                  dd| j                  d      }|j                  dd      }|j                  d      }|j                  dd| j                  d      }t	        j
                  ||gd      }| j                  |      }| j                  |      }t	        j
                  ||gd      }| j                  |      }| j                  |      }|S )a  
        Args:
            inputs (`torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`)
            loc (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
            scale (`torch.Tensor` of shape `(batch_size, 1, num_input_channels)`)
        Returns:
            `torch.Tensor` of shape `(batch_size, num_input_channels, num_patch, d_model)`
        r   r   r   r   )
r>   rV   r:  rK   rA   catr#  r$  r!  r"  )r!   r'   r  r  r\   stdevconcat_statss          r#   r)   z InjectScalerStatistics4D.forwardo  s     }}R$~~b!{{1a!1!115B'#Q4#3#3Q7yy$B7//=11,?FL1r:--f5//7r$   )r=   )	r*   r+   r,   r.   r   rA   rB   r)   r/   r0   s   @r#   r  r  e  sC    ' '# '# 'ell  ell r$   r  c                        e Zd ZdZdef fdZe	 	 	 	 ddej                  de	ej                     de	e
   de
de	e
   d	efd
       Zdej                  d	efdZ xZS )PatchTSMixerForRegressionz
    `PatchTSMixer` for regression application.

    Args:
        config (`PatchTSMixerConfig`):
            Configuration.

    Returns:
        `None`.
    r3   c                    t         |   |       t        |      | _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  dk(  rd | _        n`t        t        t        d}|j                  |j
                        }| ||j                        | _        nt        d|j
                         |j                  dv r't        |j                   |j"                        | _        nd | _        t'        || j
                        | _        |j*                  r| j+                          y y )Nr  r  r   r  r  r  r  )r   r   r  r  r  r   r  r  r   r   r
   r  r  r^   r   r  r8   rK   r  r  r  r  )r!   r3   r  r  r"   s       r#   r   z"PatchTSMixerForRegression.__init__  s$    &v.
KK	#)#=#= %55$*$?$?!;;%'+D$ ,&%;'#
 366v7Q7QRL'+7F<N<N+O( #?@Z@Z?[!\]]>>22 8]c]o]o pD $D* $ 8 8
	 NN r$   r  r  r   r  r  rL   c           	         | j                   dk(  rt        j                  d      }n!| j                   dk(  rt        }nt	        d      ||n| j
                  }| j                  |||      }t        |t              rt        | }| j                  7| j                  |j                  |j                  |j                        |_        | j                  |j                        }||d	u r| j                  r| j                  d
k(  r#t!        j"                  |dk        rt%        d      | j                  j'                  |      }	t        |D 
cg c](  }
|
j)                  d| j*                  j,                        * c}
      } ||	|      }t/        |      }n |||      }nd}|s*t        d |||j                  |j0                  fD              S t3        |||j                  |j0                        S c c}
w )a  
        past_values (`torch.FloatTensor` of shape `(batch_size, seq_length, num_input_channels)`):
            Context values of the time series. For a pretraining task, this denotes the input time series to predict
            the masked portion. For a forecasting task, this denotes the history/past time series values. Similarly,
            for classification or regression tasks, it denotes the appropriate context values of the time series.

            For univariate time series, `num_input_channels` dimension should be 1. For multivariate time series, it is
            greater than 1.
        target_values (`torch.FloatTensor` of shape `(batch_size, target_len, num_input_channels)` for forecasting,
            `(batch_size, num_targets)` for regression, or `(batch_size,)` for classification, *optional*):
            Target values of the time series, that serve as labels for the model. The `target_values` is what the
            Transformer needs during training to learn to output, given the `past_values`. Note that, this is NOT
            required for a pretraining task.

            For a forecasting task, the shape is be `(batch_size, target_len, num_input_channels)`. Even if we want
            to forecast only specific channels by setting the indices in `prediction_channel_indices` parameter,
            pass the target data with all channels, as channel Filtering for both prediction and target will be
            manually applied before the loss computation.

            For a classification task, it has a shape of `(batch_size,)`.

            For a regression task, it has a shape of `(batch_size, num_targets)`.
        return_loss (`bool`, *optional*):
            Whether to return the loss in the `forward` call.
        r  r\   r  r  r  Nr  r  Tr  r   zDtarget_values cannot be negative for negative_binomial distribution.r   c              3       K   | ]  }|  y wr&   r)  r  s     r#   r   z4PatchTSMixerForRegression.forward.<locals>.<genexpr>  r  r  )r  r  r  r   )r  r   r  r  r^   r  r  r  r  r  r  r  r  r  r  r   rA   any	Exceptionr  r   r3   r  r  r   r  )r!   r  r  r   r  r  r  r  r  r  itemr  s               r#   r)   z!PatchTSMixerForRegression.forward  s   F 99::/DYY%DQRR%0%<k$BVBVzz!5# " 

 lE*2LAL(-1->->.. $$"(( .? .L* 		,889$)<''++/BBuyyQ^abQbGc#$jkk#77DDUKRWX$tyyT[[-D-DEXYm<+H5}5H   22 ..	   /$*<<&44	
 	
) Ys   
-G.c                 `   | j                   } | |dd      }| j                  j                  |j                        }t	        |      D cg c]  }|j                          }}t        j                  |d      j                  d|| j                  j                        }t        |      S c c}w )a
  
        Generate sequences of sample predictions from a model with a probability distribution head.

        Args:
            past_values (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_input_channels)`):
                Past values of the time series that serves as context in order to predict the target values.

        Return:
            [`SamplePatchTSMixerRegressionOutput`] where the outputs `sequences` tensor will have shape `(batch_size,
            number of samples, num_targets)`.
        NF)r  r  r   r   r   r   r  )r  r   r  r  r   r	  rA   r
  r   r3   r  r  )r!   r  r  r  r  r   r  s          r#   r  z"PatchTSMixerForRegression.generate  s       $88 #!&
 //<<W=W=WX ,11E+F
&'L!
 

 ++g1-2227KT[[MdMde1GDD
s   
B+r  )r*   r+   r,   r-   r   r   r   rA   rB   r   r   r  r)   r  r  r/   r0   s   @r#   r*  r*    s    	%1 %N  15/4 &*Z
\\Z
  -Z
 'tn	Z

 Z
 d^Z
 
)Z
 Z
x#E\\#E 
,#Er$   r*  )r  r  r  r  r  r*  )NFr   )Nr   )NN)Mr-   rX   dataclassesr   typingr   r   r   rA   torch.nnr   transformers.modeling_utilsr   transformers.utilsr   time_series_utilsr
   r   r   utilsr   r   utils.deprecationr   configuration_patchtsmixerr   
get_loggerr*   r   Moduler   r2   rD   rf   rr   r   r   r   r   r   r   r   r  r  r+  rB   r   listr   r.   rH  r^  r`  rm  ru  r  r  r  r  r  r  r  r  r  r  r  distributionsDistributionr  r  r  r  r  r  r  r*  __all__r)  r$   r#   <module>rA     s    "  ! ) )   7 * U U , 0 : 
		H	% *&BII &,$RYY $N.BII .bbii .-299 -b~8BII ~8BBbii BJ*		 *Z#		 #L&#		 &#R5BII 5pDRYY DN )/ ) )2ryy D 04',7%LL7%7% 'tn7% !%	7%
 7%| 04	A%LLA%$T3Y/A% 'tnA% 	A%J-299 -b9"")) 9"z 0BII  0H3;RYY 3;n BII  6 = = =Bk5 BkJ .k . .: 
^
3 ^

^
B ={ = =*\
!< \
~ .k . .8 
2 
2 
2 
2 
2 
2#u""// # #%,, #*5<< *(5<<:P *fkfrfr *0SE ; SEl =K = =*k
.I k
\ =k = =*%ryy %PsE ; sElr$   