
    Uh                       d Z ddlZddlZddlmZmZmZ ddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ ddlmZmZm Z  ddl!m"Z"m#Z#m$Z$m%Z%m&Z& ddl'm(Z(  e#       rddl)m*Z* ddl+m,Z,  e&jZ                  e.      Z/dZ0	 ddl1m2Z2 dZ0e/jg                  d        G d dejn                        Z8e0se2Z8 ejr                  e8        G d dejn                        Z: G d dejn                        Z; G d d ejn                        Z< G d! d"ejn                        Z= G d# d$ejn                        Z> G d% d&ejn                        Z? G d' d(ejn                        Z@e" G d) d*e             ZA G d+ d,eA      ZB G d- d.ejn                        ZC e"d/0       G d1 d2eAe             ZDd2d*gZEy# e4$ r Y e5$ r e/jm                  d       Y +w xY w)3zPyTorch Pop2Piano model.    N)OptionalTupleUnion)nn)CrossEntropyLoss)GenerationConfig   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutput)PreTrainedModel)ALL_LAYERNORM_LAYERS find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringis_torch_flex_attn_availableis_torch_fx_proxyis_torchdynamo_compilinglogging   )Pop2PianoConfig)	BlockMask)make_flex_block_causal_maskT)FusedRMSNormFzVDiscovered apex.normalization.FusedRMSNorm - will use it instead of Pop2PianoLayerNormzIDiscovered apex but it failed to load, falling back to Pop2PianoLayerNormc                   &     e Zd Zd fd	Zd Z xZS )Pop2PianoLayerNormc                     t         |           t        j                  t	        j
                  |            | _        || _        y)zj
        Construct a layernorm module in the Pop2Piano style. No bias and no subtraction of mean.
        N)super__init__r   	Parametertorchonesweightvariance_epsilon)selfhidden_sizeeps	__class__s      /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/pop2piano/modeling_pop2piano.pyr%   zPop2PianoLayerNorm.__init__@   s1     	ll5::k#:; #    c                    |j                  t        j                        j                  d      j	                  dd      }|t        j
                  || j                  z         z  }| j                  j                  t        j                  t        j                  fv r%|j                  | j                  j                        }| j                  |z  S )N   T)keepdim)tor'   float32powmeanrsqrtr*   r)   dtypefloat16bfloat16)r+   hidden_statesvariances      r/   forwardzPop2PianoLayerNorm.forwardH   s     !##EMM266q9>>r4>P%Ht?T?T4T(UU ;; ??),,T[[->->?M{{]**r0   )gư>)__name__
__module____qualname__r%   r?   __classcell__r.   s   @r/   r"   r"   ?   s    $+r0   r"   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoDenseActDenseconfigc                 ^   t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _
        t        |j                     | _        y NFbias)r$   r%   r   Lineard_modeld_ffwiwoDropoutdropout_ratedropoutr
   dense_act_fnactr+   rG   r.   s     r/   r%   zPop2PianoDenseActDense.__init__`   sn    ))FNNFKKeD))FKKeDzz&"5"56&--.r0   c                    | j                  |      }| j                  |      }| j                  |      }t        | j                  j
                  t        j                        r|j                  | j                  j
                  j                  k7  r`| j                  j
                  j                  t        j                  k7  r/|j                  | j                  j
                  j                        }| j	                  |      }|S N)rO   rU   rS   
isinstancerP   r)   r'   Tensorr:   int8r5   )r+   r=   s     r/   r?   zPop2PianoDenseActDense.forwardg   s    ./]3tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r0   r@   rA   rB   r   r%   r?   rC   rD   s   @r/   rF   rF   _   s    / /r0   rF   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoDenseGatedActDenserG   c                    t         |           t        j                  |j                  |j
                  d      | _        t        j                  |j                  |j
                  d      | _        t        j                  |j
                  |j                  d      | _        t        j                  |j                        | _        t        |j                     | _        y rI   )r$   r%   r   rL   rM   rN   wi_0wi_1rP   rQ   rR   rS   r
   rT   rU   rV   s     r/   r%   z$Pop2PianoDenseGatedActDense.__init__w   s    IIfnnfkkF	IIfnnfkkF	))FKKeDzz&"5"56&--.r0   c                 ,   | j                  | j                  |            }| j                  |      }||z  }| j                  |      }t	        | j
                  j                  t        j                        r|j                  | j
                  j                  j                  k7  r`| j
                  j                  j                  t        j                  k7  r/|j                  | j
                  j                  j                        }| j                  |      }|S rX   )rU   r`   ra   rS   rY   rP   r)   r'   rZ   r:   r[   r5   )r+   r=   hidden_geluhidden_linears       r/   r?   z#Pop2PianoDenseGatedActDense.forward   s    hhtyy78		-0#m3]3 tww~~u||4##tww~~';';;$$

2),,TWW^^-A-ABM.r0   r\   rD   s   @r/   r^   r^   v   s    / /r0   r^   c                   *     e Zd Zdef fdZd Z xZS )Pop2PianoLayerFFrG   c                    t         |           |j                  rt        |      | _        nt        |      | _        t        |j                  |j                        | _	        t        j                  |j                        | _        y )Nr-   )r$   r%   is_gated_actr^   DenseReluDenserF   r"   rM   layer_norm_epsilon
layer_normr   rQ   rR   rS   rV   s     r/   r%   zPop2PianoLayerFF.__init__   s_    "=f"ED"8"@D,V^^AZAZ[zz&"5"56r0   c                 r    | j                  |      }| j                  |      }|| j                  |      z   }|S rX   )rl   rj   rS   )r+   r=   forwarded_statess      r/   r?   zPop2PianoLayerFF.forward   s=    ??=9../?@%5E(FFr0   r\   rD   s   @r/   rf   rf      s    7 7r0   rf   c                   n     e Zd Z	 	 ddedee   f fdZd Zed	d       Z	d
dZ
	 	 	 	 	 	 	 	 	 ddZ xZS )Pop2PianoAttentionrG   	layer_idxc                    t         |           |j                  | _        || _        |j                  | _        |j
                  | _        |j                  | _        |j                  | _        |j                  | _
        |j                  | _        | j                  | j                  z  | _        || _        |9| j                  r-t        j!                  d| j"                  j$                   d       t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        t'        j(                  | j                  | j                  d      | _        | j                  r/t'        j2                  | j                  | j                        | _        t7               | _        d| _        y )NzInstantiating a decoder z without passing `layer_idx` is not recommended and will to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` when creating this class.FrJ   )r$   r%   
is_decoderhas_relative_attention_biasrelative_attention_num_bucketsrelative_attention_max_distancerM   d_kvkey_value_proj_dim	num_headsn_headsrR   rS   	inner_dimrq   loggerwarning_oncer.   r@   r   rL   qkvo	Embeddingrelative_attention_biassetpruned_headsgradient_checkpointingr+   rG   rt   rq   r.   s       r/   r%   zPop2PianoAttention.__init__   ss    	 +++F(.4.S.S+/5/U/U,~~"(++''**(?(??"*4>>+B+B*C D, , 4<<eD4<<eD4<<eD4>>4<<eD+++-<<8[8[]a]i]i+jD(E&+#r0   c                    t        |      dk(  ry t        || j                  | j                  | j                        \  }}t        | j                  |      | _        t        | j                  |      | _        t        | j                  |      | _        t        | j                  |d      | _	        | j                  t        |      z
  | _        | j                  | j                  z  | _
        | j                  j                  |      | _        y )Nr   r   dim)lenr   rz   rx   r   r   r~   r   r   r   r{   union)r+   headsindexs      r/   prune_headszPop2PianoAttention.prune_heads   s    u:?74<<!8!8$:K:K
u $DFFE2#DFFE2#DFFE2#DFFEq9||c%j0004<<? --33E:r0   c                 T   d}|rC|dz  }|| dkD  j                  t        j                        |z  z  }t        j                  |       } n*t        j                  | t        j
                  |              } |dz  }| |k  }|t        j                  | j                         |z        t        j                  ||z        z  ||z
  z  j                  t        j                        z   }t        j                  |t        j                  ||dz
              }|t        j                  || |      z  }|S )a  
        Adapted from Mesh Tensorflow:
        https://github.com/tensorflow/mesh/blob/0cb87fe07da627bf0b7e60475d59f95ed6b5be3d/mesh_tensorflow/transformer/transformer_layers.py#L593

        Translate relative position to a bucket number for relative attention. The relative position is defined as
        memory_position - query_position, i.e. the distance in tokens from the attending position to the attended-to
        position. If bidirectional=False, then positive relative positions are invalid. We use smaller buckets for
        small absolute relative_position and larger buckets for larger absolute relative_positions. All relative
        positions >=max_distance map to the same bucket. All relative positions <=-max_distance map to the same bucket.
        This should allow for more graceful generalization to longer sequences than the model has been trained on

        Args:
            relative_position: an int32 Tensor
            bidirectional: a boolean - whether the attention is bidirectional
            num_buckets: an integer
            max_distance: an integer

        Returns:
            a Tensor with the same shape as relative_position, containing int32 values in the range [0, num_buckets)
        r   r2   r   )r5   r'   longabsmin
zeros_likelogfloatmath	full_likewhere)relative_positionbidirectionalnum_bucketsmax_distancerelative_buckets	max_exactis_smallrelative_position_if_larges           r/   _relative_position_bucketz,Pop2PianoAttention._relative_position_bucket   s(   , AK!2Q!6 : :5:: F TT %		*; <!&+<e>N>NO`>a!b b  1$	$y0 &/II'--/);<hh|i/01Y&( "UZZ.	&"
 &+YY&8RT_bcTc(d&
" 	EKK2CE_``r0   c                    | | j                   j                  j                  }|.t        j                  |t        j
                  |      dddf   }n|dddf   j                  |      }t        j                  |t        j
                  |      dddf   }||z
  }| j                  || j                   | j                  | j                        }| j                  |      }	|	j                  g d      j                  d      }	|	S )z%Compute binned relative position biasN)r:   device)r   r   r   )r2   r   r   r   )r   r)   r   r'   aranger   r5   r   rs   ru   rv   permute	unsqueeze)
r+   query_length
key_lengthr   cache_positioncontext_positionmemory_positionr   relative_position_bucketvaluess
             r/   compute_biaszPop2PianoAttention.compute_bias  s    >1188??F!$||L

SYZ[\^b[bc-ag699&A,,zFSTXZ[T[\+.>>#'#A#A#.;;==	 $B $
  --.FG	*44Q7r0   c                    |j                   dd \  }}|du}| j                  |      }|j                  |d| j                  | j                        j                  dd      }|@|j                  j                  | j                        }|r|j                  }n|j                  }|r|n|}|r7|5r3j                  | j                     }|j                  | j                     }n| j                  |      }| j                  |      }|j                  |d| j                  | j                        j                  dd      }|j                  |d| j                  | j                        j                  dd      }|D|s|
nd}
j                  ||| j                  d|
i      \  }}|rd|j                  | j                  <   t!        j"                  ||j                  dd            }||j                   d   }||n|
d   dz   }| j$                  sZt!        j&                  d| j                  ||f|j(                  |j*                  	      }| j,                  rE| j.                  r9d|_        n1| j3                  |||j(                  |

      }|dddd| dddf   }|#|ddddddd|j                   d   f   }||z   }| j4                  rRt!        j6                  |j                   d         }d|t9        | j4                        <   |dd|j;                         f   }n|}||z  }t<        j>                  jA                  |jC                         d      jE                  |      }t<        j>                  jG                  || jF                  | j.                        }|||z  }t!        j"                  ||      }|j                  dd      jI                         }|j                  |d| jJ                        }| jM                  |      }|||f}|	r||fz   }|S )z
        Self-attention (if key_value_states is None) or attention over source sentence (provided by key_value_states).
        Nr2   r3   r   r   Tr	   )r   r:   )r   r   r   r   )ptraining)'shaper~   viewrz   rx   	transpose
is_updatedgetrq   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   updater'   matmulrt   zerosr   r:   r   r   requires_gradr   r   r(   listboolr   
functionalsoftmaxr   type_asrS   
contiguousr{   r   )r+   r=   maskkey_value_statesposition_biaspast_key_valuelayer_head_maskr   	use_cacheoutput_attentionsr   
batch_size
seq_lengthis_cross_attentionquery_statesr   curr_past_key_valuecurrent_states
key_statesvalue_statesscoresr   real_seq_lengthcausal_maskposition_bias_maskedattn_weightsattn_outputoutputss                               r/   r?   zPop2PianoAttention.forward  s   $ "/!4!4Ra!8
J .T9vvm,#((RtG^G^_iijkmno%'2266t~~FJ!&4&J&J#&4&I&I#-?)]."<,66t~~FJ.::4>>JL/J66.1L#RtG^G^_iijkmnoJ',,ZT\\4KbKbcmmnoqrsL)7It+>+E+Ednn?OQ_>`,(
L &@DN--dnn= lJ,@,@A,FG #))"-J.:.FlN[]L^abLbO33 %j*=fmm[a[g[g! ..4==26M/ $ 1 1#ZVd !2 ! !.aZKL!.C D"1a,Bj.>.>r.B,B#BC - ;::m11!45D,-Dd''()#0DIIK#@ #0 && }},,V\\^,DLLVT}},,\T\\TXTaTa,b &'/9Lll<>!++Aq1<<>!&&z2t~~Fff[)>/Gr0   FN)T       )NN)	NNNNNNFFN)r@   rA   rB   r   r   intr%   r   staticmethodr   r   r?   rC   rD   s   @r/   rp   rp      si     %*#'	!,!, C=	!,F;  -  - ^. ir0   rp   c                   B     e Zd Zddee   f fdZ	 	 	 	 	 	 	 ddZ xZS )Pop2PianoLayerSelfAttentionrq   c                     t         |           t        |||      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )Nrt   rq   rh   )r$   r%   rp   SelfAttentionr"   rM   rk   rl   r   rQ   rR   rS   r   s       r/   r%   z$Pop2PianoLayerSelfAttention.__init__  sT    /0KW`
 -V^^AZAZ[zz&"5"56r0   c	           
          | j                  |      }	| j                  |	|||||||      }
|| j                  |
d         z   }|f|
dd  z   }|S )N)r   r   r   r   r   r   r   r   r   )rl   r   rS   )r+   r=   attention_maskr   r   r   r   r   r   normed_hidden_statesattention_outputr   s               r/   r?   z#Pop2PianoLayerSelfAttention.forward  sv      $}=-- '+)/) . 	
 &5Ea5H(II "%5ab%99r0   r   )NNNNFFNr@   rA   rB   r   r   r%   r?   rC   rD   s   @r/   r   r     s0    7XVY] 7 r0   r   c                   D     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 ddZ xZS )Pop2PianoLayerCrossAttentionrq   c                     t         |           t        |d|      | _        t	        |j
                  |j                        | _        t        j                  |j                        | _        y )NFr   rh   )r$   r%   rp   EncDecAttentionr"   rM   rk   rl   r   rQ   rR   rS   )r+   rG   rq   r.   s      r/   r%   z%Pop2PianoLayerCrossAttention.__init__  sO    1&V[gpq,V^^AZAZ[zz&"5"56r0   c                     | j                  |      }| j                  |||||||||	|

      }|| j                  |d         z   }|f|dd  z   }|S )N)	r   r   r   r   r   r   r   r   r   r   r   )rl   r   rS   )r+   r=   r   r   r   r   r   r   r   r   r   r   r   layer_outputr   s                  r/   r?   z$Pop2PianoLayerCrossAttention.forward  s{      $}=// -'+)%/) 0 
 %t||4DQ4G'HH/$4QR$88r0   rX   )NNNNFNFNr   rD   s   @r/   r   r     s2    7(3- 7 r0   r   c                   L     e Zd Zddee   f fdZ	 	 	 	 	 	 	 	 	 	 	 	 ddZ xZS )Pop2PianoBlockrq   c                 p   t         |           |j                  | _        t        j                         | _        | j
                  j                  t        |||             | j                  r&| j
                  j                  t        ||             | j
                  j                  t        |             y )Nr   )rq   )
r$   r%   rs   r   
ModuleListlayerappendr   r   rf   r   s       r/   r%   zPop2PianoBlock.__init__  s     ++]]_


'4O[d	

 ??JJ:6YWX

*623r0   c                     | j                   d   |||||	|
||      }|d d \  }}	|dd  }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }| j                  xr |d u}|r | j                   d   ||||||	|d   dz   |
|	      }|d d \  }}	|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }||dd  z   } | j                   d   |      }|j                  t        j                  k(  rt        j                  t        j
                  |      j                         t        j                  |j                        j                  dz
  t        j                  |j                        j                        }t        j                  || |      }|f}|
r||	fz   |z   }|S ||z   }|S )	Nr   )r   r   r   r   r   r   r   r2   i  )r   maxr   r3   )r   r   r   r   r   r   r   r   )r   r:   r'   r;   r   isinfanyfinfor   clamprs   )r+   r=   r   r   encoder_hidden_statesencoder_attention_maskencoder_decoder_position_biasr   cross_attn_layer_head_maskr   r   r   return_dictr   self_attention_outputsattention_outputsclamp_valuedo_cross_attentioncross_attention_outputsr   s                       r/   r?   zPop2PianoBlock.forward  s     "/A)'+)/)	"
 )?r(B%~2126 %--/++M*..0M//044t;M//044K
 "KKK<[YM!__R1Fd1R&3djjm!65; :-+B/!3#"3
'# -DBQ,G)M> ""emm3#kkKK.224KK 3 34884?KK 3 3488
 !&M|Q\ ] !24KAB4O O '

2}5 %--/++M*..0M//044t;M//044K
 "KKK<[YM " 114EEG   11Gr0   r   )NNNNNNNNFFTNr   rD   s   @r/   r   r     s@    4XVY] 4" "#&*#'Tr0   r   c                   <    e Zd ZeZdZdZdZdZdZ	dgZ
dgZd Zd Zy)	Pop2PianoPreTrainedModeltransformerFTr   rP   c                    | j                   j                  }t        |t              r)|j                  j
                  j                  |dz         yt        |t              r5|j                  j                  j
                  j                  d|dz         yt        |t              r|j                  j                  j
                  j                  d|dz         t        |d      rL| j                   j                  s5|j                  j                  j
                  j                  d|dz         yyyt        |t              rM|j                   j                  j
                  j                  d|| j                   j"                  dz  z         t        |j                   d      rD|j                   j$                  .|j                   j$                  j
                  j'                          |j(                  j                  j
                  j                  d|| j                   j*                  dz  z         t        |j(                  d      rF|j(                  j$                  /|j(                  j$                  j
                  j'                          yyyt        |t,              r|j.                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j.                  d      rD|j.                  j$                  .|j.                  j$                  j
                  j'                          |j0                  j                  j
                  j                  d|| j                   j"                  dz  z         t        |j0                  d      rD|j0                  j$                  .|j0                  j$                  j
                  j'                          |j(                  j                  j
                  j                  d|| j                   j*                  dz  z         t        |j(                  d      rF|j(                  j$                  /|j(                  j$                  j
                  j'                          yyyt        |t2              ri| j                   j"                  }| j                   j4                  }| j                   j6                  }|j8                  j                  j
                  j                  d|||z  dz  z         |j:                  j                  j
                  j                  d||dz  z         |j<                  j                  j
                  j                  d||dz  z         |j>                  j                  j
                  j                  d|||z  dz  z         |j@                  r8|jB                  j                  j
                  j                  d||dz  z         yyy)zInitialize the weights      ?        )r8   stdlm_head      rK   N)"rG   initializer_factorrY   r"   r)   datafill_Pop2PianoConcatEmbeddingToMel	embeddingnormal_!Pop2PianoForConditionalGenerationsharedhasattrtie_word_embeddingsr  rF   rO   rM   rK   zero_rP   rN   r^   r`   ra   rp   rw   ry   r~   r   r   r   rt   r   )r+   modulefactorrM   rx   rz   s         r/   _init_weightsz&Pop2PianoPreTrainedModel._init_weightsI  sq   //f01MM$$Vc\2 =>##((00cv|0L AB MM  %%--3FSL-Ivy)$++2Q2Q%%**22#2N 3R) 67 II!!))s4;;CVCV[_B_8`)avyy&)fiinn.H		##))+II!!))s4;;CSCSX\B\8])^vyy&)fiinn.H		##))+ /I) ;<KK##++&T[[EXEX]aDa:b+cv{{F+0@0@0L  %%++-KK##++&T[[EXEX]aDa:b+cv{{F+0@0@0L  %%++-II!!))s4;;CSCSX\B\8])^vyy&)fiinn.H		##))+ /I) 23 kk))G!%!1!1kk++GHHOO  ((cv'L^B^cgAg7h(iHHOO  ((cv$7O(PHHOO  ((cv$7O(PHHOO  ((cv'L^B^cgAg7h(i11..55::BBQW\chl[lQmBn 2 4r0   c                    | j                   j                  }| j                   j                  }|t        d      t	        |      rGt        j                  |j                  d d dz   |      }t        j                  ||dd df   gd      }n>|j                  |j                        }|dd df   j                         |ddd f<   ||d<   |t        d      |j                  |d	k(  |       |S )
Nzoself.model.config.decoder_start_token_id has to be defined. In Pop2Piano it is usually set to the pad_token_id.r3   )r   .r   r   ).r   z1self.model.config.pad_token_id has to be defined.)rG   decoder_start_token_idpad_token_id
ValueErrorr   r'   fullr   cat	new_zerosclonemasked_fill_)r+   	input_idsr!  r"  shifted_input_idss        r/   _shift_rightz%Pop2PianoPreTrainedModel._shift_rightw  s    !%!C!C{{//!) B 
 Y' %

9??3B+?$+FH^ _ %		+<iSbS>Q*RXZ [ ) 3 3IOO D)238)<)B)B)Dc12g&(>f%PQQ&&'8D'@,O  r0   N)r@   rA   rB   r   config_classbase_model_prefixis_parallelizablesupports_gradient_checkpointing_supports_cache_class_supports_static_cache_no_split_modules_keep_in_fp32_modulesr  r+   r0   r/   r	  r	  >  sB    "L%&*# ")*!F,o\!r0   r	  c                       e Zd Zd fd	Zd Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 ddZ	 ddeej                  df   dej                  dej                  d	e
d
ef
dZedej                  dededej                  dej                  defd       Z xZS )Pop2PianoStackc                    t         |   |       || _        |j                  | _        t	        j
                  t        |j                        D cg c]  }t        |t        |dk(        |       c}      | _
        t        |j                  |j                        | _        t	        j                  |j                         | _        | j%                          d| _        d | _        d| _        y c c}w )Nr   r   rh   F)r$   r%   embed_tokensrs   r   r   range
num_layersr   r   blockr"   rM   rk   final_layer_normrQ   rR   rS   	post_initmodel_parallel
device_mapr   )r+   rG   r8  ir.   s       r/   r%   zPop2PianoStack.__init__  s     ( ++]] v001 v4Q<[\]

 !36>>vG`G` azz&"5"56 	#&+#s   !C,c                     | j                   S rX   r8  r+   s    r/   get_input_embeddingsz#Pop2PianoStack.get_input_embeddings  s       r0   c                     || _         y rX   rB  r+   new_embeddingss     r/   set_input_embeddingsz#Pop2PianoStack.set_input_embeddings  s
    *r0   c                 
   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|$|"| j
                  rdnd}t        d| d| d      |&|j                         }|j                  d|d         }n8||j                         d d }n"| j
                  rdnd}t        d| d| d	      | j                  r%| j                  r|	rt        j                  d
       d}	|(| j                  t        d      | j                  |      }|\  }}|	du r| j
                  st        d|  d      d}d}| j
                  r|	s|t        |t              r't        |t               sd}t!        |t#                     }njt        |t               s-d}t        j                  d       t!        j$                  |      }n-|+t!        t#               t#                     }n| j
                  sd }||j'                         nd}|%t)        j*                  |||z   |j,                        }|1t/               s'||z   }t)        j0                  |||j,                        }| j                   j
                  r$| j3                  |||||j4                  nd |
      }nX|d d d d d d f   }|j7                  |j8                        }d|z
  t)        j:                  |j8                        j<                  z  }| j
                  rO|M|j                         \  }}}||f}|!t)        j0                  ||j,                        }| j?                  |      }nd }| jA                  || j                   jB                        }| jA                  || j                   jB                        }|rdnd }|
rdnd }|
r| j
                  rdnd }d }d } | jE                  |      }!tG        | jH                        D ]  \  }"}#||"   }$||"   }%|r||!fz   }| j                  r4| j                  r(| jK                  |#jL                  |!||||| |$|%d |	|
|      }&n |#|!||||| |$|%||	|
|      }&|	du r|&d d dz   |&dd  z   }&|&d d \  }!}'|&d   }| j
                  r|	|&|
rdnd   } |
s||&d   fz   }| j
                  s||&d   fz   } | jO                  |!      }!| jE                  |!      }!|r||!fz   }|	r'nd }(|r|j4                  }(|r|jQ                         }(|stS        d |!|(|||fD              S tU        |!|(|||      S )Ndecoder_ zYou cannot specify both zinput_ids and zinputs_embeds at the same timer3   zYou have to specify either zinput_ids or inputs_embedszZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...Fz<You have to initialize the model with valid token embeddingsTz)`use_cache` can only be set to `True` if z is used as a decoderzPassing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.r   r   )r:   r  r4  )r   r   r   r   r   r   r  r   r   r   r   r   rX   r2      r	      c              3   $   K   | ]  }|| 
 y wrX   r4  ).0r   s     r/   	<genexpr>z)Pop2PianoStack.forward.<locals>.<genexpr>t  s      
 = 
s   )last_hidden_statepast_key_valuesr=   
attentionscross_attentions)+rG   r   r   output_hidden_statesuse_return_dictrs   r#  sizer   r   r   r|   r}   r8  rY   r   r   r   from_legacy_cacheget_seq_lengthr'   r   r   r   r(   _update_causal_maskr   r5   r:   r   r   invert_attention_maskget_head_maskr:  rS   	enumerater;  _gradient_checkpointing_funcr?   r<  to_legacy_cachetupler   ))r+   r)  r   r   r   rL  	head_maskcross_attn_head_maskrT  r   r   rW  r  r   err_msg_prefixinput_shaper   r   return_legacy_cachereturn_self_attention_cachepast_key_values_lengthmask_seq_lengthr   encoder_batch_sizeencoder_sequence_length_encoder_hidden_shapeencoder_extended_attention_maskall_hidden_statesall_attentionsall_cross_attentionsr   r   r=   r@  layer_moduler   r  layer_outputsnext_decoder_cache
next_caches)                                            r/   r?   zPop2PianoStack.forward  s     "+!6IDKK<Q<Q	1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B] ]%>+/??ZN*>*:.HXXvw  "#..*K!r;r?;I&',,.s3K+/??ZN:>:J-XfWggtuvv&&4==##p "	   ( !_`` --i8M!,
J?? #LTFRg!hii $&+#??	_-H/51*_Vi:j.2+"5o|~"V1DE&*###`
 #6"G"G"X ("5lnln"U #OETE`!?!?!Afg!"\\&(>(KTaThThN !*B*D4zAO"ZZ
OML`L`aN;;!!228G8S44Y]!K )D$)9:K%..}/B/B.CK,M<O<O0P0T0TTK ??4@=R=W=W=Y: 7$68O#P %-).4HQ^QeQe)f&.2.H.HI_.`+.2+ &&y$++2H2HI	#112FH^H^_"6BD0d&7DOOrRV(,%]3(4 7	VOA|'lO)=a)@&#$58H$H!**t}} $ A A ((!!)31#.%"!  !-!#."/*?+J2O$3/I#2'&7#1!" E! -bq 1G ;mAB>O O0=bq0A-M-
 *!,M#8#D0=CTaZ[0\- !/=3C2E!E??+?=QRCSBU+U(o7	Vr --m<]3   1]4D D+4'$
&(==J(88:J 
 "%"(
 
 
 9+&+%1
 	
r0   r   r   input_tensorr   rT  r   c           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )Nflash_attention_2r  flex_attentionr   Fsdpa)rL  ri  is_trainingr   r3   )sequence_lengthtarget_lengthr:   r   r   )cudaxpunpu)rG   _attn_implementationr   rY   r'   rZ   r   r[  is_compileabler   _ignore_causal_mask_sdpar   r:   r   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   typer   r   _unmask_unattended)r+   r   rw  r   rT  r   past_seen_tokensusing_compilable_cacher:   r}  r~  r   	min_dtypes                r/   r\  z"Pop2PianoStack._update_causal_mask  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr0   r}  r~  r:   r   c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        NrN  )
fill_valuer:   r   r   )diagonalrM  r3   r   )r   r'   r   r   r$  r   triur   reshapeexpandr'  r   r5   masked_fill)r   r}  r~  r:   r   r   kwargsr   r  mask_lengthpadding_masks              r/   r  zDPop2PianoStack._prepare_4d_causal_attention_mask_with_cache_position  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r0   rX   )NNNNNNNNNNNNN)F)r@   rA   rB   r%   rD  rH  r?   r   r'   rZ   r   r   r\  r   r   r:   r  rC   rD   s   @r/   r6  r6    s    ,.!+
 "#!!S
x #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r0   r6  c                   (     e Zd ZdZ fdZd Z xZS )r  z'Embedding Matrix for `composer` tokens.c                     t         |           t        j                  |j                  |j
                        | _        y )N)num_embeddingsembedding_dim)r$   r%   r   r   composer_vocab_sizerM   r  rV   s     r/   r%   z&Pop2PianoConcatEmbeddingToMel.__init__  s-    V5O5O_e_m_mnr0   c                     ||z
  }| j                  |      j                  d      }t        j                  ||gd      }|S )Nr   r   )r  r   r'   r%  )r+   featureindex_valueembedding_offsetindex_shiftedcomposer_embeddingrL  s          r/   r?   z%Pop2PianoConcatEmbeddingToMel.forward  sC    #&66!^^M:DDQG		#5w"?QGr0   )r@   rA   rB   __doc__r%   r?   rC   rD   s   @r/   r  r    s    1or0   r  zA
    Pop2Piano Model with a `language modeling` head on top.
    )custom_introc            *       >    e Zd Zg dZdef fdZd Zd Zd Zd Z	d Z
d	 Z	 d$d
ej                  dededeej                     fdZe	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d%deej&                     deej                     deej&                     deej(                     deej                     deej                     deej*                     deeeej*                           deeeej*                           deej                     d
eej                     deej                     deej&                     dee   dee   dee   dee   deej&                     deeej                     ef   f&d        Z ej6                         	 	 	 d& fd!	       Zdej*                  fd"Zd# Z xZS )'r  )zencoder.embed_tokens.weightzdecoder.embed_tokens.weightzlm_head.weightrG   c                 |   t         |   |       || _        |j                  | _        t        j                  |j                  |j                        | _        t        |      | _
        t        j                  |      }d|_        d|_        d|_        t!        || j                        | _        t        j                  |      }d|_        d|_        |j$                  |_        t!        || j                        | _        t        j*                  |j                  |j                  d      | _        | j/                          y )NFTrJ   )r$   r%   rG   rM   	model_dimr   r   
vocab_sizer  r  mel_conditionercopydeepcopyrs   r   is_encoder_decoderr6  encodernum_decoder_layersr:  decoderrL   r  r=  )r+   rG   encoder_configdecoder_configr.   s       r/   r%   z*Pop2PianoForConditionalGeneration.__init__  s     ll6#4#4fnnE<VDv.$)!#( ,1)%ndkkBv.$(!,1)$*$=$=!%ndkkByy1B1BO 	r0   c                     | j                   S rX   )r  rC  s    r/   rD  z6Pop2PianoForConditionalGeneration.get_input_embeddings6  s    {{r0   c                 ~    || _         | j                  j                  |       | j                  j                  |       y rX   )r  r  rH  r  rF  s     r/   rH  z6Pop2PianoForConditionalGeneration.set_input_embeddings9  s-    $)).9)).9r0   c                     || _         y rX   r  rF  s     r/   set_output_embeddingsz7Pop2PianoForConditionalGeneration.set_output_embeddings>  s	    %r0   c                     | j                   S rX   r  rC  s    r/   get_output_embeddingsz7Pop2PianoForConditionalGeneration.get_output_embeddingsA      ||r0   c                     | j                   S rX   )r  rC  s    r/   get_encoderz-Pop2PianoForConditionalGeneration.get_encoderD  r  r0   c                     | j                   S rX   )r  rC  s    r/   get_decoderz-Pop2PianoForConditionalGeneration.get_decoderG  r  r0   input_featurescomposergeneration_configr   c                    |j                   }||j                         vr(t        dt        |j                                d|       ||   }t	        j
                  || j                        }|j                  |j                  d         }t        |j                               }| j                  |||      }|Od||dddf   j                          <   t	        j                  |dddf   j                  dd	      |gd	
      }||fS |dfS )a  
        This method is used to concatenate mel conditioner tokens at the front of the input_features in order to
        control the type of MIDI token generated by the model.

        Args:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                input features extracted from the feature extractor.
            composer (`str`):
                composer token which determines the type of MIDI tokens to be generated.
            generation_config (`~generation.GenerationConfig`):
                The generation is used to get the composer-feature_token pair.
            attention_mask (``, *optional*):
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
        zPlease choose a composer from z. Composer received - rM  r   )r  r  r  Nr  r3   r   )axis)composer_to_feature_tokenkeysr#  r   r'   tensorr   repeatr   r   r   r  r   concatenater   )r+   r  r  r  r   r  composer_valuer  s           r/   get_mel_conditioner_outputsz=Pop2PianoForConditionalGeneration.get_mel_conditioner_outputsJ  s.   0 %6$O$O!499;;06O6T6T6V1W0XXnownxy  38<nT[[I'..~/C/CA/FG8??AB--"&- . 

 %;>NN1a4055778 #..q!t0D0I0I"a0PR`/ahijN!>11t##r0   r)  decoder_input_idsdecoder_attention_maskrc  decoder_head_maskrd  encoder_outputsrT  rL  decoder_inputs_embedslabelsr   r   rW  r  r   returnc                    ||n| j                   j                  }||n| j                   j                  }|
|t        d      ||
|}
|| j	                  |||
||||      }nI|rGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }|d   }|||| j                  |      }| j                  ||||	|||||||||      }|d   }| j                   j                  r|| j                  d	z  z  }| j                  |      }d}|Ct        d
      } ||j                  d|j                  d            |j                  d            }|s|f|dd z   |z   }||f|z   S |S t!        |||j"                  |j$                  |j&                  |j(                  |j*                  |j$                  |j&                  	      S )a`  
        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Pop2Piano is a model with relative position embeddings
            so you should be able to pad the inputs on both the right and the left. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for detail.
            [What are input IDs?](../glossary#input-ids) To know more on how to prepare `input_ids` for pretraining
            take a look a [Pop2Piano Training](./Pop2Piano#training).
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Indices of decoder input sequence tokens in the vocabulary. Indices can be obtained using
            [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
            [What are decoder input IDs?](../glossary#decoder-input-ids) Pop2Piano uses the `pad_token_id` as the
            starting token for `decoder_input_ids` generation. If `past_key_values` is used, optionally only the last
            `decoder_input_ids` have to be input (see `past_key_values`). To know more on how to prepare
        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
            be used by default.
        decoder_head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the self-attention modules in the decoder. Mask values selected in `[0,
            1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        cross_attn_head_mask (`torch.Tensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in
            `[0, 1]`:
            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Does the same task as `inputs_embeds`. If `inputs_embeds` is not present but `input_features` is present
            then `input_features` will be considered as `inputs_embeds`.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
            labels in `[0, ..., config.vocab_size]`
        NzSBoth `inputs_embeds` and `input_features` received! Please provide only one of them)r)  r   rL  rc  r   rW  r  r   r   r2   )rS  r=   rU  )r)  r   rL  rT  r   r   rc  rd  r   r   rW  r  r   r  r   )ignore_indexr3   )	losslogitsrT  decoder_hidden_statesdecoder_attentionsrV  encoder_last_hidden_stater   encoder_attentions)rG   r   rX  r#  r  rY   r   r   r+  r  r  r  r  r   r   rY  r   rT  r=   rU  rV  rS  )r+   r)  r   r  r  rc  r  rd  r  rT  rL  r  r  r  r   r   rW  r  r   r=   decoder_outputssequence_output	lm_logitsr  loss_fctoutputs                             r/   r?   z)Pop2PianoForConditionalGeneration.forward{  s>   p "+!6IDKK<Q<Q	%0%<k$++B]B]$)Crss'M,A*M ""ll#-+#"3%9' + O O_!M-"1!"4474H14Loa0RV14_1E1I?1-tO (*"3";@U@] $ 1 1& 9 ,,'1/+"/#1'!5/!5#) ' 
  *!,;;** .1EFOLL1	'T:HINN2y~~b/ABFKKPROTD\OAB$77/IF)-)9TGf$EvE+;;"1"?"?.99,==&5&G&G"1"?"?.99

 
	
r0   c                    || j                   } |j                  d	i | t        |d      st        d      t	        |j
                        | j                  j                  k7  r9t        d| j                  j                   dt	        |j
                         d      | j                  ||||      \  }}t        | (  d	d|||d|S )
a  
        Generates token ids for midi outputs.

        <Tip warning={true}>

        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
        model's default generation configuration. You can override any `generation_config` by passing the corresponding
        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`. For an overview of generation
        strategies and code examples, check out the [following guide](./generation_strategies).

        </Tip>

        Parameters:
            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
                This is the featurized version of audio generated by `Pop2PianoFeatureExtractor`.
            attention_mask:
                For batched generation `input_features` are padded to have the same shape across all examples.
                `attention_mask` helps to determine which areas were padded and which were not.
                - 1 for tokens that are **not padded**,
                - 0 for tokens that are **padded**.
            composer (`str`, *optional*, defaults to `"composer1"`):
                This value is passed to `Pop2PianoConcatEmbeddingToMel` to generate different embeddings for each
                `"composer"`. Please make sure that the composet value is present in `composer_to_feature_token` in
                `generation_config`. For an example please see
                https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.json .
            generation_config (`~generation.GenerationConfig`, *optional*):
                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
                passed to generate matching the attributes of `generation_config` will override them. If
                `generation_config` is not provided, the default will be used, which had the following loading
                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
                default values, whose documentation should be checked to parameterize generation.
            kwargs:
                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
                forwarded to the `forward` function of the model. If the model is an encoder-decoder model, encoder
                specific kwargs should not be prefixed and decoder specific kwargs should be prefixed with *decoder_*.
        Return:
            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`.
                Since Pop2Piano is an encoder-decoder model (`model.config.is_encoder_decoder=True`), the possible
                [`~utils.ModelOutput`] types are:
                    - [`~generation.GenerateEncoderDecoderOutput`],
                    - [`~generation.GenerateBeamEncoderDecoderOutput`]
        Nr  z`composer_to_feature_token` was not found! Please refer to https://huggingface.co/sweetcocoa/pop2piano/blob/main/generation_config.jsonand parse a dict like that.ztconfig.composer_vocab_size must be same as the number of keys in generation_config.composer_to_feature_token! Found z vs .)r  r   r  r  )inputsrL  r   r  r4  )r  r   r  r#  r   r  rG   r  r  r$   generate)r+   r  r   r  r  r  r.   s         r/   r  z*Pop2PianoForConditionalGeneration.generate  s   l $ $ 6 6   *6* (*EF.   ::;t{{?^?^^889cBSBmBm>n=oopr  *.)I)I))/	 *J *
& w 
()/	

 
 	
r0   c                 $    | j                  |      S rX   )r+  )r+   r  s     r/   %prepare_decoder_input_ids_from_labelszGPop2PianoForConditionalGeneration.prepare_decoder_input_ids_from_labels]  s      ((r0   c           	         |t         j                  d       |S d}|D ]  }d}|D ]1  }||j                  d|j                  |j                              fz   }3 |d   j
                  |d   j
                  k7  r,t        d|d   j
                   d|d   j
                   d      t        |      t        |      k7  r$t        dt        |       dt        |       d      ||fz   } |S )	NzHYou might want to consider setting `use_cache=True` to speed up decodingr4  r   z%reordered_layer_past_states[0] shape z  and layer_past_states[0] shape z mismatchedz&length of reordered_layer_past_states z! and length of layer_past_states )r|   warningindex_selectr5   r   r   r#  r   )r+   rT  beam_idxreordered_decoder_pastlayer_past_statesreordered_layer_past_stateslayer_past_states          r/   _reorder_cachez0Pop2PianoForConditionalGeneration._reorder_cache`  su    "NNef""!#!0 	] +-'$5  .I$11!X[[AQAXAX5YZM /+ +1-337H7K7Q7QQ ;<WXY<Z<`<`;a  bB  CT  UV  CW  C]  C]  B^  ^i  j  ./37H3II <SA\=]<^^  AD  EV  AW  @X  Xc  d  &<?Z>\%\"'	]( &%r0   rX   )NNNNNNNNNNNNNNNNNN)N	composer1N) r@   rA   rB   _tied_weights_keysr   r%   rD  rH  r  r  r  r  r'   FloatTensorstrr   r   r  r   
LongTensor
BoolTensorrZ   r   r   r   r   r?   no_gradr  r  r  rC   rD   s   @r/   r  r    s    j 6:
& 7;/$))/$ /$ ,	/$
 !!2!23/$b  156:8<=A159=7;@D@D596:=A-1$(,0/3&*59'E
E,,-E
 !!2!23E
 $E$4$45	E

 !))9)9 :E
 E--.E
 $E$5$56E
 'u||4E
 "%ell(;"<=E
 "%ell(;"<=E
   1 12E
 !!2!23E
  ((9(9:E
 ))*E
 D>E
  $D>!E
" 'tn#E
$ d^%E
& !!1!12'E
( 
uU&&'8	9)E
 E
N U]]_ W
 W
r)ELL )&r0   r  )Fr  r  r   typingr   r   r   r'   r   torch.nnr   transformers.generationr   activationsr
   cache_utilsr   r   r   
generationr   modeling_attn_mask_utilsr   modeling_outputsr   r   r   modeling_utilsr   pytorch_utilsr   r   r   utilsr   r   r   r   r   configuration_pop2pianor   !torch.nn.attention.flex_attentionr   integrations.flex_attentionr   
get_loggerr@   r|   _load_pop2piano_layer_normapex.normalizationr    infoImportError	Exceptionr  Moduler"   r   rF   r^   rf   rp   r   r   r   r	  r6  r  r  __all__r4  r0   r/   <module>r     s      ) )   % 4 ! C C ) > k k - g g w w 4  !;J 
		H	%! 	/!&
KKhi+ +2 "%    . /RYY .")) <ryy &a aJ!")) !J#299 #NcRYY cL P! P! P!fp- pfBII  
d&(@/ d&
d&N /0J
KS*  	 	
NN^_	s   G G"	G"!G"