
    Uhr                    *   d dl mZmZmZmZ d dlZd dlZd dlm	Z	 ddl
mZ ddlmZmZmZ ddlmZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZmZmZmZmZ ddl m!Z!m"Z" ddl#m$Z$m%Z% ddl&m'Z' ddl(m)Z)m*Z*m+Z+m,Z, ddl-m.Z.  e+       rd dl/m0Z0 ddl1m2Z2  e,jf                  e4      Z5 G d de	jl                        Z7 G d de	jl                        Z8dejr                  de:dejr                  fdZ;	 dCde	jl                  dejr                  dejr                  dejr                  deejr                     d e<d!e<fd"Z=d# Z>dDd$Z? G d% d&e	jl                        Z@ G d' d(e	jl                        ZA G d) d*e      ZB G d+ d,e      ZCe) G d- d.e%             ZD G d/ d0eD      ZEe) G d1 d2eD             ZF	 	 dEd3ee:e:f   d4e<d5e:deej                     d6e:dej                  fd7ZIe) G d8 d9eD             ZJd:ejr                  d;e:d<e:fd=ZK e)d>?       G d@ dAeDe             ZLg dBZMy)F    )CallableOptionalTupleUnionN   )ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)AttentionMaskConverter_prepare_4d_attention_mask#_prepare_4d_attention_mask_for_sdpa)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPast)BaseModelOutputWithPastAndCrossAttentionsSeq2SeqLMOutputSeq2SeqModelOutput)ROPE_INIT_FUNCTIONSdynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_flex_attn_availablelogging   )MoonshineConfig)	BlockMask)make_flex_block_causal_maskc                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineEncoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                        | _	        t        j                  |j                  |j                        | _
        y Nsuper__init__configr   activation_fnnnLinearhidden_sizeintermediate_sizefc1fc2selfr+   
hidden_act	__class__s      /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/moonshine/modeling_moonshine.pyr*   zMoonshineEncoderMLP.__init__=   s^    #J/99V//1I1IJ99V55v7I7IJ    hidden_statesreturnc                 l    | j                  |      }| j                  |      }| j                  |      }|S r'   )r1   r,   r2   )r4   r9   s     r7   forwardzMoonshineEncoderMLP.forwardD   s4    /**=9/r8   __name__
__module____qualname__r*   torchTensorr<   __classcell__r6   s   @r7   r%   r%   <   s$    KU\\ ell r8   r%   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )MoonshineDecoderMLPc                    t         |           || _        t        |   | _        t        j                  |j                  |j                  dz        | _	        t        j                  |j                  |j                        | _
        y )N   r(   r3   s      r7   r*   zMoonshineDecoderMLP.__init__L   sc    #J/99V//1I1IA1MN99V55v7I7IJr8   r9   r:   c                     | j                  |      }|j                  dd      \  }}| j                  |      |z  }| j                  |      }|S )NrH   dim)r1   chunkr,   r2   )r4   r9   gates      r7   r<   zMoonshineDecoderMLP.forwardS   sS    /+11!1<t**40=@/r8   r=   rD   s   @r7   rF   rF   K   s$    KU\\ ell r8   rF   r9   n_repr:   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r    N)shapeexpandreshape)r9   rO   batchnum_key_value_headsslenhead_dims         r7   	repeat_kvrX   [   so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTr8   modulequerykeyvalueattention_maskscalingdropoutc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )NrH   r   rJ   )rL   dtype)ptrainingr    )rX   num_key_value_groupsrA   matmul	transposerQ   r-   
functionalsoftmaxfloat32torb   r_   rd   
contiguous)rY   rZ   r[   r\   r]   r^   r_   kwargs
key_statesvalue_statesattn_weightscausal_maskattn_outputs                r7   eager_attention_forwardrs   g   s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$r8   c                 |    | ddddf   }| ddddf   }t        j                  | |fd      j                  d      S )	z*Rotates half the hidden dims of the input..r   NrH   r    rJ   rK   ra   )rA   stackflatten)xx1x2s      r7   rotate_halfrz      sJ    	
319B	
319B;;Ryb)11"55r8   c                    |j                  |      }|j                  |      }|dd|j                  d   dz  f   j                  dd      }|dd|j                  d   dz  f   j                  dd      }|j                  d   }| dd|f   | d|df   }}|dd|f   |d|df   }
}	||z  t        |      |z  z   }|	|z  t        |	      |z  z   }t	        j
                  ||gd      }t	        j
                  ||
gd      }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        position_ids (`torch.Tensor`, *optional*):
            Deprecated and unused.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    .NrJ   rH   rK   )	unsqueezerQ   repeat_interleaverz   rA   cat)qkcossinposition_idsunsqueeze_dim
rotary_dimq_rotq_passk_rotk_passq_embedk_embeds                r7   apply_rotary_pos_embr      sD   ( --
&C
--
&C c'SYYr]a'''
(
:
:1"
:
EC
c'SYYr]a'''
(
:
:1"
:
EC 2Jc;J;&'3
+;)<6Ec;J;&'3
+;)<6E s{{51C78Gs{{51C78G ii&)r2Gii&)r2GGr8   c                   l    e Zd ZdZdededededef
 fdZ	 	 	 	 	 ddej                  d	e
eej                  ej                  f      d
e
ej                     de
e   de
ej                     de
ej                     dee   deej                  e
ej                     e
eej                        f   fdZ xZS )MoonshineAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr+   	layer_idx	is_causalnum_attention_headsrU   c                 8   t         |           |j                  ||d       || _        || _        t        |d|j                  |j                  z        | _        |j                  |j                  z  | _
        | j                  dz  | _        |j                  | _        || _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  |j                  | j                  z  |j                         | _        t        j                  |j                  | j                  z  |j                  d      | _        | j                  j*                  C| j                  j*                  }|| j                  |z   dz
  |z  z  }|| j                  z
  | _        y d| _        y )N)r   rU   rW   g      ࿩biasFr    r   )r)   r*   updater+   r   getattrr/   r   rW   rU   re   r^   attention_dropoutr   r-   r.   attention_biasq_projk_projv_projo_projpad_head_dim_to_multiple_ofhead_dim_padding)	r4   r+   r   r   r   rU   target_multipletarget_head_dimr6   s	           r7   r*   zMoonshineAttention.__init__   s    	.AZmno"
F4F4F&JdJd4de$*$>$>&B\B\$\!}}d*!'!9!9"ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JQWQfQf
 ii : :T]] JFL^L^ejk ;;22>"kkEEO-$--/2QTU2UZi1ijO$3dmm$CD!$%D!r8   r9   position_embeddingsr]   past_key_valuecache_positionkey_value_statesrm   r:   c                    |j                   d d \  }}	| j                  |      j                  ||	| j                  j                  | j
                        j                  dd      }
|d u}|Y|j                  j                  | j                        }|r&d|j                  | j                  <   |j                  }n|j                  }||n|}|r7|r5r3|j                  | j                     }|j                  | j                     }n| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }| j                  |      j                  |d| j                  j                  | j
                        j                  dd      }|r%|#|j!                  ||| j                  d|i      \  }}|s?|\  }}t#        |
|||      \  }
}|'|||d}|j!                  ||| j                  |      \  }}t$        }| j                  j&                  dk7  r^| j                  j&                  dk(  r(|j                  d	d
      rt(        j+                  d       nt,        | j                  j&                     }| j.                  r	||	dkD  rdnd
}| j0                  dkD  rt2        j4                  j6                  j9                  |
d| j0                  f      }
t2        j4                  j6                  j9                  |d| j0                  f      }t2        j4                  j6                  j9                  |d| j0                  f      } || |
|||f| j:                  sdn| j<                  | j>                  |d|\  }}| j0                  dkD  r|dd | j0                   f   }|jA                  ||	d      jC                         }| jE                  |      }||fS )NrJ   r    rH   Tr   )r   r   r   eagersdpaoutput_attentionsFz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r           )r_   r^   r   .)#rQ   r   viewr+   rU   rW   rg   
is_updatedgetr   cross_attention_cacheself_attention_cache	key_cachevalue_cacher   r   r   r   rs   _attn_implementationloggerwarning_oncer   r   r   rA   r-   rh   padrd   r   r^   rS   rl   r   )r4   r9   r   r]   r   r   r   rm   bszq_lenquery_statesis_cross_attentionr   current_statesrn   ro   r   r   cache_kwargsattention_interfacer   rr   rp   s                          r7   r<   zMoonshineAttention.forward   s    #(("-
U KK&++C8W8WY]YfYfgqqrsuvw 	 .T9%'2266t~~FJ!<@))$..9!/!E!E!/!D!D .>-I)}.Z'11$..AJ)55dnnEL N+c2t{{>>N1a  N+c2t{{>>N1a 
 "n&@+9+@+@dnn?OQ_>`,(
L "*HC';L*VY[^'_$L*)'*3.Y+9+@+@dnnl,(
L )@;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_# NN~/E%RS)DY^	  1$ 88..22<!TEZEZA[\L,,00aAVAV=WXJ 88..22<!TEZEZA[\L$7
%
  $}}C$2H2HLL
%
 
%
!\   1$%c+Cd.C.C-C+C&CDK!))#ub9DDFkk+.L((r8   )NNNNN)r>   r?   r@   __doc__r!   intboolr*   rA   rB   r   r   r	   
LongTensorr   r   r<   rC   rD   s   @r7   r   r      s   G#&#& #& 	#&
 !#& !#&P LP15*.5937[)||[) &eELL%,,,F&GH[) !.	[)
 ![) !!1!12[) #5<<0[) -.[) 
u||Xell3XeELL>Q5RR	S[)r8   r   c                   ^     e Zd Zddef fdZ ej                         ed               Z xZ	S )MoonshineRotaryEmbeddingr+   c                    t         |           t        |d      rG|j                  ;|j                  j	                  d|j                  j	                  d            | _        nd| _        |j                  | _        |j                  | _        || _	        t        | j
                     | _        | j                  | j                  |      \  }| _        | j                  d|d       | j                  | _        y )Nrope_scaling	rope_typetypedefaultinv_freqF)
persistent)r)   r*   hasattrr   r   r   max_position_embeddingsmax_seq_len_cachedoriginal_max_seq_lenr+   r   rope_init_fnattention_scalingregister_bufferr   original_inv_freq)r4   r+   devicer   r6   s       r7   r*   z!MoonshineRotaryEmbedding.__init__9  s    6>*v/B/B/N#0044[&BUBUBYBYZ`BabDN&DN"("@"@$*$B$B!/?+/+<+<T[[&+Q($(ZeD!%r8   c                 b   | j                   d d d d f   j                         j                  |j                  d   dd      j	                  |j
                        }|d d d d d f   j                         }t        |j
                  j                  t              r/|j
                  j                  dk7  r|j
                  j                  nd}t        j                  |d      5  |j                         |j                         z  j                  dd      }t        j                  ||fd	      }|j                         | j                  z  }|j                         | j                  z  }	d d d        j	                  |j                   
      	j	                  |j                   
      fS # 1 sw Y   AxY w)Nr   rJ   r    mpscpuF)device_typeenabledrH   rK   rb   )r   floatrR   rQ   rk   r   
isinstancer   strrA   autocastrg   r~   r   r   r   rb   )
r4   rw   r   inv_freq_expandedposition_ids_expandedr   freqsembr   r   s
             r7   r<   z MoonshineRotaryEmbedding.forwardJ  sV    !MM$4-8>>@GGHZHZ[\H]_acdehhijiqiqr ,QaZ 8 > > @'1!((--'E!((--[`J`ahhmmfk^^UC 	5&,,.1F1L1L1NNYYZ[]^_E))UEN3C'')d444C'')d444C		5 vvAGGv$cff177f&;;;	5 	5s    BF%%F.r'   )
r>   r?   r@   r!   r*   rA   no_gradr   r<   rC   rD   s   @r7   r   r   8  s3    / /" U]]_<  <r8   r   c                   p    e Zd Zdedef fdZ	 	 	 	 	 	 	 ddej                  deej                     deej                     dee
   dee   d	ee   d
eej                     deeej                  ej                  f      dee   deej                  eeej                  ej                  f      f   fdZ xZS )MoonshineEncoderLayerr+   r   c                 d   t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||j                        | _	        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NFr+   r   r   r   rU   r   )r)   r*   r/   r   encoder_num_attention_headsencoder_num_key_value_heads	self_attnr%   encoder_hidden_actmlpr-   	LayerNorminput_layernormpost_attention_layernormr4   r+   r   r6   s      r7   r*   zMoonshineEncoderLayer.__init__[  s    !--+ & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%r8   r9   r]   r   r   r   	use_cacher   r   rm   r:   c	                     |}
| j                  |      } | j                  d||||||||d|	\  }}|
|z   }|}
| j                  |      }| j                  |      }|
|z   }|f}|r||fz  }|S )Nr9   r]   r   r   r   r   r   r    )r   r   r   r   )r4   r9   r]   r   r   r   r   r   r   rm   residualself_attn_weightsoutputss                r7   r<   zMoonshineEncoderLayer.forwardk  s     !,,]; ,:4>> 
,
')%)/) 3
,
 
,
(( !=0 !55mD/ =0 ")++Gr8   )NNNFFNN)r>   r?   r@   r!   r   r*   rA   rB   r   r   r	   r   r   r   r   FloatTensorr<   rC   rD   s   @r7   r   r   Z  s   U U3 U& 2637*.,1$)59KO'||' !.' u//0	'
 !' $D>' D>' !!1!12' &eELL%,,,F&GH' -.' 
u  (51B1BEDUDU1U+V"WW	X'r8   r   c                        e Zd Zddedee   f fdZ	 	 	 	 	 	 	 	 	 	 	 ddej                  deej                     deej                     deej                     deej                     d	eej                     d
ee
   dee   dee   deej                     deeej                  ej                  f      deeej                  ej                  f      deej                  eeej                  ej                  f      f   fdZ xZS )MoonshineDecoderLayerr+   r   c                    t         |           |j                  | _        t        ||d|j                  |j
                        | _        t        ||d|j                  |j
                        | _        t        ||j                        | _
        t        j                  |j                  d      | _        t        j                  |j                  d      | _        t        j                  |j                  d      | _        y )NTr   Fr   )r)   r*   r/   r   decoder_num_attention_headsdecoder_num_key_value_headsr   encoder_attnrF   decoder_hidden_actr   r-   r   r   r   final_layernormr   s      r7   r*   zMoonshineDecoderLayer.__init__  s    !--+ & B B & B B
 / & B B & B B
 'vv/H/HI!||F,>,>UK(*V5G5Ge(T%!||F,>,>UKr8   r9   r]   encoder_hidden_statesencoder_attention_maskr   encoder_position_idsr   r   r   r   r   encoder_position_embeddingsr:   c                 H   |}| j                  |      } | j                  d||||||	|
|d|\  }}||z   }d }|2|}| j                  |      }| j                  ||||||	      \  }}||z   }|}| j	                  |      }| j                  |      }||z   }|f}|r|||fz  }|S )Nr   )r9   r   r]   r   r   r   r   )r   r   r   r   r   r   )r4   r9   r]   r   r   r   r   r   r   r   r   r   r  rm   r   r   cross_attn_weightsr   s                     r7   r<   zMoonshineDecoderLayer.forward  s     !,,]; ,:4>> 
,
')%)/) 3
,
 
,
(( !=0 " ,$H 99-HM040A0A+!65-"3# 1B 1-M- %}4M !,,];/ =0 ")+=>>Gr8   r'   )NNNNNNFFNNN)r>   r?   r@   r!   r   r   r*   rA   rB   r   r	   r   r   r   r<   rC   rD   s   @r7   r   r     sj   L L8C= L6 268<9=37;?*.,1$)59KOSW<||< !.<  (5	<
 !) 6< u//0< 'u'7'78< !< $D>< D>< !!1!12< &eELL%,,,F&GH< &.eELL%,,4N.O%P< 
u  (51B1BEDUDU1U+V"WW	X<r8   r   c                   Z    e Zd ZeZdZdZdZddgZdZ	dZ
dZdZd Zdej                  fdZy	)
MoonshinePreTrainedModelmodelinput_valuesTr   r   c                 8   | j                   j                  }t        |t        j                  t        j
                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                  t        j                  f      rW|j                  j                  j                  d       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j                  j                  j                  d|       |j                  2|j                  j                  |j                     j                          y y y )Nr   )meanstdg      ?)r+   initializer_ranger   r-   r.   Conv1dweightdatanormal_r   zero_	GroupNormr   fill_	Embeddingpadding_idx)r4   rY   r
  s      r7   _init_weightsz&MoonshinePreTrainedModel._init_weights  s    kk++fryy"))45MM&&CS&9{{&  &&( 'r|| <=MM$$S){{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .r8   input_lengthsc                 ~    t        |dz
  dz  dz         }t        |dz
  dz  dz         }t        |dz
  dz  dz         }|S )zH
        Computes the output length of the convolutional layers
           @   r       r   rH   )r   )r4   r  output_conv1_lengthoutput_conv2_lengthoutput_conv3_lengths        r7    _get_feat_extract_output_lengthsz9MoonshinePreTrainedModel._get_feat_extract_output_lengths  sZ     "=3#6""<q"@A!#6#:a"?!"CD!#6#:a"?!"CD""r8   N)r>   r?   r@   r!   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modules_supports_flash_attn_2_supports_sdpa_supports_cache_class_supports_static_cacher  rA   r   r  r   r8   r7   r  r    sR    "L$O&*#02IJ!N !?#e>N>N #r8   r  c                        e Zd ZdZdZdef fdZdej                  fdZ	dej                  fdZ
e	 	 	 	 ddeej                     d	eej                     d
ee   dee   dee   defd       Z xZS )MoonshineEncoderz
    Transformer encoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MoonshineEncoderLayer`]

    Args:
        config: MoonshineConfig
    r  r+   c           	      b   t         |   |       || _        |j                  }t	        j
                  d|ddd      | _        t	        j
                  |d|z  dd	      | _        t	        j
                  d|z  |dd	      | _        t	        j                  d|d
      | _
        t        |      | _        t	        j                  t        |j                        D cg c]  }t!        ||       c}      | _        t	        j$                  |d      | _        d| _        | j+                          y c c}w )Nr    r  r  F)kernel_sizestrider   rH   r  r   )r+  r,  gh㈵>)
num_groupsnum_channelsepsr+   r   )r)   r*   r+   r/   r-   r  conv1conv2conv3r  	groupnormr   
rotary_emb
ModuleListrangeencoder_num_hidden_layersr   layersr   
layer_normgradient_checkpointing	post_init)r4   r+   	embed_dimidxr6   s       r7   r*   zMoonshineEncoder.__init__  s     &&	YYq)ReT
YYy!i-QqQ
YYq9}iQqQ
PTU2&Amm;@AaAa;bcC"63/c
 ,,yu=&+# ds   D,r:   c                     | j                   S r'   r1  r4   s    r7   get_input_embeddingsz%MoonshineEncoder.get_input_embeddings1  s    zzr8   r\   c                     || _         y r'   r@  r4   r\   s     r7   set_input_embeddingsz%MoonshineEncoder.set_input_embeddings4  s	    
r8   r]   r   output_hidden_statesflash_attn_kwargsc           	         ||n| j                   j                  }||n| j                   j                  }|t        d      |j	                  d      }t
        j                  j                  | j                  |            }| j                  |      }t
        j                  j                  | j                  |            }t
        j                  j                  | j                  |            }|j                  ddd      }|| j                  |j                  d         }d}|ddd|f   dd|f   }| j                   j                   d	k(  r|d
k(  j#                         r|nd}nH| j                   j                   dk(  r|st%        ||j&                        }nt)        ||j&                        }t+        j,                  d|j                  d   |j.                        j	                  d      }	| j1                  ||	      }
|rdnd}|rdnd}| j2                  D ])  }|r||fz  } ||f||	||
d|}|d   }|s!||d   fz  }+ | j5                  |      }|r||fz  }t7        |||      S )a  
        Args:
            input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
                Float values of the raw speech waveform. Raw speech waveform can be
                obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
                `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
                `input_values`, the [`AutoFeatureExtractor`] should be used for padding
                and conversion into a tensor of type `torch.FloatTensor`.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding indices in `input_values`. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
                tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
                more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NzYou must specify input_values.r    r   rH   rJ     .flash_attention_2r   r   r   r   )r]   r   r   r   last_hidden_stater9   
attentions)r+   r   rF  
ValueErrorr|   r-   rh   tanhr1  r4  gelur2  r3  permuter  rQ   r   anyr   rb   r   rA   aranger   r5  r9  r:  r   )r4   r  r]   r   rF  rG  r9   mask_lendownsample_strider   r   all_hidden_statesall_self_attnsencoder_layerlayer_outputss                  r7   r<   zMoonshineEncoder.forward7  s   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 =>> $--a0**4::l+CD}5**4::m+DE**4::m+DE%--aA6 %<<^=Q=QRT=UVH *+C1D3D1D,DEc9H9nUN{{//3FF4Bc4I3N3N3PVZ 11V;DU!D^UbUhUh!i "<NML_L_!`||A}':':1'=mFZFZ[eefgh #oom\J #7BD0d![[ 	6M#!m%55!)-)"3$7 $M *!,M =#3"55!	6$ 6  -!11&++%
 	
r8   )NNNN)r>   r?   r@   r   r!  r!   r*   r-   ModulerB  rE  r   r   rA   r   rB   r   r   r   r   r<   rC   rD   s   @r7   r)  r)    s     %O (bii "))   5915,0/3c
u001c
 !.c
 $D>	c

 'tnc
 $$89c
 
!c
 c
r8   r)  c                   F    e Zd ZdZdef fdZd Zd Zee		 	 	 	 	 	 	 	 	 	 	 dde
ej                     de
ej                     de
ej                     de
e   d	e
ej                     d
e
e   de
e   de
e   de
ej                     de
ej                     de
ej                     dee   deeef   fd              Z	 ddeej                  df   dej                  dej                  dedef
dZedej                  dededej4                  dej                  defd       Z xZS )MoonshineDecoder	input_idsr+   c           	         t         |   |       |j                  | _        |j                  | _        t        j                  |j                  |j                  | j                        | _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        t        j                  |j                  d      | _        t!        |      | _        d| _        | j'                          y c c}w )NFr   r0  )r)   r*   pad_token_idr  
vocab_sizer-   r  r/   embed_tokensr6  r7  decoder_num_hidden_layersr   r9  r   normr   r5  r;  r<  )r4   r+   r>  r6   s      r7   r*   zMoonshineDecoder.__init__  s     !.. ++LL):):F<N<NPTP`P`amm;@AaAa;bcC"63/c
 LL!3!3%@	2&A&+# 	 ds   Dc                     | j                   S r'   rb  rA  s    r7   rB  z%MoonshineDecoder.get_input_embeddings  s       r8   c                     || _         y r'   rf  rD  s     r7   rE  z%MoonshineDecoder.set_input_embeddings  s
    !r8   r]   r   past_key_valuesinputs_embedsr   r   rF  r   r   r   rG  r:   c                 $   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|du |duz  rt	        d      | j
                  r%| j                  r|rt        j                  d       d}|| j                  |      }|r"| t               }t               }t        ||      }|	F||j                         nd}t        j                  |||j                  d   z   |j                         }	||	j#                  d      }| j%                  |||	||      }|}| j'                  ||      }|rdnd}|rdnd}|r|
dnd}||
j                  d	   }d
}|ddd|f   dd|f   }| j                   j(                  dk(  r|dk(  j+                         r|nd}nd| j                   j(                  dk(  r'|s%t-        ||j.                  |j                  d	         }n$t1        ||j.                  |j                  d	         }| j2                  D ]:  }|r||fz  } ||f|||
|||||	|d	|}|d   }|s&||d   fz  }|
2||d   fz  }< | j5                  |      }|r||fz  }t7        ||r|nd|||      S )a  
        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
            of the decoder.
        encoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding indices in `encoder_hidden_states`. Mask values selected in `[0, 1]`:
            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.
            [What are attention masks?](../glossary#attention-mask)
        Nz:You must specify exactly one of input_ids or inputs_embedszX`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.Fr   r    rK  r   ra   rI  .rJ  r   r   )	r]   r   r   r   r   r   r   r   r   rH   )rM  rh  r9   rN  cross_attentions)r+   r   rF  r   rO  r;  rd   r   r   rb  r
   r   get_seq_lengthrA   rT  rQ   r   r|   _update_causal_maskr5  r   rS  r   rb   r   r9  rd  r   )r4   r^  r]   r   rh  ri  r   r   rF  r   r   r   rG  r   r   past_seen_tokensrq   r9   r   rW  rX  all_cross_attentionsrU  rV  decoder_layerrZ  s                             r7   r<   zMoonshineDecoder.forward  sF   6 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	-t";<YZZ&&4==Yj I  --i8M0#/> $0N!12FH]^O!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6L..M>?L]
 & #oom\J #7BD0d&7<Q<]rdh "-,2226H *%;CATCTAT<T%UVY[d\d[dVd%e"{{//3FFDZ^aDaCfCfCh)?nr& 11V;DU)L*M,?,?ATATUWAX*&
 *D*M,?,?ATATUWAX*& "[[ 	@M#!m%55!)*'=&;)."3#-$7 $M *!,M =#3"55(4(]1-=,??(1	@4 		-0  -!118+/8Od+%1
 	
r8   r"   input_tensorc           	         | j                   j                  dk(  r||dk(  j                         r|S y | j                   j                  dk(  r't        |t        j
                        rt        |      }|S ||j                         nd}||j                  nd}| j                   j                  dk(  r(|s&|s$t        j                  |||| j                        ry |j                  }|j                  d   }	|r|j                         }
n1t        |t        j
                        r|j                  d	   n||	z   dz   }
| j                  ||	|
|||j                  d   
      }| j                   j                  dk(  rQ|O|j                   j"                  dv r7|s5t	        j$                  |      j&                  }t        j(                  ||      }|S )NrJ  r   flex_attentionr   Fr   )ri  past_key_values_lengthis_trainingr    rJ   )sequence_lengthtarget_lengthrb   r   
batch_size)cudaxpunpu)r+   r   rS  r   rA   rB   r#   rl  is_compileabler   _ignore_causal_mask_sdpard   rb   rQ   get_max_cache_shape5_prepare_4d_causal_attention_mask_with_cache_positionr   r   finfomin_unmask_unattended)r4   r]   rq  r   rh  r   rn  using_compilable_cacherb   rv  rw  rq   	min_dtypes                r7   rm  z$MoonshineDecoder._update_causal_mask=  s    ;;++/BB)~/D.I.I.K%%;;++/??.%,,7!<^!L!!
 @O?Z?99;`aCRC^!?!?di ;;++v5>T]n%>>*'7 MM	 ""&,,Q/!+??AM nell; $$R(%7!;  PP+')#))!, Q 
 KK,,6*%%**.DD%
 E*..I0CCKQZ[Kr8   rv  rw  rb   rx  c                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuerb   r   r    )diagonalrK  rJ   r   )rL   rA   r  r  fullr   triurT  rS   rR   clonerQ   rk   masked_fill)r]   rv  rw  rb   r   rx  rm   rq   r  mask_lengthpadding_masks              r7   r  zFMoonshineDecoder._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r8   )NNNNNNNNNNN)F)r>   r?   r@   r!  r!   r*   rB  rE  r   r   r   rA   r   rB   r	   r   r   r   r   r   r   r   r<   rm  staticmethodr   rb   r  rC   rD   s   @r7   r]  r]    s   !O  !"  151537+/59$(,0/359=A9=A
E,,-A
 !.A
 u//0	A

 "%A
   1 12A
 D>A
 $D>A
 'tnA
 !!1!12A
  ((9(9:A
 !) 6A
 $$89A
 
u--	.A
  A
R #(BellK78B llB 	B
 B  BH 444 4 {{	4
 4 4 4r8   r]  rQ   	mask_probr  	min_masksc                    | \  }dk  rt        d      kD  rt        d d d      t        j                  j                  d      j	                         fd}|-|j                         j                  d      j                         nt        |      D cg c]  } c}}t        j                  |ft        	      }	g }
 |      }|d
k(  r|	S |D ]  } ||      }t        j                  j                  t        j                  |dz
  z
        |d      }t        |      d
k(  rdz
  }n|d
   }t        j                  |t        j                  ||z
  t        j                   	      |z  g      }|
j#                  |        t        j$                  |
      }
t        j&                  |
dddddf   ||f      }
|
j)                  ||z        }
t        j                        ddddf   }t        j&                  |||f      j)                  ||z        }|
|z   }
|
j+                         dz
  kD  rdz
  |
|
dz
  kD  <   t        j,                  |	|
dd       |	S c c}w )af  
    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
    CPU as part of the preprocessing during training.

    Args:
        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
               the first element is the batch size and the second element is the length of the axis to span.
        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
                    independently generated mask spans of length `mask_length` is computed by
                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
                    actual percentage will be smaller.
        mask_length: size of the mask
        min_masks: minimum number of masked spans
        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
                        each batch dimension.
    r    z&`mask_length` has to be bigger than 0.zO`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: z and `sequence_length`: `c                     t        | z  z  z         }t        |      }|z  kD  rz  }| dz
  z
  |k  rt        | dz
  z
  d      }|S )z;Given input length, compute how many spans should be maskedr    r   )r   max)input_lengthnum_masked_spanepsilonr  r  r  rv  s     r7   compute_num_masked_spanz6_compute_mask_indices.<locals>.compute_num_masked_span  so    i,6DwNOoy9 [(?:-<O ;?+o=!,+/"BAFOr8   NrJ   r   r   F)replace)rO  nprandomranditemdetachsumtolistr7  zerosr   choicerT  lenconcatenateonesint32appendarraybroadcast_torS   r  put_along_axis)rQ   r  r  r]   r  rx  r  _r  spec_aug_maskspec_aug_mask_idxsmax_num_masked_spanr  r  spec_aug_mask_idxdummy_mask_idxoffsetsr  rv  s    `` `            @@r7   _compute_mask_indicesr    s   0 #(JQABB_$]^i]j&&7q:
 	
 iinnQ$$&G $ % 	##B'..0',Z'89!o9  HHj/:$GM1/Ba% 51,? II,,IIlkAo67RW - 
  !Q& -q0N.q1NNN(;o(MUWU]U] ^ao op
 	!!"34/52 "45 1a:&5H+(V ,33J@SVa@ab ii$T4]3Goog
4G'UV^^'+5G ,g5 /A"55GVYZGZ-!0CCD m%7B?w :s   $	I+c                       e Zd Zdef fdZd Zd Zd Zd Zd Z		 dde
j                  d	ee
j                     fd
Zee	 	 	 	 	 	 	 	 	 	 	 	 ddee
j                     d	ee
j                     dee
j                     dee
j                     deeee
j                           deeeee
j                     f      deee
j                        deee
j                        dee   dee   dee   dee
j                     defd              Z xZS )MoonshineModelr+   c                     t         |   |       t        |      | _        t	        |      | _        | j                          y r'   )r)   r*   r)  encoderr]  decoderr<  r4   r+   r6   s     r7   r*   zMoonshineModel.__init__2  s2     '/'/r8   c                 .    | j                   j                  S r'   r  rb  rA  s    r7   rB  z#MoonshineModel.get_input_embeddings:  s    ||(((r8   c                 &    || j                   _        y r'   r  rD  s     r7   rE  z#MoonshineModel.set_input_embeddings=  s    $)!r8   c                     | j                   S r'   )r  rA  s    r7   get_encoderzMoonshineModel.get_encoder@      ||r8   c                     | j                   S r'   )r  rA  s    r7   get_decoderzMoonshineModel.get_decoderC  r  r8   c                 8    | j                   j                          y)z
        Calling this function will disable the gradient computation for the Moonshine encoder so that its parameters will
        not be updated during training.
        N)r  _freeze_parametersrA  s    r7   freeze_encoderzMoonshineModel.freeze_encoderF  s    
 	'')r8   input_featuresr]   c                 2   t        | j                  dd      s|S |j                         \  }}}| j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  || j                  j                        }t        j                  ||j                  t        j                        }|dddf   j                  d|d      }d||<   | j                  j                  dkD  r| j                  rt        ||f| j                  j                  | j                  j                  | j                  j                        }t        j                  ||j                  t        j                        }d||<   |S )	z
        Masks extracted features along time axis and/or along feature axis according to
        [SpecAugment](https://arxiv.org/abs/1904.08779).
        apply_spec_augmentTr   )r  r  r]   r  )r   rb   NrJ   )r  r  r  )r   r+   sizemask_time_probrd   r  mask_time_lengthmask_time_min_masksrA   tensorr   r   rR   mask_feature_probmask_feature_lengthmask_feature_min_masks)r4   r  r]   rx  r/   rv  mask_time_indicesmask_feature_indicess           r7   _mask_input_featuresz#MoonshineModel._mask_input_featuresM  s[    t{{$8$?!! 4B3F3F3H0
K;;%%)dmm 5_-++44 KK88-++99! !&->~G\G\didndn o 1!T' : A A"kSU V01N,-;;((1,#8[)++77 KK;;++<<	$  $)<<0D^MbMbjojtjt#u 34N/0r8   r  decoder_input_idsdecoder_attention_maskencoder_outputsrh  decoder_inputs_embedsdecoder_position_idsr   r   rF  r   r:   c                 n   |
|
n| j                   j                  }
||n| j                   j                  }|	|	n| j                   j                  }	|| j	                  |||
|      }nGt        |t              s7t        |d   t        |      dkD  r|d   ndt        |      dkD  r|d   nd      }| j                  ||||j                  ||||	|
||      }t        |j                  |j                  |j                  |j                  |j                  |j                  |j                  |j                        S )	a\  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoFeatureExtractor, MoonshineModel
        >>> from datasets import load_dataset

        >>> model = MoonshineModel.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
        >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values
        >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
        >>> last_hidden_state = model(input_values, decoder_input_ids=decoder_input_ids).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 2, 288]
        ```
        N)r]   r   rF  r   r    rH   rL  )r^  r]   r   r   rh  ri  r   r   r   rF  r   )rM  rh  decoder_hidden_statesdecoder_attentionsrk  encoder_last_hidden_stater   encoder_attentions)r+   r   rF  r   r  r   r   r  r  rM  r   rh  r9   rN  rk  )r4   r  r]   r  r  r  rh  r  r  r   r   rF  r   decoder_outputss                 r7   r<   zMoonshineModel.forwardx  s[   ` 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	"/3||-"3%9	 0< 0O O_=-"1!"4474H14Loa0RV14_1E1I?1-tO FJ\\'1#1"1"C"C+/-/!5) FR F
 "-??+;;"1"?"?.99,==&5&G&G"1"?"?.99	
 		
r8   r'   )NNNNNNNNNNNN)r>   r?   r@   r!   r*   rB  rE  r  r  r  rA   r   r   r   r  r   r   r   r   r   r   r   r<   rC   rD   s   @r7   r  r  0  s    )** 6:)))) !!1!12)V  59598<=AEIZ^DHBF$(,0/359{
u001{
 !!1!12{
 $E$4$45	{

 !))9)9 :{
 "%e.?.?(@"AB{
 "%(;U5CTCT=U(U"VW{
  (e.?.?(@A{
 'uU-=-='>?{
 D>{
 $D>{
 'tn{
 !!1!12{
 
{
  {
r8   r  r^  r`  decoder_start_token_idc                     | j                  | j                        }| ddddf   j                         |ddddf<   ||dddf<   |t        d      |j	                  |dk(  |       |S )z1
    Shift input ids one token to the right.
    NrJ   r    r   z1self.model.config.pad_token_id has to be defined.i)	new_zerosrQ   r  rO  masked_fill_)r^  r`  r  shifted_input_idss       r7   shift_tokens_rightr    s}     "++IOO<(CRC0668ae4adLMM""#4#<lKr8   zj
    The Moonshine Model with a language modeling head. Can be used for automatic speech recognition.
    )custom_introc                       e Zd ZdgZdef fdZd Zd Zd Zd Z	de
j                  fd	Zee	 	 	 	 	 	 	 	 	 	 	 	 	 dd
eej"                     deej$                     deej$                     deej$                     deeeej"                           deeeeej"                     f      deeej"                        deeej$                        dee   dee   dee   deej$                     deej$                     defd              Z xZS )!MoonshineForConditionalGenerationzproj_out.weightr+   c                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NFr   )
r)   r*   r  r  r-   r.   r/   ra  proj_outr<  r  s     r7   r*   z*MoonshineForConditionalGeneration.__init__  sH     #F+
		&"4"4f6G6GeT 	r8   c                 6    | j                   j                         S r'   )r  r  rA  s    r7   r  z-MoonshineForConditionalGeneration.get_encoder      zz%%''r8   c                 6    | j                   j                         S r'   )r  r  rA  s    r7   r  z-MoonshineForConditionalGeneration.get_decoder  r  r8   c                     | j                   S r'   r  rA  s    r7   get_output_embeddingsz7MoonshineForConditionalGeneration.get_output_embeddings  s    }}r8   c                     || _         y r'   r  )r4   new_embeddingss     r7   set_output_embeddingsz7MoonshineForConditionalGeneration.set_output_embeddings!  s	    &r8   r:   c                 6    | j                   j                         S r'   )r  rB  rA  s    r7   rB  z6MoonshineForConditionalGeneration.get_input_embeddings$  s    zz..00r8   r  r]   r  r  r  rh  r  r  r   r   rF  r   labelsc                    |9|7|5t        || j                  j                  | j                  j                        }| j	                  |||||||||	|
||      }| j                  |j                        }d}|(| j                  ||| j                  j                        }t        |||j                  |j                  |j                  |j                  |j                  |j                  |j                   	      S )a  
        input_values (`torch.FloatTensor` of shape `(batch_size, audio_length)`):
            Float values of the raw speech waveform. Raw speech waveform can be
            obtained by loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a
            `numpy.ndarray`, *e.g.* via the soundfile library (`pip install soundfile`). To prepare the array into
            `input_values`, the [`AutoFeatureExtractor`] should be used for padding
            and conversion into a tensor of type `torch.FloatTensor`.
        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
            it.

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            [What are input IDs?](../glossary#input-ids)
        decoder_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

            - 1 for tokens that are **not masked**,
            - 0 for tokens that are **masked**.

            [What are attention masks?](../glossary#attention-mask)

            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
            [`PreTrainedTokenizer.__call__`] for details.

            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
            `past_key_values`).

            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
            and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
            information on the default strategy.

            - 1 indicates the head is **not masked**,
            - 0 indicates the head is **masked**.
        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded representation. This
            is useful if you want more control over how to convert `decoder_input_ids` indices into associated vectors than the
            model's internal embedding lookup matrix.
        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
            config.n_positions - 1]`.

            [What are position IDs?](../glossary#position-ids)
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, MoonshineForConditionalGeneration
        >>> from datasets import load_dataset

        >>> processor = AutoProcessor.from_pretrained("UsefulSensors/moonshine-tiny")
        >>> model = MoonshineForConditionalGeneration.from_pretrained("UsefulSensors/moonshine-tiny")

        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")

        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
        >>> input_values = inputs.input_values

        >>> generated_ids = model.generate(input_values, max_new_tokens=100)

        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> transcription
        'Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
        ```N)r]   r  r  r  rh  r  r  r   r   rF  r   )logitsr  ra  )	lossr  rh  r  r  rk  r  r   r  )r  r+   r`  r  r  r  rM  loss_functionra  r   rh  r  r  rk  r  r   r  )r4   r  r]   r  r  r  rh  r  r  r   r   rF  r   r  r   r  r  s                    r7   r<   z)MoonshineForConditionalGeneration.forward'  s
   r  (-B-J$6DKK44dkk6X6X%! '+jj)/+#9+"7!5/!5) '1 '
 w889%%VFt{{OeOe%fD#33")"?"?&99$55&-&G&G")"?"?&99

 
	
r8   )NNNNNNNNNNNNN)r>   r?   r@   _tied_weights_keysr!   r*   r  r  r  r  r-   r[  rB  r   r   r   rA   r   r   r   r   r   r   r   r<   rC   rD   s   @r7   r  r    s    ,, (('1bii 1  59598<=AEIZ^DHBF$(,0/359-1{
u001{
 !!1!12{
 $E$4$45	{

 !))9)9 :{
 "%e.?.?(@"AB{
 "%(;U5CTCT=U(U"VW{
  (e.?.?(@A{
 'uU-=-='>?{
 D>{
 $D>{
 'tn{
 !!1!12{
 ))*{
 
{
  {
r8   r  )r  r  r  )r   )Nr    )Nr   )Ntypingr   r   r   r   numpyr  rA   torch.nnr-   activationsr   cache_utilsr	   r
   r   
generationr   modeling_attn_mask_utilsr   r   r   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   r   modeling_rope_utilsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   configuration_moonshiner!   !torch.nn.attention.flex_attentionr"   integrations.flex_attentionr#   
get_loggerr>   r   r[  r%   rF   rB   r   rX   r   rs   rz   r   r   r   r   r   r  r)  r]  r   ndarrayr  r  r  r  __all__r   r8   r7   <module>r     s  * 4 3    ! C C ) 
 C 9  L F & \ \ 4  !;J 
		H	%")) "))  	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %46'TC) C)L<ryy <D86 8vU6 Up "# "# "#JH
/ H
V W/ W W| 26tc?tt t U--.	t
 t ZZtn D
- D
 D
N%,, c [^   
W
(@/ W

W
t ^r8   