
    Uh                        d dl Z d dlmZ d dlmZmZmZmZmZ d dl	Z	d dl	m
Z
 ddlmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZm Z  ddl!m"Z" ddl#m$Z$m%Z%m&Z&m'Z'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.  e&       r	d dl/m
c m0Z1  e'jd                  e3      Z4e$ G d de              Z5e G d de             Z6e G d de             Z7e G d de             Z8 G d de
jr                        Z:de	jv                  de<d e	jv                  fd!Z=	 dTd"e
jr                  d#e	jv                  d$e	jv                  d%e	jv                  d&ee	jv                     d'e>d(e>fd)Z? G d* d+e
jr                        Z@ G d, d-e
jr                        ZA G d. d/e      ZB G d0 d1e
jr                        ZCe$ G d2 d3e5             ZD G d4 d5e
jr                        ZE G d6 d7e
jr                        ZF G d8 d9e
jr                        ZG G d: d;e
jr                        ZH G d< d=e
jr                        ZI G d> d?e
jr                        ZJ G d@ dAe
jr                        ZK G dB dCe
jr                        ZL G dD dEe
jr                        ZM e$dFG       G dH dIe5             ZN G dJ dKe
jr                        ZO G dL dMe
jr                        ZP e$dNG       G dO dPe5             ZQ G dQ dRe5e      ZRg dSZSy)U    N)	dataclass)CallableListOptionalTupleUnion)nn   )ACT2FN)Cache)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)auto_docstringcan_return_tupleis_torch_availablelogging	torch_int   )	AutoModel   )JanusConfigJanusVisionConfigJanusVQVAEConfigc                   D    e Zd ZeZdZdZdgZddgZdZ	dZ
dZdZdZdZd Zy)	JanusPreTrainedModelmodelTLlamaDecoderLayerpast_key_valuescausal_maskFc                    t        | j                  d      r | j                  j                  j                  n| j                  j                  }t	        |t
        j                  t
        j                  f      rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                  t
        j                  f      rJ|j                  j                  j                          |j                  j                  j                  d       y t	        |t
        j                         rf|j                  j                  j                  d|       |j"                  2|j                  j                  |j"                     j                          y y y )Nvision_config        )meanstdg      ?)hasattrconfigr,   initializer_range
isinstancer	   LinearConv2dweightdatanormal_biaszero_	GroupNorm	LayerNormfill_	Embeddingpadding_idx)selfmoduler/   s      z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/janus/modeling_janus.py_init_weightsz"JanusPreTrainedModel._init_weightsF   s;    t{{O4 KK%%77.. 	
 fryy"))45MM&&CS&9{{&  &&( 'r|| <=KK""$MM$$S)-MM&&CS&9!!-""6#5#56<<> . .    N)__name__
__module____qualname__r"   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_cache_class_supports_static_cache!_supports_param_buffer_assignmentrC    rD   rB   r&   r&   8   sO    L&*#,-#4m"D!N $ !(-%?rD   r&   c                   \    e Zd ZU dZdZeej                     ed<   dZ	ej                  ed<   y)JanusVQVAEOutputaM  
    Base class for Janus VQ-VAE mode model outputs.
    Args:
        decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            Reconstructed pixel values after encoding and decoding the input.
        embedding_loss (`torch.FloatTensor`):
            Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)
rE   rF   rG   __doc__rV   r   torchFloatTensor__annotations__rW   rS   rD   rB   rU   rU   Y   s/     9=(5#4#45<(,NE%%,rD   rU   c                       e Zd ZU dZdZeej                     ed<   dZ	ee
e
ej                           ed<   dZee
ej                        ed<   dZee
ej                        ed<   dZee
ej                        ed<   y)JanusBaseModelOutputWithPasta	  
    Base class for Janus model's outputs that may also contain a past key/values (to speed up sequential decoding).

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.

            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and optionally if
            `config.is_encoder_decoder=True` 2 additional tensors of shape `(batch_size, num_heads,
            encoder_sequence_length, embed_size_per_head)`.

            Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
            `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
            input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlast_hidden_stater)   hidden_states
attentionsimage_hidden_states)rE   rF   rG   rX   r^   r   rY   rZ   r[   r)   r   r_   r`   ra   rS   rD   rB   r]   r]   h   s    "H 6:x 1 129AEOXeE%*;*;$<=>E8<M8E%"3"345<59Ju00129>B%(9(9":;BrD   r]   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   y)	JanusCausalLMOutputWithPasta  
    Base class for Janus causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.

            image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
    Nlosslogitsr)   r_   r`   ra   )rE   rF   rG   rX   rd   r   rY   rZ   r[   re   r)   r   r_   r   r`   ra   rS   rD   rB   rc   rc      s    @ )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju00129>B%(9(9":;BrD   rc   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e	dej                  fd
Z
 xZS )JanusVisionEmbeddingsr1   c                 f   t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  |j                  | j                  | j                  | j                  d      | _
        | j
                  | j                  z  dz  | _        | j                  | _        t        j                  | j                  | j                        | _        | j                  dt!        j"                  | j                        j%                  d      d       y )Nvalid)in_channelsout_channelskernel_sizestridepaddingr   position_ids)r!   F)
persistent)super__init__r1   hidden_size	embed_dim
image_size
patch_sizer	   r5   num_channelspatch_embeddingnum_patchesnum_positionsr>   position_embeddingregister_bufferrY   arangeexpandr@   r1   	__class__s     rB   rs   zJanusVisionEmbeddings.__init__   s    ++ ++ ++!yy++?? 
 !OOt>1D!--"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]joprD   
embeddingsheightwidthreturnc                    |j                   d   }| j                  j                  j                   d   }t        j                  j                         s%||k(  r ||k(  r| j                  | j                        S | j                  j                  j                  d      }|j                   d   }|| j                  z  }|| j                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r!   r   rp   g      ?r
   r   bicubicF)sizemodealign_corners)shaper|   r6   rY   jit
is_tracingro   	unsqueezerw   r   reshapepermuter	   
functionalinterpolateview)r@   r   r   r   rz   r{   patch_pos_embeddim
new_height	new_widthsqrt_num_positionss              rB   interpolate_pos_encodingz.JanusVisionEmbeddings.interpolate_pos_encoding   sE    !&&q)//66<<Q? yy##%+*F6UZ?**4+<+<==1188BB1Er"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#NrD   pixel_valuesr   c                 X   |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }|r| j                  |||      }	n| j                  | j                        }	||	z   }|S )N)dtyper   r!   )
r   ry   r6   r   toflatten	transposer   r|   ro   )
r@   r   r   _r   r   target_dtypepatch_embedsr   
pos_embedss
             rB   forwardzJanusVisionEmbeddings.forward   s    *001fe++2288++LOO,O,OP!))!,66q!<
#66z65QJ001B1BCJ*,
rD   F)rE   rF   rG   r#   rs   rY   Tensorintr   boolr   __classcell__r   s   @rB   rg   rg      se    q0 q($5<< $ $UX $]b]i]i $LELL D ]b]i]i rD   rg   r_   n_repr   c                     | j                   \  }}}}|dk(  r| S | dddddddddf   j                  |||||      } | j                  |||z  ||      S )z
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    r!   N)r   r   r   )r_   r   batchnum_key_value_headsslenhead_dims         rB   	repeat_kvr   
  so    
 2?1D1D.Ehz!!Qa"23::5BUW\^bdlmM  (;e(CT8TTrD   rA   querykeyvalueattention_maskscalingdropoutc                 T   t        || j                        }t        || j                        }	t        j                  ||j	                  dd            |z  }
|#|d d d d d d d |j
                  d   f   }|
|z   }
t        j                  j                  |
dt        j                        j                  |j                        }
t        j                  j                  |
|| j                        }
t        j                  |
|	      }|j	                  dd      j                         }||
fS )Nr   r
   rp   )r   r   )ptrainingr!   )r   num_key_value_groupsrY   matmulr   r   r	   r   softmaxfloat32r   r   r   r   
contiguous)rA   r   r   r   r   r   r   kwargs
key_statesvalue_statesattn_weightsr*   attn_outputs                rB   eager_attention_forwardr     s    3 ; ;<JUF$?$?@L<<z';';Aq'ABWLL!$Q1.D
0@0@0D.D%DE#k1==((2U]](SVVW\WbWbcL==((6??([L,,|\:K''1-88:K$$rD   c            
            e Zd ZdZdef fdZ	 	 d	dej                  deej                     deej                     de	e
   fdZ xZS )
JanusVisionAttentionz(Attention Class for Janus Vision Encoderr1   c                 F   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        d| _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                        | _        |dkD  rt        j,                  |      nt        j.                         | _        |rt        j0                  | j                        nt        j.                         | _        |r%t        j0                  | j                        | _        y t        j.                         | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      Fr!   r9   r   )rr   rs   r1   rt   ru   num_attention_heads	num_headsr   
ValueErrorscaleattention_dropoutprojection_dropoutuse_qk_norm	is_causalr   r	   r4   attention_biasq_projk_projv_projprojection_layerDropoutIdentityr<   q_normk_norm)r@   r1   proj_dropoutqk_normr   s       rB   rs   zJanusVisionAttention.__init__3  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=rD   r_   r   output_attentionsr   c                    |j                         \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  d| j
                  | j                        }| j                  |      }|	j	                  d| j
                  | j                        }	| j                  |	      }	|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j                  ||| j
                  | j                        j                  dd      }
t        }| j                  j                  dk7  r^| j                  j                  dk(  r(|j                  dd      rt        j!                  d       nt"        | j                  j                     } || ||	|
|f| j$                  sd	n| j&                  | j(                  | j*                  d
|\  }}|j	                  ||| j,                        }| j/                  |      }| j1                  |      }|r||f}|S |d f}|S )Nrp   r!   r   eagersdpar   Fz`torch.nn.functional.scaled_dot_product_attention` does not support `output_attentions=True`. Falling back to eager attention. This warning can be removed using the argument `attn_implementation="eager"` when loading the model.r-   )r   r   r   )r   r   r   r   r   r   r   r   r   r   r   r   r1   _attn_implementationgetloggerwarning_oncer   r   r   r   r   ru   r   r   )r@   r_   r   r   r   
batch_sizeseq_lenr   query_statesr   r   attention_interfacer   r   outputoutputss                   rB   r   zJanusVisionAttention.forwardP  s0    "/!3!3!5
GQ{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?;;++w6{{//69fjjI\^c>d##L
 '>dkk>^>^&_#$7
%
  $}}C$2H2HJJnn
%
 
%
!\ "))*gt~~N&&{3((0,=6<( EKD>rD   )NN)rE   rF   rG   rX   r#   rs   rY   r   r   r   r   r   r   r   s   @rB   r   r   0  se    2Q0 Q@ 2648	2||2 !.2 $ELL1	2
 -.2rD   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )JanusVisionMLPr1   c                    t         |           || _        t        |j                  |j
                  z        | _        t        |j                     | _	        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        y N)rr   rs   r1   r   rt   	mlp_ratiointermediate_sizer   
hidden_actactivation_fnr	   r4   fc1fc2r   hidden_dropout_ratedropout1dropout2r   s     rB   rs   zJanusVisionMLP.__init__  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>rD   r_   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r   )r   r   r   r   r   r@   r_   s     rB   r   zJanusVisionMLP.forward  sP    /**=9m4/m4rD   )	rE   rF   rG   r#   rs   rY   r   r   r   r   s   @rB   r   r     s+    ?0 ?U\\ ell rD   r   c            
            e Zd Zdef fdZ	 ddej                  dej                  dee   de	ej                     fdZ xZS )	JanusVisionEncoderLayerr1   c                 R   t         |           |j                  | _        t	        j
                  | j                  |j                        | _        t        |      | _	        t	        j
                  | j                  |j                        | _
        t        |      | _        || _        y N)eps)rr   rs   rt   ru   r	   r<   layer_norm_epslayer_norm1r   	self_attnlayer_norm2r   mlpr1   r   s     rB   rs   z JanusVisionEncoderLayer.__init__  st    ++<<F<Q<QR-f5<<F<Q<QR!&)rD   r_   r   r   r   c                     |}| j                  |      }| j                  |||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )a=  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*, defaults to `False`):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r_   r   r   )r  r  r  r  )r@   r_   r   r   residualr   r   s          rB   r   zJanusVisionEncoderLayer.forward  s      !((7&*nn')/ '5 '
#|
 !=0 ((7/ =0 "&GrD   r   )rE   rF   rG   r#   rs   rY   r   r   r   r   rZ   r   r   r   s   @rB   r   r     sW    0  -2	$||$ $ $D>	$
 
u  	!$rD   r   c            
       x     e Zd ZdZdef fdZe	 	 	 d	deej                     dee
   dee
   defd       Z xZS )
JanusVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`JanusVisionEncoderLayer`].

    Args:
        config: JanusVisionConfig
    r1   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
rr   rs   r1   r	   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointingr@   r1   r   r   s      rB   rs   zJanusVisionEncoder.__init__  sP    mmeTZTlTlNm$n%<V%D$no&+# %os   A#r   r   output_hidden_statesr   c                    ||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}| j                  D ]&  }|r||fz   } ||||      }	|	d   }|s||	d   fz   }( |r||fz   }t	        |||      S )ad  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrS   )r   r   r!   )r^   r_   r`   )r1   r   r  r  r   )
r@   inputs_embedsr   r   r  encoder_statesall_attentionsr_   encoder_layerlayer_outputss
             rB   r   zJanusVisionEncoder.forward  s    < 2C1N-TXT_T_TqTq$8$D $++JjJj 	  40d%![[ 	FM#!/=2B!B)"3M *!,M !/=3C2E!E	F  +}.>>N+(%
 	
rD   NNN)rE   rF   rG   rX   r#   rs   r   r   rY   r   r   r   r   r   r   s   @rB   r	  r	    sm    ,0 ,  26,0/3<
 !.<
 $D>	<

 'tn<
 
<
 <
rD   r	  c                        e Zd ZdZeZdef fdZe	 	 	 	 	 ddee	j                     dee   dee   dee   dedeeef   fd	       Zd
 Z xZS )JanusVisionModelr   r1   c                     t         |   |       || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        | j                          y r   )rr   rs   r1   rt   rg   r   r	  encoderr	   r<   r  post_layernorm	post_init)r@   r1   ru   r   s      rB   rs   zJanusVisionModel.__init__  s]     &&	/7)&1 ll9&:O:OPrD   r   r  return_dictr   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  ||      }| j                  ||||      }|d   }| j                  |      }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )Nz You have to specify pixel_values)r   )r  r   r  r   r   r!   )r^   pooler_outputr_   r`   )r1   r   r  use_return_dictr   r   r  r  r   r_   r`   )
r@   r   r   r  r   r   r_   encoder_outputsr^   pooled_outputs
             rB   r   zJanusVisionModel.forward*  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]?@@Ogh,,'/!5#	 ' 
 ,A. //0AB)!Q'2++M:%}58KKK)/')77&11	
 	
rD   c                     | j                   S r   )r   r@   s    rB   get_input_embeddingsz%JanusVisionModel.get_input_embeddingsU  s    rD   )NNNNF)rE   rF   rG   main_input_namer#   rH   rs   r   r   rY   rZ   r   r   r   r   r   r(  r   r   s   @rB   r  r    s    $O$L	0 	  59,0/3&*).(
u001(
 $D>(
 'tn	(

 d^(
 #'(
 
u00	1(
 (
TrD   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr1   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w Nr!   )rr   rs   r	   r4   rt   projection_dimr   r  r  depthhidden_layersr   r   r   r  s      rB   rs   zJanusVisionAlignerMLP.__init__Z  s    99V//1F1FG]]NSTUW]WcWcNdeRYYv,,f.C.CDe
 $F$5$56 f   &1B<c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r   r0  r   r@   r_   layers      rB   r   zJanusVisionAlignerMLP.forwardc  G    /'' 	1E ..}=M!-0M	1 rD   )rE   rF   rG   r#   rs   r   r   r   s   @rB   r+  r+  Y  s    70 7rD   r+  c                        e Zd ZdZdef fdZdej                  fdZdej                  dej                  fdZ xZS )	JanusVQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r1   c                    t         |           |j                  | _        |j                  | _        t        |dd      | _        t        j                  | j                  | j                        | _	        |j                  gdz  | _        y )Nbetag      ?r   )rr   rs   num_embeddingsru   embedding_dimgetattrr:  r	   r>   	embeddingrz   quant_state_dimsr   s     rB   rs   z"JanusVQVAEVectorQuantizer.__init__v  sn    $33#--FFD1	d&9&94;M;MN!'!3!3 4q 8rD   hidden_statec           
      L   |j                  dddd      j                         }|j                  d| j                        }t	        j
                  |dz  dd      t	        j
                  | j                  j                  dz  d      z   dt	        j                  d	|| j                  j                  j                  dd            z  z
  }t	        j                  |d      }| j                  |      j                  |j                        }t	        j                  |j                         |z
  dz        | j                  t	        j                  ||j                         z
  dz        z  z   }|||z
  j                         z   }|j                  dddd      j                         }|||fS )
Nr   r   r
   r!   rp   T)r   keepdimr   z	bd,dn->bn)r   r   r   r<  rY   sumr>  r6   einsumr   argminr   r.   detachr:  )r@   r@  hidden_state_flattened	distancesmin_encoding_indiceshidden_state_quantrd   s          rB   r   z!JanusVQVAEVectorQuantizer.forward  s   #++Aq!Q7BBD!-!2!22t7I7I!J II,a/QEii--q0a89%,,{,BDNNDYDYDcDcdeghDijjk 	  %||I1=!^^,@AFF|GYGYZ zz-446E!KLtyy[`[e[e,"5"5"77A=\
 P
 

 *-?,-N,V,V,XX 0771aCNNP!4)===rD   image_tokensr   c                 B   |j                   d   }| j                  j                  j                   d   }| j                  |      }t        j                  |dd      }|j                  |g| j                  |      }|j                  dddd      j                         }|S )Nr   rp   r   )r   r   r
   r!   )	r   r>  r6   F	normalizer   r?  r   r   )r@   rL  r   emb_dimrK  s        rB   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entry  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!rD   )rE   rF   rG   rX   r$   rs   rY   r   r   
LongTensorrZ   rQ  r   r   s   @rB   r8  r8  k  sD    9/ 9>ELL >6"u/?/? "EDUDU "rD   r8  c                   *     e Zd Z	 	 d fd	Zd Z xZS )JanusVQVAEResnetBlockc                    t         |           || _        ||n|| _        || _        t
        j                  j                  d|dd      | _        t
        j                  j                  ||ddd      | _
        t
        j                  j                  d|dd      | _        t
        j                  j                  |j                        | _        t
        j                  j                  ||ddd      | _        | j                  | j                  k7  r`| j                  r*t
        j                  j                  ||ddd      | _        y t
        j                  j                  ||ddd      | _        y y )	N    ư>T
num_groupsrx   r   affiner
   r!   rl   rm   rn   r   )rr   rs   rj   rk   use_conv_shortcutrY   r	   r;   norm1r5   conv1norm2r   r   conv2conv_shortcutnin_shortcut)r@   r1   rj   rk   ra  r   s        rB   rs   zJanusVQVAEResnetBlock.__init__  s1    	&+7+?K\!.XX''2KUYbf'g
XX__[,AVWab_c
XX''2LVZcg'h
xx''7XX__\<QWXbc_d
t000%%%*XX__[,\]fgqr_%s"$)HHOOK[\efpqO$r!	 1rD   c                    |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  |      }| j                  | j                  k7  r3| j                  r| j                  |      }||z   S | j                  |      }||z   S r   )r]  rY   sigmoidr^  r_  r   r`  rj   rk   r\  ra  rb  )r@   r_   r  s      rB   r   zJanusVQVAEResnetBlock.forward  s     

=1}55

=1

=1}55]3

=1t000%%--h7 -''  ,,X6-''rD   r  rE   rF   rG   rs   r   r   r   s   @rB   rT  rT    s    
 s.(rD   rT  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEAttnBlockc                    t         |           || _        t        j                  j                  d|dd      | _        t        j                  j                  ||ddd      | _        t        j                  j                  ||ddd      | _	        t        j                  j                  ||ddd      | _
        t        j                  j                  ||ddd      | _        y )NrV  rW  TrX  r!   r   r[  )rr   rs   rj   rY   r	   r;   normr5   qkvproj_outr@   rj   r   s     rB   rs   zJanusVQVAEAttnBlock.__init__  s    &HH&&";TXae&f	kqQR\]^kqQR\]^kqQR\]^[aXYcderD   c                 t   |}| j                  |      }| j                  |      }| j                  |      }| j                  |      }|j                  \  }}}}	|j                  ||||	z        j                  ddd      }|j                  ||||	z        }t        j                  ||      }
|
t        |      dz  z  }
t        j                  |
d      }
|j                  ||||	z        }|
j                  ddd      }
t        j                  ||
      j                  ||||	      }| j                  |      }||z   S )Nr   r   r!   r   rC  )ri  rj  rk  rl  r   r   r   rY   bmmr   rN  r   rm  )r@   r_   r  r   r   r   r   channelsr   r   r   r   s               rB   r   zJanusVQVAEAttnBlock.forward  s5    		-0vvm,VVM*
vvm, /;.@.@+
Hfe#++J&5.QYYZ[]^`ab''
HfunM
yyz:#s8}'>?yy15 $++J&5.Q#++Aq!4iil;CCJPXZ`bghmmK0+%%rD   re  r   s   @rB   rg  rg    s    f&rD   rg  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvDownsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr
   r   r   r[  )rr   rs   r	   r5   convrn  s     rB   rs   z!JanusVQVAEConvDownsample.__init__  s'    IIk;AaYZ[	rD   c                 Z    t        j                  |ddd      }| j                  |      }|S )N)r   r!   r   r!   constantr   )padr   r   )rN  rx  ru  r   s     rB   r   z JanusVQVAEConvDownsample.forward  s+    mJVWX		-0rD   re  r   s   @rB   rs  rs    s    \rD   rs  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                 t    t         |           t        j                  j	                  ||ddd      | _        y )Nr
   r!   r[  )rr   rs   rY   r	   r5   ru  rn  s     rB   rs   zJanusVQVAEConvUpsample.__init__  s.    HHOOK!TU_`Oa	rD   c                 X    t        j                  |dd      }| j                  |      }|S )Ng       @nearest)scale_factorr   )rN  r   ru  r   s     rB   r   zJanusVQVAEConvUpsample.forward
  s(    m#IV		-0rD   re  r   s   @rB   rz  rz    s    brD   rz  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEMidBlockr1   rq  c                     t         |           t        |||      | _        t	        |      | _        t        |||      | _        y )Nr1   rj   rk   )rr   rs   rT  block_1rg  attn_1block_2)r@   r1   rq  r   s      rB   rs   zJanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
rD   r_   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r  r  r   s     rB   r   zJanusVQVAEMidBlock.forward  s2    ]3M2]3rD   )
rE   rF   rG   r$   r   rs   rY   r   r   r   r   s   @rB   r  r    s2    
/ 
3 
U\\ ell rD   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )JanusVQVAEEncoderc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }|j                  }|j                  }t        j                  j                  ||ddd      | _        dt        |      z   }|| _        t        j                          | _        t%        | j                        D ]   }t        j                          }	t        j                          }
|||   z  }|||   z  }t%        | j
                        D ]N  }|	j'                  t)        |||             |}|| j                  dz
  k(  s5|
j'                  t+        |             P t        j,                         }|	|_        |
|_        || j                  dz
  k7  rt3        |      |_        | j"                  j'                  |        t7        |      | _        t        j                  j;                  d|dd	      | _        t        j                  j                  ||rd
|z  n|ddd      | _        y )Nr
   r!   r[  )r!   r  rV  rW  TrX  r   ) rr   rs   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrj   double_latentlatent_channelsrY   r	   r5   conv_intuplein_channel_multiplierr  downr  appendrT  rg  Moduleblockattnrs  
downsampler  midr;   norm_outconv_out)r@   r1   r  rj   r  r  r  r  i_levelr  r  block_in	block_outi_blockr  r   s                  rB   rs   zJanusVQVAEEncoder.__init__'  s   "6#<#<=$33,,((,, 00#66xx{MqYZdef $u-?'@ @%:"MMO	T112 	#GMMOE==?D$'<W'EEH%(:7(CCI !4!45 
?)%$,%. %d22Q66KK 3H =>
? 99;DDJDI$..22":8"DIIT"-	#0 &fh7**bxUYbf*g#0Ao ( 
rD   r   c                    | j                  |      g}t        | j                        D ]  }t        | j                        D ]  } | j                  |   j
                  |   |d         }t        | j                  |   j                        dkD  r" | j                  |   j                  |   |      }|j                  |        || j                  dz
  k7  s|j                  | j                  |   j                  |d                 |d   }| j                  |      }| j                  |      }|t        j                  |      z  }| j                  |      }|S )Nrp   r   r!   )r  r  r  r  r  r  r  r  r  r  r  r  rY   rd  r  )r@   r   r_   r  r  r@  r^   s          rB   r   zJanusVQVAEEncoder.forwardZ  sT   l34T112 		WG !4!45 3@tyy177@!"%  tyy)../!3#C499W#5#:#:7#CL#QL$$\23 $..22$$TYYw%7%B%B=QSCT%UV		W *"- HH%67 !MM*;<U]]+<== MM*;<  rD   )rE   rF   rG   rs   rY   rR  r   r   r   s   @rB   r  r  &  s    1
f!E$4$4 !rD   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )JanusVQVAEDecoderc           	      v   t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }||j                  | j                  dz
     z  }t        j                  j                  ||ddd      | _        t        ||      | _        t        j                         | _        t#        t%        | j                              D ]  }t        j                         }t        j                         }||j                  |   z  }	t%        | j
                  dz         D ]N  }
|j'                  t)        |||	             |	}|| j                  dz
  k(  s5|j'                  t+        |             P t        j,                         }||_        ||_        |dk7  rt3        |      |_        | j                   j'                  |        t        j                  j7                  d|dd	      | _        t        j                  j                  ||ddd      | _        y )
Nr!   r
   r[  r  r   rV  rW  TrX  )rr   rs   r  r  r  r  r  r  rk   rY   r	   r5   r  r  r  r  upreversedr  r  rT  rg  r  r  r  rz  upsampler;   r  r  )r@   r1   r  r  rk   r  r  r  r  r  r  r  r   s               rB   rs   zJanusVQVAEDecoder.__init__t  s   "6#<#<=$33,, 00** !6#<#<T=Q=QTU=U#VV xxaXYcde &fh7 --/d&:&: ;< 	GMMOE==?D%(A(A'(JJI !4!4q!89 
?)%$,%. %d22Q66KK 3H =>
? BBHBG!|4X>GGNN2)	. **bxUYbf*g,AVWabcrD   r@  r   c                 b   | j                  |      }| j                  |      }t        | j                        D ]  }t        | j                  dz         D ]l  } | j
                  |   j                  |   |      }t        | j
                  |   j                        dkD  sK | j
                  |   j                  |   |      }n || j                  dz
  k7  s| j
                  |   j                  |      } | j                  |      }|t        j                  |      z  }| j                  |      }|S )Nr!   r   )r  r  r  r  r  r  r  r  r  r  r  rY   rd  r  )r@   r@  r  r  s       rB   r   zJanusVQVAEDecoder.forward  s    ||L1 xx- T112 	GG !4!4q!89 P>twww/55g>|Ltwww',,-1#A4777#3#8#8#A,#OLP $..22#www/88F	G }}\2l33}}\2rD   )rE   rF   rG   rs   rY   rZ   r   r   r   s   @rB   r  r  s  s)    ,d\E$5$5 %:K:K rD   r  aG  
    The VQ-VAE model used in Janus for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv Taigman](https://arxiv.org/abs/2203.13131).
    )custom_introc                        e Zd ZeZg dZdZdef fdZdej                  fdZ
dej                  dej                  fdZeedej                  deej                  ej                  f   fd	              Z xZS )

JanusVQVAE)rg  rT  r8  r   r1   c                    t         |   |       t        |      | _        t	        |      | _        t        j                  j                  |j                  |j                  d      | _        t        j                  j                  |j                  |j                  d      | _        | j                          t        |      | _        d| _        | j#                          y )Nr!   F)rr   rs   r  r  r8  quantizerY   r	   r5   r  ru   
quant_convpost_quant_convevalr  decoderr  r  r   s     rB   rs   zJanusVQVAE.__init__  s     (01&9((//&*@*@&BRBRTUV$xxv/?/?AWAWYZ[		(0&+# 	rD   c                 z    | j                  |      }| j                  |      }| j                  |      \  }}}|||fS r   )r  r  r  )r@   r   r_   quantemb_lossindicess         rB   encodezJanusVQVAE.encode  s@    \26#'==#? xh''rD   rL  r   c                    |j                   d   | j                  j                  d   | j                  j                  d   z  k7  rMt        d| j                  j                  d   | j                  j                  d   z   d|j                    d      | j                  j	                  |      }| j                  |      }| j                  |      }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        r!   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   r  r?  r   rQ  r  r  )r@   rL  codebook_entryr_   r   s        rB   decodezJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2rD   c                     |j                   d   }| j                  |      \  }}}| j                  |j                  |d            }t	        ||      }|S )Nr   rp   )r   r  r  r   rU   )r@   r   r   r  rW   r  rV   r   s           rB   r   zJanusVQVAE.forward  sU     "''*
)-\)B&~w#{{7<<
B+GH!"6GrD   )rE   rF   rG   r$   rH   rK   r)  rs   rY   rR  r  rZ   r  r   r   r   r   r   r   s   @rB   r  r    s     $L
 %O/ (5#3#3 (5#3#3 8I8I & 	''	 
u  %"3"33	4	  	rD   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr1   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w r-  )rr   rs   r	   r4   ru   r.  r   r  r  r  r0  r   r   r   r  s      rB   rs   zJanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqRYYv,,f.C.CDq
 $F$5$56 rr1  c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r3  r4  s      rB   r   zJanusVQVAEAlignerMLP.forward  r6  rD   )rE   rF   rG   r$   rs   r   r   r   s   @rB   r  r    s    7/ 7rD   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r1   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j
                  |j                        | _        y r   )rr   rs   r	   r4   image_token_embed_dimr.  rm  r   r   r   r;  vision_headr   s     rB   rs   zJanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRrD   r_   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rm  r   r  r   s     rB   r   zJanusVQVAEHead.forward  s6    m4**=9((7rD   )rE   rF   rG   rX   r$   rs   rY   r   tensorr   r   r   s   @rB   r  r    s0    YS/ SU\\ ell rD   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                   \    e Zd Zdef fdZd Zd Zd Zee		 	 	 	 	 	 	 	 	 	 	 dde
j                  de
j                  dee
j                     d	ee
j                     d
ee   dee
j                     dee
j                     dee   dee   dee   deee
j                  f   fd              Z xZS )
JanusModelr1   c                    t         |   |       || _        t        j	                  |j
                        | _        t        | j                  j                        | _        t        j	                  |j                        | _        t        j                  | j                  j                  j                  | j                  j                  j                        | _        t#        | j                  j                        | _        t'        | j                  j                        | _        t+        j,                  |j.                        | _        d| _        | j5                          y )N)r1   F)rr   rs   r1   r  _from_configr,   vision_modelr+  alignerr  	vq_configvqmodelr	   r>   r;  ru   generation_embeddingsr  generation_alignerr  generation_headr    from_configtext_configlanguage_modelr  r  r   s     rB   rs   zJanusModel.__init__$  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#rD   c                 6    | j                   j                         S r   )r  r(  r'  s    rB   r(  zJanusModel.get_input_embeddings9  s    ""7799rD   c                 :    | j                   j                  |       y r   )r  set_input_embeddingsr@   r   s     rB   r  zJanusModel.set_input_embeddings<  s    007rD   c                 ^    | j                  |      }| j                  |j                        }|S r   )r  r  r^   )r@   r   image_embedss      rB   get_image_featureszJanusModel.get_image_features?  s,    ((6||L$B$BCrD   	input_idsr   r   ro   r)   cache_positionr  	use_cacher   r  logits_to_keepc                 D   |	|	n| j                   j                  }	|
|
n| j                   j                  }
|d u |d uz  rt        d      | j                  r%| j
                  r|rt        j                  d       d}||t        d      | | j                         |      }|| j                  |      }|| j                   j                  k(  }|j                  d   }|j                  d|      }|j                  d      j                  dd|      }|j                  |j                   |j"                        }|j%                  ||      } | j&                  d||||||	|
||d	|}t)        |j*                  |j,                  |j.                  |j0                  |nd       }|S )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either onezZ`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...FzdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either onerp   )	r  r   ro   r)   r  r   r  r  r  )r^   r)   r_   r`   ra   rS   )r1   r   r  r   r  r   r   r   r(  r  image_token_idr   r   r   r   r   devicer   masked_scatterr  r]   r^   r)   r_   r`   )r@   r  r   r   ro   r)   r  r  r  r   r  r  r   r  image_attention_maskru   image_features	lm_outputr   s                      rB   r   zJanusModel.forwardD  s   " 2C1N-TXT_T_TqTq$8$D $++JjJj 	 -t";<s  &&4==##p "	#(Av   7D557	BM#22<@L#,0J0J#J %++B/I)11"i@N#7#A#A"#E#L#LRQSU^#_ +..}/C/C]EXEXYN)889M~^M'D'' 
')%+/!5))
 
	 .'99%55#11 ++0<0Hd
 rD   )NNNNNNNNNNr   )rE   rF   rG   r"   rs   r(  r  r  r   r   rY   rR  rZ   r   r   r   r   r   r   r   r   r   s   @rB   r  r    s*   { *:8
  '+*.1537+/5959$(,0/334H##H ''H !.	H
 u//0H "%H !!1!12H   1 12H D>H $D>H 'tnH c5<</0H  HrD   r  c                   |    e Zd ZddgZdZdef fdZd Zd Zde	j                  d	e	j                  fd
Zd Zd Zd Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 d!de	j$                  de	j&                  dee	j                     dee	j$                     dee   dee	j$                     dee	j&                     dee	j$                     dee   dee   dee   deee	j                  f   fd              Z	 	 	 	 	 	 d" fd	Zde	j                  fdZe	j8                  	 	 	 d#de	j                  dee	j$                     dee   f fd        Z xZS )$JanusForConditionalGenerationz(model.language_model.embed_tokens.weightzlm_head.weightTr1   c                     t         |   |       || _        t        |      | _        t        j                  |j                  j                  |j                  j                  d      | _
        | j                          y )NFr   )rr   rs   r1   r  r'   r	   r4   r  rt   
vocab_sizelm_headr  r   s     rB   rs   z&JanusForConditionalGeneration.__init__  s\     '
yy!3!3!?!?ASASA^A^ejk 	rD   c                 J    | j                   j                  j                         S r   )r'   r  r(  r'  s    rB   r(  z2JanusForConditionalGeneration.get_input_embeddings  s    zz((==??rD   c                 N    | j                   j                  j                  |       y r   )r'   r  r  r  s     rB   r  z2JanusForConditionalGeneration.set_input_embeddings  s    

!!66u=rD   inputsr   c                 r    | j                   j                  |      }| j                   j                  |      }|S r   )r'   r  r  )r@   r  r@  s      rB   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generation  s0    zz77?zz44\BrD   c                     | j                   S r   r  r'  s    rB   get_output_embeddingsz3JanusForConditionalGeneration.get_output_embeddings  s    ||rD   c                     || _         y r   r  )r@   new_embeddingss     rB   set_output_embeddingsz3JanusForConditionalGeneration.set_output_embeddings  s	    %rD   c                     || _         y r   r'   )r@   r  s     rB   set_decoderz)JanusForConditionalGeneration.set_decoder  s	    
rD   c                     | j                   S r   r  r'  s    rB   get_decoderz)JanusForConditionalGeneration.get_decoder  s    zzrD   r  r   r   ro   r)   r  r  labelsr  r   r  r  c                    |
|
n| j                   j                  }
||n| j                   j                  } | j                  d|||||||	|
||d
|}|j                  }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                  |j                         }|S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        N)
r  r   r   ro   r)   r  r  r   r  r  )re   r  r  )rd   re   r)   r_   r`   ra   rS   )r1   r   r  r'   r^   r3   r   slicer  loss_functionr  r  rc   r)   r_   r`   ra   )r@   r  r   r   ro   r)   r  r  r  r  r   r  r  r   r   r_   slice_indicesre   rd   r   s                       rB   r   z%JanusForConditionalGeneration.forward  s(   0 2C1N-TXT_T_TqTq$8$D $++JjJj 	 $** 
%)%+'/!5)
 
  118B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD,#33!//)) ' ; ;
 rD   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r)   r  r   r  r  r   r   )rr   prepare_inputs_for_generation)r@   r  r   r)   r   r  r  r  r   model_inputsr   s             rB   r  z;JanusForConditionalGeneration.prepare_inputs_for_generation  sT     w<
+')))
 
 !!+7L(rD   rL  c                 x    | j                   j                  j                  |      }|j                  dddd      }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r
   r!   )r'   r  r  r   )r@   rL  decoded_images      rB   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9rD   logits_processorc           	         |j                  d| j                        }t        j                  |      }|j                  dd      }|dk(  rt	        %|   d|||d d|S  |j                  di |}|j                         t        j                  t        j                  fvrt        d      |j                          | j                  |j                                ||n	t               }d|d<   |j                  t         j#                  d       d	|_        |j                  |d
<   | j%                  ||j&                  |      \  }}	}|j(                  |j*                  }}
t-        |j.                        dk7  rt        d|j.                   d      |d u}| j1                  |||j*                         |j                  r:|j                  dkD  r+|j3                  t5        |j                               d |_        | j7                  ||j.                  d   |d ||      } | j8                  d|||j:                  d|\  }}| j<                  j>                  j@                  jB                  }|j.                  \  }}|jE                  dd      }|j                  dd       }|jE                  dd      }||d<   ||d d d f   |j&                  k7  ||d d d f   |jF                  d   k7  z  }||d d d f   jI                  ||jJ                          | jM                         |      }| jO                  |||      }|jQ                  dd       A| jS                  |jT                  xs d|dz  tW        |jX                  ||z         ||      |d<   t[        j\                  ||f|
|      }|j^                  }|j`                  }|jb                  }|jd                  }|jf                  }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }ti        |      D ]x  } | jj                  d||d|}|d   jm                  |j*                        |d<   |d   jm                  |j*                        |d<    | j<                  jn                  di |||d}| jq                  ||      }|jr                  d d dd d f   ju                         } | j<                  jw                  |       }! |||!      }"|jx                  r>t[        jz                  |"d      }#t[        j|                  |#d      j                  d      }$nt[        j                  |"d      }$|$|d d |f<   t[        j                  |$|$g      }$|$j                  d      }$| j                  |$      }{ |r@|r|!fz  }|r| j                         fz  }|r|j                  z  }|r|j                  z  }|rt        |!|||j                        S |S ) Ngeneration_configgeneration_modetext)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1` and `num_beam_groups=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  r!   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr)   static)cache_implementationr   max_cache_lenr  model_kwargs)r   r  rS   )r  r  r  )r   r  rp   rC  )num_samples)	sequencesscoresre   r`   r_   r)   )Ipopr  copydeepcopyrr   generateupdateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargsr   r  r   warning_prepare_model_inputsbos_token_idr   r  r  r   _prepare_special_tokensr  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr'   r  r1   num_image_tokensrepeatgeneration_kwargsmasked_fill_pad_token_idr(  _get_initial_cache_positionr   
_get_cacher  max
max_lengthrY   zerosr   r  output_scoresoutput_logitsreturn_dict_in_generater  r  r   r  #_update_model_kwargs_for_generationr^   cloner  	do_sampler   multinomialsqueezeargmaxcatr   r  floatr`   r_   r   r)   )&r@   r  r   r  r   r  r	  r  r  model_input_namer   r  kwargs_has_attention_maskr*  r   r   input_tokensmaskr  generated_tokensr   r  r4  r5  r6  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r   r@  r  next_token_scoresprobs
next_tokenr   s&                                        rB   r  z&JanusForConditionalGeneration.generate  s    #JJ':D<R<RS MM*;< !**%6?f$7# -"3#	
   0(//9&9 002>;P;PR`RnRn:ool  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N%22L5
1	#\ ")9)9vy1$MiooM^EF  %3$$>!$$%68QZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #E$"D"D #
))>>#
 	#
	<  ::2299JJ'oo
G ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW3113LA77V-t4<.2oo%6%K%K%Wx%>!"3">">@PSZ@Z[) /> /L*+ !;;
4D'EU[ab .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'( #	UA=4== +|GSL .::J-K-N-N}OcOc-dL)*-9:J-K-N-N}OcOc-dL)*/djj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMG#	UJ #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#rD   )NNNNNNNNNNNr   )NNNNNNr  ) rE   rF   rG   _tied_weights_keysrQ   r"   rs   r(  r  rY   r   r  r  r  r  r  r   r   rR  rZ   r   r   r   r   r   r   r  r  no_gradr   r  r   r   s   @rB   r  r    s   DFVW!{ @>ell u|| 
&  '+*.1537+/5959-1$(,0/3349##9 ''9 !.	9
 u//09 "%9 !!1!129   1 129 ))*9 D>9 $D>9 'tn9 c5<</09  9| <
 
 ]]  $59:>	}$}$ !!1!12}$ ##67	}$ }$rD   r  )r&   r  r  r  r  )r-   )Tr  dataclassesr   typingr   r   r   r   r   rY   r	   activationsr   cache_utilsr   
generationr   r   r   r   generation.utilsr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   autor    configuration_janusr"   r#   r$   torch.nn.functionalr   rN  
get_loggerrE   r   r&   rU   r]   rc   r  rg   r   r   r   r>  r   r   r   r   r	  r  r+  r8  rT  rg  rs  rz  r  r  r  r  r  r  r  r  __all__rS   rD   rB   <module>r_     s	  ,  ! 9 9   !   u u 9 B 9 X X F &   Q Q ## 
		H	% ?? ? ?@ -{ - - )C; )C )CX &C+ &C &CRHBII HV	UU\\ 	U# 	U%,, 	U& %II%<<% 
% <<	%
 U\\*% % %4R299 RjRYY (.8 .bM
 M
` ;+ ; ;|BII $<"		 <"~)(BII )(X &"))  &F	ryy 	RYY  ,J!		 J!ZA		 AH ;% ;;|299 $RYY   
k% k
k\I$$8/ I$X
 trD   