
    UhtC                     ^   d Z ddlmZmZmZmZ ddlZddlZddlmZ ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ  ej4                  e      Ze G d de             Z G d dee      Z ed       G d de             Z ed       G d dee
             Z g dZ!y)zPyTorch Fuyu model.    )ListOptionalTupleUnionN)nn   )GenerationMixin)FlashAttentionKwargs)CausalLMOutputWithPast)PreTrainedModel)	AutoModel)
LossKwargsauto_docstringcan_return_tuplelogging   )
FuyuConfigc                   6    e Zd ZeZdZdZdZdZdZ	dZ
g ZdZd Zy)FuyuPreTrainedModelfuyuTpast_key_valuesc                    | j                   j                  }t        |t        j                        rY|j
                  j                  j                  d|       |j                  %|j                  j                  j                          y y t        |t        j                        rf|j
                  j                  j                  d|       |j                  2|j
                  j                  |j                     j                          y y y )Ng        )meanstd)configinitializer_range
isinstancer   Linearweightdatanormal_biaszero_	Embeddingpadding_idx)selfmoduler   s      x/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/fuyu/modeling_fuyu.py_init_weightsz!FuyuPreTrainedModel._init_weights/   s    kk++fbii(MM&&CS&9{{&  &&( '-MM&&CS&9!!-""6#5#56<<> . .    N)__name__
__module____qualname__r   config_classbase_model_prefixsupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_2_supports_sdpa_supports_flex_attn_no_split_modules_skip_keys_device_placementr)    r*   r(   r   r   #   s;    L&*#"&!N"3	?r*   r   c                       e Zd Zy)KwargsForCausalLMN)r+   r,   r-   r7   r*   r(   r9   r9   ;   s    r*   r9   zt
    The Fuyu model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                       e Zd ZddiZdef fdZd Zd Zdej                  de
ej                     d	ej                  d
ej                  fdZdej                  fdZe	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  dej                  deej                     deej                     dee
ej                        deej                     dee   dee   dee   dee   d
eeef   fd       Z xZS )	FuyuModelzlanguage_model.modellanguage_modelr   c                    t         |   |       |j                  | _        |j                  j
                  | _        t        j                  |j                        | _        t        j                  |j                  |j                  z  |j                  z  |j                        | _        d| _        | j!                          y )NF)super__init__pad_token_idr%   text_config
vocab_sizer   from_configr=   r   r   
patch_sizenum_channelshidden_sizevision_embed_tokensgradient_checkpointing	post_initr&   r   	__class__s     r(   r@   zFuyuModel.__init__F   s     !.. ,,77'33F4F4FG#%99 1 11F4G4GGI[I[$
  ',#r*   c                 6    | j                   j                         S N)r=   get_input_embeddingsr&   s    r(   rO   zFuyuModel.get_input_embeddingsS   s    ""7799r*   c                 :    | j                   j                  |       y rN   )r=   set_input_embeddingsr&   values     r(   rR   zFuyuModel.set_input_embeddingsV   s    007r*   word_embeddingscontinuous_embeddingsimage_patch_input_indicesreturnc           
         |j                   d   t        |      k(  s't        dt        |      d|j                   d         |j                         }t	        |j                   d         D ]  }t        j                  ||   dk\  d      d   }||   |   }|j                   d   ||   j                   d   kD  r,t        d||   j                   d|j                   d| d	      ||   |   j                  |j                        |||f<    |S )
a  This function places the continuous_embeddings into the word_embeddings at the locations
        indicated by image_patch_input_indices. Different batch elements can have different numbers of continuous
        embeddings.

        Args:
            word_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Tensor of word embeddings.
            continuous_embeddings (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
                Tensor of continuous embeddings. The length of the list is the batch size. Each entry is shape
                [num_image_embeddings, hidden], and num_image_embeddings needs to match the number of non-negative
                indices in image_patch_input_indices for that batch element.
            image_patch_input_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
                Tensor of indices of the image patches in the input_ids tensor.
        r   z7Batch sizes must match! Got len(continuous_embeddings)=z and word_embeddings.shape[0]=T)as_tuplezGNumber of continuous embeddings continuous_embeddings[batch_idx].shape=zA does not match number of continuous token ids src_indices.shape=z in batch element .)	shapelen
ValueErrorclonerangetorchnonzerotodevice)r&   rU   rV   rW   output_embeddings	batch_idxdst_indicessrc_indicess           r(   gather_continuous_embeddingsz&FuyuModel.gather_continuous_embeddingsY   sZ   (  %%a(C0E,FFJs3H/I.KKjQ`QfQfghQiPkl  ,11344Q78 	I  --(A)(LPQ(Q\`abcdK 4I>{KK  #&;I&F&L&LQ&OO ^7LY7W7]7]6_ `I6A6G6G5II[\e[ffgi  9Ni8XYd8e8h8h!((9i45	  ! r*   pixel_valuesc                     |D cg c]O  }| j                  |j                  | j                   j                  j                              j	                  d      Q }}|S c c}w )a$  
        Encodes images into continuous embeddings that can be forwarded to the language model.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
        r   )rH   rc   r   dtypesqueeze)r&   rj   patchpatch_embeddingss       r(   get_image_featureszFuyuModel.get_image_features   sa     &
 $$UXXd.F.F.M.M.S.S%TU]]^_`
 
  	
s   AA	input_idsimage_patchesimage_patches_indicesattention_maskposition_idsr   inputs_embeds	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictc                    |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||t        d      ||j                  \  }}n||j                  \  }}}nt        d      |}d}||d   d   j                  d   }||z   }|U||j                  n|j                  }t        j                  |||z   t        j                  |      }|j                  d      }|I | j                  j                         |      }|'|%| j                  |      }| j                  |||      } | j                  d|||||	|
||d|}|S )	a  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.
        zDYou cannot specify both input_ids and inputs_embeds at the same timez4You have to specify either input_is or inputs_embedsr      )rl   rd   )rU   rV   rW   )rv   rt   ru   r   rx   ry   rw   rz   r7   )r   rx   ry   rw   use_return_dictr^   r\   rd   ra   arangelong	unsqueezer=   rO   rp   ri   )r&   rq   rr   rs   rt   ru   r   rv   rw   rx   ry   rz   kwargs
batch_size
seq_length_seq_length_with_pastpast_key_values_lengthrd   ro   outputss                        r(   forwardzFuyuModel.forward   s   . 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B] ]%>cdd"%.__"J
&(5(;(;%J
ASTT)!"&%4Q%7%:%@%@%C"#7:P#P )2)>Y%%MDXDXF <<&
5K(KSXS]S]flL (11!4L FD//DDFyQM(_-D#'#:#:=#I  $ A A$1*:.C !B ! &$%% 

')%+/!5#

 

 r*   )NNNNNNNNNNN)r+   r,   r-   _checkpoint_conversion_mappingr   r@   rO   rR   ra   Tensorr   ri   FloatTensorrp   r   
LongTensorr   boolr   r   r   r   __classcell__rL   s   @r(   r<   r<   >   s    '=>N%O"z :8*!*!  $ELL1*! $)<<	*!
 
*!X u/@/@    '+&*.21537=A59$(,0/3&*K##K ||K  %||	K
 !.K u//0K "$u'8'8"9:K   1 12K D>K $D>K 'tnK d^K 
u,,	-K Kr*   r<   zz
    Fuyu Model with a language modeling head on top for causal language model conditioned on image patches and text.
    c            !           e Zd ZddddZdgZdef fdZd Zd	 Zd
 Z	d Z
d Zd Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                   dej"                  dej"                  deej"                     deej                      deeej(                        deej(                     dee   deej"                     dee   dee   dee   dee   deeef   fd              Z	 	 	 	 	 d  fd	Zed        Z xZS )!FuyuForCausalLMzmodel.language_modelzmodel.vision_embed_tokenslm_head)z^language_model.modelz^vision_embed_tokensz^language_model.lm_headzlm_head.weightr   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NF)r"   )r?   r@   r<   modelr   r   rB   rG   rC   r   rJ   rK   s     r(   r@   zFuyuForCausalLM.__init__   sS     v&
yy!3!3!?!?ASASA^A^ejkr*   c                 6    | j                   j                         S rN   )r   rO   rP   s    r(   rO   z$FuyuForCausalLM.get_input_embeddings   s    zz..00r*   c                 :    | j                   j                  |       y rN   )r   rR   rS   s     r(   rR   z$FuyuForCausalLM.set_input_embeddings   s    

''.r*   c                     | j                   S rN   r   rP   s    r(   get_output_embeddingsz%FuyuForCausalLM.get_output_embeddings   s    ||r*   c                     || _         y rN   r   )r&   new_embeddingss     r(   set_output_embeddingsz%FuyuForCausalLM.set_output_embeddings   s	    %r*   c                 :    | j                   j                  |       y rN   )r   set_decoder)r&   decoders     r(   r   zFuyuForCausalLM.set_decoder  s    

w'r*   c                 6    | j                   j                         S rN   )r   get_decoderrP   s    r(   r   zFuyuForCausalLM.get_decoder  s    zz%%''r*   rq   rr   rs   rt   ru   r   rv   rw   labelsrx   ry   rz   logits_to_keeprX   c                 T   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  ||||||||
||d      }|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                  |j                         S )a@  
        image_patches (`torch.FloatTensor` of shape `(batch_size, num_total_patches, patch_size_ x patch_size x num_channels)`, *optional*):
            Image patches to be used as continuous embeddings. The patches are flattened and then projected to the
            hidden size of the model.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
        image_patches_indices (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Tensor of indices of the image patches in the input_ids tensor.

        Examples:

        ```python
        >>> from transformers import FuyuProcessor, FuyuForCausalLM
        >>> from PIL import Image
        >>> import requests

        >>> processor = FuyuProcessor.from_pretrained("adept/fuyu-8b")
        >>> model = FuyuForCausalLM.from_pretrained("adept/fuyu-8b")

        >>> url = "https://huggingface.co/datasets/hf-internal-testing/fixtures-captioning/resolve/main/bus.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> prompt = "Generate a coco-style caption.\n"

        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=7)
        >>> generation_text = processor.batch_decode(generated_ids[:, -7:], skip_special_tokens=True)
        >>> print(generation_text[0])
        A blue bus parked on the side of a road.
        ```NT)rq   rr   rs   rv   rt   ru   r   rx   ry   rw   rz   r   )logitsr   rC   )lossr   r   hidden_states
attentionsr7   )r   rx   ry   rw   r}   r   r   intslicer   loss_functionrB   rC   r   r   r   r   )r&   rq   rr   rs   rt   ru   r   rv   rw   r   rx   ry   rz   r   r   r   r   slice_indicesr   r   s                       r(   r   zFuyuForCausalLM.forward  sN   j 2C1N-TXT_T_TqTq$8$D $++JjJj 	 "+!6IDKK<Q<Q	%0%<k$++B]B]**'"7')%+/!5  
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r*   c           	      L    t        	|   |f|||||d|}|
d |d<   d |d<   |S )N)r   rt   rv   rr   rs   rs   rr   )r?   prepare_inputs_for_generation)
r&   rq   r   rt   rv   rr   rs   r   model_inputsrL   s
            r(   r   z-FuyuForCausalLM.prepare_inputs_for_generationf  sX     w<
+)''"7
 
 &48L01,0L)r*   c                 J    d}| D ]  }|t        fd|D              fz  } |S )Nr7   c              3   t   K   | ]/  }|j                  d j                  |j                               1 yw)r   N)index_selectrc   rd   ).0
past_statebeam_idxs     r(   	<genexpr>z1FuyuForCausalLM._reorder_cache.<locals>.<genexpr>  s.     nU_j--aZ=N=N1OPns   58)tuple)r   r   reordered_past
layer_pasts    `  r(   _reorder_cachezFuyuForCausalLM._reorder_cache  s=    ) 	Jncmnn N	 r*   )NNNNNNNNNNNNr   )NNNNN)r+   r,   r-   r   _tied_weights_keysr   r@   rO   rR   r   r   r   r   r   r   ra   r   r   r   r   r   r   r   r   r   r   r   r   staticmethodr   r   r   s   @r(   r   r      s    "8 ;#,&"
 ++z 1/&((  '+&*.21537=A59$()-,0/3&*()[
##[
 ||[
  %||	[

 !.[
 u//0[
 "$u'8'8"9:[
   1 12[
 D>[
 &[
 $D>[
 'tn[
 d^[
 ![
  
u,,	-![
  [
@ "8  r*   r   )r   r   r<   )"__doc__typingr   r   r   r   ra   torch.utils.checkpointr   
generationr	   modeling_flash_attention_utilsr
   modeling_outputsr   modeling_utilsr   models.auto.modeling_autor   utilsr   r   r   r   configuration_fuyur   
get_loggerr+   loggerr   r9   r<   r   __all__r7   r*   r(   <module>r      s     / /    ) B 6 - 2 J J * 
		H	% ?/ ? ?. ?,j > 
\# \
\~ 
b)? b
bJ Br*   