
    Uhbb                        d dl mZ d dlmZmZmZmZ d dlZd dlmZ ddl	m
Z
 ddlmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ e G d de             Ze G d de             Z G d dej8                        Ze G d de             Z ed       G d de             Z ed       G d dee             Z g dZ!y)    )	dataclass)ListOptionalTupleUnionN)nn   )ACT2FN)GenerationMixin)BaseModelOutputWithPastModelOutput)PreTrainedModel)auto_docstringcan_return_tupleis_torchdynamo_compiling   )	AutoModel   )VipLlavaConfigc                   :    e Zd ZU dZdZeej                     ed<   y)VipLlavaModelOutputWithPasta  
    Base class for VipLlava outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/vipllava/modeling_vipllava.pyr   r   %   s    8 8<%"3"34;r!   r   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	VipLlavaCausalLMOutputWithPasta  
    Base class for VipLlava causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r   r   r   r%   r   r   r   r   r&   r'   r   r(   r   r)   r   r    r!   r"   r$   r$   F   s    < )-D(5$$
%,*.FHU&&'.9=OXd5#4#456=8<M8E%"3"345<59Ju001297;%"3"34;r!   r$   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                 H   t         |           t        |j                  t              rdnt        |j                        }t        j                  ||j                  j                  z  |j                        | _        t        j                  ||j                  j                  z  |j                  j                  d      | _        t        |j                      | _        t        j                  |j                  j                  |j                  j                  d      | _        y )Nr   )epsTbias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r
   projector_hidden_actactlinear_2)selfr,   num_feature_layers	__class__s      r"   r2   z$VipLlavaMultiModalProjector.__init__o   s    ",V-I-I3"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr!   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)r;   r>   r@   rA   )rB   r(   s     r"   forwardz#VipLlavaMultiModalProjector.forward~   sB    00?m4/m4r!   )r   r   r   r   r2   rG   __classcell__rD   s   @r"   r+   r+   n   s    m~ mr!   r+   c                   :    e Zd ZeZdZdZdZdZdZ	dZ
dZdZdZd Zy)VipLlavaPreTrainedModel Tr'   c                 $   t        | j                  d| j                  j                         j                        }t	        |t
        j                        rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y t	        |t
        j                        rJ|j                  j                  j                  d       |j                  j                  j                          y y )Ninitializer_rangeg        )meanstdg      ?)getattrr,   get_text_configrN   r3   r   r<   weightdatanormal_r0   zero_r7   fill_)rB   modulerP   s      r"   _init_weightsz%VipLlavaPreTrainedModel._init_weights   s     dkk#68S8S8U8g8ghfbii(MM&&CS&9{{&  &&( '-MM$$S)KK""$ .r!   N)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointing_skip_keys_device_placement_supports_cache_class_supports_flash_attn_2_supports_sdpa_supports_quantized_cache_supports_static_cache_supports_attention_backendrY   r    r!   r"   rK   rK      sA    !L&*#"3 !N $!"&%r!   rK   zx
    The VipLlava model which consists of a vision backbone and a language model, without a language modeling head.
    )custom_introc                       e Zd ZddiZdef fdZd Zd Zdej                  de
eee   f   fd	Ze	 	 	 	 	 	 	 	 	 	 	 	 dd
ej                  dej                  deej"                     deej                     deeej                        deej                     dee
eee   f      dee   dee   dee   dee   deej                     de
eef   fd       Z xZS )VipLlavaModelzlanguage_model.modellanguage_modelr,   c                     t         |   |       t        j                  |j                        | _        t        |      | _        t        j                  |j                        | _	        | j                          y rF   )r1   r2   r   from_configr8   vision_towerr+   multi_modal_projectorr=   rg   	post_initrB   r,   rD   s     r"   r2   zVipLlavaModel.__init__   sY     %11&2F2FG%@%H"'33F4F4FGr!   c                 6    | j                   j                         S rF   )rg   get_input_embeddingsrB   s    r"   ro   z"VipLlavaModel.get_input_embeddings   s    ""7799r!   c                 :    | j                   j                  |       y rF   )rg   set_input_embeddingsrB   values     r"   rr   z"VipLlavaModel.set_input_embeddings   s    007r!   pixel_valuesr4   c                 "   | j                  |d      }t        |t              r|j                  |   ddddf   }n<|D cg c]  }|j                  |   ddddf    }}t	        j
                  |d      }| j                  |      }|S c c}w )aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, List[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        T)output_hidden_statesNr   )dim)rj   r3   r5   r(   r   catrk   )rB   ru   r4   image_outputsimage_featuresindexs         r"   get_image_featuresz VipLlavaModel.get_image_features   s     )),T)R +S1*889NOPQSTSUPUVN VkkEm99%@ABGkNk"YY~2>N33NC ls   B	input_idsattention_maskposition_idsr'   inputs_embeds	use_cacheoutput_attentionsrw   return_dictcache_positionreturnc                 j   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      ||t        d      | | j                         |      }|"| j                  ||      }|| j                   j                  k(  j                  d      }|j                  |      j                  |j                        }t               s{||   j                         |j                         k7  rW|| j                   j                  k(  j                         }|j                   d   |j                   d   z  }t        d| d	|       |j                  |j                  |j"                        }|j%                  ||      } | j&                  d||||||	|
d
|d	|}t)        |j*                  |j,                  |j.                  |j0                  |nd      }|r|S |j3                         S )z
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)ru   r4   rx   r   r   z6Image features and image tokens do not match: tokens: z, features T)	r   r   r'   r   r   r   rw   r   r   )last_hidden_stater'   r(   r)   r   r    )r,   r   rw   use_return_dictr4   
ValueErrorro   r~   image_token_id	unsqueeze	expand_astodevicer   numelsumshapedtypemasked_scatterrg   r   r   r'   r(   r)   to_tuple)rB   r   ru   r   r   r'   r   r4   r   r   rw   r   r   	lm_kwargsr|   special_image_maskn_image_tokensn_image_featuresoutputsoutputs                       r"   rG   zVipLlavaModel.forward   sq   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ#(Av   7D557	BM#!44)AV 5 N #,t{{/I/I"I!T!TUW!X!3!=!=m!L!O!OP]PdPd!e+--@R2S2Y2Y2[_m_s_s_u2u"+t{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K^\M%$%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r!   )NNNNNNNNNNNN)r   r   r   _checkpoint_conversion_mappingr   r2   ro   rr   r   r   r   r5   r   r~   r   
LongTensorr   Tensorboolr   r   rG   rH   rI   s   @r"   rf   rf      s    '=>N%O"~ :8u/@/@ Y^_bdhildm_mYn 4  '+*.1537=A59AE$(,0/3&*59M<##M< ''M< !.	M<
 u//0M< "$u'8'8"9:M<   1 12M<  (c49n(=>M< D>M< $D>M< 'tnM< d^M< !!1!12M< 
u11	2M< M<r!   rf   zV
    The VIPLLAVA model which consists of a vision backbone and a language model.
    c            #           e Zd ZdddddZdgZdef fdZd	 Zd
 Zde	j                  fdZd Zed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d&dej(                  dej*                  deej.                     deej(                     deeej*                        deej*                     deeeee   f      deej(                     dee   dee   dee   dee   deej(                     deeej.                  f   deeef   fd              Z	 	 	 	 	 	 d' fd 	Ze dej.                  d!ed"ed#ejB                  dej.                  d$efd%       Z" xZ#S )( VipLlavaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr,   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr/   )r1   r2   rf   modelr   r<   r=   r9   
vocab_sizer   rl   rm   s     r"   r2   z)VipLlavaForConditionalGeneration.__init__1  sS     "6*
yy!3!3!?!?ASASA^A^ejkr!   c                 6    | j                   j                         S rF   )r   ro   rp   s    r"   ro   z5VipLlavaForConditionalGeneration.get_input_embeddings7  s    zz..00r!   c                 :    | j                   j                  |       y rF   )r   rr   rs   s     r"   rr   z5VipLlavaForConditionalGeneration.set_input_embeddings:  s    

''.r!   r   c                     | j                   S rF   r   rp   s    r"   get_output_embeddingsz6VipLlavaForConditionalGeneration.get_output_embeddings=  s    ||r!   c                     || _         y rF   r   )rB   new_embeddingss     r"   set_output_embeddingsz6VipLlavaForConditionalGeneration.set_output_embeddings@  s	    %r!   c                 .    | j                   j                  S rF   )r   rg   rp   s    r"   rg   z/VipLlavaForConditionalGeneration.language_modelD  s    zz(((r!   c                 .    | j                   j                  S rF   )r   rj   rp   s    r"   rj   z-VipLlavaForConditionalGeneration.vision_towerH  s    zz&&&r!   c                 .    | j                   j                  S rF   )r   rk   rp   s    r"   rk   z6VipLlavaForConditionalGeneration.multi_modal_projectorL  s    zz///r!   r   ru   r   r   r'   r   r4   labelsr   r   rw   r   r   logits_to_keepc                 l   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j
                  d|||||||	||
|d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                   |j"                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)r   ru   r   r   r'   r   r   r4   r   rw   r   r   r   )r&   r   r   )r%   r&   r'   r(   r)   r   r    )r,   r   rw   r   r4   r   r3   r5   slicer   loss_functionr=   r   r$   r'   r(   r)   r   )rB   r   ru   r   r   r'   r   r4   r   r   r   rw   r   r   r   r   r   r(   slice_indicesr&   r%   s                        r"   rG   z(VipLlavaForConditionalGeneration.forwardP  s[   l 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 $** 
%)%+'"7/!5)
 
   
8B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r!   c           	      N    t        
|   |f|||||d|}	|d   dk(  r||	d<   |	S )N)r'   r   r   r   r   r   ru   )r1   prepare_inputs_for_generation)rB   r   r'   r   ru   r   r   r   kwargsmodel_inputsrD   s             r"   r   z>VipLlavaForConditionalGeneration.prepare_inputs_for_generation  sV     w<
+')))
 
 !! ,8L(r!   sequence_lengthtarget_lengthr   
batch_sizec                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        N   )
fill_valuer   r   r   )diagonal)r   rx   r   )ry   r   finfominfullr   triuarangereshapeexpandcloner   r   masked_fill)r   r   r   r   r   r   r   causal_mask	min_dtypemask_lengthpadding_masks              r"   5_prepare_4d_causal_attention_mask_with_cache_positionzVVipLlavaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position  s   < %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r!   )NNNNNNNNNNNNNr   )NNNNNN)$r   r   r   r   _tied_weights_keysr   r2   ro   rr   r   Moduler   r   propertyrg   rj   rk   r   r   r   r   r   r   r   r   r   r5   r   r   r$   rG   r   staticmethodr   r   rH   rI   s   @r"   r   r   #  sv    "8-"?#,	&" ++~ 1/ryy & ) ) ' ' 0 0  '+*.1537=A59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "$u'8'8"9:]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
  ]
D < 444 4 {{	4
 4 4 4r!   r   )rf   r   rK   )"dataclassesr   typingr   r   r   r   r   r   activationsr
   
generationr   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   autor   configuration_vipllavar   r   r$   r   r+   rK   rf   r   __all__r    r!   r"   <module>r      s   , " / /   ! ) D - O O  2 <"9 < <@ $<[ $< $<N")) 0 %o % %6 
y<+ y<
y<x 
\'> \
\~ [r!   