
    UhO1                     <   d dl mZmZmZmZ d dlZd dlmZ d dlmZm	Z	m
Z
mZmZ ddlmZ ddlmZmZmZ dd	lmZ  ej*                  e      Z G d
 de      Z G d de      Z G d dej4                        Z G d de      Z G d de
      Z G d de	      Zg dZy)    )ListOptionalTupleUnionN)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)auto_docstringis_torchdynamo_compilinglogging   )VipLlavaConfigc                       e Zd Zy)VipLlavaModelOutputWithPastN__name__
__module____qualname__     /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   %       r   r   c                       e Zd Zy)VipLlavaCausalLMOutputWithPastNr   r   r   r   r   r   )   r   r   r   c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                 H   t         |           t        |j                  t              rdnt        |j                        }t        j                  ||j                  j                  z  |j                        | _        t        j                  ||j                  j                  z  |j                  j                  d      | _        t        |j                      | _        t        j                  |j                  j                  |j                  j                  d      | _        y )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r   projector_hidden_actactlinear_2)selfr"   num_feature_layers	__class__s      r   r'   z$VipLlavaMultiModalProjector.__init__.   s    ",V-I-I3"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S )N)r0   r3   r5   r6   )r7   hidden_statess     r   forwardz#VipLlavaMultiModalProjector.forward=   sB    00?m4/m4r   )r   r   r   r   r'   r<   __classcell__)r9   s   @r   r!   r!   -   s    m~ mr   r!   c                       e Zd Zy)VipLlavaPreTrainedModelNr   r   r   r   r?   r?   E   r   r   r?   c                      e Zd Zdej                  deeee   f   fdZe		 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej                  deej                     deej                     deeej                        d	eej                     deeeee   f      d
ee   dee   dee   dee   deej                     deeef   fd       Zy)VipLlavaModelpixel_valuesr)   c                 "   | j                  |d      }t        |t              r|j                  |   ddddf   }n<|D cg c]  }|j                  |   ddddf    }}t	        j
                  |d      }| j                  |      }|S c c}w )aW  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
            vision_feature_layers (`Union[int, List[int]]`):
                The vision feature layer, or the list of indexes of the layers to select
                the vision feature.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        T)output_hidden_statesNr   )dim)vision_towerr(   r*   r;   torchcatmulti_modal_projector)r7   rB   r)   image_outputsimage_featuresindexs         r   get_image_featuresz VipLlavaModel.get_image_featuresJ   s     )),T)R +S1*889NOPQSTSUPUVN VkkEm99%@ABGkNk"YY~2>N33NC ls   BN	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cacheoutput_attentionsrD   return_dictcache_positionreturnc                 j   |	|	n| j                   j                  }	|
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }|du |duz  rt        d      ||t        d      | | j                         |      }|"| j                  ||      }|| j                   j                  k(  j                  d      }|j                  |      j                  |j                        }t               s{||   j                         |j                         k7  rW|| j                   j                  k(  j                         }|j                   d   |j                   d   z  }t        d| d	|       |j                  |j                  |j"                        }|j%                  ||      } | j&                  d||||||	|
d
|d	|}t)        |j*                  |j,                  |j.                  |j0                  |nd      }|r|S |j3                         S )z
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedszdYou cannot specify both pixel_values and inputs_embeds at the same time, and must specify either one)rB   r)   rE   r   r   z6Image features and image tokens do not match: tokens: z, features T)	rP   rQ   rR   rS   rT   rU   rD   rV   rW   )last_hidden_staterR   r;   
attentionsimage_hidden_statesr   )r"   rU   rD   use_return_dictr)   
ValueErrorget_input_embeddingsrN   image_token_id	unsqueeze	expand_astodevicer   numelsumshapedtypemasked_scatterlanguage_modelr   rZ   rR   r;   r[   to_tuple)r7   rO   rB   rP   rQ   rR   rS   r)   rT   rU   rD   rV   rW   	lm_kwargsrL   special_image_maskn_image_tokensn_image_featuresoutputsoutputs                       r   r<   zVipLlavaModel.forwardd   sq   , 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 -t";<YZZ#(Av   7D557	BM#!44)AV 5 N #,t{{/I/I"I!T!TUW!X!3!=!=m!L!O!OP]PdPd!e+--@R2S2Y2Y2[_m_s_s_u2u"+t{{/I/I"I!N!N!P#1#7#7#:^=Q=QRS=T#T  L^L\\ghxgyz  ,..}/C/C]EXEXYN)889K^\M%$%% 
)%+'/!5)
 
 -%77#33!//))2>2JPT
 %v;&//*;;r   )NNNNNNNNNNNN)r   r   r   rH   FloatTensorr   r*   r   rN   r   
LongTensorr   Tensorboolr   r   r<   r   r   r   rA   rA   I   sj   u/@/@ Y^_bdhildm_mYn 4  '+*.1537=A59AE$(,0/3&*59M<##M< ''M< !.	M<
 u//0M< "$u'8'8"9:M<   1 12M<  (c49n(=>M< D>M< $D>M< 'tnM< d^M< !!1!12M< 
u11	2M< M<r   rA   c            !          e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  dej
                  deej                     deej                     deeej
                        deej
                     dee	e
ee
   f      d	eej                     d
ee   dee   dee   dee   deej                     de	e
ej                  f   de	eef   fdZy) VipLlavaForConditionalGenerationNrO   rB   rP   rQ   rR   rS   r)   labelsrT   rU   rD   rV   rW   logits_to_keeprX   c                 l   |
|
n| j                   j                  }
||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j
                  d|||||||	||
|d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                   |j"                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        vision_feature_layers (`Union[int, List[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", torch_dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```NT)rO   rB   rP   rQ   rR   rS   rT   r)   rU   rD   rV   rW   r   )logitsrx   
vocab_size)lossr{   rR   r;   r[   r\   r   )r"   rU   rD   r]   r)   modelr(   r*   slicelm_headloss_functionr2   r|   r   rR   r;   r[   r\   )r7   rO   rB   rP   rQ   rR   rS   r)   rx   rT   rU   rD   rV   rW   ry   rl   rp   r;   slice_indicesr{   r}   s                        r   r<   z(VipLlavaForConditionalGeneration.forward   s[   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]%:%F!DKKLmLm 	 $** 
%)%+'"7/!5)
 
   
8B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r   )NNNNNNNNNNNNNr   )r   r   r   rH   rs   rr   r   rt   r   r   r*   ru   r   r   r<   r   r   r   rw   rw      sc    '+*.1537=A59AE-1$(,0/3&*5934]
##]
 '']
 !.	]

 u//0]
 "$u'8'8"9:]
   1 12]
  (c49n(=>]
 ))*]
 D>]
 $D>]
 'tn]
 d^]
 !!1!12]
 c5<</0]
" 
u44	5#]
r   rw   )rA   rw   r?   ) typingr   r   r   r   rH   r   (transformers.models.llava.modeling_llavar   r	   r
   r   r   activationsr   utilsr   r   r   configuration_vipllavar   
get_loggerr   loggerr   r   Moduler!   r?   rA   rw   __all__r   r   r   <module>r      s     0 /    " F F 2 
		H	%	": 		%@ 	")) 0	2 	i<J i<X^
'D ^
B [r   