
    Uh_r                        d Z ddlmZ ddlmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#  ejH                  e%      Z&e G d de             Z'e G d de             Z( G d de
jR                        Z*e G d de             Z+ ed       G d de+             Z, G d dee      Z- ed        G d! d"e+e             Z.g d#Z/y)$zPyTorch PaliGemmamodel.    )	dataclass)ListOptionalTupleUnionN)nn   )CacheHybridCacheStaticCache)GenerationMixin)FlashAttentionKwargs)BaseModelOutputWithPast)PreTrainedModel)Unpack)
LossKwargsModelOutputauto_docstringcan_return_tupleis_torchdynamo_compilinglogging   )	AutoModel   )PaliGemmaConfigc                   :    e Zd ZU dZdZeej                     ed<   y)PaligemmaModelOutputWithPasta  
    Base class for Paligemma outputs, with hidden states and attentions.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)	__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__     /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   r   &   s    8 8<%"3"34;r'   r   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeeej                     ef      ed<   dZeeej                        ed<   dZeeej                        ed<   dZeej                     ed<   y)	PaliGemmaCausalLMOutputWithPasta  
    Base class for PaliGemma causal language model (or autoregressive) outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Language modeling loss (for next-token prediction).
        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)

            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
            `past_key_values` input) to speed up sequential decoding.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`torch.FloatTensor`, *optional*):
            A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r   r    r!   r"   r+   r   r#   r$   r%   r,   r-   r   r   r
   r.   r   r/   r   r&   r'   r(   r*   r*   G   s    < )-D(5$$
%,*.FHU&&'.GKOXeD):):$;U$BCDK8<M8E%"3"345<59Ju001297;%"3"34;r'   r*   c                   *     e Zd Zdef fdZd Z xZS )PaliGemmaMultiModalProjectorconfigc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr2   	__class__s     r(   r7   z%PaliGemmaMultiModalProjector.__init__p   s;    ii 4 4 @ @&BVBVBeBelpqr'   c                 (    | j                  |      }|S N)r<   )r>   image_featuresr.   s      r(   forwardz$PaliGemmaMultiModalProjector.forwardt   s    N3r'   )r   r    r!   r   r7   rC   __classcell__r?   s   @r(   r1   r1   o   s    r rr'   r1   c                   @    e Zd ZeZdZdZdgZdZdZ	dZ
dZdZdZdZd Zy)PaliGemmaPreTrainedModel Tr1   r-   c                 \   t        | j                  d| j                  j                         j                        }t	        |t
        j                        rY|j                  j                  j                  d|       |j                  %|j                  j                  j                          y y y )Ninitializer_range        )meanstd)getattrr2   get_text_configrJ   
isinstancer   r8   weightdatanormal_r5   zero_)r>   modulerM   s      r(   _init_weightsz&PaliGemmaPreTrainedModel._init_weights   s     dkk#68S8S8U8g8ghfbii(MM&&CS&9{{&  &&( ' )r'   N)r   r    r!   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_cache_class_supports_quantized_cache_supports_static_cache_supports_flash_attn_2_supports_sdpa_supports_attention_backendrV   r&   r'   r(   rG   rG   z   sJ    "L&*#78"3  $!!N"&)r'   rG   z{
    The Base Paligemma model which consists of a vision backbone and a language model withou language modeling head.,
    )custom_introc            #            e Zd ZddiZdef fdZd Zd Z	 	 	 	 	 ddee	   fdZ
d	ej                  fd
Zee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                   d	ej                  deej"                     deej                      deeeej                     ef      deej                      deej                      deej                     deej                      dee	   dee	   dee	   dee	   dee   deeef   fd              Z xZS )PaliGemmaModelzlanguage_model.modellanguage_modelr2   c                    t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                  j                  | j                  j                  nd| _        | j                          y )N)r2   )r6   r7   r   from_configr9   vision_towerr1   multi_modal_projectortext_config
vocab_sizere   r2   pad_token_id	post_init)r>   r2   re   r?   s      r(   r7   zPaliGemmaModel.__init__   s     %119M9MN%A&%I" ,,77"..f6H6HI,8<8P8P8\DKK44bdr'   c                 6    | j                   j                         S rA   )re   get_input_embeddingsr>   s    r(   rp   z#PaliGemmaModel.get_input_embeddings   s    ""7799r'   c                 :    | j                   j                  |       y rA   )re   set_input_embeddingsr>   values     r(   rs   z#PaliGemmaModel.set_input_embeddings   s    007r'   is_trainingc                 `   | j                   j                  j                  dk(  r	|d|v r|S y ||n| j                  }t	        |t
              }t        j                  | j                        j                  }||}|j                  d d \  }	}
|r|j                         }nUt	        |t              r|j                         }n4t	        |t        j                        r|j                  d   n
|d   |
z   dz   }||j                         dk(  r|S t        j                  |
|f|| j                  |j                         }|
dk7  r%|rt        j"                  |d	      }nd|d d d |
f<   |t        j$                  ||j                   
      |j'                  dd      kD  z  }|d d d d d d f   j)                  |	ddd      }||j+                         }|j                  d   }|rd|t-        d      |d d d d d d d |f   j/                  |d d d d d d f   j1                  |j                         dk(  d      |d d d d d d d |f<   |d d d d d d d |f   |d d d d d d f   j1                  |j                         z   }|dk(  }|d d d d d d d |f   j/                  ||      |d d d d d d d |f<   |S )Nflash_attention_2rK   r   rg   r   r      
fill_valuedtypedevicediagonalr}   z/Token type ids must be provided during training)r2   rk   _attn_implementationtrainingrP   r   r#   finfor|   minshapeget_max_cache_shaper   Tensordimfullr}   triuarangereshapeexpandclone
ValueErrormasked_fillto)r>   attention_masktoken_type_idsr-   cache_positioninput_tensorrv   using_static_cache	min_dtypeinputs_lead_dimsequence_lengthtarget_lengthcausal_maskmask_lengthpadding_masks                  r(   _update_causal_maskz"PaliGemmaModel._update_causal_mask   s    ;;""77;NN)c^.C%%%0%<k$--'EKK

+//	)L+7+=+=bq+A(+??AM5+??AM nell; $$R(#A&81<  %.*<*<*>!*C!!jjm,$**]k]r]r
 a#jjqA36A///0u||M.:O:OPSaSiSijlnoSppp!$a"23::?ArSUV%%++-K(..r2K !)$%VWW5@Aq,;,AV5W5c5c"1dD!#34778J8JKqPRS6Aq!\k\12
 'q!Q'<=qRVX\^_O_@`@c@cdodvdv@wwL'1,L1<Q1l{l=R1S1_1_i2K1a+-. r'   pixel_valuesc                     | j                  |      }|j                  }| j                  |      }|| j                  j                  j
                  dz  z  }|S )a  
        Obtains image last hidden states from the vision tower and apply multimodal projection.

        Args:
            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
               The tensors corresponding to the input images.
        Returns:
            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
        g      ?)ri   last_hidden_staterj   r2   rk   r:   )r>   r   image_outputsselected_image_featurerB   s        r(   get_image_featuresz!PaliGemmaModel.get_image_features   sW     )),7!.!@!@334JK'4;;+B+B+N+NPS+STr'   	input_idsr   position_idsr-   r   r   inputs_embedslabels	use_cacheoutput_attentionsoutput_hidden_statesreturn_dictkwargsreturnc                 T   |du |duz  rt        d      ||n| j                  j                  }||n| j                  j                  }||n| j                  j                  }|duxr |	du}|R| j                  j
                  | j                  k\  r/|| j                  j
                  k(  }|j                         }d||<   n|}| | j                         |      }|F||j                         nd}t        j                  |||j                  d   z   |j                        }||j                  d      dz   }|{| j                  |      }|\| | j                         t        j                   | j                  j
                  t        j"                  |j                              k(  }nR|| j                  j
                  k(  j                  d      }|j%                  |      j'                  |j                        }t)               sx||   j+                         |j+                         k7  rT|j-                  d      j-                  d      d   }t        d	| d
|j                  d   |j                  d   z   d      |j'                  |j                  |j.                        }|j1                  ||      }| j3                  ||||||      } | j4                  d|||||
||d|d	|}t7        |j8                  |j:                  |j<                  |j>                  |      S d      S )i  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   r   )r|   r}   rg   )r   zVNumber of images does not match number of special image tokens in the input text. Got z image tokens in the text but z tokens from image embeddings.T)	r   r   r-   r   r   r   r   r   r   )r   r-   r.   r/   r   r&   ) r   r2   r   r   use_return_dictimage_token_idrl   r   rp   get_seq_lengthr#   r   r   r}   	unsqueezer   tensorlong	expand_asr   r   numelsumr|   masked_scatterr   re   r   r   r-   r.   r/   )r>   r   r   r   r   r-   r   r   r   r   r   r   r   r   r   rv   special_image_maskllm_input_idspast_seen_tokensrB   image_tokens_in_textr   outputss                          r(   rC   zPaliGemmaModel.forward  sy   ^ -t";<YZZ1B1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$D0GV45G  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FM!CRC^==?de"\\ "2]5H5H5K"KTaThThN )33A6:L #!44\BN %26Qd6O6O6QLL!;!;5::VcVjVjk7 &" '04;;3M3M&M%X%XY[%\"%7%A%A-%P%S%STaThTh%i"+--@R2S2Y2Y2[_m_s_s_u2u(:'?'?A'?'F'J'Jq'J'QRS'T$ /00N~OcOcdeOfiwi}i}~  jA  PA  OB B44 
 ,..}/C/C]EXEXYN)889K^\M..NO^]\g
 &$%% 
&%+'/!5)
 
 ,%77#33!//))2>2J
 	

 QU
 	
r'   )NNNNN)NNNNNNNNNNNNN)r   r    r!   _checkpoint_conversion_mappingr   r7   rp   rs   r   boolr   r#   r$   r   r   r   
LongTensorr   r   r   r
   r   r   r   r   rC   rD   rE   s   @r(   rd   rd      s    '=>N%O"
 
:8 &*B d^BHu/@/@    '+*.1537KO595959-1$(,0/3&*x
##x
 ''x
 !.	x

 u//0x
 "%U->->(?(F"GHx
 !!1!12x
 !!1!12x
   1 12x
 ))*x
 D>x
 $D>x
 'tnx
 d^x
 -.x
  
u22	3!x
  x
r'   rd   c                       e Zd Zy)KwargsForCausalLMN)r   r    r!   r&   r'   r(   r   r     s    r'   r   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c            %           e Zd ZdddddZdgZdef fdZd	 Zd
 Zd Z	d Z
ed        Zed        Zed        Zee	 	 	 	 	 	 	 	 	 	 	 	 	 	 d'dej$                  dej&                  deej*                     deej$                     deeeej&                     ef      deej$                     deej$                     deej&                     deej$                     dee   dee   dee   dee   deeej*                  f   dee   deeef   f d               Z	 	 	 	 	 	 	 	 	 	 d( fd!	Z e!dej*                  d"ed#ed$ejD                  dej*                  d%efd&       Z# xZ$S ))!PaliGemmaForConditionalGenerationzmodel.language_modelzmodel.vision_towerzmodel.multi_modal_projectorlm_head)z^language_model.modelz^vision_towerz^multi_modal_projectorz^language_model.lm_headzlm_head.weightr2   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr4   )r6   r7   rd   modelr   r8   rk   r:   rl   r   rn   r=   s     r(   r7   z*PaliGemmaForConditionalGeneration.__init__  sS     #F+
yy!3!3!?!?ASASA^A^ejkr'   c                 6    | j                   j                         S rA   )r   rp   rq   s    r(   rp   z6PaliGemmaForConditionalGeneration.get_input_embeddings  s    zz..00r'   c                 :    | j                   j                  |       y rA   )r   rs   rt   s     r(   rs   z6PaliGemmaForConditionalGeneration.set_input_embeddings  s    

''.r'   c                     | j                   S rA   r   rq   s    r(   get_output_embeddingsz7PaliGemmaForConditionalGeneration.get_output_embeddings  s    ||r'   c                     || _         y rA   r   )r>   new_embeddingss     r(   set_output_embeddingsz7PaliGemmaForConditionalGeneration.set_output_embeddings  s	    %r'   c                 .    | j                   j                  S rA   )r   re   rq   s    r(   re   z0PaliGemmaForConditionalGeneration.language_model  s    zz(((r'   c                 .    | j                   j                  S rA   )r   ri   rq   s    r(   ri   z.PaliGemmaForConditionalGeneration.vision_tower  s    zz&&&r'   c                 .    | j                   j                  S rA   )r   rj   rq   s    r(   rj   z7PaliGemmaForConditionalGeneration.multi_modal_projector  s    zz///r'   r   r   r   r   r-   r   r   r   r   r   r   r   r   logits_to_keepr   r   c                 >   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  } | j                  d||||||||
|	||d|d|}|d   }t        |t              rt        | d      n|}| j                  |dd|ddf         }d}|	4 | j                  d||	| j                   j                  j                  d|}t        |||j                  |j                  |j                  |j                         S )r   NT)r   r   r   r   r   r-   r   r   r   r   r   r   r   r   )r,   r   rl   )r+   r,   r-   r.   r/   r   r&   )r2   r   r   r   r   rP   intslicer   loss_functionrk   rl   r*   r-   r.   r/   r   )r>   r   r   r   r   r-   r   r   r   r   r   r   r   r   r   r   r   r.   slice_indicesr,   r+   s                        r(   rC   z)PaliGemmaForConditionalGeneration.forward  sS   ^ 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]$** 
%))%+'/!5)
 
"  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD /#33!//)) ' ; ;
 	
r'   c                 *   t        |   |f||||||	|
|d|}|j                  d      |dxx   dz  cc<   |d   dk(  r||d<   |d uxr |d u}|d   dk(  r;t        |t              r+||n|}| j
                  j                  ||||||      }||d<   |S )N)r-   r   r   r   r   r   r   r   r   r   r   r   r   )r6   prepare_inputs_for_generationgetrP   r   r   r   )r>   r   r-   r   r   r   r   r   r   r   r   r   r   model_inputsrv   r   r   r?   s                    r(   r   z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generation
  s      w<
+')%)))
 
 N+7(A-( !!+7L($D0GV45G!!j+&N,9,E=9L**88Q]_jK .9L)*r'   r   r   r|   
batch_sizec                    | | j                         dk(  r| }|S t        j                  |      j                  }t        j                  ||f|||j
                        }|dk7  rt        j                  |d      }|t        j                  ||j
                        |j                  dd      kD  z  }|ddddddf   j                  |ddd      }| |j                         }| j                  d   }	|ddddddd|	f   | ddddddf   j                  |j
                        z   }
|
dk(  }
|ddddddd|	f   j                  |
|      |ddddddd|	f<   |S )	aM  
        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.

        Args:
            attention_mask (`torch.Tensor`):
                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
                `(batch_size, 1, query_length, key_value_length)`.
            sequence_length (`int`):
                The sequence length being processed.
            target_length (`int`):
                The target length: when generating with static cache, the mask should be as long as the static cache,
                to account for the 0 padding, the part of the cache that is not filled yet.
            dtype (`torch.dtype`):
                The dtype to use for the 4D attention mask.
            cache_position (`torch.Tensor`):
                Indices depicting the position of the input sequence tokens in the sequence.
            batch_size (`torch.Tensor`):
                Batch size.
        Nry   rz   r   r~   r   rg   r   )r   r#   r   r   r   r}   r   r   r   r   r   r   r   r   )r   r   r   r|   r   r   r   r   r   r   r   s              r(   5_prepare_4d_causal_attention_mask_with_cache_positionzWPaliGemmaForConditionalGeneration._prepare_4d_causal_attention_mask_with_cache_position8  s   > %.*<*<*>!*C(K* ' E*..I** -0Ye\j\q\qK !##jjqA5<<n>S>STWeWmWmnprsWtttK%dD!Q&67>>z1bRTUK))//1,2226*1aL[L+@ANSTVZ\`bcScDdDgDg&&E    ,q05@Aq,;,AV5W5c5c )6Aq!\k\12 r'   )NNNNNNNNNNNNNr   )
NNNNNNNTNN)%r   r    r!   r   _tied_weights_keysr   r7   rp   rs   r   r   propertyre   ri   rj   r   r   r#   r   r$   r   r   r   r   r
   r   r   r   r   r   r*   rC   r   staticmethodr|   r   rD   rE   s   @r(   r   r     s    "8-"?#,	&" ++ 1/& ) ) ' ' 0 0  '+*.1537KO595959-1$(,0/3&*34V
##V
 ''V
 !.	V

 u//0V
 "%U->->(?(F"GHV
 !!1!12V
 !!1!12V
   1 12V
 ))*V
 D>V
 $D>V
 'tnV
 d^V
 c5<</0V
  *+!V
" 
u55	6#V
  V
v ,\ 444 4 {{	4
 4 4 4r'   r   )r   rG   rd   )0r"   dataclassesr   typingr   r   r   r   r#   torch.utils.checkpointr   cache_utilsr
   r   r   
generationr   modeling_flash_attention_utilsr   modeling_outputsr   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   autor   configuration_paligemmar   
get_loggerr   loggerr   r*   Moduler1   rG   rd   r   r   __all__r&   r'   r(   <module>r      s     ! / /    : : ) B 7 - & q q  4 
		H	% <#: < <@ $<k $< $<N299  ) ) )0 
e
- e

e
P ?,j > 
f(@/ f
fR ^r'   