
    UhW6                    ,   d Z ddlmZ ddlmZ ddlmZmZmZm	Z	m
Z
 ddlZddlZddlmZmZ ddlmZ dd	lmZmZ dd
lmZmZ ddlmZ ddlmZmZmZmZmZ ddlm Z m!Z!m"Z"  e       rddl#m$Z$  ejJ                  e&      Z'dej                  dej                  fdZ(dej                  dej                  fdZ)e G d de             Z*dedefdZ+dedefdZ,d Z-d Z.e G d de             Z/e G d d e             Z0 G d! d"ejb                        Z2 G d# d$ejb                        Z3 G d% d&ejb                        Z4 G d' d(ejb                        Z5 G d) d*ejb                        Z6e G d+ d,e             Z7 G d- d.ejb                        Z8 G d/ d0ejb                        Z9 G d1 d2e7      Z: G d3 d4ejb                        Z; G d5 d6e7      Z<e G d7 d8e7             Z= G d9 d:ejb                        Z> G d; d<ejb                        Z? G d= d>e7      Z@g d?ZAy)@zPyTorch OWLv2 model.    )	dataclass)	lru_cache)AnyDictOptionalTupleUnionN)Tensornn   )ACT2FN) _create_4d_causal_attention_mask_prepare_4d_attention_mask)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)ModelOutputauto_docstringis_vision_availablelogging	torch_int   )Owlv2ConfigOwlv2TextConfigOwlv2VisionConfig)center_to_corners_formatlogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr!   )r   s    z/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/owlv2/modeling_owlv2.pycontrastive_lossr(   ,   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r(   t)r*   caption_loss
image_losss      r'   
owlv2_lossr/   1   s,    #J/L!*,,.1J:%,,r)   c                      e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed<   dZeed	<   d
ee   fdZy)Owlv2Outputa!  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Contrastive loss for image-text similarity.
        logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
            similarity scores.
        logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
            similarity scores.
        text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
            The image embeddings obtained by applying the projection layer to the pooled output of
            [`Owlv2VisionModel`].
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`Owlv2TextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`Owlv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)r7   r8   Ngetattrto_tuple.0kselfs     r'   	<genexpr>z'Owlv2Output.to_tuple.<locals>.<genexpr>W   =      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysrB   s   `r'   r>   zOwlv2Output.to_tupleV   #     
YY[
 
 	
r)   )__name__
__module____qualname____doc__r2   r   r$   FloatTensor__annotations__r3   r4   r5   r6   r7   r   r8   r   r   r>    r)   r'   r1   r1   7   s    * )-D(5$$
%,48hu001837OXe//07/3K%++,304L(5,,-448186:3:
%* 
r)   r1   r,   c                    | j                         r>| j                  t        j                  t        j                  fv r| S | j                         S | j                  t        j                  t        j                  fv r| S | j                         S N)	is_floating_pointdtyper$   float32float64floatint32int64int)r,   s    r'   _upcastr\   ^   s`    GGu}}==qL1779LGGU[[99qFquuwFr)   boxesc                 f    t        |       } | dddf   | dddf   z
  | dddf   | dddf   z
  z  S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r   r   )r\   )r]   s    r'   box_arear`   g   sB     ENE!Q$K%1+%%1+ad*CDDr)   c                 ^   t        |       }t        |      }t        j                  | d d d d df   |d d d df         }t        j                  | d d d dd f   |d d dd f         }||z
  j	                  d      }|d d d d df   |d d d d df   z  }|d d d f   |z   |z
  }||z  }	|	|fS )Nr_   r   minr   )r`   r$   maxrc   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r'   box_iourp   x   s    VEVEyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29LAq!LAq$99E!T'NU"U*E
%-C:r)   c                    | ddddf   | ddddf   k\  j                         st        d|        |ddddf   |ddddf   k\  j                         st        d|       t        | |      \  }}t        j                  | dddddf   |ddddf         }t        j
                  | dddddf   |ddddf         }||z
  j                  d      }|dddddf   |dddddf   z  }|||z
  |z  z
  S )z
    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

    Returns:
        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
    Nr_   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   rb   r   )all
ValueErrorrp   r$   rc   rd   re   )rf   rg   ro   rn   top_leftbottom_rightrl   areas           r'   generalized_box_iourw      s*    1ab5MVArrE]*//1WX^W_`aa1ab5MVArrE]*//1WX^W_`aa(JCyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29L1a <1a#88D$,$&&&r)   c                   l   e Zd ZU dZdZeej                     ed<   dZ	ee
   ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed	<   dZeej                     ed
<   dZeed<   dZeed<   dee   fdZy)Owlv2ObjectDetectionOutputa	  
    Output type of [`Owlv2ForObjectDetection`].

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
            scale-invariant IoU loss.
        loss_dict (`Dict`, *optional*):
            A dictionary containing the individual losses. Useful for logging.
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`):
            The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the
            total number of patches is (image_size / patch_size)**2.
        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
            possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the
            unnormalized bounding boxes.
        text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
            The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
            embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`Owlv2TextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`Owlv2VisionModel`].
    Nr2   	loss_dictr   objectness_logits
pred_boxesr5   r6   class_embedsr7   r8   r   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr;   r<   r?   s     r'   rC   z6Owlv2ObjectDetectionOutput.to_tuple.<locals>.<genexpr>   rD   rE   rF   rI   s   `r'   r>   z#Owlv2ObjectDetectionOutput.to_tuple   rJ   r)   )rK   rL   rM   rN   r2   r   r$   rO   rP   rz   r   r   r{   r|   r5   r6   r}   r7   r   r8   r   r   r>   rQ   r)   r'   ry   ry      s     D )-D(5$$
%, $Ix~$*.FHU&&'.59x 1 129.2J**+2/3K%++,304L(5,,-404L(5,,-448186:3:
%* 
r)   ry   c                   0   e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeej                     ed<   dZeed	<   dZeed
<   dee   fdZy)%Owlv2ImageGuidedObjectDetectionOutputa  
    Output type of [`Owlv2ForObjectDetection.image_guided_detection`].

    Args:
        logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
            Classification logits (including no-object) for all queries.
        target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual target image in the batch
            (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
            values are normalized in [0, 1], relative to the size of each individual query image in the batch
            (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
            retrieve the unnormalized bounding boxes.
        image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
            image embeddings for each patch.
        query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
            Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
            image embeddings for each patch.
        class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
            Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
            number of patches is (image_size / patch_size)**2.
        text_model_output (Tuple[`BaseModelOutputWithPooling`]):
            The output of the [`Owlv2TextModel`].
        vision_model_output (`BaseModelOutputWithPooling`):
            The output of the [`Owlv2VisionModel`].
    Nr   r6   query_image_embedstarget_pred_boxesquery_pred_boxesr}   r7   r8   r   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr;   r<   r?   s     r'   rC   zAOwlv2ImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>  rD   rE   rF   rI   s   `r'   r>   z.Owlv2ImageGuidedObjectDetectionOutput.to_tuple  rJ   r)   )rK   rL   rM   rN   r   r   r$   rO   rP   r6   r   r   r   r}   r7   r   r8   r   r   r>   rQ   r)   r'   r   r      s    > +/FHU&&'.04L(5,,-46:!2!23:59x 1 12948hu001804L(5,,-448186:3:
%* 
r)   r   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )Owlv2VisionEmbeddingsconfigc                    t         |           |j                  | _        || _        |j                  | _        t        j                  t        j                  |j                              | _
        t        j                  |j                  | j
                  |j                  |j                  d      | _        |j                  |j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j
                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasr_   r   position_idsr   
persistent)super__init__
patch_sizer   hidden_size	embed_dimr   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embedding
image_sizenum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr%   expandrB   r   	__class__s     r'   r   zOwlv2VisionEmbeddings.__init__  s    ++++!||EKK8J8J,KL!yy++))$$ 
 #--1B1BBqH!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr)   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr   g      ?r   r_   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer$   jit
is_tracingr   r   r   reshapepermuter   r"   interpolateviewcat)rB   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r'   interpolate_pos_encodingz.Owlv2VisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr)   pixel_valuesr   c                 h   |j                   \  }}}}| j                  |      }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )Nr_   r   r   r   )r   r   flatten	transposer   r   r$   r   r   r   r   )
rB   r   r   
batch_size_r   r   patch_embedsr}   r   s
             r'   forwardzOwlv2VisionEmbeddings.forwardI  s    '3'9'9$
Avu++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr)   F)rK   rL   rM   r   r   r$   r
   r[   r   rO   boolr   __classcell__r   s   @r'   r   r   
  sm    q0 q*'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 QU bgbnbn r)   r   c            	            e Zd Zdef fdZ	 	 	 ddeej                     deej                     deej                     dej                  fdZ
 xZS )	Owlv2TextEmbeddingsr   c                 ^   t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        | j                  dt        j                  |j                        j                  d      d       y )Nr   r   Fr   )r   r   r   r   
vocab_sizer   token_embeddingmax_position_embeddingsr   r   r$   r%   r   r   s     r'   r   zOwlv2TextEmbeddings.__init__Y  s    !||F,=,=v?Q?QR"$,,v/M/MvOaOa"b 	ELL)G)GHOOPWXej 	 	
r)   	input_idsr   inputs_embedsr   c                     ||j                   d   n|j                   d   }|| j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr   )r   r   r   r   )rB   r   r   r   
seq_lengthposition_embeddingsr   s          r'   r   zOwlv2TextEmbeddings.forwardc  s{     -6,AY__R(}GZGZ[]G^
,,Q^<L  00;M"55lC"%88
r)   )NNN)rK   rL   rM   r   r   r   r$   
LongTensorrO   r
   r   r   r   s   @r'   r   r   X  sj    
 
 153759	E,,- u//0   1 12	
 
r)   r   c                       e Zd ZdZ fdZdej                  dedefdZ	 	 	 ddej                  de	ej                     d	e	ej                     d
e	e
   deej                  e	ej                     e	eej                        f   f
dZ xZS )Owlv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                 
   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).      )r   r   r   r   r   num_attention_heads	num_headshead_dimrs   scaleattention_dropoutdropoutr   Lineark_projv_projq_projout_projr   s     r'   r   zOwlv2Attention.__init__{  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar)   tensorseq_lenbszc                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   r_   )r   r   r   r   
contiguous)rB   r   r   r   s       r'   _shapezOwlv2Attention._shape  s7    {{3GQQRSUVWbbddr)   hidden_statesattention_maskcausal_attention_maskoutput_attentionsr   c                    |j                         \  }}}| j                  |      | j                  z  }| j                  | j	                  |      d|      }	| j                  | j                  |      d|      }
|| j                  z  d| j                  f} | j                  |||      j                  | } |	j                  | }	 |
j                  | }
|	j                  d      }t        j                  ||	j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                |{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }|{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t        j                  j                  |d      }|r?|j                  || j                  ||      }|j                  || j                  z  ||      }nd}t        j                  j!                  || j                   | j"                  	      }|j%                  |
j&                        }t        j                  ||
      }|j                         || j                  z  || j                  fk7  r7t        d
|| j                  || j                  f d|j                                |j                  || j                  || j                        }|j                  dd      }|j)                  |||      }| j+                  |      }||fS )z#Input shape: Batch x Time x Channelr   r   r_   z$Attention weights should be of size z	, but is Nz!Attention mask should be of size r   )ptrainingz `attn_output` should be of size )r   r   r   r   r   r   r   r   r   r$   bmmr   rs   r   r"   softmaxr   r   torU   r   r   )rB   r   r   r   r   r   tgt_lenr   query_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                    r'   r   zOwlv2Attention.forward  s    #0"4"4"6Wi {{=1DJJ>[[]!;RE
{{4;;}#=r3GDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(*  !,$))+Q/II 7a'8R7S T-22457  (,,S$..'7SVkkL',,S4>>-A7GTL%""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B
 %1$5$5c4>>7T[$\!055cDNN6JGU\]L$(!]]**<4<<RVR_R_*`
  ]]<#5#56
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1!))#w	BmmK0111r)   NNF)rK   rL   rM   rN   r   r$   r
   r[   r   r   r   r   r   r   r   s   @r'   r   r   x  s    GB&eU\\ eC ec e 268<,1O2||O2 !.O2  (5	O2
 $D>O2 
u||Xell3XeELL>Q5RR	SO2r)   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Owlv2MLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y rS   )r   r   r   r   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r'   r   zOwlv2MLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr)   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rS   )r  r
  r  )rB   r   s     r'   r   zOwlv2MLP.forward  s4    /**=9/r)   )rK   rL   rM   r   r$   r
   r   r   r   s   @r'   r  r    s$    KU\\ ell r)   r  c                        e Zd Zdef fdZ	 d	dej                  dej                  dej                  dee   de	ej                     f
dZ xZS )
Owlv2EncoderLayerr   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r   s     r'   r   zOwlv2EncoderLayer.__init__  sm    ++'/<<F<Q<QRF#<<F<Q<QRr)   r   r   r   r   r   c                     |}| j                  |      }| j                  ||||      \  }}||z   }|}| j                  |      }| j                  |      }||z   }|f}|r||fz  }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        )r   r   r   r   )r  r  r  r  )rB   r   r   r   r   residualr  outputss           r'   r   zOwlv2EncoderLayer.forward  s    " !((7&*nn')"7/	 '5 '
#| !=0 ((7/ =0 "&Gr)   r   )rK   rL   rM   r   r   r$   r
   r   r   r   rO   r   r   r   s   @r'   r  r    sf    S{ S -2&||& &  %||	&
 $D>& 
u  	!&r)   r  c                   $    e Zd ZeZdZdZdgZd Zy)Owlv2PreTrainedModelowlv2Tr  c                 L
   | j                   j                  }t        |t              rj|j                  j
                  j                  j                  d|dz         |j                  j
                  j                  j                  d|dz         nt        |t              r| j                   j                  }t        j                  j                  |j                  d|j                  dz  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         t        j                  j                  |j                  j
                  |j                   j                  |z         nt        |t               r-| j                   j                  }|j                  dz  d|j                   j"                  z  dz  z  |z  }|j                  dz  |z  }t        j                  j                  |j$                  j
                  |       t        j                  j                  |j&                  j
                  |       t        j                  j                  |j(                  j
                  |       t        j                  j                  |j*                  j
                  |       nt        |t,              r| j                   j                  }|j                   j.                  dz  d|j                   j"                  z  dz  z  |z  }d|j                   j.                  z  dz  |z  }t        j                  j                  |j0                  j
                  |       t        j                  j                  |j2                  j
                  |       nt        |t4              rt        j                  j                  |j6                  j
                  |j8                  dz  | j                   j                  z         t        j                  j                  |j:                  j
                  |j<                  dz  | j                   j                  z         t        |t        j>                        rI|j@                  j                  jC                          |j
                  j                  jE                  d       t        |t        jF                        r2|j@                  %|j@                  j                  jC                          yyy)	zInitialize the weights        g{Gz?)meanstdr   )r$  r_         ?N)$r   initializer_factor
isinstancer   r   r   datanormal_r   r   r   initr   r   r   initializer_ranger   num_hidden_layersr   r   r   r   r  r   r  r  
Owlv2Modeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr  r   zero_fill_r   )rB   modulefactorin_proj_stdout_proj_stdfc_stds         r'   _init_weightsz"Owlv2PreTrainedModel._init_weights.  s   //f12""))..66CVd]6S%%,,1199sQU9V 56[[33FGGOOF22&BRBRTXBX[aBaObGGOOF2299v}}?^?^ag?gOhGGOOF55<<&--BaBadjBjOk/[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LGGOOFMM00kOBGGOOFMM00kOBGGOOFMM00kOBGGOOFOO22OE)[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFGGOOFJJ--6O:GGOOFJJ--;O?
+GGOO&&--))4/$++2P2PP   GGOO((//++T1DKK4R4RR   fbll+KK""$MM$$S)fbii(V[[-DKK""$ .E(r)   N)	rK   rL   rM   r   config_classbase_model_prefixsupports_gradient_checkpointing_no_split_modulesr9  rQ   r)   r'   r  r  &  s#     L&*#,-&%r)   r  c                        e Zd ZdZdef fdZ	 	 	 	 	 ddeej                     deej                     dee	   dee	   dee	   d	e
eef   fd
Z xZS )Owlv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Owlv2EncoderLayer`].

    Args:
        config: Owlv2Config
    r   c                     t         |           t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        d| _        y c c}w )NF)	r   r   r   
ModuleListranger,  r  layersgradient_checkpointing)rB   r   r   r   s      r'   r   zOwlv2Encoder.__init__a  sH    mmfNfNfHg$h1%6v%>$hi&+# %is   Ar   r   r   output_hidden_statesreturn_dictr   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd}|rdnd}|}	| j                  D ]_  }
|r||	fz   }| j
                  r,| j                  r | j                  |
j                  |	|||      }n |
|	|||      }|d   }	|sW||d   fz   }a |r||	fz   }|st        d |	||fD              S t        |	||      S )a  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`).
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Causal mask for the text model. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            output_hidden_states (`bool`, *optional*):
                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
                for more detail.
            return_dict (`bool`, *optional*):
                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
        NrQ   )r   r   r   c              3   &   K   | ]	  }||  y wrS   rQ   )r@   vs     r'   rC   z'Owlv2Encoder.forward.<locals>.<genexpr>  s     eqWXWde   )last_hidden_stater   
attentions)r   r   rE  use_return_dictrC  rD  r   _gradient_checkpointing_func__call__rG   r   )rB   r   r   r   r   rE  rF  encoder_statesall_attentionsr   encoder_layerlayer_outputss               r'   r   zOwlv2Encoder.forwardf  s=   > 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]30d%![[ 	FM#!/=2B!B**t}} $ A A!**!")%! !.!")&7	! *!,M !/=3C2E!E-	F0  +}.>>Ne]NN$Seee+>Vd
 	
r)   NNNNN)rK   rL   rM   rN   r   r   r   r$   r
   r   r	   r   r   r   r   r   s   @r'   r?  r?  X  s    ,{ , 268<,0/3&*H
 !.H
  (5	H

 $D>H
 'tnH
 d^H
 
uo%	&H
r)   r?  c                        e Zd Zdef fdZe	 	 	 	 	 ddej                  deej                     deej                     dee	   dee	   dee	   d	e
eef   fd
       Z xZS )Owlv2TextTransformerr   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        y r  )r   r   r   r   r   r   r?  encoderr   r  r  final_layer_norm)rB   r   r   r   s      r'   r   zOwlv2TextTransformer.__init__  sQ    &&	-f5#F+ "YF<Q<Q Rr)   r   r   r   r   rE  rF  r   c                 ,   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|j	                         }|j                  d|d         }| j                  ||      }t        ||j                  |j                        }	|t        ||j                        }| j                  |||	|||      }
|
d   }| j                  |      }|t        j                  |j                  d   |j                        |j!                  t        j"                        j%                  d      j!                  |j                        f   }|s
||f|
dd z   S t'        |||
j(                  |
j*                  	      S )
a|  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        Nr   )r   r   r    )r   r   r   r   rE  rF  r   r   r   rK  pooler_outputr   rL  )r   r   rE  rM  r   r   r   r   rU   r!   r   rX  rY  r$   r%   r   r   r[   argmaxr   r   rL  )rB   r   r   r   r   rE  rF  input_shaper   r   encoder_outputsrK  pooled_outputs                r'   r   zOwlv2TextTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]nn&NN2{27	),W
 !A,,]5I5I!
 %7H[H[\N,,')"7/!5# ' 
 ,A. 112CD *LL*003<M<T<TULL#**r*2556G6N6NOQ

 %}58KKK)/')77&11	
 	
r)   rT  )rK   rL   rM   r   r   r   r$   r
   r   r   r	   r   r   r   r   r   s   @r'   rV  rV    s    S S  26/3,0/3&*?
<<?
 !.?
 u||,	?

 $D>?
 'tn?
 d^?
 
u00	1?
 ?
r)   rV  c                        e Zd ZeZdef fdZdej                  fdZd Z	e
	 	 	 	 ddej                  deej                     dee   d	ee   d
ee   deeef   fd       Z xZS )Owlv2TextModelr   c                 d    t         |   |       t        |      | _        | j	                          y rS   )r   r   rV  
text_model	post_initr   s     r'   r   zOwlv2TextModel.__init__  s&     .v6r)   r   c                 B    | j                   j                  j                  S rS   rd  r   r   rI   s    r'   get_input_embeddingsz#Owlv2TextModel.get_input_embeddings  s    ))999r)   c                 :    || j                   j                  _        y rS   rg  )rB   values     r'   set_input_embeddingsz#Owlv2TextModel.set_input_embeddings  s    5:""2r)   r   r   r   rE  rF  c                 .    | j                  |||||      S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Examples:
        ```python
        >>> from transformers import AutoProcessor, Owlv2TextModel

        >>> model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   r   rE  rF  )rd  )rB   r   r   r   rE  rF  s         r'   r   zOwlv2TextModel.forward  s)    < )/!5#  
 	
r)   )NNNN)rK   rL   rM   r   r:  r   r   Modulerh  rk  r   r$   r
   r   r   r	   r   r   r   r   r   s   @r'   rb  rb    s    "L :bii :;  26,0/3&*#
<<#
 !.#
 $D>	#

 'tn#
 d^#
 
u00	1#
 #
r)   rb  c                        e Zd Zdef fdZe	 	 	 	 d
dej                  dee	   dee	   dee	   dee	   de
eef   fd	       Z xZS )Owlv2VisionTransformerr   c                 0   t         |           || _        t        |      | _        t        j                  |j                  |j                        | _	        t        |      | _        t        j                  |j                  |j                        | _        y r  )r   r   r   r   r   r   r  r   r  pre_layernormr?  rX  post_layernormr   s     r'   r   zOwlv2VisionTransformer.__init__7  sk    /7\\&*<*<&BWBWX#F+ ll6+=+=6CXCXYr)   r   r   rE  r   rF  r   c                 (   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j                  j
                  j                  j                  }|j                  |      }| j	                  ||      }| j                  |      }| j                  ||||      }|d   }	|	d d dd d f   }
| j                  |
      }
|s
|	|
f|dd  z   S t        |	|
|j                  |j                        S )N)r   )r   r   rE  rF  r   r   r[  )r   r   rE  rM  r   r   r   rU   r   rr  rX  rs  r   r   rL  )rB   r   r   rE  r   rF  expected_input_dtyper   r_  rK  r`  s              r'   r   zOwlv2VisionTransformer.forward@  s-    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]  $>>EEKK#';<Ogh**=9,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r)   )NNFN)rK   rL   rM   r   r   r   r$   rO   r   r   r	   r   r   r   r   r   s   @r'   rp  rp  6  s    Z0 Z  -1/338&*)
'')
 $D>)
 'tn	)

 #+4.)
 d^)
 
u00	1)
 )
r)   rp  c                        e Zd ZeZdZdef fdZdej                  fdZ	e
	 	 	 	 	 ddeej                     dee   dee   ded	ee   deeef   fd
       Z xZS )Owlv2VisionModelr   r   c                 d    t         |   |       t        |      | _        | j	                          y rS   )r   r   rp  vision_modelre  r   s     r'   r   zOwlv2VisionModel.__init__r  s'     26:r)   r   c                 B    | j                   j                  j                  S rS   )ry  r   r   rI   s    r'   rh  z%Owlv2VisionModel.get_input_embeddingsx  s      ++;;;r)   r   rE  r   rF  c                 .    | j                  |||||      S )a  
        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Owlv2VisionModel

        >>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   rE  r   rF  )ry  )rB   r   r   rE  r   rF  s         r'   r   zOwlv2VisionModel.forward{  s+    6   %/!5%=# ! 
 	
r)   NNNFN)rK   rL   rM   r   r:  main_input_namer   r   rn  rh  r   r   r$   rO   r   r	   r   r   r   r   r   s   @r'   rw  rw  n  s    $L$O0 <bii <  59,0/3).&* 
u001 
 $D> 
 'tn	 

 #' 
 d^ 
 
u00	1 
  
r)   rw  c                       e Zd ZeZdef fdZe	 	 	 	 	 ddeej                     deej                     dee
   dee
   dee
   dej                  fd	       Ze	 	 	 	 	 dd
eej                     dee
   dee
   de
dee
   dej                  fd       Ze	 	 	 	 	 	 	 	 	 ddeej                     d
eej                     deej                     dee
   dee
   dee
   de
dee
   dee
   deeef   fd       Z xZS )r-  r   c                 <   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _        |j                  | _        t        |      | _        t        |      | _        t#        j$                  | j                  | j                  d      | _        t#        j$                  | j                  | j                  d      | _        t#        j*                  t-        j.                  |j0                              | _        | j5                          y )NzLconfig.text_config is expected to be of type Owlv2TextConfig but is of type .zPconfig.vision_config is expected to be of type Owlv2VisionConfig but is of type F)r   )r   r   r'  text_configr   	TypeErrortypevision_configr   projection_dimr   r/  r1  rV  rd  rp  ry  r   r   r0  r.  r   r$   r   logit_scale_init_valuelogit_scalere  )rB   r   r  r  r   s       r'   r   zOwlv2Model.__init__  sR    &,,o>++,-Q0 
 &..0AB--./q2 
 ((,,$33)55 - 9 9.{;2=A!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<V5R5R(ST 	r)   r   r   r   rE  rF  r   c                     ||n| j                   j                  }| j                  |||      }|d   }| j                  |      }|S )aL  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Returns:
            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
            applying the projection layer to the pooled output of [`Owlv2TextModel`].

        Examples:
        ```python
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> text_features = model.get_text_features(**inputs)
        ```)r   r   rF  r   )r   rM  rd  r.  )	rB   r   r   r   rE  rF  text_outputr`  text_featuress	            r'   get_text_featureszOwlv2Model.get_text_features  sP    > &1%<k$++B]B] oo	.fqor#A,,];r)   r   r   c                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  |||||      }|d   }| j                  |      }|S )aO  
        Returns:
            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
            applying the projection layer to the pooled output of [`Owlv2VisionModel`].

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(images=image, return_tensors="pt")
        >>> image_features = model.get_image_features(**inputs)
        ```r|  r   )r   r   rE  rM  ry  r0  )	rB   r   r   rE  r   rF  vision_outputsr`  image_featuress	            r'   get_image_featureszOwlv2Model.get_image_features  s    8 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 'q)//>r)   return_lossreturn_base_image_embedsc
           	      (   ||n| j                   j                  }||n| j                   j                  }|	|	n| j                   j                  }	| j	                  |||||	      }
| j                  |||||	      }|d   }| j                  |      }|
d   }| j                  |      }|t        j                  j                  |ddd      z  }|t        j                  j                  |ddd      z  }| j                  j                         j                  |j                        }t        j                  ||j!                               |z  }|j!                         }d}|rt#        |      }|}|	s||||||
f}||f|z   S |S t%        |||||||
	      S )
a4  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        return_base_image_embeds (`bool`, *optional*):
            Whether or not to return the base image embeddings.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import requests
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```Nr|  rm  r   r_   r   T)ordr   keepdim)r2   r3   r4   r5   r6   r7   r8   )r   r   rE  rM  ry  rd  r.  r0  r$   linalgnormr  expr   r!   matmulr,   r/   r1   )rB   r   r   r   r  r   rE  r   r  rF  r  text_outputsr5   r6   text_embeds_normr  r4   r3   r2   outputs                       r'   r   zOwlv2Model.forward  s   F 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++B]B]**%/!5%=# + 
 )/!5# ' 
 #1o**;7%a(--l; $ell&7&7!QS]a&7&bb&):):;ASU_c):)dd &&**,//0C0CD,,'79IJ[X*,,.o.D&&lT`bpqF)-)9TGf$EvE-+#%* .
 	
r)   rT  r}  )	NNNNNNFNN)rK   rL   rM   r   r:  r   r   r   r$   r
   r   rO   r  r  r   r	   r   r1   r   r   r   s   @r'   r-  r-    s    L{ @  -115,0/3&*%ELL)% !.% $D>	%
 'tn% d^% 
		% %N  59,0/3).&*,u001, $D>, 'tn	,
 #', d^, 
		, ,\  154815&*,0/3).37&*Z
E,,-Z
 u001Z
 !.	Z

 d^Z
 $D>Z
 'tnZ
 #'Z
 #+4.Z
 d^Z
 
uk!	"Z
 Z
r)   r-  c                   b     e Zd Zddedef fdZdej                  dej                  fdZ	 xZ
S )Owlv2BoxPredictionHeadr   out_dimc                 "   t         |           |j                  j                  }t	        j
                  ||      | _        t	        j
                  ||      | _        t	        j                         | _	        t	        j
                  ||      | _
        y rS   )r   r   r  r   r   r   dense0dense1GELUgeludense2)rB   r   r  r   r   s       r'   r   zOwlv2BoxPredictionHead.__init__{  sb    $$00iiu-iiu-GGI	iiw/r)   r  r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rS   )r  r  r  r  )rB   r  r  s      r'   r   zOwlv2BoxPredictionHead.forward  sM    ^,6"V$6"V$r)   )   )rK   rL   rM   r   r[   r   r$   r
   rO   r   r   r   s   @r'   r  r  z  s3    0{ 0S 0ell u7H7H r)   r  c            	            e Zd Zdef fdZdej                  deej                     deej                     de	ej                     fdZ
 xZS )Owlv2ClassPredictionHeadr   c                    t         |           |j                  j                  }|j                  j                  | _        t        j                  | j
                  |      | _        t        j                  | j
                  d      | _	        t        j                  | j
                  d      | _
        t        j                         | _        y )Nr   )r   r   r  r   r  	query_dimr   r   r  logit_shiftr  ELUelu)rB   r   r  r   s      r'   r   z!Owlv2ClassPredictionHead.__init__  s    $$00--99ii899T^^Q799T^^Q7668r)   r6   query_embeds
query_maskr   c                 0   | j                  |      }|S|j                  }|j                  d d \  }}t        j                  ||| j
                  f      j                  |      }||fS |t        j                  j                  |dd      dz   z  }|t        j                  j                  |dd      dz   z  }t        j                  d||      }| j                  |      }	| j                  |      }
| j                  |
      dz   }
||	z   |
z  }||j                  dkD  rt        j                  |d	      }t        j                  |d
k(  t        j                   |j"                        j$                  |      }|j                  t        j&                        }||fS )Nr_   r   T)r   r  gư>z...pd,...qd->...pqr   r   r   r   )r  r!   r   r$   zerosr  r   r  r  einsumr  r  r  ndimr   wherefinforU   rc   rV   )rB   r6   r  r  image_class_embedsr!   r   r   pred_logitsr  r  s              r'   r   z Owlv2ClassPredictionHead.forward  s    "[[6'..F&8&>&>r&B#J++z;&OPSSTZ[K!344 05<<3D3DEW]_im3D3nqu3uv#u||'8'82W['8'\_c'cd ll#79K\Z &&|4&&|4hh{+a/"[0K?!""__ZR@
++jAou{{;CTCT7U7Y7Y[fgK%..7K/00r)   )rK   rL   rM   r   r   r$   rO   r   r
   r   r   r   r   s   @r'   r  r    s_    	{ 	!1''!1 u001!1 U\\*	!1
 
u  	!!1r)   r  c                       e Zd ZeZdef fdZedededej                  fd       Z
dej                  dej                  fdZ ed	
      	 d!dededeej                     dej                  fd       Z	 d"dej                  dej                  dedej                  fdZ	 	 d#dej                  deej                     deej                     deej                     fdZ	 	 	 d$dej                  dej                  dej                  dee   dee   dedeej                     fdZ	 	 	 d$dej                  dee   dee   dedeej                     f
dZ	 d"dej                  dej                  dedej                  fdZe	 	 	 	 	 d%dej                  deej                     dee   dee   dedee   defd       Ze	 	 	 	 	 d%dej                  dej                  deej                     dee   dee   dedee   defd        Z xZS )&Owlv2ForObjectDetectionr   c                    t         |   |       t        |      | _        t	        |      | _        t        |      | _        t        |d      | _        t        j                  |j                  j                  |j                  j                        | _        t        j                         | _        || _        | j"                  j                  j$                  | j"                  j                  j&                  z  | _        | j"                  j                  j$                  | j"                  j                  j&                  z  | _        | j-                  | j(                  | j*                        | _        y )Nr   )r  r  )r   r   r-  r   r  
class_headr  box_headobjectness_headr   r  r  r   r  
layer_normSigmoidsigmoidr   r   r   num_patches_heightnum_patches_widthcompute_box_biasbox_biasr   s     r'   r   z Owlv2ForObjectDetection.__init__  s     '
26:.v65faH,,v';';'G'GVMaMaMpMpqzz|"&++";";"F"F$++JcJcJnJn"n!%!:!:!E!EIbIbImIm!m--d.E.EtG]G]^r)   r  r  r   c                 j   t        j                  d|dz   t         j                        }t        j                  d| dz   t         j                        }t        j                  ||d      \  }}t        j                  ||fd      }|dxx   |z  cc<   |dxx   | z  cc<   |j                  dd	      }|S )
Nr   )rU   xy)indexingr   r   .r   .r   r_   )r$   r%   rV   meshgridstackr   )r  r  x_coordinatesy_coordinatesxxyybox_coordinatess          r'   !normalize_grid_corner_coordinatesz9Owlv2ForObjectDetection.normalize_grid_corner_coordinates  s     Q(9A(=U]]SQ(:Q(>emmT}tLB  ++r2hB7#44#55 *..r15r)   r  c                 R    |j                         }| j                  |      }|d   }|S )a#  Predicts the probability that each image feature token is an object.

        Args:
            image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)):
                Features extracted from the image.
        Returns:
            Objectness scores.
        r  )detachr  )rB   r  r{   s      r'   objectness_predictorz,Owlv2ForObjectDetection.objectness_predictor  s4     (..0 00@-f5  r)   r_   )maxsizefeature_mapc                    |t        d      | j                  ||      }t        j                  |dd      }t        j                  |dz         t        j
                  | dz         z
  }t        j                  |d      }|dxx   |z  cc<   |dxx   |z  cc<   t        j                  |dz         t        j
                  | dz         z
  }t        j                  ||gd      }|S )	NzOfeature_map has been deprecated as an input. Please pass in num_patches insteadr"  r%  g-C6?r  r  r   r   )rs   r  r$   cliploglog1p	full_liker   )	rB   r  r  r  r  box_coord_biasbox_sizebox_size_biasr  s	            r'   r  z(Owlv2ForObjectDetection.compute_box_bias  s    
 "noo@@ASUfg**_c3? ?T#9:U[[/IY\`I`=aa ??>37--..		(T/2U[[(TAQ5RR 99nm<"Er)   image_featsr   c                     | j                  |      }|r$|j                  \  }}}}| j                  ||      }n| j                  }|j	                  |j
                        }||z  }| j                  |      }|S )a  
        Args:
            image_feats:
                Features extracted from the image, returned by the `image_text_embedder` method.
            feature_map:
                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
            interpolate_pos_encoding:
                Whether to interpolate the pre-trained position encodings.
        Returns:
            pred_boxes:
                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
        )r  r   r  r  r   r!   r  )	rB   r  r  r   r|   r   r  r  r  s	            r'   box_predictorz%Owlv2ForObjectDetection.box_predictor	  s|    & ]];/
 $:E:K:K7A!#4a,,-?ARSH}}H;;{112h
\\*-
r)   r  r  c                 6    | j                  |||      \  }}||fS )a8  
        Args:
            image_feats:
                Features extracted from the `image_text_embedder`.
            query_embeds:
                Text query embeddings.
            query_mask:
                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
        )r  )rB   r  r  r  r  r  s         r'   class_predictorz'Owlv2ForObjectDetection.class_predictor+  s)     -1OOKWa,b)(/00r)   r   r   r   r   rE  c           	         | j                  ||||||d      }|rX|j                  \  }}}	}
|	| j                  j                  j                  z  }|
| j                  j                  j                  z  }n| j
                  }| j                  }|j                  d   }| j                   j                  j                  |      }t        j                  |d d d dd d f   |d d d df   j                        }|d d dd d d f   |z  }| j                  |      }|j                  d   |||j                  d   f}|j                  |      }|d   }|||fS )NT)r   r   r   r   rE  r   rF  r   r   r   )r   r   r   r  r   r  r  r8   ry  rs  r$   broadcast_tor  r   )rB   r   r   r   r   rE  r   r  r   r   r   r  r  rK  r6   class_token_outnew_sizer5   s                     r'   image_text_embedderz+Owlv2ForObjectDetection.image_text_embedder?  sv    **%)/!5%=  
 $"."4"4Aq&%!'4;;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 $77:zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5bk\733r)   c                    | j                   j                  ||d      }|rX|j                  \  }}}}|| j                  j                  j
                  z  }	|| j                  j                  j
                  z  }
n| j                  }	| j                  }
|d   }| j                   j                  j                  |      }t        j                  |d d d dd d f   |d d d df   j                        }|d d dd d d f   |z  }| j                  |      }|j                  d   |	|
|j                  d   f}|j                  |      }||fS )NT)r   r   rF  r   r   r   )r   ry  r   r   r  r   r  r  rs  r$   r  r  r   )rB   r   r   rE  r   r  r   r   r   r  r  rK  r6   r  r  s                  r'   image_embedderz&Owlv2ForObjectDetection.image_embedders  s_    00%@Xfj 1 
 $"."4"4Aq&%!'4;;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 +1-zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5n--r)   query_image_featuresquery_feature_mapc                 j   | j                  |      \  }}| j                  |||      }t        |      }g }g }	|j                  }
t	        |j
                  d         D ]  }t        j                  g dg|
      }||   }t        ||      \  }}t        j                  |d   dk(        rt        ||      }t        j                  |      dz  }|d   |k\  j                         }|j                         s||   |j                  d         }t        j                  ||   d      }t        j                   d||      }|t        j"                  |         }|j%                  ||   |          |	j%                  |       " |r+t        j&                  |      }t        j&                  |	      }nd	\  }}|||fS )
Nr   )r   r   r   r   r    r"  g?r   )axiszd,id->iNN)r  r  r   r!   rB  r   r$   r   rp   rr   rw   rd   nonzeronumelsqueezer#  r  argminappendr  )rB   r  r  r   r   r}   r|   pred_boxes_as_cornersbest_class_embedsbest_box_indicespred_boxes_deviceieach_query_boxeach_query_pred_boxesiousiou_thresholdselected_indsselected_embeddingsmean_embedsmean_simbest_box_indr  box_indicess                          r'   embed_image_queryz)Owlv2ForObjectDetection.embed_image_query  s    ../CD<''(<>OQij
 8 D 188+11!45 	6A"\\<.ARSN$9!$<!n.CDGD! yyaC(*>;PQ "IIdOc1M!!W5>>@M""$&21om6K6KA6N&O##jjaqA <<	;@ST,U\\(-CD!((a)FG ''5'	6*  ;;'89L++&67K(2%L+[*44r)   query_pixel_valuesrF  c           
         ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      d   }| j	                  ||||      \  }}	|j
                  \  }
}}}t        j                  ||
||z  |f      }|j
                  \  }
}}}t        j                  ||
||z  |f      }| j                  |||      \  }}}| j                  ||      \  }}| j                  |||      }|s+|||||||	j                         f}t        d |D              }|S t        ||||||d|	      S )a  
        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values of query image(s) to be detected. Pass in one query image per target image.

        Examples:
        ```python
        >>> import requests
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, Owlv2ForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
        >>> query_image = Image.open(requests.get(query_url, stream=True).raw)
        >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")

        >>> # forward pass
        >>> with torch.no_grad():
        ...     outputs = model.image_guided_detection(**inputs)

        >>> target_sizes = torch.Tensor([image.size[::-1]])

        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_image_guided_detection(
        ...     outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
        ... )
        >>> i = 0  # Retrieve predictions for the first image
        >>> boxes, scores = results[i]["boxes"], results[i]["scores"]
        >>> for box, score in zip(boxes, scores):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
        Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06]
        Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39]
        Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.8]
        Detected similar object with confidence 0.985 at location [176.98, -29.45, 672.69, 182.83]
        Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82]
        Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05]
        Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01]
        Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72]
        Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18]
        Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21]
        Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76]
        Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07]
        Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39]
        ```N)r   r   r   )r   r   rE  r   )r  r  c              3   &   K   | ]	  }||  y wrS   rQ   r@   xs     r'   rC   zAOwlv2ForObjectDetection.image_guided_detection.<locals>.<genexpr>2       >1>rJ  )r6   r   r   r   r   r}   r7   r8   )r   r   rE  rF  r  r   r$   r   r  r  r  r>   rG   r   )rB   r   r  r   rE  r   rF  r  r  r  r   r  r  
hidden_dimr  query_image_featsr  r  r   r  r}   r   r  s                          r'   image_guided_detectionz.Owlv2ForObjectDetection.image_guided_detection  s   v 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY !//+F^ 0 

 '+&9&9%/!5%=	 ': '
#^ ITHYHYE
&(9:mmK*>PSd>dfp1qrHYH_H_E
&(9:!MM
,>AR,RT^_
 <@;Q;Q02J<
8&(8
 '+&:&:{am&:&n#l !..{KIab!! '')F >f>>FM4$0/-%" .	
 		
r)   c           
      $   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||||||      \  }}	}
|
j
                  }|
j                  }|	j                  \  }}}}t        j                  |	|||z  |f      }|j                  d   |z  }|j                  |||j                  d         }|j                  |||j                  d         }|d   dkD  }| j                  |||      \  }}| j                  |      }| j                  ||	|      }|s:|||||	||j                         |j                         f}t        d |D              }|S t        |	|||||||      S )a	  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids).
        output_hidden_states (`bool`, *optional*):
            Whether or not to return the last hidden state. See `text_model_last_hidden_state` and
            `vision_model_last_hidden_state` under returned tensors for more detail.

        Examples:
        ```python
        >>> import requests
        >>> from PIL import Image
        >>> import torch

        >>> from transformers import Owlv2Processor, Owlv2ForObjectDetection

        >>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)
        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.tensor([(image.height, image.width)])
        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_grounded_object_detection(
        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
        ... )
        >>> # Retrieve predictions for the first image for the corresponding text queries
        >>> result = results[0]
        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
        >>> for box, score, text_label in zip(boxes, scores, text_labels):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
        Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
        ```)r   r   r   r   rE  r   r   r   r  c              3   &   K   | ]	  }||  y wrS   rQ   r
  s     r'   rC   z2Owlv2ForObjectDetection.forward.<locals>.<genexpr>  r  rJ  )r6   r5   r|   r   r{   r}   r7   r8   )r   r   rE  rF  r  r7   r8   r   r$   r   r  r  r  r>   rG   ry   )rB   r   r   r   r   rE  r   rF  r  r  r  r  r  r   r  r  r  r  max_text_queriesr  r  r}   r{   r|   r  s                            r'   r   zOwlv2ForObjectDetection.forward@  s   h 2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY .2-E-E%)/!5%= .F .
*k7 00 44HSHYHYE
&(9:mmK*>PSd>dfp1qr %??1-;#++J8H,J\J\]_J`a %%j2BIOOTVDWX	v&*
 '+&:&:;V`&a#l !55kB ''[BZ[
!%%''')	F >f>>FM)$$!/%* .	
 		
r)   rS   r   r  r  r}  )rK   rL   rM   r   r:  r   staticmethodr[   r$   r
   r  rO   r  r   r   r  r   r  r   r  r  r  r  r   r   r  ry   r   r   r   s   @r'   r  r    s   L_{ _ c VY ^c^j^j   !53D3D !IZIZ ! q ko"%:=LTUZUfUfLg	 6 */	&& && #'	
 
		J 59-1	1&&1 u0011 U\\*	1
 
u  	!12 -1/3).14<<14 ''14 	14
 $D>14 'tn14 #'14 
u  	!14n -1/3).(.''(. $D>(. 'tn	(.
 #'(. 
u  	!(.^ */	*5#//*5 !,,*5 #'	*5
 
		*5X  ;?,0/3).&*s
''s
 %U%6%67s
 $D>	s

 'tns
 #'s
 d^s
 
/s
 s
j 
 26,0/3).&*r
<<r
 ''r
 !.	r

 $D>r
 'tnr
 #'r
 d^r
 
$r
 r
r)   r  )r-  r  rb  rw  r  )BrN   dataclassesr   	functoolsr   typingr   r   r   r   r	   r$   torch.utils.checkpointr
   r   activationsr   modeling_attn_mask_utilsr   r   modeling_outputsr   r   modeling_utilsr   utilsr   r   r   r   r   configuration_owlv2r   r   r   transformers.image_transformsr   
get_loggerrK   loggerr(   r/   r1   r\   r`   rp   rw   ry   r   rn  r   r   r   r  r  r  r?  rV  rb  rp  rw  r-  r  r  r  __all__rQ   r)   r'   <module>r"     s9    !  4 4    ! d K - Y Y P P F 
		H	%`U\\ `ell `
-5<< -ELL - "
+ "
 "
LGv G& GEF Ev E""'0 2
 2
 2
j -
K -
 -
bJBII J\")) @h2RYY h2Xryy  /		 /d ,%? ,% ,%`V
299 V
tI
299 I
Z3
) 3
n4
RYY 4
p.
+ .
b U
% U
 U
rRYY (-1ryy -1`u
2 u
p rr)   