
    Uh                        d Z ddlZddlmZ ddlmZmZ ddlZddlZddlm	Z	 ddl
mZ ddlmZmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ  ej6                  e      Ze G d de             Z G d de	j>                        Z  G d de	j>                        Z! G d de	j>                        Z" G d de	j>                        Z# G d de	j>                        Z$ G d de	j>                        Z% G d de	j>                        Z& G d d e	j>                        Z' G d! d"e	j>                        Z( G d# d$e	j>                        Z)e G d% d&e             Z* G d' d(e	j>                        Z+ G d) d*e	j>                        Z,e+e,d+Z- ed,-       G d. d/e*             Z. G d0 d1e	j>                        Z/ ed2-       G d3 d4e*             Z0g d5Z1y)6zPyTorch TVP Model    N)	dataclass)OptionalTuple)nn   )ACT2FN)BaseModelOutputBaseModelOutputWithPoolingModelOutput)PreTrainedModel)prune_linear_layer)auto_docstringlogging)load_backbone   )	TvpConfigc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                  df      ed<   dZeeej                  df      ed<   y)TvpVideoGroundingOutputa  
    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
            Temporal-Distance IoU loss for video grounding.
        logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
            Contains start_time/duration and end_time/duration. It is the time slot of the videos corresponding to the
            input texts.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of
            the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
    Nlosslogits.hidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        v/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/tvp/modeling_tvp.pyr   r   %   sq      )-D(5$$
%,*.FHU&&'.=AM8E%"3"3S"89:A:>Ju00#567>r!   r   c                   :     e Zd ZdZ fdZd Zd Zd Zd Z xZ	S )TvpLossa~  
    This class computes the losses for `TvpForVideoGrounding`. The process happens in two steps: 1) we compute
    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
    ground-truth / prediction (supervise class and box).

    Args:
        losses (`List[str]`):
            List of all the losses to be applied.
    c                     t         |           | j                  | j                  | j                  d| _        |D ]  }|| j
                  vst        d| d       || _        y )NioudistancedurationzLoss z not supported)super__init__loss_iouloss_distanceloss_durationloss_map
ValueErrorlosses)selfr1   r   	__class__s      r"   r+   zTvpLoss.__init__H   sj    ==****

  	?D4==( 5n!=>>	? r!   c                     t        j                  ||      t        j                  ||      z
  }t        j                  ||      t        j                  ||      z
  }d|j                  d      |z  z
  }|S )z6
        Measure the intersection over union.
        r   r   min)r   r6   maxclamp)	r2   
start_timeend_timecandidates_start_timecandidates_end_timer)   interunionr'   s	            r"   r,   zTvpLoss.loss_iouU   si     		-x8599EZ\f;gg		-x8599EZ\f;gg%++!+$u,,
r!   c                 P   t        j                  t        j                  ||      d      }t        j                  t        j                  ||      d      }t        j                  t        j                  ||      t        j                  ||      z
  |      j                  d      }|S )z5
        Measure the distance of mid points.
        g       @g?r5   )r   divaddr7   r6   r8   )	r2   r9   r:   r;   r<   r)   mid_candidatesmid_groundtruthdistance_diffs	            r"   r-   zTvpLoss.loss_distance_   s     599-BDW#XZ]^))EIIj($CSI		IIno6>Sb9ccem

%C%. 	 r!   c                     t        j                  ||      }t        j                  ||      }t        j                  t        j                  t        j                  ||      |            }|j	                  d      }|S )z5
        Measure the difference of duration.
        g?r5   )r   subsquarer@   r8   )	r2   r9   r:   r;   r<   r)   duration_candidatesduration_groundtruthduration_diffs	            r"   r.   zTvpLoss.loss_durationk   sh     $ii(;=RS$yy:>UYYuyy9LNb/cem%no%+++4r!   c                    |\  }}}t        j                  ||      }|dddf   j                         |dddf   j                         }}i }	| j                  D ],  }
|	j	                  |
 | j
                  |
   |||||      i       . |	S )am  
        This performs the loss computation.

        Args:
            logits (`torch.FloatTensor`):
                The output logits of head module.
            labels (`List[torch.FloatTensor]`):
                List of tensors ([start, end, duration]), which contains start time, end time of the video corresponding to the text, and also the duration.
        Nr   r   )r   mulfloatr1   updater/   )r2   r   labelsr)   r9   r:   
candidatesr;   r<   losses_dictr   s              r"   forwardzTvpLoss.forwardv   s     *0&*hYYvx0
5?15E5K5K5MzZ[]^Z^O_OeOeOg2KK 	D*t}}T*:xAVXkmuvw	
 r!   )
r   r   r   r   r+   r,   r-   r.   rR   __classcell__r3   s   @r"   r$   r$   =   s!    
	r!   r$   c                   $     e Zd Z fdZd Z xZS )TvpVisionModelc           	      \   t         |           t        |      | _        |j                  |j                  j
                  d   }nt        | j                  d      rDt        | j                  j                  d      r$| j                  j                  j
                  d   }nbt        | j                  d      rAt        | j                  j                  d      r!| j                  j                  j                  }nt        d      t        j                  ||j                  ddddd	      | _        y )
Nconfighidden_sizeshidden_sizezBackbone config not foundr   r   F)kernel_sizestridepaddinggroupsbias)r*   r+   r   backbonebackbone_configrZ   hasattrrY   r[   r0   r   Conv2dgrid_encoder_conv)r2   rY   in_channelsr3   s      r"   r+   zTvpVisionModel.__init__   s    %f-!!- 00==bAKT]]H-'$--:N:NP^2_--..;;B?KT]]H-'$--:N:NP]2^--..::K899!#"
r!   c                    |j                   \  }}}}}|j                  ||z  |||      }| j                  |      d   d   }| j                  |      }t        j
                  j                  |dd      }t        j
                  j                  |d      }|j                   dd  \  }	}
}|j                  |||	|
|      }|j                  ddd	d
d      }|S )Nfeature_mapsr      )r\   r]   T)inplacer   r      )	shapeviewra   re   r   
functional
max_pool2drelupermute)r2   pixel_values
batch_size
num_framesnum_channelsheightwidthgrid_feat_outputsgridnew_channel
new_height	new_widths               r"   rR   zTvpVisionModel.forward   s    >J>P>P;
Jfe#((j)@,PVX]^ MM,7GJ%%&78}}''!A'F}}!!$!5-1ZZ_*ZyyZj)T||Aq!Q*r!   r   r   r   r+   rR   rS   rT   s   @r"   rV   rV      s    
.r!   rV   c                   ~     e Zd ZdZ fdZdej                  dededej                  fdZdde	fd	Z
dde	fd
Z xZS )TvpVisualInputEmbeddingz;
    Takes input of both image and video (multi-frame)
    c                 r   t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _
        t        j                  d|j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        |j                  | _        |j                  | _	        y )Nr   eps)r*   r+   r   	Embeddingmax_position_embeddingsr[   position_embeddings max_grid_row_position_embeddingsrow_position_embeddings max_grid_col_position_embeddingscol_position_embeddingstoken_type_embeddings	LayerNormlayer_norm_eps
layer_normDropouthidden_dropout_probdropoutr2   rY   r3   s     r"   r+   z TvpVisualInputEmbedding.__init__   s    #%<<0N0NPVPbPb#c ')||F4[4[]c]o]o'p$')||F4[4[]c]o]o'p$%'\\!V5G5G%H",,v'9'9v?T?TUzz&"<"<=060W0W-060W0W-r!   	embeddingrw   rx   returnc                     dx}}|| j                   kD  r|| j                   z  }|| j                  kD  r|| j                  z  }|j                  dddd      }t        j                  j                  |||fdd      }|j                  dddd      }|S )z
        This method allows to interpolate the pre-trained pad weights , to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   r   r   ri   bicubicFscale_factormodealign_corners)r   r   rr   r   ro   interpolate)r2   r   rw   rx   h0w0s         r"   interpolate_pos_encodingz0TvpVisualInputEmbedding.interpolate_pos_encoding   s     RD999$???B4888>>>B%%aAq1	MM--b	 . 
	 %%aAq1	r!   r   c                    |j                   \  }}}}t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }	dt        |j                         dz
  z  |d|fz   }
 |	j                  |
 }	t        | j                  |      }t        j                  |t        j
                  |j                        }| j                  |      }|d||f} |j                  | }|	|z   }|r6|| j                  kD  s|| j                  kD  r|| j                  |||      z   }|S ||z   }|S )af  
        Args:
            grid: (batch_size, height, width, hidden_dim)
            interpolate_pos_encoding: (`bool`, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.
        Returns:
            grid + col_position_embeddings.view(*col_shape): (batch_size, *, height, width, hidden_dim)
        dtypedevice)r   r   r   )rm   r6   r   r   arangelongr   r   lenrn   r   r   r   )r2   rz   r   rt   rw   rx   
hidden_dim
row_heightrow_position_idsr   	row_shape	row_widthcol_position_idsr   	col_shapepositional_embeddingss                   r"   add_2d_positional_embeddingsz4TvpVisualInputEmbedding.add_2d_positional_embeddings   sQ    15

-
FE: >>G
 <<
%**T[[Y"&">">?O"PC

Oa/0J:3NN	">"9">">	"J ==uE	 <<	DKKX"&">">?O"PIz:	">"9">">	"J 7:Q Q $T:::edFkFk>k$778MvW\]]D  //Dr!   c                    |j                   \  }}}}}|j                  d      }| j                  ||      }|j                  |d|      }|j                   dd }	|j                  }
t        j                  |	t
        j                  |
      }| j                  |      }||z   }| j                  |      }| j                  |      }|S )a  
        Args:
            grid: Array of shape (batch_size, num_frames, height, width, num_channels).
                It contains processed frames extracted from videos, and is generated by Tvp image preprocessor. Note,
                num_frames can be 1
            interpolate_pos_encoding: (bool, *optional*, defaults to `False`):
                Whether to interpolate the pre-trained position encodings.

        Returns:
            embeddings: The embedding of grid with size (batch_size, height*width, num_channels)

        r   r   rX   Nr   )rm   meanr   rn   r   r   zerosr   r   r   r   )r2   rz   r   rt   ru   rw   rx   rv   visual_tokensvisual_tokens_shaper   token_type_idsr   
embeddingss                 r"   rR   zTvpVisualInputEmbedding.forward  s     ?Cjj;
J|yy|00Ph0i		*b,?+11#26%% %8

SYZ $ : :> J"%::
__Z0
\\*-
r!   F)r   r   r   r   r+   r   Tensorintr   boolr   rR   rS   rT   s   @r"   r   r      sT    
X%,,  TW \a\h\h .'4 'Rd r!   r   c                   *     e Zd ZdZ fdZddZ xZS )TvpTextInputEmbeddingszGConstruct the embeddings from word, position and token_type embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                         | _        y )N)padding_idxr   )r*   r+   r   r   
vocab_sizer[   pad_token_idword_embeddingsr   r   type_vocab_sizer   r   r   r   r   r   r   r   s     r"   r+   zTvpTextInputEmbeddings.__init__)  s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c %'\\&2H2H&J\J\%]",,v'9'9v?T?TUzz&"<"<=r!   c                 .   ||j                         }n|j                         d d }|d   }||j                  n|j                  }|Ft        j                  |t        j                  |      }|j                  d      j                  |      }|&t        j                  |t        j                  |      }|| j                  |      }| j                  |      }| j                  |      }	||z   |	z   }
| j                  |
      }
| j                  |
      }
|
S )NrX   r   r   r   )sizer   r   r   r   	unsqueezeexpandr   r   r   r   r   r   )r2   	input_idsr   position_idsinputs_embedsinput_shape
seq_lengthr   r   r   r   s              r"   rR   zTvpTextInputEmbeddings.forward1  s    #..*K',,.s3K ^
%.%:!!@T@T <<
%**VTL'11!4;;KHL!"[[EJJvVN  00;M"66|D $ : :> J"%88;PP
__Z0
\\*-
r!   )NNNNr   r   r   r   r+   rR   rS   rT   s   @r"   r   r   &  s    Q>r!   r   c                   f     e Zd Z fdZd Zdej                  dedefdZ	 	 	 d	de	e
   fdZ xZS )
TvpAttentionc                    t         |           |j                  |j                  z  dk7  r1t	        |d      s%t        d|j                   d|j                         |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                  | j                        | _        t        j                  |j                        | _        t        j                  |j                  |j                        | _        t        j$                  |j                  |j&                        | _        t        j                  |j*                        | _        t/               | _        y )Nr   embedding_sizezThe hidden size z4 is not a multiple of the number of attention heads r   )r*   r+   r[   num_attention_headsrc   r0   r   attention_head_sizeall_head_sizer   Linearquerykeyvaluer   attention_probs_dropout_probattn_dropoutdenser   r   r   r   r   setpruned_headsr   s     r"   r+   zTvpAttention.__init__K  s    : ::a?PVXhHi"6#5#5"66jkq  lF  lF  kG  H  $*#=#= #&v'9'9F<V<V'V#W !558P8PPYYv1143E3EF
99V//1C1CDYYv1143E3EF
JJv'J'JKYYv1163E3EF
,,v'9'9v?T?TUzz&"<"<=Er!   c                 N   t        |      dk(  ry t        j                  | j                  | j                        }t        |      | j                  z
  }|D ](  t        fd| j                  D              z
  d|<   * |j                  d      j                         j                  d      }t        j                  t        |            |   j                         }t        | j                  |      | _        t        | j                  |      | _        t        | j                   |      | _        t        | j"                  |d      | _        | j                  t        |      z
  | _        | j                  | j                  z  | _        | j                  j'                  |      | _        y )Nr   c              3   0   K   | ]  }|k  rd nd  yw)r   r   Nr    ).0hheads     r"   	<genexpr>z+TvpAttention.prune_heads.<locals>.<genexpr>g  s     Nq1t8a2Ns   rX   r   dim)r   r   onesr   r   r   r   sumrn   
contiguouseqr   r   r   r   r   r   r   r   r>   )r2   headsmaskindexr   s       @r"   prune_headszTvpAttention.prune_heads`  sN   u:?zz$22D4L4LME
T... 	D#ND<M<MNNNDDJ	 yy}''),,Q/SY'-224 (

E:
%dhh6'

E:
'

EqA
 $(#;#;c%j#H !558P8PP --33E:r!   tensorsequence_lengthrt   c                     |j                  ||| j                  | j                        j                  dd      j	                         S )Nr   ri   )rn   r   r   	transposer   )r2   r   r   rt   s       r"   _reshapezTvpAttention._reshapew  s7    KK
OT5M5MtOgOghYq!_Z\	
r!   output_attentionsc                 :   |j                   d d \  }}| j                  |      }| j                  |      }| j                  |      }	| j	                  |||      }
| j	                  |||      }| j	                  |	||      }t        j                  |
|j                  dd            }|t        j                  | j                        z  }|||z   }t        j                  j                  |d      }| j                  |      }|||z  }t        j                  ||      }|j                  dd      j                         }|j!                  ||| j"                        }| j%                  |      }| j'                  |      }| j)                  ||z         }|r||f}|S |f}|S )Nri   rX   r   r   )rm   r   r   r   r   r   matmulr   mathsqrtr   r   ro   softmaxr   r   reshaper   r   r   r   )r2   r   attention_mask	head_maskr   rt   r   mixed_query_layermixed_key_layermixed_value_layerquery_layer	key_layervalue_layerattention_scoresattention_probsattn_outputoutputss                    r"   rR   zTvpAttention.forward~  s    '4&9&9"1&=#
O JJ}5((=1 JJ}5mm$5
SMM/?JO	mm$5
S !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ++O<  -	9Oll?K@!++Aq1<<>!))*otGYGYZjj-ll;/ookM&AB4E;0 MX>r!   NNN)r   r   r   r+   r   r   r   r   r   r   r   rR   rS   rT   s   @r"   r   r   J  sI    "*;.
u|| 
c 
s 
 ,0+
 $D>+r!   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )TvpIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y N)r*   r+   r   r   r[   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r"   r+   zTvpIntermediate.__init__  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r!   r   r   c                 J    | j                  |      }| j                  |      }|S r
  )r   r  )r2   r   s     r"   rR   zTvpIntermediate.forward  s&    

=100?r!   r   r   r   r+   r   r   rR   rS   rT   s   @r"   r  r    s#    9U\\ ell r!   r  c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )TvpOutputLayerc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _	        t        j                  |j                        | _        y )Nr   )r*   r+   r   r   r  r[   r   r   r   r   r   r   r   r   s     r"   r+   zTvpOutputLayer.__init__  s`    YYv779K9KL
,,v'9'9v?T?TUzz&"<"<=r!   r   input_tensorr   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r
  )r   r   r   )r2   r   r  s      r"   rR   zTvpOutputLayer.forward  s7    

=1]3(DEr!   r  rT   s   @r"   r  r    s1    >U\\  RWR^R^ r!   r  c                   8     e Zd Z fdZ	 	 	 ddee   fdZ xZS )TvpEncodeLayerc                     t         |           t        |      | _        t	        |      | _        t        |      | _        y r
  )r*   r+   r   	attentionr  intermediater  outputr   s     r"   r+   zTvpEncodeLayer.__init__  s3    %f-+F3$V,r!   r   c                     | j                  ||||      }|d   }|dd  }| j                  |      }| j                  ||      }	|	f|z   }|S )N)r   r   r   )r  r  r  )
r2   r   r   r   r   self_attention_outputsattention_outputr  intermediate_outputlayer_outputs
             r"   rR   zTvpEncodeLayer.forward  sr     "&/	 "0 "
 2!4(,"//0@A{{#68HI/G+r!   r  )r   r   r   r+   r   r   rR   rS   rT   s   @r"   r  r    s&    - ,0
 $D>r!   r  c            
       n     e Zd Z fdZ	 	 	 	 	 ddeej                     dee   dee   dee   fdZ xZ	S )
TvpEncoderc                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
r*   r+   rY   r   
ModuleListrangenum_hidden_layersr  layergradient_checkpointing)r2   rY   _r3   s      r"   r+   zTvpEncoder.__init__  sN    ]]E&JbJbDc#dqN6$:#de
&+# $es   A#r   r   output_hidden_statesreturn_dictc                     ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}d}t	        | j
                        D ]k  \  }	}
|r||fz   }| j                  r3| j                  r'| j                  |
j                  |||||	   nd |      }n |
||||	   |      }|d   }|sc||d   fz   }m |r||fz   }|s|f}|r||fz   }|r||fz   }|S t        ||r|nd |r|      S d       S )Nr    r   r   )last_hidden_stater   r   )rY   r,  r   r+  	enumerater(  r)  training_gradient_checkpointing_func__call__r	   )r2   r   r   r   r   r+  r,  all_hidden_statesall_attentionsilayer_modulelayer_outputsr  s                r"   rR   zTvpEncoder.forward  ss    &1%<k$++BYBY1B1N-TXT_T_TqTq$8$D $++JjJj 	 (4 	FOA|#$58H$H!**t}} $ A A ))!"%.%:Yq\%! !-]NIVWLZk l)!,M !/=3C2E!E#	F(   1]4D D$&G#!%6$88 !^$55N+/C+):~
 	
 AE
 	
r!   )NNNNN)
r   r   r   r+   r   r   r   r   rR   rS   rT   s   @r"   r#  r#    s]    , 15,0/3&*4
 E--.	4

 $D>4
 'tn4
 d^4
r!   r#  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )	TvpPoolerc                     t         |           t        j                  |j                  |j                        | _        t        j                         | _        y r
  )r*   r+   r   r   r[   r   Tanh
activationr   s     r"   r+   zTvpPooler.__init__&  s9    YYv1163E3EF
'')r!   r   r   c                 \    |d d df   }| j                  |      }| j                  |      }|S )Nr   )r   r<  )r2   r   first_token_tensorpooled_outputs       r"   rR   zTvpPooler.forward+  s6     +1a40

#566r!   r  rT   s   @r"   r9  r9  %  s#    $
U\\ ell r!   r9  c                       e Zd ZeZdZdZd Zy)TvpPreTrainedModelmodelTc                 &   t        |t        j                  t        j                  f      r<|j                  j
                  j                  d| j                  j                         nct        |t        j                        rI|j                  j
                  j                          |j                  j
                  j                  d       t        |t        j                        r0|j                  $|j                  j
                  j                          t        |t        j                        rdt        j                  j                  |j                  dd       |j                  +t        j                  j!                  |j                  d       yyy)	zInitialize the weights        )r   stdg      ?Nfan_outrq   )r   nonlinearityr   )r  r   r   r   weightdatanormal_rY   initializer_ranger   r`   zero_fill_rd   initkaiming_normal_	constant_)r2   modules     r"   _init_weightsz TvpPreTrainedModel._init_weights:  s   fryy",,78 MM&&CT[[5R5R&S-KK""$MM$$S)fbii(V[[-DKK""$fbii(GG##FMM	PV#W{{&!!&++q1 ' )r!   N)r   r   r   r   config_classbase_model_prefixsupports_gradient_checkpointingrR  r    r!   r"   rA  rA  4  s    L&*#2r!   rA  c                   (     e Zd ZdZ fdZd Z xZS )TvpFrameDownPadPrompterz>
    Pad frames extracted from videos only at the bottom.
    c           	      |   |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                  | _        |j                   | _         t        j                  t        j                  d|j
                  d|j                  |j                  g            | _        y )NrA   replaceremove9`visual_prompter_apply` must be in (add, replace, remove)r   r   )visual_prompter_applyr0   r*   r+   visual_prompt_size	frame_nummax_img_sizer   	Parameterr   randnpad_downr   s     r"   r+   z TvpFrameDownPadPrompter.__init__R  s    ''/KKXYY"(";";))"//%+%A%A"KKF,,a1J1JFL_L_`a
r!   c                    | j                   dk7  rst        j                  | j                  | j                  g|j                  |j
                        }d|| j                  | j                  z
  | j                  d d f<   ||z  }| j                   dk7  rt        j                  |j                  d   |j                  d   d| j                  | j                  g|j
                        }| j                  | j                  z
  }| j                  |d d d d d d || j                  d d f<   ||j                  |j                        z  }|S )	NrA   r   rD  r[  r   r   r   r   )r]  r   r   r`  r   r   r^  r   rm   rc  to)r2   rs   visual_prompt_maskpromptstart_points        r"   rR   zTvpFrameDownPadPrompter.forward`  s1   %%.!&""D$5$56l>P>PYeYlYl" fit0043J3JJTM^M^^`aab..L%%1[[##A&(:(:1(=q$BSBSUYUfUfg#**F ++d.E.EEKBF--F1aK$*;*;;Q>?FIIl&8&899Lr!   r   rT   s   @r"   rW  rW  M  s    
r!   rW  c                   p     e Zd ZdZ fdZdej                  dededej                  fdZd
de	fd	Z
 xZS )TvpFramePadPrompterz?
    Pad frames extracted from videos in the surroundings.
    c           
         |j                   dvrt        d      t        |           |j                  | _        |j
                  | _        |j                   | _         |j
                  |j                  dz  z
  | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j                  |j
                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        t        j                  t        j                  d|j                  d|j
                  |j                  dz  z
  |j                  g            | _        y )NrY  r\  ri   r   r   )r]  r0   r*   r+   ru   r`  r^  	base_sizer   ra  r   rb  pad_uprc  pad_left	pad_rightr   s     r"   r+   zTvpFramePadPrompter.__init__w  s   ''/KKXYY ++"//%+%A%A",,v/H/H1/LLllKKF--q&2K2KVM`M`ab
 KKF--q&2K2KVM`M`ab
 KK%%''&*C*Ca*GG--

 KK%%''&*C*Ca*GG--

r!   rh  rw   rx   r   c                    || j                   z  || j                   z  }}|j                  \  }}}}	}
|j                  ||z  ||	|
      }t        j                  j                  |||fdd      }|j                  |||||      }|S )z
        This method allows to interpolate the pre-trained pad weights, to be able to use the model on collection of high
        resolution images (high resolution videos).

        r   Fr   )r`  rm   r   r   ro   r   )r2   rh  rw   rx   r   r   batchru   channelsprompt_heightprompt_widths              r"   interpolate_pad_encodingz,TvpFramePadPrompter.interpolate_pad_encoding  s     $+++UT5F5F-FBCI<<@z8]L 
 2Hm\Z**b	 + 
 z8VUKr!   rv  c                 Z   |r|j                   d   |j                   d   fn| j                  | j                  f\  }}| j                  dvrt        d| j                         | j                  dv r3t	        j
                  ||g|j                  |j                        }||z  }| j                  dv rt	        j                  d| j                  d	| j                  | j                  |j                  
      }t	        j                  | j                  || j                  gd      }t	        j                  | j                  || j                  gd	      }t	        j                  |j!                  d      |gz        }|r| j#                  |||      }||j%                  |j                        z   }|S )Nr   rX   )rA   r[  rZ  z$Invalid visual_prompter_apply value )rZ  r[  r   )rZ  rA   r   r   re  rl   r   r   )rm   r`  r]  r0   r   r   r   r   r   ru   rm  catro  rp  rn  rc  r   rv  rf  )r2   rs   rv  rw   rx   rg  baserh  s           r"   rR   zTvpFramePadPrompter.forward  s{    ( #\%7%7%;<##T%6%67 	
 %%-IICDD^D^C_`aa%%)>>!&VUO<CUCU^j^q^q!r..L%%);;;;q$//1dnndnn]i]p]pqDYYtT^^D!LFYYVT]]CKFYY|003vh>?F'66vvuM'&))L4F4F*GGLr!   r   )r   r   r   r   r+   r   r   r   rv  r   rR   rS   rT   s   @r"   rk  rk  r  sG    $
Lu|| S QT Y^YeYe 0d r!   rk  )framedownpadframepadzw
    The bare Tvp Model transformer outputting BaseModelOutputWithPooling object without any specific head on top.
    )custom_introc                        e Zd Z fdZd Zd Zd Ze	 	 	 	 	 	 	 	 ddee	j                     dee	j                     dee	j                     dee	j                     d	ee   d
ee   dee   defd       Z xZS )TvpModelc                 "   t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        t        |      | _
        t        |      | _        t        j                  t        j                   dd|j"                  g            | _        t        j&                  |j(                        | _        |j,                  t.        vrt1        d      t/        |j,                     |      | _        | j5                          y )Nr   
   z:`visual_prompter_type` must be in (framedownpad, framepad))r*   r+   rY   rV   vision_modelr   r   r   visual_embeddingsr#  encoderr9  poolerr   ra  r   rb  r[   text_promptr   r   r   visual_prompter_typeTVP_PROMPTER_CLASSES_MAPPINGr0   visual_prompter	post_initr   s     r"   r+   zTvpModel.__init__  s     *6208!8!@!&)'<<QF<N<N4O(PQzz&"<"<=&&.JJYZZ;F<W<WXY_`r!   c                 .    | j                   j                  S r
  r   r   )r2   s    r"   get_input_embeddingszTvpModel.get_input_embeddings  s    ...r!   c                 &    || j                   _        y r
  r  )r2   r   s     r"   set_input_embeddingszTvpModel.set_input_embeddings  s    */'r!   c                     |j                         D ]7  \  }}| j                  j                  |   j                  j	                  |       9 y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr  r(  r  r   )r2   heads_to_pruner(  r   s       r"   _prune_headszTvpModel._prune_heads  sE     +002 	CLE5LLu%//;;EB	Cr!   r   rs   r   r   r   r+  r,  r   c	                 "   ||n| j                   j                  }| j                  | j                  ||            }| j	                  |      }	| j                  ||      }
||j                  |
j                  dd       }t        j                  |j                  d   d      j                  |j                  |j                        }t        j                  |||gd	
      }| j                  ||j                               j                  |j                        }| j                   j#                  |	j                  d   d	d	      }t        j                  ||	|
gd
      }| j%                  ||| j'                  || j                   j(                        |||      }|r|j*                  n|d   }| j-                  |      }| j/                  |      }| j/                  |      }|s
||f|dd z   S t1        |||j2                  |j4                        S )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpModel

        >>> model = TvpModel.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)rv  )r   r   ri   r   r  )r   r   rX   r   r   )r   r   r   r+  r,  )r.  pooler_outputr   r   )rY   r,  r  r  r   r  new_onesrm   r   r   rf  r   r   rx  get_extended_attention_maskr   r  r   r  get_head_maskr'  r.  r  r   r
   r   r   )r2   r   rs   r   r   r   r+  r,  r   text_embedding_outputvisual_embedding_outputvisual_attention_maskpt_maskr  embedding_outputencoder_outputsr.  r?  s                     r"   rR   zTvpModel.forward  s    4 &1%<k$++BYBY((  H` a
 !%) D"&"8"83K #9 #
 %$2$;$;<S<Y<YZ\[\<]$^!jj!5!5a!8"=@@%,,N4H4H A G #YYAV'W]_`N "==ninnN^_bbclcscstN&&--.C.I.I!.LbRTU 99k3HJa%bhij,,)((DKK4Q4QR/!5# ' 
 BMO==RabcRd$56 LL):;]3%}58KKK)/')77&11	
 	
r!   )NNNNNNNF)r   r   r   r+   r  r  r  r   r   r   
LongTensorr   r   rR   rS   rT   s   @r"   r~  r~    s     /0C  15485915,0/3&*).F
E,,-F
 u001F
 !!1!12	F

 E--.F
 $D>F
 'tnF
 d^F
 #'F
 F
r!   r~  c                   $     e Zd Z fdZd Z xZS )TvpVideoGroundingHeadc                 :   t         |           t        j                  |j                  |j                  dz        | _        t        j                  |j                  dz  d      | _        t        j                         | _        t        j                         | _
        y )Nri   )r*   r+   r   r   r[   layer_0layer_1ReLUactivation_0Sigmoidactivation_1r   s     r"   r+   zTvpVideoGroundingHead.__init__@  sj    yy!3!3V5G5G!5KLyy!3!3a!7;GGIJJLr!   c                     | j                  | j                  |            }| j                  | j                  |            }|S r
  )r  r  r  r  )r2   r  r   s      r"   rR   zTvpVideoGroundingHead.forwardG  s9    ""4<<#>?""4<<#78r!   r~   rT   s   @r"   r  r  ?  s    )r!   r  zb
    Tvp Model with a video grounding head on top computing IoU, distance, and duration loss.
    c                       e Zd Z fdZe	 	 	 	 	 	 	 	 	 ddeej                     deej                     deej                     dee	ej                        deej                     dee   dee   d	ee   d
efd       Z xZS )TvpForVideoGroundingc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y r
  )r*   r+   rY   r~  rB  r  video_grounding_headr  r   s     r"   r+   zTvpForVideoGrounding.__init__S  s:     f%
$9&$A!r!   r   rs   r   rO   r   r   r+  r,  r   c
           
         ||n| j                   j                  }| j                  ||||||||	      }
|
d   }| j                  |      }d}|pt	        g d      }|j                  | j                          |||      }|d   | j                   j                  |d   z  z   | j                   j                  |d   z  z   }|s|f|
dd z   }
||f|
z   }
|
S t        |||
j                  |
j                  	      S )
a  
        labels (`torch.FloatTensor` of shape `(batch_size, 3)`, *optional*):
            The labels contains duration, start time, and end time of the video corresponding to the text.

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoConfig, AutoTokenizer, TvpForVideoGrounding

        >>> model = TvpForVideoGrounding.from_pretrained("Jiqing/tiny-random-tvp")

        >>> tokenizer = AutoTokenizer.from_pretrained("Jiqing/tiny-random-tvp")

        >>> pixel_values = torch.rand(1, 1, 3, 448, 448)
        >>> text_inputs = tokenizer("This is an example input", return_tensors="pt")
        >>> output = model(text_inputs.input_ids, pixel_values, text_inputs.attention_mask)
        ```N)r   r   r+  r,  r   r   r&   r'   r(   r)   ri   )r   r   r   r   )rY   r,  rB  r  r$   rf  r   distance_loss_weightduration_loss_weightr   r   r   )r2   r   rs   r   rO   r   r   r+  r,  r   r  r  r   r   	criterion	loss_dicts                   r"   rR   zTvpForVideoGrounding.forward[  s,   < &1%<k$++BYBY**/!5#%=  	
  
**=9 ?@ILL%!&&1I% ++22Yz5JJK++22Yz5JJK 
 i'!"+-G'G+N&!//))	
 	
r!   )	NNNNNNNNF)r   r   r   r+   r   r   r   r  r   r   r   r   rR   rS   rT   s   @r"   r  r  M  s      1548590415,0/3&*).@
E,,-@
 u001@
 !!1!12	@

 u||,-@
 E--.@
 $D>@
 'tn@
 d^@
 #'@
 @
r!   r  )r~  rA  r  )2r   r   dataclassesr   typingr   r   r   torch.utils.checkpointr   activationsr   modeling_outputsr	   r
   r   modeling_utilsr   pytorch_utilsr   utilsr   r   utils.backbone_utilsr   configuration_tvpr   
get_loggerr   loggerr   Moduler$   rV   r   r   r   r  r  r  r#  r9  rA  rW  rk  r  r~  r  r  __all__r    r!   r"   <module>r     s     ! "    ! X X - / , 1 ( 
		H	% ?k ? ?.Mbii M`%RYY %Pnbii nb!RYY !H_299 _Fbii RYY RYY 8;
 ;
~		  2 2 20"bii "JW")) Wv ,#   
e
! e

e
PBII  
J
- J

J
Z Er!   