
    Uh              	          d Z ddlZddlmZmZmZmZmZ ddlZddl	Zddlm
Z
 ddlmZmZmZ ddlmZ ddlmZmZmZmZ dd	lmZ dd
lmZmZ ddlmZmZmZ ddlm Z   ejB                  e"      Z#d=de$de$dee$   de$fdZ% G d de
jL                        Z' G d de
jL                        Z( G d de
jL                        Z) G d de
jL                        Z* G d de
jL                        Z+ G d de
jL                        Z, G d d e
jL                        Z- G d! d"e
jL                        Z. G d# d$e
jL                        Z/ G d% d&e
jL                        Z0 G d' d(e
jL                        Z1 G d) d*e
jL                        Z2e G d+ d,e             Z3e G d- d.e3             Z4 ed/0       G d1 d2e3             Z5 G d3 d4e
jL                        Z6 G d5 d6e
jL                        Z7 G d7 d8e
jL                        Z8 ed90       G d: d;e3             Z9g d<Z:y)>zPyTorch MobileViT model.    N)DictOptionalSetTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BaseModelOutputWithNoAttention(BaseModelOutputWithPoolingAndNoAttention$ImageClassifierOutputWithNoAttentionSemanticSegmenterOutput)PreTrainedModel) find_pruneable_heads_and_indicesprune_linear_layer)auto_docstringlogging	torch_int   )MobileViTConfigvaluedivisor	min_valuereturnc                 |    ||}t        |t        | |dz  z         |z  |z        }|d| z  k  r||z  }t        |      S )a  
    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
    original TensorFlow repo. It can be seen here:
    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
       g?)maxint)r   r   r   	new_values       /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/mobilevit/modeling_mobilevit.pymake_divisibler$   +   sS     	Is57Q;#677BWLMI3;W	y>    c                        e Zd Z	 	 	 	 	 	 ddedededededededed	ed
eeef   ddf fdZde	j                  de	j                  fdZ xZS )MobileViTConvLayerconfigin_channelsout_channelskernel_sizestridegroupsbiasdilationuse_normalizationuse_activationr   Nc                 $   t         |           t        |dz
  dz        |z  }||z  dk7  rt        d| d| d      ||z  dk7  rt        d| d| d      t	        j
                  ||||||||d		      | _        |	r t	        j                  |d
ddd      | _        nd | _        |
rdt        |
t              rt        |
   | _        y t        |j                  t              rt        |j                     | _        y |j                  | _        y d | _        y )Nr   r   r   zInput channels (z) are not divisible by z groups.zOutput channels (zeros)	r)   r*   r+   r,   paddingr/   r-   r.   padding_modegh㈵>g?T)num_featuresepsmomentumaffinetrack_running_stats)super__init__r!   
ValueErrorr   Conv2dconvolutionBatchNorm2dnormalization
isinstancestrr   
activation
hidden_act)selfr(   r)   r*   r+   r,   r-   r.   r/   r0   r1   r4   	__class__s               r#   r<   zMobileViTConvLayer.__init__;   s*    	{Q!+,x71$/}<STZS[[cdee& A%0>UV\U]]efgg99#%# 

 !#)$("D "&D.#."("8F--s3"():):";"("3"3"DOr%   featuresc                     | j                  |      }| j                  | j                  |      }| j                  | j                  |      }|S N)r?   rA   rD   )rF   rH   s     r#   forwardzMobileViTConvLayer.forwardq   sK    ##H-)))(3H??&x0Hr%   )r   r   Fr   TT)__name__
__module____qualname__r   r!   boolr   rC   r<   torchTensorrK   __classcell__rG   s   @r#   r'   r'   :   s     "&+/4#4# 4# 	4#
 4# 4# 4# 4# 4#  4# dCi(4# 
4#l  r%   r'   c                   x     e Zd ZdZ	 ddedededededdf fd	Zd
ej                  dej                  fdZ	 xZ
S )MobileViTInvertedResidualzQ
    Inverted residual block (MobileNetv2): https://arxiv.org/abs/1801.04381
    r(   r)   r*   r,   r/   r   Nc           	      @   t         |           t        t        t	        ||j
                  z              d      }|dvrt        d| d      |dk(  xr ||k(  | _        t        |||d      | _	        t        |||d|||      | _
        t        |||dd	
      | _        y )N   )r   r   zInvalid stride .r   r)   r*   r+   r   )r)   r*   r+   r,   r-   r/   Fr)   r*   r+   r1   )r;   r<   r$   r!   roundexpand_ratior=   use_residualr'   
expand_1x1conv_3x3
reduce_1x1)rF   r(   r)   r*   r,   r/   expanded_channelsrG   s          r#   r<   z"MobileViTInvertedResidual.__init__   s     	*3u[6CVCV5V/W+XZ[\vha899#q[K{l/J,:KYZ
 +)*$
 -)% 
r%   rH   c                     |}| j                  |      }| j                  |      }| j                  |      }| j                  r||z   S |S rJ   )r^   r_   r`   r]   )rF   rH   residuals      r#   rK   z!MobileViTInvertedResidual.forward   sI    ??8,==*??8,&*&7&7x("EXEr%   r   )rL   rM   rN   __doc__r   r!   r<   rP   rQ   rK   rR   rS   s   @r#   rU   rU   z   sc    
 jk
%
47
GJ
TW
cf
	
BF F Fr%   rU   c                   t     e Zd Z	 ddedededededdf fdZd	ej                  dej                  fd
Z xZ	S )MobileViTMobileNetLayerr(   r)   r*   r,   
num_stagesr   Nc                     t         |           t        j                         | _        t        |      D ]5  }t        ||||dk(  r|nd      }| j                  j                  |       |}7 y )Nr   r   )r)   r*   r,   )r;   r<   r   
ModuleListlayerrangerU   append)	rF   r(   r)   r*   r,   rh   irk   rG   s	           r#   r<   z MobileViTMobileNetLayer.__init__   sh     	]]_
z" 	'A-')!"avQ	E JJe$&K	'r%   rH   c                 8    | j                   D ]
  } ||      } |S rJ   rk   )rF   rH   layer_modules      r#   rK   zMobileViTMobileNetLayer.forward   s$     JJ 	.L#H-H	.r%   )r   r   
rL   rM   rN   r   r!   r<   rP   rQ   rK   rR   rS   s   @r#   rg   rg      sV    op'%'47'GJ'TW'il'	'   r%   rg   c                        e Zd Zdededdf fdZdej                  dej                  fdZdej                  dej                  fd	Z	 xZ
S )
MobileViTSelfAttentionr(   hidden_sizer   Nc                    t         |           ||j                  z  dk7  rt        d| d|j                   d      |j                  | _        t	        ||j                  z        | _        | j                  | j
                  z  | _        t        j                  || j                  |j                        | _
        t        j                  || j                  |j                        | _        t        j                  || j                  |j                        | _        t        j                  |j                        | _        y )Nr   zThe hidden size z4 is not a multiple of the number of attention heads rX   )r.   )r;   r<   num_attention_headsr=   r!   attention_head_sizeall_head_sizer   Linearqkv_biasquerykeyr   Dropoutattention_probs_dropout_probdropoutrF   r(   ru   rG   s      r#   r<   zMobileViTSelfAttention.__init__   s    333q8";- 0334A7 
 $*#=#= #&{V5O5O'O#P !558P8PPYY{D,>,>V__U
99[$*<*<6??SYY{D,>,>V__U
zz&"E"EFr%   xc                     |j                         d d | j                  | j                  fz   } |j                  | }|j	                  dddd      S )Nr   r   r   r   )sizerw   rx   viewpermute)rF   r   new_x_shapes      r#   transpose_for_scoresz+MobileViTSelfAttention.transpose_for_scores   sN    ffhsmt'?'?AYAY&ZZAFFK yyAq!$$r%   hidden_statesc                    | j                  |      }| j                  | j                  |            }| j                  | j                  |            }| j                  |      }t	        j
                  ||j                  dd            }|t        j                  | j                        z  }t        j                  j                  |d      }| j                  |      }t	        j
                  ||      }|j                  dddd      j                         }|j!                         d d | j"                  fz   }	 |j$                  |	 }|S )Nr   dimr   r   r   r   )r|   r   r}   r   rP   matmul	transposemathsqrtrx   r   
functionalsoftmaxr   r   
contiguousr   ry   r   )
rF   r   mixed_query_layer	key_layervalue_layerquery_layerattention_scoresattention_probscontext_layernew_context_layer_shapes
             r#   rK   zMobileViTSelfAttention.forward   s&    JJ}5--dhh}.EF	//

=0IJ//0AB !<<Y5H5HR5PQ+dii8P8P.QQ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S***,CDr%   )rL   rM   rN   r   r!   r<   rP   rQ   r   rK   rR   rS   s   @r#   rt   rt      sW    G GS GT G&%ell %u|| %
U\\ ell r%   rt   c                   d     e Zd Zdededdf fdZdej                  dej                  fdZ xZ	S )MobileViTSelfOutputr(   ru   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rJ   r;   r<   r   rz   denser~   hidden_dropout_probr   r   s      r#   r<   zMobileViTSelfOutput.__init__   s6    YY{K8
zz&"<"<=r%   r   c                 J    | j                  |      }| j                  |      }|S rJ   r   r   rF   r   s     r#   rK   zMobileViTSelfOutput.forward   s$    

=1]3r%   rr   rS   s   @r#   r   r      s8    > >S >T >
U\\ ell r%   r   c                   z     e Zd Zdededdf fdZdee   ddfdZdej                  dej                  fd	Z
 xZS )
MobileViTAttentionr(   ru   r   Nc                     t         |           t        ||      | _        t	        ||      | _        t               | _        y rJ   )r;   r<   rt   	attentionr   outputsetpruned_headsr   s      r#   r<   zMobileViTAttention.__init__  s4    /D)&+>Er%   headsc                 >   t        |      dk(  ry t        || j                  j                  | j                  j                  | j
                        \  }}t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _        t        | j                  j                  |      | j                  _	        t        | j                  j                  |d      | j                  _        | j                  j                  t        |      z
  | j                  _        | j                  j                  | j                  j                  z  | j                  _        | j
                  j                  |      | _        y )Nr   r   r   )lenr   r   rw   rx   r   r   r|   r}   r   r   r   ry   union)rF   r   indexs      r#   prune_headszMobileViTAttention.prune_heads  s   u:?74>>55t~~7Y7Y[_[l[l
u
  2$..2F2FN/0B0BEJ1$..2F2FN.t{{/@/@%QO .2^^-O-ORUV[R\-\*'+~~'I'IDNNLnLn'n$ --33E:r%   r   c                 J    | j                  |      }| j                  |      }|S rJ   )r   r   )rF   r   self_outputsattention_outputs       r#   rK   zMobileViTAttention.forward  s%    ~~m4;;|4r%   )rL   rM   rN   r   r!   r<   r   r   rP   rQ   rK   rR   rS   s   @r#   r   r      sO    " "S "T ";S ;d ;$ U\\  ell  r%   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTIntermediater(   ru   intermediate_sizer   Nc                     t         |           t        j                  ||      | _        t        |j                  t              rt        |j                     | _	        y |j                  | _	        y rJ   )
r;   r<   r   rz   r   rB   rE   rC   r   intermediate_act_fnrF   r(   ru   r   rG   s       r#   r<   zMobileViTIntermediate.__init__   sR    YY{,=>
f''-'-f.?.?'@D$'-'8'8D$r%   r   c                 J    | j                  |      }| j                  |      }|S rJ   )r   r   r   s     r#   rK   zMobileViTIntermediate.forward(  s&    

=100?r%   rr   rS   s   @r#   r   r     sA    9 9S 9UX 9]a 9U\\ ell r%   r   c                        e Zd Zdedededdf fdZdej                  dej                  dej                  fd	Z xZ	S )
MobileViTOutputr(   ru   r   r   Nc                     t         |           t        j                  ||      | _        t        j
                  |j                        | _        y rJ   r   r   s       r#   r<   zMobileViTOutput.__init__/  s7    YY0+>
zz&"<"<=r%   r   input_tensorc                 T    | j                  |      }| j                  |      }||z   }|S rJ   r   )rF   r   r   s      r#   rK   zMobileViTOutput.forward4  s.    

=1]3%4r%   rr   rS   s   @r#   r   r   .  sO    > >S >UX >]a >
U\\  RWR^R^ r%   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerLayerr(   ru   r   r   Nc                 $   t         |           t        ||      | _        t	        |||      | _        t        |||      | _        t        j                  ||j                        | _        t        j                  ||j                        | _        y )Nr7   )r;   r<   r   r   r   intermediater   r   r   	LayerNormlayer_norm_epslayernorm_beforelayernorm_afterr   s       r#   r<   z"MobileViTTransformerLayer.__init__<  sq    +FK@1&+GXY%fk;LM "[f>S>S T!||KV=R=RSr%   r   c                     | j                  | j                  |            }||z   }| j                  |      }| j                  |      }| j	                  ||      }|S rJ   )r   r   r   r   r   )rF   r   r   layer_outputs       r#   rK   z!MobileViTTransformerLayer.forwardD  s\    >>$*?*?*NO(=8++M:((6{{<?r%   rr   rS   s   @r#   r   r   ;  sF    T TS TUX T]a TU\\ ell r%   r   c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTTransformerr(   ru   rh   r   Nc           	          t         |           t        j                         | _        t        |      D ]A  }t        ||t        ||j                  z              }| j                  j                  |       C y )N)ru   r   )
r;   r<   r   rj   rk   rl   r   r!   	mlp_ratiorm   )rF   r(   ru   rh   _transformer_layerrG   s         r#   r<   zMobileViTTransformer.__init__O  sh    ]]_
z" 	1A 9'"%kF4D4D&D"E!
 JJ/0	1r%   r   c                 8    | j                   D ]
  } ||      } |S rJ   rp   )rF   r   rq   s      r#   rK   zMobileViTTransformer.forward[  s%     JJ 	8L(7M	8r%   rr   rS   s   @r#   r   r   N  s@    
1 
1S 
1c 
1VZ 
1U\\ ell r%   r   c                        e Zd ZdZ	 ddededededededed	d
f fdZdej                  d	e	ej                  e
f   fdZdej                  de
d	ej                  fdZdej                  d	ej                  fdZ xZS )MobileViTLayerz;
    MobileViT block: https://arxiv.org/abs/2110.02178
    r(   r)   r*   r,   ru   rh   r/   r   Nc                    t         |           |j                  | _        |j                  | _        |dk(  r)t        ||||dk(  r|nd|dkD  r|dz  nd      | _        |}nd | _        t        ||||j                        | _	        t        |||ddd      | _
        t        |||      | _        t        j                  ||j                        | _        t        |||d      | _        t        |d|z  ||j                        | _        y )	Nr   r   )r)   r*   r,   r/   rY   F)r)   r*   r+   r0   r1   )ru   rh   r   )r;   r<   
patch_sizepatch_widthpatch_heightrU   downsampling_layerr'   conv_kernel_sizeconv_kxkconv_1x1r   transformerr   r   r   	layernormconv_projectionfusion)	rF   r(   r)   r*   r,   ru   rh   r/   rG   s	           r#   r<   zMobileViTLayer.__init__f  s    	!,,"--Q;&?')!)QvA*2Q,QA'D# 'K&*D#*#$//	
 +#$# 
 0#!
 kv7L7LM1+ST 
 )KkW]WnWn
r%   rH   c                 |   | j                   | j                  }}t        ||z        }|j                  \  }}}}t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }	t        j
                  j                         r$t        t	        j                  ||z        |z        n#t        t        j                  ||z        |z        }
d}|
|k7  s|	|k7  r't        j                  j                  ||	|
fdd      }d}|
|z  }|	|z  }||z  }|j                  ||z  |z  |||      }|j                  dd      }|j                  ||||      }|j                  dd      }|j                  ||z  |d      }||f||||||d	}||fS )
NFbilinearr   modealign_cornersTr   r   r   r   )	orig_size
batch_sizechannelsinterpolatenum_patchesnum_patches_widthnum_patches_height)r   r   r!   shaperP   jit
is_tracingr   ceilr   r   r   r   reshaper   )rF   rH   r   r   
patch_arear   r   orig_height
orig_width
new_height	new_widthr   num_patch_widthnum_patch_heightr   patches	info_dicts                    r#   	unfoldingzMobileViTLayer.unfolding  s   $($4$4d6G6G\|34
8@5
Hk: yy##% ejj|!;<|KLTYY{\9:\IJ 	 yy##% ejjk!9:[HITYYzK78;FG 	 
"jK&?}}00
I6ZW\ 1 H K ${2%5&8 ""!$44lOU`
 ##Aq)//*hZP##Aq)//*z"9;K &z2$ &&!0"2
	 	!!r%   r   r   c                    | j                   | j                  }}t        ||z        }|d   }|d   }|d   }|d   }	|d   }
|j                         j	                  |||d      }|j                  dd      }|j                  ||z  |	z  |
||      }|j                  dd	      }|j                  |||	|z  |
|z        }|d
   r&t        j                  j                  ||d   dd      }|S )Nr   r   r   r   r   r   r   r   r   r   r   r   Fr   )
r   r   r!   r   r   r   r   r   r   r   )rF   r   r   r   r   r   r   r   r   r   r   rH   s               r#   foldingzMobileViTLayer.folding  s&   $($4$4d6G6G\|34
|,
Z(.$%9:#$78 %%',,Z[RTU%%a+##!$44o|U`
 %%a+##"2\"A?U`C`
 ]#}}00y5JV[ 1 H r%   c                    | j                   r| j                  |      }|}| j                  |      }| j                  |      }| j                  |      \  }}| j	                  |      }| j                  |      }| j                  ||      }| j                  |      }| j                  t        j                  ||fd            }|S Nr   r   )r   r   r   r   r   r   r   r   r   rP   cat)rF   rH   rc   r   r   s        r#   rK   zMobileViTLayer.forward  s    ""..x8H ==*==* "^^H5 ""7+..) <<3''1;;uyy(H)=1EFr%   rd   )rL   rM   rN   re   r   r!   r<   rP   rQ   r   r   r   r   rK   rR   rS   s   @r#   r   r   a  s     8
8
 8
 	8

 8
 8
 8
 8
 
8
t1"%,, 1"5t9K3L 1"fu||   :  r%   r   c                   d     e Zd Zdeddf fdZ	 	 d	dej                  dededee	e
f   fdZ xZS )
MobileViTEncoderr(   r   Nc           	         t         
|           || _        t        j                         | _        d| _        dx}}|j                  dk(  rd}d}n|j                  dk(  rd}d}t        ||j                  d   |j                  d   dd      }| j
                  j                  |       t        ||j                  d   |j                  d   dd	      }| j
                  j                  |       t        ||j                  d   |j                  d	   d|j                  d   d
      }| j
                  j                  |       |r|dz  }t        ||j                  d	   |j                  d   d|j                  d   d|      }| j
                  j                  |       |r|dz  }t        ||j                  d   |j                  d   d|j                  d   d	|      }	| j
                  j                  |	       y )NFrW   T   r   r   )r)   r*   r,   rh   r   r   )r)   r*   r,   ru   rh      )r)   r*   r,   ru   rh   r/      )r;   r<   r(   r   rj   rk   gradient_checkpointingoutput_striderg   neck_hidden_sizesrm   r   hidden_sizes)rF   r(   dilate_layer_4dilate_layer_5r/   layer_1layer_2layer_3layer_4layer_5rG   s             r#   r<   zMobileViTEncoder.__init__  s   ]]_
&+# +0/1$!N!N!!R'!N)00311!4
 	

'")00311!4
 	

'" 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"MH 00311!4++A.
 	

'"r%   r   output_hidden_statesreturn_dictc                    |rdnd }t        | j                        D ]K  \  }}| j                  r)| j                  r| j	                  |j
                  |      }n ||      }|sF||fz   }M |st        d ||fD              S t        ||      S )N c              3   &   K   | ]	  }||  y wrJ   r  ).0vs     r#   	<genexpr>z+MobileViTEncoder.forward.<locals>.<genexpr>j  s     Xq!-Xs   )last_hidden_stater   )	enumeraterk   r	  training_gradient_checkpointing_func__call__tupler   )rF   r   r  r  all_hidden_statesrn   rq   s          r#   rK   zMobileViTEncoder.forwardU  s     #7BD(4 
	IOA|**t}} $ A A ))!!
 !-] ;#$58H$H!
	I X]4E$FXXX-]noor%   )FT)rL   rM   rN   r   r<   rP   rQ   rO   r   r!  r   rK   rR   rS   s   @r#   r  r  
  sa    H# H#4 H#Z &+ 	p||p #p 	p
 
u44	5pr%   r  c                   z    e Zd ZeZdZdZdZdgZde	e
j                  e
j                  e
j                  f   ddfdZy)	MobileViTPreTrainedModel	mobilevitpixel_valuesTr   moduler   Nc                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yy)zInitialize the weightsg        )meanstdNg      ?)rB   r   rz   r>   weightdatanormal_r(   initializer_ranger.   zero_r   fill_)rF   r'  s     r#   _init_weightsz&MobileViTPreTrainedModel._init_weightsw  s    fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) .r%   )rL   rM   rN   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr   r   rz   r>   r   r1  r  r%   r#   r$  r$  o  sM    "L#$O&*#)*
*E"))RYY*L$M 
*RV 
*r%   r$  c                        e Zd Zd
dedef fdZd Ze	 	 	 ddee	j                     dee   dee   deeef   fd	       Z xZS )MobileViTModelr(   expand_outputc                 L   t         |   |       || _        || _        t	        ||j
                  |j                  d   dd      | _        t        |      | _	        | j                  r.t	        ||j                  d   |j                  d   d      | _
        | j                          y	)
aE  
        expand_output (`bool`, *optional*, defaults to `True`):
            Whether to expand the output of the model using a 1x1 convolution. If `True`, the model will apply an additional
            1x1 convolution to expand the output channels from `config.neck_hidden_sizes[5]` to `config.neck_hidden_sizes[6]`.
        r   r   r   )r)   r*   r+   r,   r     r   rY   N)r;   r<   r(   r9  r'   num_channelsr  	conv_stemr  encoderconv_1x1_exp	post_init)rF   r(   r9  rG   s      r#   r<   zMobileViTModel.__init__  s     	 *+++11!4
 (/ 2"44Q7#55a8	!D 	r%   c                     |j                         D ]e  \  }}| j                  j                  |   }t        |t              s0|j
                  j                  D ]  }|j                  j                  |        g y)zPrunes heads of the model.
        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
        N)itemsr>  rk   rB   r   r   r   r   )rF   heads_to_prunelayer_indexr   mobilevit_layerr   s         r#   _prune_headszMobileViTModel._prune_heads  ss     #1"6"6"8 	CK"ll00=O/>:)8)D)D)J)J C%%//;;EBC	Cr%   r&  r  r  r   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  |      }| j                  |||      }| j                  r/| j                  |d         }t        j                  |ddgd      }n|d   }d }|s|||fn|f}||dd  z   S t        |||j                  	      S )
Nz You have to specify pixel_valuesr  r  r   r   r   F)r   keepdimr   )r  pooler_outputr   )r(   r  use_return_dictr=   r=  r>  r9  r?  rP   r)  r   r   )	rF   r&  r  r  embedding_outputencoder_outputsr  pooled_outputr   s	            r#   rK   zMobileViTModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@>>,7,,!5# ' 
  $ 1 1/!2D E "JJ'8r2hPUVM / 2 M;H;T'7[lZnFOAB///7/')77
 	
r%   )T)NNN)rL   rM   rN   r   rO   r<   rF  r   r   rP   rQ   r   r!  r   rK   rR   rS   s   @r#   r8  r8    s}     t >C  04/3&*	'
u||,'
 'tn'
 d^	'

 
u>>	?'
 '
r%   r8  z
    MobileViT model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
    ImageNet.
    )custom_introc                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     dee	   deej                     dee	   de
eef   f
d	       Z xZS )MobileViTForImageClassificationr(   r   Nc                 |   t         |   |       |j                  | _        t        |      | _        t        j                  |j                  d      | _        |j                  dkD  r-t        j                  |j                  d   |j                        nt        j                         | _        | j                          y )NT)inplacer   r   )r;   r<   
num_labelsr8  r%  r   r~   classifier_dropout_probr   rz   r  Identity
classifierr@  rF   r(   rG   s     r#   r<   z(MobileViTForImageClassification.__init__  s      ++'/ zz&"@"@$OJPJ[J[^_J_BIIf..r2F4E4EFegepeper 	
 	r%   r&  r  labelsr  c                 6   ||n| j                   j                  }| j                  |||      }|r|j                  n|d   }| j	                  | j                  |            }d}|| j                   j                  | j                  dk(  rd| j                   _        nl| j                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }	| j                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j                  dk(  r=t               }	 |	|j                  d| j                        |j                  d            }n,| j                   j                  dk(  rt!               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t#        |||j$                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        NrH  r   
regressionsingle_label_classificationmulti_label_classificationr   r   )losslogitsr   )r(   rK  r%  rJ  rW  r   problem_typerT  dtyperP   longr!   r   squeezer
   r   r	   r   r   )rF   r&  r  rY  r  outputsrN  r_  r^  loss_fctr   s              r#   rK   z'MobileViTForImageClassification.forward  s    &1%<k$++B]B]..DXfq.r1<--'!*m!<={{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE3!//
 	
r%   NNNN)rL   rM   rN   r   r<   r   r   rP   rQ   rO   r   r!  r   rK   rR   rS   s   @r#   rQ  rQ    s     4   04/3)-&*4
u||,4
 'tn4
 &	4

 d^4
 
u::	;4
 4
r%   rQ  c                   h     e Zd Zdedededdf fdZdej                  dej                  fdZ xZ	S )	MobileViTASPPPoolingr(   r)   r*   r   Nc           	          t         |           t        j                  d      | _        t        |||dddd      | _        y )Nr   )output_sizeTrelu)r)   r*   r+   r,   r0   r1   )r;   r<   r   AdaptiveAvgPool2dglobal_poolr'   r   )rF   r(   r)   r*   rG   s       r#   r<   zMobileViTASPPPooling.__init__)  sB    //A>*#%"!
r%   rH   c                     |j                   dd  }| j                  |      }| j                  |      }t        j                  j                  ||dd      }|S )Nr   r   Fr   )r   rm  r   r   r   r   )rF   rH   spatial_sizes      r#   rK   zMobileViTASPPPooling.forward8  sS    ~~bc*##H-==*==,,XLzin,or%   rr   rS   s   @r#   rh  rh  (  sA    
 
S 
PS 
X\ 
  r%   rh  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTASPPzs
    ASPP module defined in DeepLab papers: https://arxiv.org/abs/1606.00915, https://arxiv.org/abs/1706.05587
    r(   r   Nc                 ~   t         |           |j                  d   }|j                  }t	        |j
                        dk7  rt        d      t        j                         | _	        t        |||dd      }| j                  j                  |       | j                  j                  |j
                  D cg c]  }t        |||d|d       c}       t        |||      }| j                  j                  |       t        |d|z  |dd      | _        t        j                  |j                   	      | _        y c c}w )
Nr   r   z"Expected 3 values for atrous_ratesr   rk  rZ   )r)   r*   r+   r/   r1   r  )p)r;   r<   r  aspp_out_channelsr   atrous_ratesr=   r   rj   convsr'   rm   extendrh  projectr~   aspp_dropout_probr   )rF   r(   r)   r*   in_projectionrate
pool_layerrG   s          r#   r<   zMobileViTASPP.__init__E  s(   ..r2//v""#q(ABB]]_
*#%!
 	

-(

 #//
  # +!- !!#)
	
 *&+|L


*%)L 0|YZkq
 zzF$<$<=)
s   5D:rH   c                     g }| j                   D ]  }|j                   ||              t        j                  |d      }| j	                  |      }| j                  |      }|S r  )rv  rm   rP   r  rx  r   )rF   rH   pyramidconvpooled_featuress        r#   rK   zMobileViTASPP.forwardp  s\    JJ 	+DNN4>*	+))G+,,w/,,7r%   
rL   rM   rN   re   r   r<   rP   rQ   rK   rR   rS   s   @r#   rq  rq  @  s7    )> )>4 )>V  r%   rq  c                   d     e Zd ZdZdeddf fdZdej                  dej                  fdZ xZ	S )MobileViTDeepLabV3zB
    DeepLabv3 architecture: https://arxiv.org/abs/1706.05587
    r(   r   Nc           	          t         |           t        |      | _        t	        j
                  |j                        | _        t        ||j                  |j                  dddd      | _        y )Nr   FT)r)   r*   r+   r0   r1   r.   )r;   r<   rq  asppr   	Dropout2drU  r   r'   rt  rT  rW  rX  s     r#   r<   zMobileViTDeepLabV3.__init__  s]    !&)	||F$B$BC,00**# 
r%   r   c                 r    | j                  |d         }| j                  |      }| j                  |      }|S )Nr   )r  r   rW  )rF   r   rH   s      r#   rK   zMobileViTDeepLabV3.forward  s6    99]2./<<)??8,r%   r  rS   s   @r#   r  r  {  s6    
 
4 
 U\\ ell r%   r  zX
    MobileViT model with a semantic segmentation head on top, e.g. for Pascal VOC.
    c                        e Zd Zdeddf fdZe	 	 	 	 d
deej                     deej                     dee	   dee	   de
eef   f
d	       Z xZS ) MobileViTForSemanticSegmentationr(   r   Nc                     t         |   |       |j                  | _        t        |d      | _        t        |      | _        | j                          y )NF)r9  )r;   r<   rT  r8  r%  r  segmentation_headr@  rX  s     r#   r<   z)MobileViTForSemanticSegmentation.__init__  sD      ++'eD!3F!; 	r%   r&  rY  r  r  c                 h   ||n| j                   j                  }||n| j                   j                  }|$| j                   j                  dk(  rt	        d      | j                  |d|      }|r|j                  n|d   }| j                  |      }d}|Yt        j                  j                  ||j                  dd dd	      }	t        | j                   j                  
      }
 |
|	|      }|s|r
|f|dd z   }n	|f|dd z   }||f|z   S |S t        |||r|j                  d      S dd      S )a{  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import requests
        >>> import torch
        >>> from PIL import Image
        >>> from transformers import AutoImageProcessor, MobileViTForSemanticSegmentation

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("apple/deeplabv3-mobilevit-small")
        >>> model = MobileViTForSemanticSegmentation.from_pretrained("apple/deeplabv3-mobilevit-small")

        >>> inputs = image_processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs)

        >>> # logits are of shape (batch_size, num_labels, height, width)
        >>> logits = outputs.logits
        ```Nr   z/The number of labels should be greater than oneTrH  r   r   Fr   )ignore_indexr   )r^  r_  r   
attentions)r(   r  rK  rT  r=   r%  r   r  r   r   r   r   r
   semantic_loss_ignore_indexr   )rF   r&  rY  r  r  rd  encoder_hidden_statesr_  r^  upsampled_logitsre  r   s               r#   rK   z(MobileViTForSemanticSegmentation.forward  sq   H %9$D $++JjJj 	 &1%<k$++B]B]$++"8"8A"=NOO..!%# ! 
 :E 5 5'RS*''(=>!}}88V\\"#.Zu  9   (T[[5[5[\H,f5D# WQR[0 WQR[0)-)9TGf$EvE&3G'//	
 	
 NR	
 	
r%   rf  )rL   rM   rN   r   r<   r   r   rP   rQ   rO   r   r!  r   rK   rR   rS   s   @r#   r  r    s     4   04)-/3&*I
u||,I
 &I
 'tn	I

 d^I
 
u--	.I
 I
r%   r  )rQ  r  r8  r$  )rW   N);re   r   typingr   r   r   r   r   rP   torch.utils.checkpointr   torch.nnr	   r
   r   activationsr   modeling_outputsr   r   r   r   modeling_utilsr   pytorch_utilsr   r   utilsr   r   r   configuration_mobilevitr   
get_loggerrL   loggerr!   r$   Moduler'   rU   rg   rt   r   r   r   r   r   r   r   r  r$  r8  rQ  rh  rq  r  r  __all__r  r%   r#   <module>r     s  "   4 4    A A !  . Q 7 7 4 
		H	%#  HSM UX = =@-F		 -F`bii .0RYY 0f	")) 	   >BII 
bii 
		 &299 &fRYY fRbpryy bpJ * * *( R
- R
 R
j E
&> E
E
P299 08BII 8v 8 
U
'? U

U
pr%   