
    Uh              	          d Z ddlZddlZddlmZ ddlmZmZm	Z	 ddl
Z
ddlZ
ddl
mZ ddlmZmZmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZ ddlmZ ddlmZ  ej>                  e       Z!e G d de             Z"e G d de             Z#e G d de             Z$e G d de             Z% G d dejL                        Z' G d dejL                        Z(d;de
jR                  de*de+de
jR                  fdZ, G d  d!ejL                        Z- G d" d#ejL                        Z. G d$ d%ejL                        Z/ G d& d'ejL                        Z0 G d( d)ejL                        Z1 G d* d+ejL                        Z2e G d, d-e             Z3e G d. d/e3             Z4 ed01       G d2 d3e3             Z5 ed41       G d5 d6e3             Z6 ed71       G d8 d9e3e             Z7g d:Z8y)<zPyTorch FocalNet model.    N)	dataclass)OptionalTupleUnion)nn)BCEWithLogitsLossCrossEntropyLossMSELoss   )ACT2FN)BackboneOutput)PreTrainedModel)ModelOutputauto_docstringlogging)BackboneMixin   )FocalNetConfigc                       e Zd ZU dZdZeej                     ed<   dZ	ee
ej                        ed<   dZee
ej                        ed<   y)FocalNetEncoderOutputa  
    FocalNet encoder's outputs, with potential hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.

        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlast_hidden_statehidden_statesreshaped_hidden_states)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r        /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/focalnet/modeling_focalnet.pyr   r   &   sT    ( 6:x 1 1298<M8E%"3"345<AEHU5+<+<%=>Er"   r   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)FocalNetModelOutputa  
    FocalNet model's outputs that also contains a pooling of the last hidden states.

    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
            Average pooling of the last layer hidden-state.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr   pooler_outputr   r   )r   r   r   r   r   r   r   r   r    r&   r   r   r   r!   r"   r#   r%   r%   A   si    * 6:x 1 12915M8E--.58<M8E%"3"345<AEHU5+<+<%=>Er"   r%   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)!FocalNetMaskedImageModelingOutputa  
    FocalNet masked image model outputs.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `bool_masked_pos` is provided):
            Masked image modeling (MLM) loss.
        reconstruction (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Reconstructed pixel values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nlossreconstructionr   r   )r   r   r   r   r)   r   r   r   r    r*   r   r   r   r!   r"   r#   r(   r(   ^   sh    * )-D(5$$
%,26NHU../68<M8E%"3"345<AEHU5+<+<%=>Er"   r(   c                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)FocalNetImageClassifierOutputaS  
    FocalNet outputs for image classification.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Classification (or regression if config.num_labels==1) loss.
        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
            Classification (or regression if config.num_labels==1) scores (before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
            shape `(batch_size, hidden_size, height, width)`.

            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
            include the spatial dimensions.
    Nr)   logitsr   r   )r   r   r   r   r)   r   r   r   r    r-   r   r   r   r!   r"   r#   r,   r,   {   sh    * )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<AEHU5+<+<%=>Er"   r,   c                        e Zd ZdZd fd	Z	 ddeej                     deej                     de	ej                     fdZ xZS )	FocalNetEmbeddingszX
    Construct the patch embeddings and layernorm. Optionally, also the mask token.
    c           	         t         |           t        ||j                  |j                  |j
                  |j                  |j                  d      | _        | j                  j                  | _
        |r4t        j                  t        j                  dd|j                              nd | _        t        j                   |j                  |j"                        | _        t        j&                  |j(                        | _        y )NT)config
image_size
patch_sizenum_channels	embed_dimuse_conv_embedis_stemr   eps)super__init__FocalNetPatchEmbeddingsr2   r3   r4   r5   r6   patch_embeddings	grid_size
patch_gridr   	Parameterr   zeros
mask_token	LayerNormlayer_norm_epsnormDropouthidden_dropout_probdropout)selfr1   use_mask_token	__class__s      r#   r;   zFocalNetEmbeddings.__init__   s     7((((,,&&!00!
 //99O]",,u{{1a9I9I'JKcgLL!1!1v7L7LM	zz&"<"<=r"   pixel_valuesbool_masked_posreturnc                 8   | j                  |      \  }}| j                  |      }|j                         \  }}}|K| j                  j	                  ||d      }|j                  d      j                  |      }	|d|	z
  z  ||	z  z   }| j                  |      }||fS )N      ?)r=   rE   sizerB   expand	unsqueezetype_asrH   )
rI   rL   rM   
embeddingsoutput_dimensions
batch_sizeseq_len_mask_tokensmasks
             r#   forwardzFocalNetEmbeddings.forward   s     )-(=(=l(K%
%YYz*
!+!2
GQ&//00WbIK",,R088ED#sTz2[45GGJ\\*-
,,,r"   )FN)r   r   r   r   r;   r   r   r   
BoolTensorr   Tensorr]   __classcell__rK   s   @r#   r/   r/      sQ    >& hl-$U%6%67-JRSXScScJd-	u||	-r"   r/   c                   z     e Zd Z	 	 	 d fd	Zd Zdeej                     deej                  ee
   f   fdZ xZS )r<   c	                 d   t         |           t        |t        j                  j
                        r|n||f}t        |t        j                  j
                        r|n||f}|d   |d   z  |d   |d   z  z  }	|| _        || _        || _        |	| _	        |d   |d   z  |d   |d   z  f| _
        |r/|rd}
d}d}nd}
d}d}t        j                  |||
||      | _        nt        j                  ||||      | _        |r't        j                  ||j                  	      | _        y d | _        y )
Nr   r            r   )kernel_sizestridepadding)rh   ri   r8   )r:   r;   
isinstancecollectionsabcIterabler2   r3   r4   num_patchesr>   r   Conv2d
projectionrC   rD   rE   )rI   r1   r2   r3   r4   r5   add_normr6   r7   ro   rh   rj   ri   rK   s                r#   r;   z FocalNetPatchEmbeddings.__init__   s7    	#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$(&$Q-:a=8*Q-:VW=:XY iii[Y`DO !iiiZ`jkDOYF4I4IJDIDIr"   c                 n   || j                   d   z  dk7  rDd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|| j                   d   z  dk7  rFddd| j                   d   || j                   d   z  z
  f}t        j                  j                  ||      }|S )Nr   r   )r3   r   
functionalpad)rI   rL   heightwidth
pad_valuess        r#   	maybe_padz!FocalNetPatchEmbeddings.maybe_pad   s    4??1%%*T__Q/%$//!:L2LLMJ==,,\:FLDOOA&&!+Q4??1#5QRAS8S#STJ==,,\:FLr"   rL   rN   c                 N   |j                   \  }}}}|| j                  k7  rt        d      | j                  |||      }| j	                  |      }|j                   \  }}}}||f}|j                  d      j                  dd      }| j                  | j                  |      }||fS )NzeMake sure that the channel dimension of the pixel values match with the one set in the configuration.rf   r   )shaper4   
ValueErrorry   rq   flatten	transposerE   )rI   rL   rZ   r4   rv   rw   rV   rW   s           r#   r]   zFocalNetPatchEmbeddings.forward   s    )5););&<4,,,w  ~~lFEB__\2
(..1fe#UO''*44Q:
99 :.J,,,r"   )FFF)r   r   r   r;   ry   r   r   r   r   r`   intr]   ra   rb   s   @r#   r<   r<      sL     (T-HU->->$? -E%,,X]^aXbJbDc -r"   r<   input	drop_probtrainingrN   c                    |dk(  s|s| S d|z
  }| j                   d   fd| j                  dz
  z  z   }|t        j                  || j                  | j
                        z   }|j                          | j                  |      |z  }|S )aF  
    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).

    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
    argument.
            r   r   )r   )dtypedevice)r{   ndimr   randr   r   floor_div)r   r   r   	keep_probr{   random_tensoroutputs          r#   	drop_pathr     s     CxII[[^

Q 77E

5ELL YYMYYy!M1FMr"   c                   x     e Zd ZdZd	dee   ddf fdZdej                  dej                  fdZ	de
fdZ xZS )
FocalNetDropPathzXDrop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).Nr   rN   c                 0    t         |           || _        y r^   )r:   r;   r   )rI   r   rK   s     r#   r;   zFocalNetDropPath.__init__   s    "r"   r   c                 D    t        || j                  | j                        S r^   )r   r   r   )rI   r   s     r#   r]   zFocalNetDropPath.forward$  s    FFr"   c                 8    dj                  | j                        S )Nzp={})formatr   rI   s    r#   
extra_reprzFocalNetDropPath.extra_repr'  s    }}T^^,,r"   r^   )r   r   r   r   r   floatr;   r   r`   r]   strr   ra   rb   s   @r#   r   r     sG    b#(5/ #T #GU\\ Gell G-C -r"   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetModulationc                    t         	|           || _        |j                  |   | _        |j
                  |   | _        || _        |j                  | _        |j                  | _	        t        j                  |d|z  | j                  dz   z   |      | _        t        j                  ||dd|      | _        t        j                         | _        t        j                  ||      | _        t        j$                  |      | _        t        j(                         | _        g | _        t/        | j                        D ]  }| j                  |z  | j                  z   }| j*                  j1                  t        j2                  t        j                  |||d||dz  d      t        j                                      | j,                  j1                  |        | j                  r't        j4                  ||j6                        | _        y y )Nrf   r   )bias)rh   ri   r   F)rh   ri   groupsrj   r   r8   )r:   r;   dimfocal_windowsfocal_windowfocal_levelsfocal_levelfocal_factor use_post_layernorm_in_modulationnormalize_modulatorr   Linearprojection_inrp   projection_contextGELU
activationprojection_outrF   projection_dropout
ModuleListfocal_layerskernel_sizesrangeappend
SequentialrC   rD   	layernorm)
rI   r1   indexr   r   r   r   krh   rK   s
            r#   r;   zFocalNetModulation.__init__,  s   "007!..u5(060W0W-#)#=#= YYsAGt7G7G!7K,LSWX"$))C!ATX"Y'') iiS1"$**-?"@MMOt''( 
	2A++a/$2C2CCK$$IISk!CYdhiYipu GGI	 $$[1
	2 00\\#63H3HIDN 1r"   c                 ,   |j                   d   }| j                  |      j                  dddd      j                         }t	        j
                  |||| j                  dz   fd      \  }}}d}t        | j                        D ]+  } | j                  |   |      }|||dd||dz   f   z  z   }- | j                  |j                  dd      j                  dd            }	||	|dd| j                  df   z  z   }| j                  r|| j                  dz   z  }| j                  |      }
||
z  }|j                  dddd      j                         }| j                  r| j                  |      }| j                  |      }| j!                  |      }|S )	z
        Args:
            hidden_state:
                Input features with shape of (batch_size, height, width, num_channels)
        rP   r   r   r   rf   NT)keepdim)r{   r   permute
contiguousr   splitr   r   r   r   meanr   r   r   r   r   r   )rI   hidden_stater4   xqctxgatesctx_alllevel
ctx_global	modulatorx_outs               r#   r]   zFocalNetModulation.forwardM  s    $))"- |,44Q1a@KKMAlDDTDTWXDX'Y[\]3 4++, 	BE*$##E*3/CeAuuqy/@,@&A AAG	B __SXXaX%>%C%CAt%C%TU
Jq$2B2B2D/D)EEE ##!1!1A!56G ++G4	IaAq)44600NN5)E ##E*''.r"   )rf   Tr   r   r   r   r;   r]   ra   rb   s   @r#   r   r   +  s    JB"r"   r   c                   &     e Zd Zd fd	Zd Z xZS )FocalNetMlpc                 
   t         |           |xs |}|xs |}t        j                  ||      | _        t
        |j                     | _        t        j                  ||      | _        t        j                  |      | _
        y r^   )r:   r;   r   r   fc1r   
hidden_actr   fc2rF   drop)rI   r1   in_featureshidden_featuresout_featuresr   rK   s         r#   r;   zFocalNetMlp.__init__s  sh    #2{)8[99[/: !2!2399_l;JJt$	r"   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S r^   )r   r   r   r   )rI   r   s     r#   r]   zFocalNetMlp.forward|  sN    xx-|4yy.xx-yy.r"   )NNr   r   rb   s   @r#   r   r   r  s    %r"   r   c                   *     e Zd ZdZd fd	Zd Z xZS )FocalNetLayera  Focal Modulation Network layer (block).

    Args:
        config (`FocalNetConfig`):
            Model config.
        index (`int`):
            Layer index.
        dim (`int`):
            Number of input channels.
        input_resolution (`Tuple[int]`):
            Input resolution.
        drop_path (`float`, *optional*, defaults to 0.0):
            Stochastic depth rate.
    c                 H   t         |           || _        || _        || _        |j
                  | _        |j                  | _        t        j                  ||j                        | _        t        |||| j                        | _        |dkD  rt        |      nt        j                         | _        t        j                  ||j                        | _        t%        ||j&                  z        }t)        |||| j                        | _        d| _        d| _        |j0                  ryt        j2                  |j4                  t7        j8                  |      z  d      | _        t        j2                  |j4                  t7        j8                  |      z  d      | _        y y )Nr8   )r1   r   r   r   r   )r1   r   r   r   rQ   T)requires_grad)r:   r;   r1   r   input_resolutionrG   r   use_post_layernormr   rC   rD   norm1r   
modulationr   Identityr   norm2r   	mlp_ratior   mlpgamma_1gamma_2use_layerscaler@   layerscale_valuer   ones)rI   r1   r   r   r   r   mlp_hidden_dimrK   s          r#   r;   zFocalNetLayer.__init__  sE     0 ..	"(";";\\#6+@+@A
,#yy	
 9BC))4R[[]\\#6+@+@A
S6#3#334f#~dhdmdmn  <<(?(?%**cBS(ScghDL<<(?(?%**cBS(ScghDL !r"   c           	      :   |\  }}|j                   \  }}}|}| j                  r|n| j                  |      }|j                  ||||      }| j	                  |      j                  |||z  |      }| j                  s|n| j                  |      }|| j                  | j                  |z        z   }|| j                  | j                  | j                  r | j                  | j                  |            n| j                  | j                  |            z        z   }|S r^   )
r{   r   r   viewr   r   r   r   r   r   )	rI   r   input_dimensionsrv   rw   rX   rZ   r4   shortcuts	            r#   r]   zFocalNetLayer.forward  s   (&2&8&8#
A| (,'>'>|DJJ|D\#((VULQ|499*funVbc+/+B+B|

S_H`  $..1L"MM#dnnLL595L5Ltzz$((<01RVRZRZ[_[e[efr[sRtv'
 

 r"   )r   )r   r   r   r   r;   r]   ra   rb   s   @r#   r   r     s    i@r"   r   c                   j     e Zd Z fdZdej
                  deeef   deej
                     fdZ xZ	S )FocalNetStagec                    t         |           || _        t        |j                        | _        t        | j
                        D cg c]  }|j                  d|z  z   }}||   }|| j
                  dz
  k  r||dz      nd }|| j
                  dz
  k  rt        nd }t        j                  d|j                  t        |j                        d      D 	cg c]  }	|	j                          }
}	|
t        |j                  d |       t        |j                  d |dz           }t        j                  t        |j                  |         D cg c]'  }t!        ||||t#        |t$              r||   n|      ) c}      | _        |' |||d||d|j(                  d	      | _        d| _        y d | _        d| _        y c c}w c c}	w c c}w )
Nrf   r   r   cpu)r   )r1   r   r   r   r   TF)r1   r2   r3   r4   r5   rr   r6   r7   )r:   r;   r1   lendepths
num_stagesr   r5   r<   r   linspacedrop_path_ratesumitemr   r   r   rk   listlayersr6   
downsamplepointing)rI   r1   r   r   ir5   r   out_dimr   r   dprr   rK   s               r#   r;   zFocalNetStage.__init__  s   fmm,8=doo8NO1V%%A.O	O+04??Q3F+F)EAI&T1619L1L,SW
 "'63H3H#fmmJ\ej!klAqvvxllFMM&512S{QR9S5TU	mm v}}U34	  !%5.8D.Iily	
 !(+ !%44	DO  #DOI P m	s   F<G,Gr   r   rN   c                    |\  }}| j                   D ]  } |||      } |}| j                  K|\  }}|j                  dd      j                  |j                  d   d||      }| j                  |      \  }}n||||f}|||f}|S )Nr   rf   r   rP   )r   r   r~   reshaper{   )	rI   r   r   rv   rw   layer_module!hidden_states_before_downsamplingrW   stage_outputss	            r#   r]   zFocalNetStage.forward  s    ( KK 	JL(8HIM	J -:)??&,MFE)33Aq9AA177:BM 04}/M,M, "( >&(IK\]r"   )
r   r   r   r;   r   r`   r   r   r]   ra   rb   s   @r#   r   r     s=    *XU\\ U3PS8_ Y^_d_k_kYl r"   r   c                        e Zd Z fdZ	 	 	 d	dej
                  deeef   dee	   dee	   dee	   de
eef   fdZ xZS )
FocalNetEncoderc                 2   t         |           t        |j                        | _        || _        t        j                  t        | j                        D cg c]$  }t        |||d   d|z  z  |d   d|z  z  f      & c}      | _
        d| _        y c c}w )Nr   rf   r   )r1   r   r   F)r:   r;   r   r   r   r1   r   r   r   r   stagesgradient_checkpointing)rI   r1   r>   i_layerrK   s       r#   r;   zFocalNetEncoder.__init__  s    fmm,mm  %T__5  !!&/lq'z&BIaLUVX_U_D`%a	
 ',#s   )Br   r   output_hidden_states(output_hidden_states_before_downsamplingreturn_dictrN   c                    |rdnd }|rdnd }|rE|j                   \  }}	}
 |j                  |g||
 }|j                  dddd      }||fz  }||fz  }t        | j                        D ]  \  }}| j
                  r*| j                  r| j                  |j                  ||      }n	 |||      }|d   }|d   }|d   }|d   |d   f}|rP|rN|j                   \  }}	}
 |j                  |g|d   |d   f|
 }|j                  dddd      }||fz  }||fz  }|s|r|j                   \  }}	}
 |j                  |g||
 }|j                  dddd      }||fz  }||fz  } |st        d ||fD              S t        |||	      S )
Nr!   r   r   r   rf   rP   c              3   &   K   | ]	  }||  y wr^   r!   ).0vs     r#   	<genexpr>z*FocalNetEncoder.forward.<locals>.<genexpr>U  s     Xq!-Xs   )r   r   r   )r{   r   r   	enumerater  r  r   _gradient_checkpointing_func__call__tupler   )rI   r   r   r  r  r  all_hidden_statesall_reshaped_hidden_statesrX   rZ   hidden_sizereshaped_hidden_stater   stage_moduler   r   rW   s                    r#   r]   zFocalNetEncoder.forward  s9    #7BD+?RT")6)<)<&J;$6M$6$6z$bDT$bVa$b!$9$A$A!Q1$M!-!11&+@*BB&(5  	GOA|**t}} $ A A ))!$! !-]<L M)!,M0=a0@- -a 0 1" 57H7LM#(P-N-T-T*
A{ )O(I(N(N)"3A"68I!8L!M)OZ)% )>(E(EaAq(Q%!&G%II!*/D.FF*%.V-:-@-@*
A{(:(:(::(fHX(fZe(f%(=(E(EaAq(Q%!m%55!*/D.FF*A 	GD X]4E$FXXX$++#=
 	
r"   )FFT)r   r   r   r;   r   r`   r   r   r   boolr   r   r]   ra   rb   s   @r#   r   r     su    ,, 05CH&*<
||<
  S/<
 'tn	<

 3;4.<
 d^<
 
u++	,<
r"   r   c                   (    e Zd ZeZdZdZdZdgZd Z	y)FocalNetPreTrainedModelfocalnetrL   Tr   c                    t        |t        j                  t        j                  f      rm|j                  j
                  j                  d| j                  j                         |j                  %|j                  j
                  j                          yyt        |t        j                        rJ|j                  j
                  j                          |j                  j
                  j                  d       yt        |t              r2|j                  %|j                  j
                  j                          yyt        |t              r| j                  j                   rs|j"                  j
                  j                  | j                  j$                         |j&                  j
                  j                  | j                  j$                         yyy)zInitialize the weightsr   )r   stdNrQ   )rk   r   r   rp   weightdatanormal_r1   initializer_ranger   zero_rC   fill_r/   rB   r   r   r   r   r   )rI   modules     r#   _init_weightsz%FocalNetPreTrainedModel._init_weightsf  s<   fryy"))45 MM&&CT[[5R5R&S{{&  &&( '-KK""$MM$$S) 23  ,!!&&,,. -.{{))##))$++*F*FG##))$++*F*FG * /r"   N)
r   r   r   r   config_classbase_model_prefixmain_input_namesupports_gradient_checkpointing_no_split_modulesr$  r!   r"   r#   r  r  ^  s'    !L"$O&*#()Hr"   r  c                        e Zd Zd	 fd	Zd Ze	 	 	 	 d
deej                     deej                     dee
   dee
   deeef   f
d       Z xZS )FocalNetModelc                    t         |   |       || _        t        |j                        | _        t        |j                  d| j
                  dz
  z  z        | _        t        ||      | _
        t        || j                  j                        | _        t        j                  | j                  |j                         | _        |rt        j$                  d      nd| _        | j)                          y)z
        add_pooling_layer (bool, *optional*, defaults to `True`):
            Whether to add a pooling layer
        use_mask_token (`bool`, *optional*, defaults to `False`):
            Whether to use a mask token for masked image modeling.
        rf   r   )rJ   r8   N)r:   r;   r1   r   r   r   r   r5   num_featuresr/   rV   r   r?   encoderr   rC   rD   r   AdaptiveAvgPool1dpooler	post_init)rI   r1   add_pooling_layerrJ   rK   s       r#   r;   zFocalNetModel.__init__|  s     	 fmm, 0 0119L3M MN,VNS&vt/I/IJd&7&7V=R=RS1Bb**1- 	r"   c                 .    | j                   j                  S r^   )rV   r=   r   s    r#   get_input_embeddingsz"FocalNetModel.get_input_embeddings  s    ///r"   rL   rM   r  r  rN   c                    ||n| j                   j                  }||n| j                   j                  }|t        d      | j	                  ||      \  }}| j                  ||||      }|d   }| j                  |      }d}	| j                  7| j                  |j                  dd            }	t        j                  |	d      }	|s||	f|dd z   }
|
S t        ||	|j                  |j                        S )	z
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
        Nz You have to specify pixel_values)rM   r  r  r   r   rf   )r   r&   r   r   )r1   r  use_return_dictr|   rV   r.  r   r0  r~   r   r}   r%   r   r   )rI   rL   rM   r  r  embedding_outputr   encoder_outputssequence_outputpooled_outputr   s              r#   r]   zFocalNetModel.forward  s    %9$D $++JjJj 	 &1%<k$++B]B]?@@-1__\[j_-k**,,!5#	 ' 
 *!,..9;;" KK(A(A!Q(GHM!MM-;M%}58KKFM"-')77#2#I#I	
 	
r"   )TFNNNN)r   r   r   r;   r4  r   r   r   r   r_   r  r   r   r%   r]   ra   rb   s   @r#   r+  r+  z  s    *0  596:/3&*.
u001.
 "%"2"23.
 'tn	.

 d^.
 
u))	*.
 .
r"   r+  a  
    FocalNet Model with a decoder on top for masked image modeling.

    This follows the same implementation as in [SimMIM](https://arxiv.org/abs/2111.09886).

    <Tip>

    Note that we provide a script to pre-train this model on custom data in our [examples
    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).

    </Tip>
    )custom_introc                        e Zd Z fdZe	 	 	 	 ddeej                     deej                     dee	   dee	   de
eef   f
d       Z xZS )	FocalNetForMaskedImageModelingc                    t         |   |       t        |dd      | _        t	        |j
                        | _        t        |j                  d| j                  dz
  z  z        }t        j                  t        j                  ||j                  dz  |j                  z  d      t        j                  |j                              | _        | j!                          y )NFT)r2  rJ   rf   r   )in_channelsout_channelsrh   )r:   r;   r+  r  r   r   r   r   r5   r   r   rp   encoder_strider4   PixelShuffledecoderr1  )rI   r1   r-  rK   s      r#   r;   z'FocalNetForMaskedImageModeling.__init__  s     %fVZ[fmm,6++aDOOa4G.HHI}}II(v7L7La7ORXReRe7est OOF112	
 	r"   rL   rM   r  r  rN   c                    ||n| j                   j                  }| j                  ||||      }|d   }|j                  dd      }|j                  \  }}}	t        j                  |	dz        x}
}|j                  |||
|      }| j                  |      }d}|| j                   j                  | j                   j                  z  }|j                  d||      }|j                  | j                   j                  d      j                  | j                   j                  d      j                  d      j                         }t        j                  j!                  ||d	      }||z  j#                         |j#                         d
z   z  | j                   j$                  z  }|s|f|dd z   }||f|z   S |S t'        |||j(                  |j*                        S )a?  
        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`):
            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).

        Examples:
        ```python
        >>> from transformers import AutoImageProcessor, FocalNetConfig, FocalNetForMaskedImageModeling
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-base-simmim-window6-192")
        >>> config = FocalNetConfig()
        >>> model = FocalNetForMaskedImageModeling(config)

        >>> num_patches = (model.config.image_size // model.config.patch_size) ** 2
        >>> pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
        >>> # create random boolean mask of shape (batch_size, num_patches)
        >>> bool_masked_pos = torch.randint(low=0, high=2, size=(1, num_patches)).bool()

        >>> outputs = model(pixel_values, bool_masked_pos=bool_masked_pos)
        >>> loss, reconstructed_pixel_values = outputs.loss, outputs.logits
        >>> list(reconstructed_pixel_values.shape)
        [1, 3, 192, 192]
        ```N)rM   r  r  r   r   rf   g      ?rP   none)	reductiongh㈵>)r)   r*   r   r   )r1   r7  r  r~   r{   mathfloorr   rE  r2   r3   repeat_interleaverT   r   r   rt   l1_lossr   r4   r(   r   r   )rI   rL   rM   r  r  outputsr:  rX   r4   sequence_lengthrv   rw   reconstructed_pixel_valuesmasked_im_lossrR   r\   reconstruction_lossr   s                     r#   r]   z&FocalNetForMaskedImageModeling.forward  s   H &1%<k$++B]B]--+!5#	   
 "!*)33Aq94C4I4I1
L/OS$899)11*lFTYZ &*\\/%B"&;;))T[[-C-CCD-55b$EO11$++2H2H!L""4;;#9#91=1	  #%--"7"7F`lr"7"s1D8==?488:PTCTUX\XcXcXpXppN02WQR[@F3A3M^%.YSYY05!//#*#A#A	
 	
r"   r<  )r   r   r   r;   r   r   r   r   r_   r  r   r   r(   r]   ra   rb   s   @r#   r?  r?    s    "  596:/3&*L
u001L
 "%"2"23L
 'tn	L

 d^L
 
u77	8L
 L
r"   r?  z
    FocalNet Model with an image classification head on top (a linear layer on top of the pooled output) e.g. for
    ImageNet.
    c                        e Zd Z fdZe	 	 	 	 ddeej                     deej                     dee	   dee	   de
eef   f
d       Z xZS )	FocalNetForImageClassificationc                 >   t         |   |       |j                  | _        t        |      | _        |j                  dkD  r4t        j                  | j                  j                  |j                        nt        j                         | _	        | j                          y )Nr   )r:   r;   
num_labelsr+  r  r   r   r-  r   
classifierr1  rI   r1   rK   s     r#   r;   z'FocalNetForImageClassification.__init__>  sx      ++%f- IOHYHY\]H]BIIdmm00&2C2CDcecncncp 	
 	r"   rL   labelsr  r  rN   c                    ||n| j                   j                  }| j                  |||      }|d   }| j                  |      }d}|| j                   j                  | j
                  dk(  rd| j                   _        nl| j
                  dkD  rL|j                  t        j                  k(  s|j                  t        j                  k(  rd| j                   _        nd| j                   _        | j                   j                  dk(  rIt               }	| j
                  dk(  r& |	|j                         |j                               }n |	||      }n| j                   j                  dk(  r=t               }	 |	|j                  d| j
                        |j                  d            }n,| j                   j                  dk(  rt               }	 |	||      }|s|f|dd z   }
||f|
z   S |
S t        |||j                   |j"                  	      S )
a  
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
        Nr6  r   
regressionsingle_label_classificationmulti_label_classificationrP   rf   )r)   r-   r   r   )r1   r7  r  rV  problem_typerU  r   r   longr   r
   squeezer	   r   r   r,   r   r   )rI   rL   rX  r  r  rM  r;  r-   r)   loss_fctr   s              r#   r]   z&FocalNetForImageClassification.forwardL  s    &1%<k$++B]B]--!5#   
  
/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./Y,F)-)9TGf$EvE,!//#*#A#A	
 	
r"   r<  )r   r   r   r;   r   r   r   r   
LongTensorr  r   r   r,   r]   ra   rb   s   @r#   rS  rS  6  s      59-1/3&*9
u0019
 ))*9
 'tn	9

 d^9
 
u33	49
 9
r"   rS  zG
    FocalNet backbone, to be used with frameworks like X-Decoder.
    c            
       l     e Zd Zdef fdZe	 	 ddej                  dee	   dee	   de
fd       Z xZS )	FocalNetBackboner1   c                     t         |   |       t         | 	  |       |j                  g|j                  z   | _        t        |      | _        | j                          y r^   )	r:   r;   _init_backboner5   hidden_sizesr-  r+  r  r1  rW  s     r#   r;   zFocalNetBackbone.__init__  sQ     v&#--.1D1DD%f- 	r"   rL   r  r  rN   c                    ||n| j                   j                  }||n| j                   j                  }| j                  |dd      }|j                  }d}t        | j                        D ]  \  }}|| j                  v s|||   fz  } |s|f}	|r|	|j                  fz  }	|	S t        ||r|j                  d      S dd      S )aj  
        Examples:

        ```python
        >>> from transformers import AutoImageProcessor, AutoBackbone
        >>> import torch
        >>> from PIL import Image
        >>> import requests

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = Image.open(requests.get(url, stream=True).raw)

        >>> processor = AutoImageProcessor.from_pretrained("microsoft/focalnet-tiny-lrf")
        >>> model = AutoBackbone.from_pretrained("microsoft/focalnet-tiny-lrf")

        >>> inputs = processor(image, return_tensors="pt")
        >>> outputs = model(**inputs)
        ```NTr6  r!   )feature_mapsr   
attentions)
r1   r7  r  r  r   r  stage_namesr   r   r   )
rI   rL   r  r  rM  r   rh  idxstager   s
             r#   r]   zFocalNetBackbone.forward  s    2 &1%<k$++B]B]$8$D $++JjJj 	 --4UY-Z66#D$4$45 	6JC)))s!3 55	6 "_F#70022M%3G'//
 	
MQ
 	
r"   )NN)r   r   r   r   r;   r   r   r`   r   r  r   r]   ra   rb   s   @r#   rc  rc    s]    ~   04&*	0
ll0
 'tn0
 d^	0

 
0
 0
r"   rc  )rS  r?  rc  r+  r  )r   F)9r   collections.abcrl   rI  dataclassesr   typingr   r   r   r   torch.utils.checkpointr   torch.nnr   r	   r
   activationsr   modeling_outputsr   modeling_utilsr   utilsr   r   r   utils.backbone_utilsr   configuration_focalnetr   
get_loggerr   loggerr   r%   r(   r,   Moduler/   r<   r`   r   r  r   r   r   r   r   r   r   r  r+  r?  rS  rc  __all__r!   r"   r#   <module>r|     sE      ! ) )    A A ! . - 9 9 1 2 
		H	% FK F F4 F+ F F8 F F F8 FK F F8%- %-PD-bii D-PU\\ e T V[VbVb *-ryy -D DN")) &BBII BJ?BII ?DO
bii O
d Ho H H6 H
+ H
 H
V _
%< _
_
D J
%< J
J
Z 
<
. <

<
~r"   