
    Uh+                        d Z ddlmZ ddlmZmZ ddlZddlmZ ddlm	Z	 ddl
mZmZ dd	lmZ d
dlmZ e G d de             Ze G d de	             Z G d dej&                        Z G d dej&                        Z G d dej&                        Z G d dej&                        Z G d dej&                        Z ed       G d de             ZddgZy)zPyTorch ViTMatte model.    )	dataclass)OptionalTupleN)nn   )PreTrainedModel)ModelOutputauto_docstring)load_backbone   )VitMatteConfigc                       e Zd ZU dZdZeej                     ed<   dZ	eej                     ed<   dZ
eeej                        ed<   dZeeej                        ed<   y)ImageMattingOutputa  
    Class for outputs of image matting models.

    Args:
        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
            Loss.
        alphas (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
           Estimated alpha values.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each stage) of shape `(batch_size, sequence_length, hidden_size)`. Hidden-states
            (also called feature maps) of the model at the output of each stage.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, patch_size,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nlossalphashidden_states
attentions)__name__
__module____qualname____doc__r   r   torchFloatTensor__annotations__r   r   r   r        /var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/models/vitmatte/modeling_vitmatte.pyr   r      sg    ( )-D(5$$
%,*.FHU&&'.8<M8E%"3"345<59Ju00129r   r   c                   "    e Zd ZeZdZdZg Zd Zy)VitMattePreTrainedModelpixel_valuesTc                    t        |t        j                        rm|j                  j                  j                  d| j                  j                         |j                  %|j                  j                  j                          y y y )Ng        )meanstd)

isinstancer   Conv2dweightdatanormal_configinitializer_rangebiaszero_)selfmodules     r   _init_weightsz%VitMattePreTrainedModel._init_weights@   sa    fbii(MM&&CT[[5R5R&S{{&  &&( ' )r   N)	r   r   r   r   config_classmain_input_namesupports_gradient_checkpointing_no_split_modulesr/   r   r   r   r   r   9   s    !L$O&*#)r   r   c                   *     e Zd ZdZd fd	Zd Z xZS )VitMatteBasicConv3x3zP
    Basic convolution layers including: Conv3x3, BatchNorm2d, ReLU layers.
    c                     t         |           t        j                  ||d||d      | _        t        j
                  ||j                        | _        t        j                         | _	        y )Nr   F)in_channelsout_channelskernel_sizestridepaddingr+   )eps)
super__init__r   r%   convBatchNorm2dbatch_norm_eps
batch_normReLUrelu)r-   r)   r7   r8   r:   r;   	__class__s         r   r>   zVitMatteBasicConv3x3.__init__L   sW    II#%
	 ..6;P;PQGGI	r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S N)r?   rB   rD   r-   hidden_states     r   forwardzVitMatteBasicConv3x3.forwardY   s2    yy.|4yy.r   )   r   r   r   r   r   r>   rJ   __classcell__rE   s   @r   r5   r5   G   s    r   r5   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteConvStreamzc
    Simple ConvStream containing a series of basic conv3x3 layers to extract detail features.
    c                    t         |           d}|j                  |j                  j                  }|j                  }t        j                         | _        |g|z   | _        t        t        | j                        dz
        D ]I  }| j                  |   }| j                  |dz      }| j                  j                  t        |||             K y )N   r   )r=   r>   backbone_confignum_channelsconvstream_hidden_sizesr   
ModuleListconvs
conv_chansrangelenappendr5   )r-   r)   r7   r8   iin_chan_	out_chan_rE   s          r   r>   zVitMatteConvStream.__init__f   s     !!- 00==K55]]_
&-,6s4??+a/0 	QAq)HA.IJJ268YOP	Qr   c                     d|i}|}t        t        | j                              D ]-  } | j                  |   |      }dt        |dz         z   }|||<   / |S )Ndetailed_feature_map_0detailed_feature_map_r   )rY   rZ   rW   str)r-   r    out_dict
embeddingsr\   name_s         r   rJ   zVitMatteConvStream.forwardy   sc    ,l;!
s4::' 	)A&Az2J+c!a%j8E(HUO	)
 r   rL   rN   s   @r   rP   rP   a   s    Q&r   rP   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteFusionBlockz\
    Simple fusion block to fuse features from ConvStream and Plain Vision Transformer.
    c                 L    t         |           t        |||dd      | _        y )Nr   )r:   r;   )r=   r>   r5   r?   )r-   r)   r7   r8   rE   s       r   r>   zVitMatteFusionBlock.__init__   s$    (lST^_`	r   c                     t         j                  j                  |ddd      }t        j                  ||gd      }| j                  |      }|S )NrK   bilinearF)scale_factormodealign_cornersr   )dim)r   
functionalinterpolater   catr?   )r-   featuresdetailed_feature_mapupscaled_featuresouts        r   rJ   zVitMatteFusionBlock.forward   sK    MM55hQU_ot5uii-/@AqIiin
r   rL   rN   s   @r   rg   rg      s    ar   rg   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteHeadzJ
    Simple Matting Head, containing only conv3x3 and conv1x1 layers.
    c                 *   t         |           |j                  d   }d}t        j                  t        j
                  ||ddd      t        j                  |      t        j                  d      t        j
                  |dddd            | _        y )N   r   r   )r9   r:   r;   Tr   )	r=   r>   fusion_hidden_sizesr   
Sequentialr%   r@   rC   matting_convs)r-   r)   r7   mid_channelsrE   s       r   r>   zVitMatteHead.__init__   st    004]]IIk<QqRSTNN<(GGDMIIlA1QJ	
r   c                 (    | j                  |      }|S rG   )r}   rH   s     r   rJ   zVitMatteHead.forward   s    )),7r   rL   rN   s   @r   rw   rw      s    
r   rw   c                   (     e Zd ZdZ fdZd Z xZS )VitMatteDetailCaptureModulezG
    Simple and lightweight Detail Capture Module for ViT Matting.
    c           
         t         |           t        |j                        t        |j                        dz   k7  rt        d      || _        t        |      | _        | j                  j                  | _	        t        j                         | _        |j                  g|j                  z   | _        t        t        | j                        dz
        D ]Z  }| j                  j!                  t#        || j                  |   | j                  |dz       z   | j                  |dz                   \ t%        |      | _        y )Nr   z_The length of fusion_hidden_sizes should be equal to the length of convstream_hidden_sizes + 1.)r)   r7   r8   )r=   r>   rZ   r{   rU   
ValueErrorr)   rP   
convstreamrX   r   rV   fusion_blockshidden_sizefusion_channelsrY   r[   rg   rw   matting_head)r-   r)   r\   rE   s      r   r>   z$VitMatteDetailCaptureModule.__init__   s   v))*c&2P2P.QTU.UUq  ,V4//44]]_ & 2 23f6P6PPs4//0145 	A%%#! $ 4 4Q 7$//APQE(:S S!%!5!5a!e!<	 )0r   c                 6   | j                  |      }t        t        | j                              D ]B  }dt	        t        | j                        |z
  dz
        z   } | j                  |   |||         }D t        j                  | j                  |            }|S )Nra   r   )r   rY   rZ   r   rb   r   sigmoidr   )r-   rr   r    detail_featuresr\   detailed_feature_map_namer   s          r   rJ   z#VitMatteDetailCaptureModule.forward   s    //,7s4--./ 	cA(?#c$J\J\F]`aFadeFeBf(f%,t))!,XG`7abH	c t00:;r   rL   rN   s   @r   r   r      s    12r   r   zX
    ViTMatte framework leveraging any vision backbone e.g. for ADE20k, CityScapes.
    )custom_introc                        e Zd Z fdZe	 	 	 	 	 ddeej                     dee   dee   deej                     dee   f
d       Z	 xZ
S )	VitMatteForImageMattingc                     t         |   |       || _        t        |      | _        t        |      | _        | j                          y rG   )r=   r>   r)   r   backboner   decoder	post_init)r-   r)   rE   s     r   r>   z VitMatteForImageMatting.__init__   s;     %f-26: 	r   r    output_attentionsoutput_hidden_stateslabelsreturn_dictc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }d}|t	        d      | j
                  j                  |||      }|j                  d   }| j                  ||      }	|s|	f|dd z   }
||f|
z   S |
S t        ||	|j                  |j                        S )a8  
        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
            Ground truth image matting for computing the loss.

        Examples:

        ```python
        >>> from transformers import VitMatteImageProcessor, VitMatteForImageMatting
        >>> import torch
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download

        >>> processor = VitMatteImageProcessor.from_pretrained("hustvl/vitmatte-small-composition-1k")
        >>> model = VitMatteForImageMatting.from_pretrained("hustvl/vitmatte-small-composition-1k")

        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="image.png", repo_type="dataset"
        ... )
        >>> image = Image.open(filepath).convert("RGB")
        >>> filepath = hf_hub_download(
        ...     repo_id="hf-internal-testing/image-matting-fixtures", filename="trimap.png", repo_type="dataset"
        ... )
        >>> trimap = Image.open(filepath).convert("L")

        >>> # prepare image + trimap for the model
        >>> inputs = processor(images=image, trimaps=trimap, return_tensors="pt")

        >>> with torch.no_grad():
        ...     alphas = model(**inputs).alphas
        >>> print(alphas.shape)
        torch.Size([1, 1, 640, 960])
        ```NzTraining is not yet supported)r   r   ry   r   )r   r   r   r   )r)   use_return_dictr   r   NotImplementedErrorr   forward_with_filtered_kwargsfeature_mapsr   r   r   r   )r-   r    r   r   r   r   r   outputsrr   r   outputs              r   rJ   zVitMatteForImageMatting.forward   s    R &1%<k$++B]B]$8$D $++JjJj 	 2C1N-TXT_T_TqTq%&EFF--<</CWh = 
 ''+h5Y,F)-)9TGf$EvE!!//))	
 	
r   )NNNNN)r   r   r   r>   r
   r   r   TensorboolrJ   rM   rN   s   @r   r   r      s      04,0/3)-&*B
u||,B
 $D>B
 'tn	B

 &B
 d^B
 B
r   r   )r   dataclassesr   typingr   r   r   r   modeling_utilsr   utilsr	   r
   utils.backbone_utilsr   configuration_vitmatter   r   r   Moduler5   rP   rg   rw   r   r   __all__r   r   r   <module>r      s     ! "   - 0 1 2 : : :6 
)o 
) 
)299 4   F")) "299 0&")) &R 
N
5 N

N
b %&?
@r   