
    Uh&                        d dl mZmZ ddlmZ ddlmZmZmZm	Z	m
Z
mZ ddlmZmZ  e	       rd dlmZ ddlmZ  e       rdd	lmZ  e       r
d d
lZddlmZ  e
j0                  e      Z e edd             G d de             Zy
)    )ListUnion   )GenerationConfig)add_end_docstringsis_tf_availableis_torch_availableis_vision_availableloggingrequires_backends   )Pipelinebuild_pipeline_init_args)Image)
load_image)'TF_MODEL_FOR_VISION_2_SEQ_MAPPING_NAMESN)$MODEL_FOR_VISION_2_SEQ_MAPPING_NAMEST)has_tokenizerhas_image_processorc                        e Zd ZdZdZ ed      Z fdZddZdde	e
ee
   ded   f   f fd	Zdd
Zd Zd Z xZS )ImageToTextPipelinea  
    Image To Text pipeline using a `AutoModelForVision2Seq`. This pipeline predicts a caption for a given image.

    Unless the model you're using explicitly sets these generation parameters in its configuration files
    (`generation_config.json`), the following default values will be used:
    - max_new_tokens: 256

    Example:

    ```python
    >>> from transformers import pipeline

    >>> captioner = pipeline(model="ydshieh/vit-gpt2-coco-en")
    >>> captioner("https://huggingface.co/datasets/Narsil/image_dummy/raw/main/parrots.png")
    [{'generated_text': 'two birds are standing next to each other '}]
    ```

    Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)

    This image to text pipeline can currently be loaded from pipeline() using the following task identifier:
    "image-to-text".

    See the list of available models on
    [huggingface.co/models](https://huggingface.co/models?pipeline_tag=image-to-text).
    T   )max_new_tokensc                     t        |   |i | t        | d       | j                  | j                  dk(  rt
               y t               y )Nvisiontf)super__init__r   check_model_type	frameworkr   r   )selfargskwargs	__class__s      v/var/www/catia.catastroantioquia-mas.com/valormas/lib/python3.12/site-packages/transformers/pipelines/image_to_text.pyr   zImageToTextPipeline.__init__P   sD    $)&)$)7;~~7M3	
Sw	
    c                    i }i }|||d<   |||d<   |||d<   |"|d|v rt        d      |j                  |       | j                  | j                  |d<   | j                  | j                  |d<   | j                  |d<   ||i fS )Nprompttimeoutr   zp`max_new_tokens` is defined both as an argument and inside `generate_kwargs` argument, please use only 1 versionassistant_model	tokenizerassistant_tokenizer)
ValueErrorupdater*   r,   r+   )r!   r   generate_kwargsr(   r)   forward_paramspreprocess_paramss          r%   _sanitize_parametersz(ImageToTextPipeline._sanitize_parametersW   s    *0h'+2i(%/=N+,&).>/.Q &  !!/2+040D0DN,-##/*...N;'484L4LN01 ."44r&   inputszImage.Imagec                 h    d|v r|j                  d      }|t        d      t        |   |fi |S )a  
        Assign labels to the image(s) passed as inputs.

        Args:
            inputs (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
                The pipeline handles three types of images:

                - A string containing a HTTP(s) link pointing to an image
                - A string containing a local path to an image
                - An image loaded in PIL directly

                The pipeline accepts either a single image or a batch of images.

            max_new_tokens (`int`, *optional*):
                The amount of maximum tokens to generate. By default it will use `generate` default.

            generate_kwargs (`Dict`, *optional*):
                Pass it to send all of these arguments directly to `generate` allowing full control of this function.

            timeout (`float`, *optional*, defaults to None):
                The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
                the call may block forever.

        Return:
            A list or a list of list of `dict`: Each result comes as a dictionary with the following key:

            - **generated_text** (`str`) -- The generated text.
        imageszBCannot call the image-to-text pipeline without an inputs argument!)popr-   r   __call__)r!   r3   r#   r$   s      r%   r7   zImageToTextPipeline.__call__r   sA    < vZZ)F>abbw1&11r&   c                    t        ||      }|t        j                  d       t        |t              st        dt        |       d      | j                  j                  j                  }|dk(  r| j                  || j                        }| j                  dk(  r|j                  | j                        }| j                  |d	      j                  }| j                  j                   g|z   }t#        j$                  |      j'                  d
      }|j)                  d|i       n|dk(  rI| j                  ||| j                        }| j                  dk(  r|j                  | j                        }n|dk7  rv| j                  || j                        }| j                  dk(  r|j                  | j                        }| j                  || j                        }|j)                  |       nVt        d| d      | j                  || j                        }| j                  dk(  r|j                  | j                        }| j                  j                  j                  dk(  r|d |d<   |S )N)r)   u   Passing `prompt` to the `image-to-text` pipeline is deprecated and will be removed in version 4.48 of 🤗 Transformers. Use the `image-text-to-text` pipeline insteadz&Received an invalid text input, got - zy - but expected a single string. Note also that one single text can be provided for conditional image to text generation.git)r5   return_tensorsptF)textadd_special_tokensr   	input_ids
pix2struct)r5   header_textr:   zvision-encoder-decoder)r:   zModel type z- does not support conditional text generation)r   loggerwarning_once
isinstancestrr-   typemodelconfig
model_typeimage_processorr    totorch_dtyper+   r>   cls_token_idtorchtensor	unsqueezer.   )r!   imager(   r)   rH   model_inputsr>   text_inputss           r%   
preprocesszImageToTextPipeline.preprocess   s<   5'2W fc* <T&\N Ko o 
 **55JU"#335QUQ_Q_3`>>T)#/??43C3C#DL NN5NQ[[	!^^889IE	!LL3==a@	##[)$<=|+#335feieses3t>>T)#/??43C3C#DL77#335QUQ_Q_3`>>T)#/??43C3C#DL"nnVDNNnS##K0 !;zl:g!hii  //uT^^/\L~~%+t/?/?@::''50V^(,L%r&   c                    d|v r-t        |d   t              rt        d |d   D              rd |d<   d|vr| j                  |d<   |j	                  | j
                  j                        } | j
                  j                  |fi ||}|S )Nr>   c              3   $   K   | ]  }|d u  
 y wN ).0xs     r%   	<genexpr>z/ImageToTextPipeline._forward.<locals>.<genexpr>   s     A!AIAs   generation_config)rC   listallr[   r6   rF   main_input_namegenerate)r!   rQ   r/   r3   model_outputss        r%   _forwardzImageToTextPipeline._forward   s     <'<4d;A|K'@AA(,L% o5373I3IO/0 !!$**"<"<=+

++FVlVoVr&   c                 x    g }|D ]2  }d| j                   j                  |d      i}|j                  |       4 |S )Ngenerated_textT)skip_special_tokens)r+   decodeappend)r!   r`   records
output_idsrecords        r%   postprocesszImageToTextPipeline.postprocess   sQ    ' 	#J $.."7"7(, #8 #F NN6"	# r&   )NNNNrV   )NN)__name__
__module____qualname____doc___pipeline_calls_generater   _default_generation_configr   r2   r   rD   r   r7   rS   ra   rj   __classcell__)r$   s   @r%   r   r   .   s\    4  $!1"
56"2uS$s)]DDW%WX "2H1f,
r&   r   )typingr   r   
generationr   utilsr   r   r	   r
   r   r   baser   r   PILr   image_utilsr   models.auto.modeling_tf_autor   rM   models.auto.modeling_autor   
get_loggerrk   rA   r   rW   r&   r%   <module>r{      s~      )  5 (VP			H	% ,4UYZ[z( z \zr&   