huggingface · molbap · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
@@ -16,8 +16,56 @@
 Image/Text processor class for ALIGN
 """
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from typing import List, Union
+
+# TODO fix forward referencing of costly modules to import as below
+from ...image_utils import ImageInput
+from ...processing_utils import (
+ CommonKwargs,
+ ImagesKwargs,
+ ProcessingKwargs,
+ ProcessorMixin,
+ TextKwargs,
+ add_expanded_type_hints,
+)
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+class AlignProcessorKwargs(ProcessingKwargs):
+ __total__ = False
+ text_kwargs: TextKwargs = {
+ "add_special_tokens": True,
+ "padding": "max_length",
+ "truncation": True,
+ "max_length": 64,
+ "stride": 0,
+ "is_split_into_words": False,
+ "pad_to_multiple_of": None,
+ "return_token_type_ids": None,
+ "return_attention_mask": None,
+ "return_overflowing_tokens": False,
+ "return_special_tokens_mask": False,
+ "return_offsets_mapping": False,
+ "return_length": False,
+ "verbose": True,
+ }
+
+ images_kwargs: ImagesKwargs = {
+ "do_resize": None,
+ "size": None,
+ "crop_size": None,
+ "resample": None,
+ "do_rescale": None,
+ "rescale_factor": None,
+ "do_normalize": None,
+ "image_mean": None,
+ "image_std": None,
+ "do_center_crop": None,
+ "data_format": "channels_first",
+ "input_data_format": None,
+ }
+
+ common_kwargs: CommonKwargs = {"return_tensors": None}
 
 
 class AlignProcessor(ProcessorMixin):
@@ -32,20 +80,37 @@ class AlignProcessor(ProcessorMixin):
  The image processor is a required input.
  tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
  The tokenizer is a required input.
+
  """
 
  attributes = ["image_processor", "tokenizer"]
  image_processor_class = "EfficientNetImageProcessor"
  tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
 
  def __init__(self, image_processor, tokenizer):
+ self.base_kwargs = AlignProcessorKwargs()
  super().__init__(image_processor, tokenizer)
 
- def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs):
+ @add_expanded_type_hints(
+ text_kwargs=TextKwargs,
+ images_kwargs=ImagesKwargs,
+ common_kwargs=CommonKwargs,
+ )
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ audio=None,
+ videos=None,
+ text_kwargs: TextKwargs = None,
+ images_kwargs: ImagesKwargs = None,
+ common_kwargs: CommonKwargs = None,
+ **kwargs: AlignProcessorKwargs,
+ ) -> BatchEncoding:
  """
  Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
- and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
- the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+ arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` arguments to
  EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
  to the doctsring of the above two methods for more information.
 
@@ -57,19 +122,12 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
  images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
  The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
  tensor. Both channels-first and channels-last formats are supported.
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
- Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`,
- `'max_length'`, `False` or `'do_not_pad'`]
- max_length (`int`, *optional*, defaults to `max_length`):
- Maximum padding value to use to pad the input text during tokenization.
-
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
  If set, will return tensors of a particular framework. Acceptable values are:
-
- - `'tf'`: Return TensorFlow `tf.constant` objects.
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
- - `'np'`: Return NumPy `np.ndarray` objects.
- - `'jax'`: Return JAX `jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
 
  Returns:
  [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
@@ -81,15 +139,32 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
  - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
  """
  if text is None and images is None:
- raise ValueError("You have to specify either text or images. Both cannot be none.")
-
+ raise ValueError("You must specify either text or images.")
+
+ text_kwargs = kwargs.pop("text_kwargs", {})
+ images_kwargs = kwargs.pop("images_kwargs", {})
+ common_kwargs = kwargs.pop("common_kwargs", {})
+
+ for key, value in kwargs.items():
+ if key in AlignProcessorKwargs.text_kwargs:
+ text_kwargs[key] = value
+ elif key in AlignProcessorKwargs.images_kwargs:
+ images_kwargs[key] = value
+ else:
+ # remaining kwargs go to the "common" key
+ common_kwargs[key] = value
+ common_kwargs = {**AlignProcessorKwargs.common_kwargs, **common_kwargs}
+ text_kwargs = {**AlignProcessorKwargs.text_kwargs, **text_kwargs, **common_kwargs}
+ images_kwargs = {**AlignProcessorKwargs.images_kwargs, **images_kwargs, **common_kwargs}
  if text is not None:
- encoding = self.tokenizer(
- text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs
- )
+ encoding = self.tokenizer(text, **text_kwargs)
 
  if images is not None:
- image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+ image_features = self.image_processor(images, **images_kwargs)
+
+ # BC for explicit return_tensors
+ if "return_tensors" in common_kwargs:
+ return_tensors = common_kwargs.pop("return_tensors", None)
 
  if text is not None and images is not None:
  encoding["pixel_values"] = image_features.pixel_values

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
@@ -17,9 +17,11 @@
 """
 
 import warnings
+from typing import List, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ChannelDimension, ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 
 
 class AltCLIPProcessor(ProcessorMixin):
@@ -35,31 +37,71 @@ class AltCLIPProcessor(ProcessorMixin):
  The image processor is a required input.
  tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
  The tokenizer is a required input.
+ feature_extractor ([`CLIPFeatureExtractor`], *optional*):
+ The feature extractor is a deprecated input.
  """
 
  attributes = ["image_processor", "tokenizer"]
  image_processor_class = "CLIPImageProcessor"
  tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
 
- def __init__(self, image_processor=None, tokenizer=None, **kwargs):
- feature_extractor = None
- if "feature_extractor" in kwargs:
+ def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None):
+ if "feature_extractor":
  warnings.warn(
  "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
  " instead.",
  FutureWarning,
  )
- feature_extractor = kwargs.pop("feature_extractor")
-
  image_processor = image_processor if image_processor is not None else feature_extractor
  if image_processor is None:
  raise ValueError("You need to specify an `image_processor`.")
  if tokenizer is None:
  raise ValueError("You need to specify a `tokenizer`.")
 
  super().__init__(image_processor, tokenizer)
-
- def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+ self.processing_kwargs: ProcessingKwargs = {
+ "common_kwargs": {"return_tensors": None},
+ "text_kwargs": {
+ "add_special_tokens": True,
+ "padding": False,
+ "truncation": None,
+ "max_length": None,
+ "stride": 0,
+ "is_split_into_words": False,
+ "pad_to_multiple_of": None,
+ "return_token_type_ids": None,
+ "return_attention_mask": None,
+ "return_overflowing_tokens": False,
+ "return_special_tokens_mask": False,
+ "return_offsets_mapping": False,
+ "return_length": False,
+ "verbose": True,
+ },
+ "images_kwargs": {
+ "do_resize": None,
+ "size": None,
+ "resample": None,
+ "do_thumbnail": None,
+ "do_align_long_axis": None,
+ "do_pad": None,
+ "do_rescale": None,
+ "rescale_factor": None,
+ "do_normalize": None,
+ "image_mean": None,
+ "image_std": None,
+ "data_format": ChannelDimension.FIRST,
+ "input_data_format": None,
+ },
+ }
+
+ def __call__(
+ self,
+ images: ImageInput = None,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ audio=None,
+ videos=None,
+ **kwargs,
+ ) -> BatchEncoding:
  """
  Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
  and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
@@ -68,22 +110,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
  of the above two methods for more information.
 
  Args:
- text (`str`, `List[str]`, `List[List[str]]`):
+
+ images (`ImageInput`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
  The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
  (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
  `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
- images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
- The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
- tensor. Both channels-first and channels-last formats are supported.
-
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
  If set, will return tensors of a particular framework. Acceptable values are:
-
- - `'tf'`: Return TensorFlow `tf.constant` objects.
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
- - `'np'`: Return NumPy `np.ndarray` objects.
- - `'jax'`: Return JAX `jnp.ndarray` objects.
-
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
  Returns:
  [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
@@ -94,22 +134,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
  - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
  """
 
- if text is None and images is None:
- raise ValueError("You have to specify either text or images. Both cannot be none.")
-
- if text is not None:
- encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+ text_kwargs = {**self.processing_kwargs["text_kwargs"], **self.processing_kwargs["common_kwargs"], **kwargs}
+ images_kwargs = {
+ **self.processing_kwargs["images_kwargs"],
+ **self.processing_kwargs["common_kwargs"],
+ **kwargs,
+ }
 
- if images is not None:
- image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+ if text:
+ encoding = self.tokenizer(text, **text_kwargs)
+ if images:
+ image_features = self.image_processor(images, **images_kwargs)
+ encoding.update(image_features)
 
- if text is not None and images is not None:
- encoding["pixel_values"] = image_features.pixel_values
- return encoding
- elif text is not None:
- return encoding
- else:
- return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+ return encoding
 
  def batch_decode(self, *args, **kwargs):
  """