huggingface · molbap · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/setup.py b/setup.py
@@ -260,7 +260,15 @@ def run(self):
 extras["sklearn"] = deps_list("scikit-learn")
 
 extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
-extras["tf-cpu"] = deps_list("keras", "tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp", "tensorflow-probability")
+extras["tf-cpu"] = deps_list(
+ "keras",
+ "tensorflow-cpu",
+ "onnxconverter-common",
+ "tf2onnx",
+ "tensorflow-text",
+ "keras-nlp",
+ "tensorflow-probability",
+)
 
 extras["torch"] = deps_list("torch", "accelerate")
 extras["accelerate"] = deps_list("accelerate")
@@ -380,12 +388,7 @@ def run(self):
  + extras["tf-speech"]
 )
 extras["dev"] = (
- extras["all"]
- + extras["testing"]
- + extras["quality"]
- + extras["ja"]
- + extras["sklearn"]
- + extras["modelcreation"]
+ extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["sklearn"] + extras["modelcreation"]
 )
 
 extras["torchhub"] = deps_list(
@@ -470,4 +473,4 @@ def run(self):
 extras["tests_examples_tf"] = deps_list()
 extras["tests_custom_tokenizers"] = deps_list()
 extras["tests_exotic_models"] = deps_list()
-extras["consistency"] = deps_list()
+extras["consistency"] = deps_list()
diff --git a/src/transformers/models/align/processing_align.py b/src/transformers/models/align/processing_align.py
@@ -17,8 +17,11 @@
 """
 
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from typing import List, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 
 
 class AlignProcessor(ProcessorMixin):
@@ -33,6 +36,7 @@ class AlignProcessor(ProcessorMixin):
  The image processor is a required input.
  tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
  The tokenizer is a required input.
+
  """
 
  attributes = ["image_processor", "tokenizer"]
@@ -41,12 +45,59 @@ class AlignProcessor(ProcessorMixin):
 
  def __init__(self, image_processor, tokenizer):
  super().__init__(image_processor, tokenizer)
-
- def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs):
+ self.processing_kwargs: ProcessingKwargs = {
+ "common_kwargs": {"return_tensors": None},
+ "text_kwargs": {
+ "text_pair": None,
+ "text_target": None,
+ "text_pair_target": None,
+ "add_special_tokens": True,
+ "padding": "max_length",
+ "truncation": True,
+ "max_length": 64,
+ "stride": 0,
+ "is_split_into_words": False,
+ "pad_to_multiple_of": None,
+ "return_token_type_ids": None,
+ "return_attention_mask": None,
+ "return_overflowing_tokens": False,
+ "return_special_tokens_mask": False,
+ "return_offsets_mapping": False,
+ "return_length": False,
+ "verbose": True,
+ },
+ "images_kwargs": {
+ "do_crop_margin": None,
+ "do_resize": None,
+ "size": None,
+ "resample": None,
+ "do_thumbnail": None,
+ "do_align_long_axis": None,
+ "do_pad": None,
+ "do_rescale": None,
+ "rescale_factor": None,
+ "do_normalize": None,
+ "image_mean": None,
+ "image_std": None,
+ "data_format": "channels_first",
+ "input_data_format": None,
+ },
+ "audio_kwargs": {},
+ "videos_kwargs": {},
+ }
+
+ def __call__(
+ self,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ images: ImageInput = None,
+ audio=None,
+ videos=None,
+ **kwargs,
+ ) -> BatchEncoding:
  """
  Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
- and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
- the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+ arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
+ the text. To prepare the image(s), this method forwards the `images` arguments to
  EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
  to the doctsring of the above two methods for more information.
 
@@ -58,19 +109,12 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
  images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
  The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
  tensor. Both channels-first and channels-last formats are supported.
- padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
- Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`,
- `'max_length'`, `False` or `'do_not_pad'`]
- max_length (`int`, *optional*, defaults to `max_length`):
- Maximum padding value to use to pad the input text during tokenization.
-
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
  If set, will return tensors of a particular framework. Acceptable values are:
-
- - `'tf'`: Return TensorFlow `tf.constant` objects.
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
- - `'np'`: Return NumPy `np.ndarray` objects.
- - `'jax'`: Return JAX `jnp.ndarray` objects.
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
 
  Returns:
  [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
@@ -82,15 +126,28 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
  - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
  """
  if text is None and images is None:
- raise ValueError("You have to specify either text or images. Both cannot be none.")
+ raise ValueError("You must specify either text or images.")
 
  if text is not None:
- encoding = self.tokenizer(
- text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs
- )
+ text_kwargs = {
+ **self.processing_kwargs["text_kwargs"],
+ **self.processing_kwargs["common_kwargs"],
+ **kwargs,
+ }
+ encoding = self.tokenizer(text, **text_kwargs)
 
  if images is not None:
- image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+ images_kwargs = {
+ **self.processing_kwargs["images_kwargs"],
+ **self.processing_kwargs["common_kwargs"],
+ **kwargs,
+ }
+ image_features = self.image_processor(images, **images_kwargs)
+
+ # BC for explicit return_tensors
+ common_kwargs = {**self.processing_kwargs["common_kwargs"], **kwargs}
+ if "return_tensors" in common_kwargs:
+ return_tensors = common_kwargs.pop("return_tensors", None)
 
  if text is not None and images is not None:
  encoding["pixel_values"] = image_features.pixel_values

diff --git a/src/transformers/models/altclip/processing_altclip.py b/src/transformers/models/altclip/processing_altclip.py
@@ -16,9 +16,11 @@
 Image/Text processor class for AltCLIP
 """
 import warnings
+from typing import List, Union
 
-from ...processing_utils import ProcessorMixin
-from ...tokenization_utils_base import BatchEncoding
+from ...image_utils import ChannelDimension, ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
 
 
 class AltCLIPProcessor(ProcessorMixin):
@@ -34,31 +36,71 @@ class AltCLIPProcessor(ProcessorMixin):
  The image processor is a required input.
  tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
  The tokenizer is a required input.
+ feature_extractor ([`CLIPFeatureExtractor`], *optional*):
+ The feature extractor is a deprecated input.
  """
 
  attributes = ["image_processor", "tokenizer"]
  image_processor_class = "CLIPImageProcessor"
  tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")
 
- def __init__(self, image_processor=None, tokenizer=None, **kwargs):
- feature_extractor = None
- if "feature_extractor" in kwargs:
+ def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None):
+ if "feature_extractor":
  warnings.warn(
  "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
  " instead.",
  FutureWarning,
  )
- feature_extractor = kwargs.pop("feature_extractor")
-
  image_processor = image_processor if image_processor is not None else feature_extractor
  if image_processor is None:
  raise ValueError("You need to specify an `image_processor`.")
  if tokenizer is None:
  raise ValueError("You need to specify a `tokenizer`.")
 
  super().__init__(image_processor, tokenizer)
-
- def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+ self.processing_kwargs: ProcessingKwargs = {
+ "common_kwargs": {"return_tensors": None},
+ "text_kwargs": {
+ "add_special_tokens": True,
+ "padding": False,
+ "truncation": None,
+ "max_length": None,
+ "stride": 0,
+ "is_split_into_words": False,
+ "pad_to_multiple_of": None,
+ "return_token_type_ids": None,
+ "return_attention_mask": None,
+ "return_overflowing_tokens": False,
+ "return_special_tokens_mask": False,
+ "return_offsets_mapping": False,
+ "return_length": False,
+ "verbose": True,
+ },
+ "images_kwargs": {
+ "do_resize": None,
+ "size": None,
+ "resample": None,
+ "do_thumbnail": None,
+ "do_align_long_axis": None,
+ "do_pad": None,
+ "do_rescale": None,
+ "rescale_factor": None,
+ "do_normalize": None,
+ "image_mean": None,
+ "image_std": None,
+ "data_format": ChannelDimension.FIRST,
+ "input_data_format": None,
+ },
+ }
+
+ def __call__(
+ self,
+ images: ImageInput = None,
+ text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+ audio=None,
+ videos=None,
+ **kwargs,
+ ) -> BatchEncoding:
  """
  Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
  and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
@@ -67,22 +109,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
  of the above two methods for more information.
 
  Args:
- text (`str`, `List[str]`, `List[List[str]]`):
+
+ images (`ImageInput`):
+ The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+ tensor. Both channels-first and channels-last formats are supported.
+ text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
  The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
  (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
  `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
- images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
- The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
- tensor. Both channels-first and channels-last formats are supported.
-
  return_tensors (`str` or [`~utils.TensorType`], *optional*):
  If set, will return tensors of a particular framework. Acceptable values are:
-
- - `'tf'`: Return TensorFlow `tf.constant` objects.
- - `'pt'`: Return PyTorch `torch.Tensor` objects.
- - `'np'`: Return NumPy `np.ndarray` objects.
- - `'jax'`: Return JAX `jnp.ndarray` objects.
-
+ - `'tf'`: Return TensorFlow `tf.constant` objects.
+ - `'pt'`: Return PyTorch `torch.Tensor` objects.
+ - `'np'`: Return NumPy `np.ndarray` objects.
+ - `'jax'`: Return JAX `jnp.ndarray` objects.
  Returns:
  [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
 
@@ -93,22 +133,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
  - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
  """
 
- if text is None and images is None:
- raise ValueError("You have to specify either text or images. Both cannot be none.")
-
- if text is not None:
- encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
+ text_kwargs = {**self.processing_kwargs["text_kwargs"], **self.processing_kwargs["common_kwargs"], **kwargs}
+ images_kwargs = {
+ **self.processing_kwargs["images_kwargs"],
+ **self.processing_kwargs["common_kwargs"],
+ **kwargs,
+ }
 
- if images is not None:
- image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+ if text:
+ encoding = self.tokenizer(text, **text_kwargs)
+ if images:
+ image_features = self.image_processor(images, **images_kwargs)
+ encoding.update(image_features)
 
- if text is not None and images is not None:
- encoding["pixel_values"] = image_features.pixel_values
- return encoding
- elif text is not None:
- return encoding
- else:
- return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
+ return encoding
 
  def batch_decode(self, *args, **kwargs):
  """