Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Image + text + audio uniform processors #30511

Open
wants to merge 54 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
42ecf48
expand kwargs from align
molbap Jan 25, 2024
ccb2147
remove kwargs from altclip processor
molbap Jan 25, 2024
f999e0c
add explicit args for donut processor
molbap Jan 25, 2024
8fb3a6b
add explicit call to current processor for in context manager
molbap Jan 25, 2024
a90c766
format
molbap Jan 25, 2024
49cb6cc
remove unused kwargs
molbap Jan 25, 2024
3ac1c7e
move conditions for encodings
molbap Jan 25, 2024
7a819fd
improve flow over text/image
molbap Jan 25, 2024
9cc38b7
[breaking] pass explicit args to bridgetower
molbap Jan 25, 2024
ff6a950
wwsMerge branch 'main' into improve_multimodal_processors
molbap Jan 26, 2024
7db64a0
add default kwargs for BC
molbap Jan 26, 2024
41674d9
fix bridgetower
molbap Jan 26, 2024
618a687
debug bridgetower image proc
molbap Jan 26, 2024
f39cdc1
format
molbap Jan 26, 2024
9a6f97d
move kwargs message to info level
molbap Jan 26, 2024
380f82f
add debug messages
molbap Jan 26, 2024
75f15d3
fix arguments not being passed in bridgetower
molbap Jan 26, 2024
3df5faa
keep backwards compat for processing + modify testing args dict
molbap Feb 1, 2024
5ad0694
Merge branch 'main' into improve_multimodal_processors
molbap Feb 1, 2024
69e5a2d
fix quality
molbap Feb 1, 2024
68c2f40
log kwargs mismatch to info level
molbap Feb 1, 2024
e1e4084
fix quality
molbap Feb 1, 2024
bfa81e5
Merge branch 'main' into improve_multimodal_processors
molbap Feb 15, 2024
4b557b0
address comments
molbap Feb 15, 2024
b7fc377
fix typo
molbap Feb 15, 2024
270bb9e
fix expected tests for bridgetower
molbap Feb 16, 2024
94a1b75
fix conflicts
molbap Feb 26, 2024
6603bf0
Merge branch 'main' into improve_multimodal_processors
molbap Feb 26, 2024
004c961
fix valid processor keys
molbap Feb 26, 2024
c2e49f5
remove unused arg list
molbap Feb 26, 2024
79958b5
quality
molbap Feb 26, 2024
a36f524
Merge branch 'main' into improve_multimodal_processors
molbap Apr 17, 2024
3238dd3
skeleton draft - uniform processor call
molbap Apr 18, 2024
3afde22
fix quality
molbap Apr 18, 2024
eb99e29
add broken wav2vec audio processing
molbap Apr 25, 2024
c6afd63
Merge branch 'main' into improve_multimodal_processors
molbap Apr 25, 2024
3b824b5
uniforrmize processor calls
molbap Apr 26, 2024
d8c2a6e
fix default
molbap Apr 28, 2024
78433a1
protect image types
molbap Apr 28, 2024
0cd3d66
Merge branch 'main' into image_text_audio_processors
molbap May 2, 2024
d9c51c1
Merge branch 'main' into image_text_audio_processors
molbap May 7, 2024
43ba3bd
improve audio kwargs typing
molbap May 7, 2024
dcfa8db
create audio input type
molbap May 7, 2024
7048fee
fix docstrings for common kwargs
molbap May 7, 2024
25ba7ba
add deprecation warning for dual kwargs
molbap May 7, 2024
36ba3cc
fix imports
molbap May 7, 2024
05fea5d
fix quality
molbap May 7, 2024
e05b14b
fix quality + copies
molbap May 7, 2024
46a1fe8
fix quality on setup :eyes:
molbap May 7, 2024
13a1909
protect import
molbap May 7, 2024
7bf3615
fix quality
molbap May 7, 2024
a285284
fix conflicts
molbap May 24, 2024
1fe9eb5
add type hinting expansion
molbap May 27, 2024
931f68c
fixup
molbap May 27, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
119 changes: 97 additions & 22 deletions src/transformers/models/align/processing_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,56 @@
Image/Text processor class for ALIGN
"""

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from typing import List, Union

# TODO fix forward referencing of costly modules to import as below
from ...image_utils import ImageInput
from ...processing_utils import (
CommonKwargs,
ImagesKwargs,
ProcessingKwargs,
ProcessorMixin,
TextKwargs,
add_expanded_type_hints,
)
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput


class AlignProcessorKwargs(ProcessingKwargs):
__total__ = False
text_kwargs: TextKwargs = {
"add_special_tokens": True,
"padding": "max_length",
"truncation": True,
"max_length": 64,
"stride": 0,
"is_split_into_words": False,
"pad_to_multiple_of": None,
"return_token_type_ids": None,
"return_attention_mask": None,
"return_overflowing_tokens": False,
"return_special_tokens_mask": False,
"return_offsets_mapping": False,
"return_length": False,
"verbose": True,
}

images_kwargs: ImagesKwargs = {
"do_resize": None,
"size": None,
"crop_size": None,
"resample": None,
"do_rescale": None,
"rescale_factor": None,
"do_normalize": None,
"image_mean": None,
"image_std": None,
"do_center_crop": None,
"data_format": "channels_first",
"input_data_format": None,
}

common_kwargs: CommonKwargs = {"return_tensors": None}


class AlignProcessor(ProcessorMixin):
Expand All @@ -32,20 +80,37 @@ class AlignProcessor(ProcessorMixin):
The image processor is a required input.
tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
The tokenizer is a required input.

"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "EfficientNetImageProcessor"
tokenizer_class = ("BertTokenizer", "BertTokenizerFast")

def __init__(self, image_processor, tokenizer):
self.base_kwargs = AlignProcessorKwargs()
super().__init__(image_processor, tokenizer)

def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs):
@add_expanded_type_hints(
text_kwargs=TextKwargs,
images_kwargs=ImagesKwargs,
common_kwargs=CommonKwargs,
)
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
audio=None,
videos=None,
text_kwargs: TextKwargs = None,
images_kwargs: ImagesKwargs = None,
common_kwargs: CommonKwargs = None,
**kwargs: AlignProcessorKwargs,
) -> BatchEncoding:
"""
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` arguments to
EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
to the doctsring of the above two methods for more information.

Expand All @@ -57,19 +122,12 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`,
`'max_length'`, `False` or `'do_not_pad'`]
max_length (`int`, *optional*, defaults to `max_length`):
Maximum padding value to use to pad the input text during tokenization.

return_tensors (`str` or [`~utils.TensorType`], *optional*):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return_tensors is the only one I'm not sure we want to remove from the docstring, as it's something users have to specify if they want to be able to use the outputs for a model

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a good point, in general anything that is common could be here

If set, will return tensors of a particular framework. Acceptable values are:

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.

Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
Expand All @@ -81,15 +139,32 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")

raise ValueError("You must specify either text or images.")

text_kwargs = kwargs.pop("text_kwargs", {})
images_kwargs = kwargs.pop("images_kwargs", {})
common_kwargs = kwargs.pop("common_kwargs", {})

for key, value in kwargs.items():
if key in AlignProcessorKwargs.text_kwargs:
text_kwargs[key] = value
elif key in AlignProcessorKwargs.images_kwargs:
images_kwargs[key] = value
else:
# remaining kwargs go to the "common" key
common_kwargs[key] = value
common_kwargs = {**AlignProcessorKwargs.common_kwargs, **common_kwargs}
text_kwargs = {**AlignProcessorKwargs.text_kwargs, **text_kwargs, **common_kwargs}
images_kwargs = {**AlignProcessorKwargs.images_kwargs, **images_kwargs, **common_kwargs}
if text is not None:
encoding = self.tokenizer(
text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs
)
encoding = self.tokenizer(text, **text_kwargs)

if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
image_features = self.image_processor(images, **images_kwargs)

# BC for explicit return_tensors
if "return_tensors" in common_kwargs:
return_tensors = common_kwargs.pop("return_tensors", None)

if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
Expand Down
106 changes: 72 additions & 34 deletions src/transformers/models/altclip/processing_altclip.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,9 +17,11 @@
"""

import warnings
from typing import List, Union

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from ...image_utils import ChannelDimension, ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput


class AltCLIPProcessor(ProcessorMixin):
Expand All @@ -35,31 +37,71 @@ class AltCLIPProcessor(ProcessorMixin):
The image processor is a required input.
tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
The tokenizer is a required input.
feature_extractor ([`CLIPFeatureExtractor`], *optional*):
The feature extractor is a deprecated input.
"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")

def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None):
if "feature_extractor":
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")

image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")

super().__init__(image_processor, tokenizer)

def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
self.processing_kwargs: ProcessingKwargs = {
"common_kwargs": {"return_tensors": None},
"text_kwargs": {
"add_special_tokens": True,
"padding": False,
"truncation": None,
"max_length": None,
"stride": 0,
"is_split_into_words": False,
"pad_to_multiple_of": None,
"return_token_type_ids": None,
"return_attention_mask": None,
"return_overflowing_tokens": False,
"return_special_tokens_mask": False,
"return_offsets_mapping": False,
"return_length": False,
"verbose": True,
},
"images_kwargs": {
"do_resize": None,
"size": None,
"resample": None,
"do_thumbnail": None,
"do_align_long_axis": None,
"do_pad": None,
"do_rescale": None,
"rescale_factor": None,
"do_normalize": None,
"image_mean": None,
"image_std": None,
"data_format": ChannelDimension.FIRST,
"input_data_format": None,
},
}

def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs,
) -> BatchEncoding:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
Expand All @@ -68,22 +110,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
of the above two methods for more information.

Args:
text (`str`, `List[str]`, `List[List[str]]`):

images (`ImageInput`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.

return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

Expand All @@ -94,22 +134,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""

if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")

if text is not None:
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
text_kwargs = {**self.processing_kwargs["text_kwargs"], **self.processing_kwargs["common_kwargs"], **kwargs}
images_kwargs = {
**self.processing_kwargs["images_kwargs"],
**self.processing_kwargs["common_kwargs"],
**kwargs,
}

if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text:
encoding = self.tokenizer(text, **text_kwargs)
if images:
image_features = self.image_processor(images, **images_kwargs)
encoding.update(image_features)

if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
return encoding

def batch_decode(self, *args, **kwargs):
"""
Expand Down