Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Image + text + audio uniform processors #30511

Open
wants to merge 54 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 51 commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
42ecf48
expand kwargs from align
molbap Jan 25, 2024
ccb2147
remove kwargs from altclip processor
molbap Jan 25, 2024
f999e0c
add explicit args for donut processor
molbap Jan 25, 2024
8fb3a6b
add explicit call to current processor for in context manager
molbap Jan 25, 2024
a90c766
format
molbap Jan 25, 2024
49cb6cc
remove unused kwargs
molbap Jan 25, 2024
3ac1c7e
move conditions for encodings
molbap Jan 25, 2024
7a819fd
improve flow over text/image
molbap Jan 25, 2024
9cc38b7
[breaking] pass explicit args to bridgetower
molbap Jan 25, 2024
ff6a950
wwsMerge branch 'main' into improve_multimodal_processors
molbap Jan 26, 2024
7db64a0
add default kwargs for BC
molbap Jan 26, 2024
41674d9
fix bridgetower
molbap Jan 26, 2024
618a687
debug bridgetower image proc
molbap Jan 26, 2024
f39cdc1
format
molbap Jan 26, 2024
9a6f97d
move kwargs message to info level
molbap Jan 26, 2024
380f82f
add debug messages
molbap Jan 26, 2024
75f15d3
fix arguments not being passed in bridgetower
molbap Jan 26, 2024
3df5faa
keep backwards compat for processing + modify testing args dict
molbap Feb 1, 2024
5ad0694
Merge branch 'main' into improve_multimodal_processors
molbap Feb 1, 2024
69e5a2d
fix quality
molbap Feb 1, 2024
68c2f40
log kwargs mismatch to info level
molbap Feb 1, 2024
e1e4084
fix quality
molbap Feb 1, 2024
bfa81e5
Merge branch 'main' into improve_multimodal_processors
molbap Feb 15, 2024
4b557b0
address comments
molbap Feb 15, 2024
b7fc377
fix typo
molbap Feb 15, 2024
270bb9e
fix expected tests for bridgetower
molbap Feb 16, 2024
94a1b75
fix conflicts
molbap Feb 26, 2024
6603bf0
Merge branch 'main' into improve_multimodal_processors
molbap Feb 26, 2024
004c961
fix valid processor keys
molbap Feb 26, 2024
c2e49f5
remove unused arg list
molbap Feb 26, 2024
79958b5
quality
molbap Feb 26, 2024
a36f524
Merge branch 'main' into improve_multimodal_processors
molbap Apr 17, 2024
3238dd3
skeleton draft - uniform processor call
molbap Apr 18, 2024
3afde22
fix quality
molbap Apr 18, 2024
eb99e29
add broken wav2vec audio processing
molbap Apr 25, 2024
c6afd63
Merge branch 'main' into improve_multimodal_processors
molbap Apr 25, 2024
3b824b5
uniforrmize processor calls
molbap Apr 26, 2024
d8c2a6e
fix default
molbap Apr 28, 2024
78433a1
protect image types
molbap Apr 28, 2024
0cd3d66
Merge branch 'main' into image_text_audio_processors
molbap May 2, 2024
d9c51c1
Merge branch 'main' into image_text_audio_processors
molbap May 7, 2024
43ba3bd
improve audio kwargs typing
molbap May 7, 2024
dcfa8db
create audio input type
molbap May 7, 2024
7048fee
fix docstrings for common kwargs
molbap May 7, 2024
25ba7ba
add deprecation warning for dual kwargs
molbap May 7, 2024
36ba3cc
fix imports
molbap May 7, 2024
05fea5d
fix quality
molbap May 7, 2024
e05b14b
fix quality + copies
molbap May 7, 2024
46a1fe8
fix quality on setup :eyes:
molbap May 7, 2024
13a1909
protect import
molbap May 7, 2024
7bf3615
fix quality
molbap May 7, 2024
a285284
fix conflicts
molbap May 24, 2024
1fe9eb5
add type hinting expansion
molbap May 27, 2024
931f68c
fixup
molbap May 27, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
19 changes: 11 additions & 8 deletions setup.py
Expand Up @@ -260,7 +260,15 @@ def run(self):
extras["sklearn"] = deps_list("scikit-learn")

extras["tf"] = deps_list("tensorflow", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp")
extras["tf-cpu"] = deps_list("keras", "tensorflow-cpu", "onnxconverter-common", "tf2onnx", "tensorflow-text", "keras-nlp", "tensorflow-probability")
extras["tf-cpu"] = deps_list(
"keras",
"tensorflow-cpu",
"onnxconverter-common",
"tf2onnx",
"tensorflow-text",
"keras-nlp",
"tensorflow-probability",
)

extras["torch"] = deps_list("torch", "accelerate")
extras["accelerate"] = deps_list("accelerate")
Expand Down Expand Up @@ -380,12 +388,7 @@ def run(self):
+ extras["tf-speech"]
)
extras["dev"] = (
extras["all"]
+ extras["testing"]
+ extras["quality"]
+ extras["ja"]
+ extras["sklearn"]
+ extras["modelcreation"]
extras["all"] + extras["testing"] + extras["quality"] + extras["ja"] + extras["sklearn"] + extras["modelcreation"]
)

extras["torchhub"] = deps_list(
Expand Down Expand Up @@ -470,4 +473,4 @@ def run(self):
extras["tests_examples_tf"] = deps_list()
extras["tests_custom_tokenizers"] = deps_list()
extras["tests_exotic_models"] = deps_list()
extras["consistency"] = deps_list()
extras["consistency"] = deps_list()
101 changes: 79 additions & 22 deletions src/transformers/models/align/processing_align.py
Expand Up @@ -17,8 +17,11 @@
"""


from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from typing import List, Union

from ...image_utils import ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput


class AlignProcessor(ProcessorMixin):
Expand All @@ -33,6 +36,7 @@ class AlignProcessor(ProcessorMixin):
The image processor is a required input.
tokenizer ([`BertTokenizer`, `BertTokenizerFast`]):
The tokenizer is a required input.
"""

attributes = ["image_processor", "tokenizer"]
Expand All @@ -41,12 +45,59 @@ class AlignProcessor(ProcessorMixin):

def __init__(self, image_processor, tokenizer):
super().__init__(image_processor, tokenizer)

def __call__(self, text=None, images=None, padding="max_length", max_length=64, return_tensors=None, **kwargs):
self.processing_kwargs: ProcessingKwargs = {
"common_kwargs": {"return_tensors": None},
"text_kwargs": {
"text_pair": None,
"text_target": None,
"text_pair_target": None,
"add_special_tokens": True,
"padding": "max_length",
"truncation": True,
"max_length": 64,
"stride": 0,
"is_split_into_words": False,
"pad_to_multiple_of": None,
"return_token_type_ids": None,
"return_attention_mask": None,
"return_overflowing_tokens": False,
"return_special_tokens_mask": False,
"return_offsets_mapping": False,
"return_length": False,
"verbose": True,
},
"images_kwargs": {
"do_crop_margin": None,
"do_resize": None,
"size": None,
"resample": None,
"do_thumbnail": None,
"do_align_long_axis": None,
"do_pad": None,
"do_rescale": None,
"rescale_factor": None,
"do_normalize": None,
"image_mean": None,
"image_std": None,
"data_format": "channels_first",
"input_data_format": None,
},
"audio_kwargs": {},
"videos_kwargs": {},
}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The only thing I'd say about the current design is the default behaviour of a processor isn't configurable. As users who use multimodal models just want to use the processor, and processors might share the default processing classes they bundle together e.g. CLIP, we might want to be able to override this in the init, such that I can save and load processors with custom behaviour. For example:

from transformers import AlignProcessor

processor = AlignProcessor.from_pretrained("kakaobrain/align-base")
# I have to pass in max_length in for every call
output = processor(images=images, text=text, max_length=128)

processor = AlignProcessor.from_pretrained("kakaobrain/align-base", max_length=128)
processor.save_pretrained("my_new_model")

processor = AlignProcessor.from_pretrained("my_new_model")
# The processor automatically defaults to the max value I want when calling
output = processor(images=images, text=text)

The correct way, might be to say that the e.g. tokenizer behaviour should be configured instead. At the moment, I think if I change the tokenizer, and build the processor, it will use the processor's max_length default, rather than the tokenizer.

from transformers import AlignProcessor, AutoTokenizer, AutoImageProcessor

tokenizer = AutoTokenizer.from_pretrained("kakaobrain/align-base", max_length=128)
image_processor = AutoImageProcessor.from_pretrained("kakaobrain/align-base")
processor = AlignProcessor(tokenizer, image_processor)

# Is max_length 128 or 64 used here? 
outputs = processor(text=text, images=images)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's a good point - I tried the second example with model_max_length and it seems to work fine:
image


def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
audio=None,
videos=None,
**kwargs,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My main question is how are type hints and signature scan done for this? Is there a way to say that the kwargs include text kwargs and image kwargs + benefit from having the doc as well?

) -> BatchEncoding:
"""
Main method to prepare text(s) and image(s) to be fed as input to the model. This method forwards the `text`
and `kwargs` arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
arguments to BertTokenizerFast's [`~BertTokenizerFast.__call__`] if `text` is not `None` to encode
the text. To prepare the image(s), this method forwards the `images` arguments to
EfficientNetImageProcessor's [`~EfficientNetImageProcessor.__call__`] if `images` is not `None`. Please refer
to the doctsring of the above two methods for more information.
Expand All @@ -58,19 +109,12 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `max_length`):
Activates and controls padding for tokenization of input text. Choose between [`True` or `'longest'`,
`'max_length'`, `False` or `'do_not_pad'`]
max_length (`int`, *optional*, defaults to `max_length`):
Maximum padding value to use to pad the input text during tokenization.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

return_tensors is the only one I'm not sure we want to remove from the docstring, as it's something users have to specify if they want to be able to use the outputs for a model

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

that's a good point, in general anything that is common could be here

If set, will return tensors of a particular framework. Acceptable values are:
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
Expand All @@ -82,15 +126,28 @@ def __call__(self, text=None, images=None, padding="max_length", max_length=64,
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")
raise ValueError("You must specify either text or images.")

if text is not None:
encoding = self.tokenizer(
text, padding=padding, max_length=max_length, return_tensors=return_tensors, **kwargs
)
text_kwargs = {
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🤩

**self.processing_kwargs["text_kwargs"],
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So this is basically creating a dictionary with the default, and is updates with the kwargs.

  1. It's very nice
  2. It's a tad bit not explicit enough for me (for newbies, it would be hard to guess that this is what's happening IMO)
    Thus I am wondering if there is a way to leverage the ProcessingKwargs class rather than use the class's keys.
    For that what I mean is something along the line of self.processing_kwargs.update_with_kwargs("text_kwargs",**kwargs) could be more understandable? (I don't know)

**self.processing_kwargs["common_kwargs"],
**kwargs,
}
encoding = self.tokenizer(text, **text_kwargs)

if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
images_kwargs = {
**self.processing_kwargs["images_kwargs"],
**self.processing_kwargs["common_kwargs"],
**kwargs,
}
image_features = self.image_processor(images, **images_kwargs)

# BC for explicit return_tensors
common_kwargs = {**self.processing_kwargs["common_kwargs"], **kwargs}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we prepare common_kwargs here and not use them? Is it for consistency in the processor patterns?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

will correct this part in align :)

if "return_tensors" in common_kwargs:
return_tensors = common_kwargs.pop("return_tensors", None)

if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
Expand Down
106 changes: 72 additions & 34 deletions src/transformers/models/altclip/processing_altclip.py
Expand Up @@ -16,9 +16,11 @@
Image/Text processor class for AltCLIP
"""
import warnings
from typing import List, Union

from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import BatchEncoding
from ...image_utils import ChannelDimension, ImageInput
from ...processing_utils import ProcessingKwargs, ProcessorMixin
from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput


class AltCLIPProcessor(ProcessorMixin):
Expand All @@ -34,31 +36,71 @@ class AltCLIPProcessor(ProcessorMixin):
The image processor is a required input.
tokenizer ([`XLMRobertaTokenizerFast`], *optional*):
The tokenizer is a required input.
feature_extractor ([`CLIPFeatureExtractor`], *optional*):
The feature extractor is a deprecated input.
"""

attributes = ["image_processor", "tokenizer"]
image_processor_class = "CLIPImageProcessor"
tokenizer_class = ("XLMRobertaTokenizer", "XLMRobertaTokenizerFast")

def __init__(self, image_processor=None, tokenizer=None, **kwargs):
feature_extractor = None
if "feature_extractor" in kwargs:
def __init__(self, image_processor=None, tokenizer=None, feature_extractor=None):
if "feature_extractor":
warnings.warn(
"The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
" instead.",
FutureWarning,
)
feature_extractor = kwargs.pop("feature_extractor")

image_processor = image_processor if image_processor is not None else feature_extractor
if image_processor is None:
raise ValueError("You need to specify an `image_processor`.")
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")

super().__init__(image_processor, tokenizer)

def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
self.processing_kwargs: ProcessingKwargs = {
"common_kwargs": {"return_tensors": None},
"text_kwargs": {
"add_special_tokens": True,
"padding": False,
"truncation": None,
"max_length": None,
"stride": 0,
"is_split_into_words": False,
"pad_to_multiple_of": None,
"return_token_type_ids": None,
"return_attention_mask": None,
"return_overflowing_tokens": False,
"return_special_tokens_mask": False,
"return_offsets_mapping": False,
"return_length": False,
"verbose": True,
},
"images_kwargs": {
"do_resize": None,
"size": None,
"resample": None,
"do_thumbnail": None,
"do_align_long_axis": None,
"do_pad": None,
"do_rescale": None,
"rescale_factor": None,
"do_normalize": None,
"image_mean": None,
"image_std": None,
"data_format": ChannelDimension.FIRST,
"input_data_format": None,
},
}

def __call__(
self,
images: ImageInput = None,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
audio=None,
videos=None,
**kwargs,
) -> BatchEncoding:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
and `kwargs` arguments to XLMRobertaTokenizerFast's [`~XLMRobertaTokenizerFast.__call__`] if `text` is not
Expand All @@ -67,22 +109,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
of the above two methods for more information.

Args:
text (`str`, `List[str]`, `List[List[str]]`):

images (`ImageInput`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`TextInput`, `PreTokenizedInput`, `List[TextInput]`, `List[PreTokenizedInput]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.

return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.

- `'tf'`: Return TensorFlow `tf.constant` objects.
- `'pt'`: Return PyTorch `torch.Tensor` objects.
- `'np'`: Return NumPy `np.ndarray` objects.
- `'jax'`: Return JAX `jnp.ndarray` objects.
Returns:
[`BatchEncoding`]: A [`BatchEncoding`] with the following fields:

Expand All @@ -93,22 +133,20 @@ def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""

if text is None and images is None:
raise ValueError("You have to specify either text or images. Both cannot be none.")

if text is not None:
encoding = self.tokenizer(text, return_tensors=return_tensors, **kwargs)
text_kwargs = {**self.processing_kwargs["text_kwargs"], **self.processing_kwargs["common_kwargs"], **kwargs}
images_kwargs = {
**self.processing_kwargs["images_kwargs"],
**self.processing_kwargs["common_kwargs"],
**kwargs,
}

if images is not None:
image_features = self.image_processor(images, return_tensors=return_tensors, **kwargs)
if text:
encoding = self.tokenizer(text, **text_kwargs)
if images:
image_features = self.image_processor(images, **images_kwargs)
encoding.update(image_features)

if text is not None and images is not None:
encoding["pixel_values"] = image_features.pixel_values
return encoding
elif text is not None:
return encoding
else:
return BatchEncoding(data=dict(**image_features), tensor_type=return_tensors)
return encoding

def batch_decode(self, *args, **kwargs):
"""
Expand Down