14
14
# limitations under the License.
15
15
"""Testing suite for the PyTorch chameleon model."""
16
16
17
+ import copy
17
18
import unittest
18
19
19
20
import requests
30
31
31
32
from ...generation .test_utils import GenerationTesterMixin
32
33
from ...test_configuration_common import ConfigTester
33
- from ...test_modeling_common import ModelTesterMixin , ids_tensor
34
+ from ...test_modeling_common import ModelTesterMixin , floats_tensor , ids_tensor
34
35
from ...test_pipeline_mixin import PipelineTesterMixin
35
36
36
37
@@ -52,12 +53,12 @@ def __init__(
52
53
self ,
53
54
parent ,
54
55
batch_size = 13 ,
55
- seq_length = 7 ,
56
+ seq_length = 35 ,
56
57
is_training = False ,
57
58
use_input_mask = True ,
58
59
use_labels = True ,
59
60
vocab_size = 99 ,
60
- image_token_id = 98 ,
61
+ image_token_id = 4 ,
61
62
hidden_size = 32 ,
62
63
num_hidden_layers = 2 ,
63
64
num_attention_heads = 2 ,
@@ -73,9 +74,9 @@ def __init__(
73
74
num_labels = 3 ,
74
75
num_choices = 4 ,
75
76
pad_token_id = 0 ,
76
- vq_num_embeds = 12 ,
77
- vq_embed_dim = 12 ,
78
- vq_channel_multiplier = [1 , 2 ],
77
+ vq_num_embeds = 5 ,
78
+ vq_embed_dim = 5 ,
79
+ vq_channel_multiplier = [1 , 4 ],
79
80
vq_img_token_start_id = 10 , # has to be less than vocab size when added with vq_num_embeds
80
81
scope = None ,
81
82
):
@@ -138,7 +139,9 @@ def get_config(self):
138
139
start = self .vq_img_token_start_id
139
140
end = self .vq_img_token_start_id + self .vq_num_embeds
140
141
for i in range (start , end ):
141
- vocab_map [i ] = f"IMGIMGBS{ i } " # dummy str for each token, anything starting with IMGIMG
142
+ image_token_infix = "" .join (chr (ord ("A" ) + int (c )) for c in str (i ))
143
+ # dummy str for each image token, anything starting with IMGIMG
144
+ vocab_map [i ] = f"IMGIMG{ image_token_infix } Z"
142
145
143
146
return ChameleonConfig (
144
147
vocab_size = self .vocab_size ,
@@ -275,7 +278,6 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
275
278
{
276
279
"feature-extraction" : ChameleonModel ,
277
280
"text-generation" : ChameleonForConditionalGeneration ,
278
- "image-text-to-text" : ChameleonForConditionalGeneration ,
279
281
}
280
282
if is_torch_available ()
281
283
else {}
@@ -330,6 +332,149 @@ def test_model_rope_scaling(self, scaling_type):
330
332
def test_batching_equivalence (self ):
331
333
pass
332
334
335
+ @unittest .skip ("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code" )
336
+ def test_model_is_small (self ):
337
+ pass
338
+
339
+
340
+ class ChameleonVision2SeqModelTester (ChameleonModelTester ):
341
+ def __init__ (self , parent , image_size = 10 , ** kwargs ):
342
+ super ().__init__ (parent , ** kwargs )
343
+ self .image_size = image_size
344
+ self .image_seq_length = 25
345
+
346
+ def prepare_config_and_inputs (self ):
347
+ input_ids = ids_tensor ([self .batch_size , self .seq_length ], self .vocab_size )
348
+ input_ids [input_ids == self .image_token_id ] = self .pad_token_id
349
+ input_ids [:, : self .image_seq_length ] = self .image_token_id
350
+ attention_mask = torch .tril (torch .ones_like (input_ids ).to (torch_device ))
351
+ pixel_values = floats_tensor ([self .batch_size , 3 , self .image_size , self .image_size ])
352
+
353
+ config = self .get_config ()
354
+
355
+ return config , input_ids , attention_mask , pixel_values
356
+
357
+ def prepare_config_and_inputs_for_common (self ):
358
+ config_and_inputs = self .prepare_config_and_inputs ()
359
+ config , input_ids , attention_mask , pixel_values = config_and_inputs
360
+ inputs_dict = {"input_ids" : input_ids , "attention_mask" : attention_mask , "pixel_values" : pixel_values }
361
+ return config , inputs_dict
362
+
363
+
364
+ @require_torch
365
+ class ChameleonVision2SeqModelTest (ModelTesterMixin , GenerationTesterMixin , unittest .TestCase ):
366
+ all_model_classes = (ChameleonModel , ChameleonForConditionalGeneration ) if is_torch_available () else ()
367
+ pipeline_model_mapping = (
368
+ {
369
+ "image-text-to-text" : ChameleonForConditionalGeneration ,
370
+ }
371
+ if is_torch_available ()
372
+ else {}
373
+ )
374
+ test_headmasking = False
375
+ test_pruning = False
376
+ fx_compatible = False
377
+
378
+ def setUp (self ):
379
+ self .model_tester = ChameleonVision2SeqModelTester (self )
380
+ self .config_tester = ConfigTester (self , config_class = ChameleonConfig , hidden_size = 37 )
381
+
382
+ def test_config (self ):
383
+ self .config_tester .run_common_tests ()
384
+
385
+ @unittest .skip ("Chameleon forces some token ids to be -inf!" )
386
+ def test_batching_equivalence (self ):
387
+ pass
388
+
389
+ @unittest .skip ("Chameleon cannot do offload because it uses `self.linear.weight` in forward" )
390
+ def test_cpu_offload (self ):
391
+ pass
392
+
393
+ @unittest .skip ("Chameleon cannot do offload because it uses `self.linear.weight` in forward" )
394
+ def test_disk_offload_bin (self ):
395
+ pass
396
+
397
+ @unittest .skip ("Chameleon cannot do offload because it uses `self.linear.weight` in forward" )
398
+ def test_disk_offload_safetensors (self ):
399
+ pass
400
+
401
+ @unittest .skip ("Chameleon VQ model cannot be squishes more due to hardcoded layer params in model code" )
402
+ def test_model_is_small (self ):
403
+ pass
404
+
405
+ def test_mismatching_num_image_tokens (self ):
406
+ """
407
+ Tests that VLMs through an error with explicit message saying what is wrong
408
+ when number of images don't match number of image tokens in the text.
409
+ Also we need to test multi-image cases when one prompr has multiple image tokens.
410
+ """
411
+ config , input_dict = self .model_tester .prepare_config_and_inputs_for_common ()
412
+ for model_class in self .all_model_classes :
413
+ model = model_class (config ).to (torch_device )
414
+ curr_input_dict = copy .deepcopy (input_dict ) # the below tests modify dict in-place
415
+ _ = model (** curr_input_dict ) # successful forward with no modifications
416
+
417
+ # remove one image but leave the image token in text
418
+ curr_input_dict ["pixel_values" ] = curr_input_dict ["pixel_values" ][- 1 :, ...]
419
+ with self .assertRaises (ValueError ):
420
+ _ = model (** curr_input_dict )
421
+
422
+ # simulate multi-image case by concatenating inputs where each has exactly one image/image-token
423
+ input_ids = curr_input_dict ["input_ids" ][:1 ]
424
+ pixel_values = curr_input_dict ["pixel_values" ][:1 ]
425
+ input_ids = torch .cat ([input_ids , input_ids ], dim = 0 )
426
+
427
+ # one image and two image tokens raise an error
428
+ with self .assertRaises (ValueError ):
429
+ _ = model (input_ids = input_ids , pixel_values = pixel_values )
430
+
431
+ # two images and two image tokens don't raise an error
432
+ pixel_values = torch .cat ([pixel_values , pixel_values ], dim = 0 )
433
+ _ = model (input_ids = input_ids , pixel_values = pixel_values )
434
+
435
+ # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
436
+ def test_inputs_embeds (self ):
437
+ config , inputs_dict = self .model_tester .prepare_config_and_inputs_for_common ()
438
+
439
+ for model_class in self .all_model_classes :
440
+ model = model_class (config )
441
+ model .to (torch_device )
442
+ model .eval ()
443
+
444
+ inputs = self ._prepare_for_class (inputs_dict , model_class )
445
+
446
+ input_ids = inputs ["input_ids" ]
447
+ del inputs ["input_ids" ]
448
+ del inputs ["pixel_values" ]
449
+
450
+ wte = model .get_input_embeddings ()
451
+ inputs ["inputs_embeds" ] = wte (input_ids )
452
+
453
+ with torch .no_grad ():
454
+ model (** inputs )
455
+
456
+ # overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
457
+ # while some other models require pixel_values to be present
458
+ def test_inputs_embeds_matches_input_ids (self ):
459
+ config , inputs_dict = self .model_tester .prepare_config_and_inputs_for_common ()
460
+
461
+ for model_class in self .all_model_classes :
462
+ model = model_class (config )
463
+ model .to (torch_device )
464
+ model .eval ()
465
+
466
+ inputs = self ._prepare_for_class (inputs_dict , model_class )
467
+ input_ids = inputs ["input_ids" ]
468
+ del inputs ["input_ids" ]
469
+ del inputs ["pixel_values" ]
470
+
471
+ inputs_embeds = model .get_input_embeddings ()(input_ids )
472
+
473
+ with torch .no_grad ():
474
+ out_ids = model (input_ids = input_ids , ** inputs )[0 ]
475
+ out_embeds = model (inputs_embeds = inputs_embeds , ** inputs )[0 ]
476
+ torch .testing .assert_close (out_embeds , out_ids )
477
+
333
478
334
479
@require_torch
335
480
class ChameleonIntegrationTest (unittest .TestCase ):
0 commit comments