fix qwen2_5-omni (#3716)

Jintao-Huang · web-flow · commit cfe527c80f9f · 2025-03-28T23:44:34.000+08:00
diff --git a/docs/source/BestPractices/GRPO多模态训练.md b/docs/source/BestPractices/GRPO多模态训练.md
@@ -36,7 +36,7 @@ register_dataset(
 ```json
 {
     'images': [{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\xe0\x00\x00\x01@\x08\x06\x00\x00\x00d\xc8\xafB`\x82 ...', 'path': 'CLEVR_trainA_000000.png'}],
-    'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}, {'role': 'assistant', 'content': '<answer> 3 </answer>'}],
+    'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}],
     'solution': '<answer> 3 </answer>'
 }
 ```
diff --git a/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md b/docs/source_en/BestPractices/GRPO-Multi-Modal-Training.md
@@ -40,7 +40,7 @@ The purpose of redefining the dataset preprocessor here is to modify the query.
 ```json
 {
     'images': [{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\xe0\x00\x00\x01@\x08\x06\x00\x00\x00d\xc8\xafB`\x82 ...', 'path': 'CLEVR_trainA_000000.png'}],
-    'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}, {'role': 'assistant', 'content': '<answer> 3 </answer>'}],
+    'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}],
     'solution': '<answer> 3 </answer>'
 }
 ```
diff --git a/docs/source_en/BestPractices/More-Best-Practices.md b/docs/source_en/BestPractices/More-Best-Practices.md
diff --git a/examples/train/multimodal/omni/infer.sh b/examples/train/multimodal/omni/infer.sh
@@ -0,0 +1,9 @@
+CUDA_VISIBLE_DEVICES=0 \
+VIDEO_MAX_PIXELS=50176 \
+FPS_MAX_FRAMES=12 \
+MAX_PIXELS=1003520 \
+swift infer \
+    --adapters output/vx-xxx/checkpoint-xxx \
+    --stream true \
+    --load_data_args true \
+    --max_new_tokens 2048
diff --git a/examples/train/multimodal/omni/sft.sh b/examples/train/multimodal/omni/sft.sh
@@ -0,0 +1,35 @@
+# 4*25GB
+# A demo for four modalities that can be run directly
+nproc_per_node=4
+
+CUDA_VISIBLE_DEVICES=0,1,2,3 \
+NPROC_PER_NODE=$nproc_per_node \
+VIDEO_MAX_PIXELS=50176 \
+FPS_MAX_FRAMES=12 \
+MAX_PIXELS=1003520 \
+swift sft \
+    --model Qwen/Qwen2.5-Omni-7B \
+    --dataset 'AI-ModelScope/alpaca-gpt4-data-zh#500' \
+              'AI-ModelScope/LaTeX_OCR:human_handwrite#2000' \
+              'speech_asr/speech_asr_aishell1_trainsets:validation#2000' \
+              'swift/VideoChatGPT:all#2000' \
+    --train_type lora \
+    --torch_dtype bfloat16 \
+    --num_train_epochs 1 \
+    --per_device_train_batch_size 1 \
+    --per_device_eval_batch_size 1 \
+    --learning_rate 1e-4 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --target_modules all-linear \
+    --freeze_vit true \
+    --gradient_accumulation_steps $(expr 16 / $nproc_per_node) \
+    --eval_steps 50 \
+    --save_steps 50 \
+    --save_total_limit 5 \
+    --logging_steps 5 \
+    --max_length 2048 \
+    --output_dir output \
+    --warmup_ratio 0.05 \
+    --dataloader_num_workers 4 \
+    --deepspeed zero2
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -644,6 +644,7 @@ def get_model_tokenizer_qwen2_5_omni(model_dir, *args, **kwargs):
         requires=['transformers>=4.50', 'soundfile', 'qwen_omni_utils', 'decord'],
         tags=['vision', 'video', 'audio'],
         additional_saved_files=['spk_dict.pt'],
+        ignore_patterns=[],
     ))
 
 
diff --git a/swift/llm/model/utils.py b/swift/llm/model/utils.py
@@ -237,11 +237,10 @@ def safe_snapshot_download(model_id_or_path: str,
             logger.info(f'Loading the model using local model_dir: {model_dir}')
             return model_dir
     if ignore_patterns is None:
-        ignore_patterns = []
-    ignore_patterns += [
-        '*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx',
-        '*.ot', '*.h5'
-    ]
+        ignore_patterns = [
+            '*.zip', '*.gguf', '*.pth', '*.pt', 'consolidated*', 'onnx/*', '*.safetensors.md', '*.msgpack', '*.onnx',
+            '*.ot', '*.h5'
+        ]
     if not download_model:
         ignore_patterns += ['*.bin', '*.safetensors']
     hub = get_hub(use_hf)
diff --git a/swift/llm/template/template/qwen.py b/swift/llm/template/template/qwen.py
@@ -341,6 +341,7 @@ class Qwen2_5VLTemplate(Qwen2VLTemplate):
 
 
 class Qwen2_5OmniTemplate(Template):
+    placeholder_tokens = ['<|IMAGE|>', '<|AUDIO|>', '<|VIDEO|>']
 
     def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
                     inputs: StdTemplateInputs) -> List[Context]:
@@ -376,15 +377,61 @@ def _encode(self, inputs: StdTemplateInputs) -> Dict[str, Any]:
         media_inputs.pop('input_ids')
         media_inputs.pop('attention_mask')
         media_inputs = to_float_dtype(media_inputs, self.model_info.torch_dtype)
+        input_ids = encoded['input_ids']
+        labels = encoded['labels']
+        for media_type in ['image', 'video']:
+            token = f'<|{media_type.upper()}|>'
+            token_id = self._tokenize(token)
+            idx_list = findall(input_ids, token_id)
+            if idx_list:
+                merge_length = self.processor.omni_processor.merge_size**2
+                media_grid_thw = media_inputs.get(f'{media_type}_grid_thw')
+
+                def _get_new_tokens(i):
+                    token_len = (media_grid_thw[i].prod() // merge_length)
+                    return [token_id] * token_len
+
+                _, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+        # audio
+        feature_attention_mask = media_inputs.get('feature_attention_mask')
+        if feature_attention_mask is not None:
+            audio_feature_lengths = torch.sum(feature_attention_mask, dim=1).tolist()
+            token_id = self._tokenize('<|AUDIO|>')
+            idx_list = findall(input_ids, token_id)
+
+            def _get_new_tokens(i):
+                place_num = ((audio_feature_lengths[i] - 1) // 2 + 1 - 2) // 2 + 1
+                return [token_id] * place_num
+
+            _, labels = self._extend_tokens(input_ids, labels, idx_list, _get_new_tokens)
+
+        encoded['labels'] = labels
         encoded.update(media_inputs)
         return encoded
 
     def _post_encode(self, model, inputs: Dict[str, Any]) -> Dict[str, Any]:
         if self.is_training:
             feature_attention_mask = inputs.get('feature_attention_mask')
             if feature_attention_mask is not None:
+                audio_feature_lengths = torch.sum(feature_attention_mask, dim=1)
                 inputs['input_features'] = inputs['input_features'].permute(0, 2, 1)[feature_attention_mask.bool()]
                 inputs['input_features'] = inputs['input_features'].permute(1, 0)
+            else:
+                audio_feature_lengths = None
+            use_audio_in_video = get_env_args('use_audio_in_video', bool, False)
+            video_second_per_grid = inputs.pop('video_second_per_grid', None)
+            position_ids, _, input_ids, attention_mask = model.thinker.get_rope_index(
+                inputs.get('input_ids'),
+                inputs.get('image_grid_thw'),
+                inputs.get('video_grid_thw'),
+                inputs.get('attention_mask'),
+                use_audio_in_video,
+                audio_feature_lengths,
+                video_second_per_grid,
+            )
+            inputs['input_ids'] = input_ids
+            inputs['attention_mask'] = attention_mask
+            inputs['position_ids'] = position_ids
         return inputs
 
     def _data_collator(self, batch: List[Dict[str, Any]], *, padding_to: Optional[int] = None) -> Dict[str, Any]:
diff --git a/tests/test_align/test_template/test_audio.py b/tests/test_align/test_template/test_audio.py
@@ -13,7 +13,8 @@ def _infer_model(pt_engine, system=None, messages=None, audios=None):
         messages += [{'role': 'user', 'content': '你好'}]
         resp = pt_engine.infer([{'messages': messages}], request_config=request_config)
         response = resp[0].choices[0].message.content
-        messages += [{'role': 'assistant', 'content': response}, {'role': 'user', 'content': '<audio>这段语音说了什么'}]
+        messages += [{'role': 'assistant', 'content': response}]
+        messages += [{'role': 'user', 'content': '<audio>这段语音说了什么'}]
     else:
         messages = messages.copy()
     if audios is None:

Original file line number	Diff line number	Diff line change
`@@ -36,7 +36,7 @@ register_dataset(`
`36`	`36`	```json
`37`	`37`	`{`
`38`	`38`	'images': [{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\xe0\x00\x00\x01@\x08\x06\x00\x00\x00d\xc8\xafB`\x82 ...', 'path': 'CLEVR_trainA_000000.png'}],
`39`		`- 'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}, {'role': 'assistant', 'content': '<answer> 3 </answer>'}],`
	`39`	`+ 'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}],`
`40`	`40`	`'solution': '<answer> 3 </answer>'`
`41`	`41`	`}`
`42`	`42`	```
Original file line number	Diff line number	Diff line change
`@@ -40,7 +40,7 @@ The purpose of redefining the dataset preprocessor here is to modify the query.`
`40`	`40`	```json
`41`	`41`	`{`
`42`	`42`	'images': [{'bytes': b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x01\xe0\x00\x00\x01@\x08\x06\x00\x00\x00d\xc8\xafB`\x82 ...', 'path': 'CLEVR_trainA_000000.png'}],
`43`		`- 'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}, {'role': 'assistant', 'content': '<answer> 3 </answer>'}],`
	`43`	`+ 'messages': [{'role': 'user', 'content': 'How many items are there in the image? Output the thinking process in <think> </think> and\n final answer (number) in <answer> </answer> tags.'}],`
`44`	`44`	`'solution': '<answer> 3 </answer>'`
`45`	`45`	`}`
`46`	`46`	```