Merge pull request #29 from tanganke/lora-clip

Update documentation and add support for LoRA CLIP vision models.
tanganke · Nov 14, 2024 · 69ea2da · 69ea2da
2 parents b894b2b + da5fec4
commit 69ea2da
Show file tree

Hide file tree

Showing 3 changed files with 45 additions and 18 deletions.
diff --git a/config/modelpool/CLIPVisionModelPool/clip-vit-base-patch16_individual_lora.yaml b/config/modelpool/CLIPVisionModelPool/clip-vit-base-patch16_individual_lora.yaml
@@ -0,0 +1,14 @@
+_target_: fusion_bench.modelpool.CLIPVisionModelPool
+
+models:
+  sun397:
+    _target_: fusion_bench.models.linearized.vision_model.load_lora_vision_model_hf
+    base_model_name: openai/clip-vit-base-patch16
+    peft_name: tanganke/clip-vit-base-patch16_sun397_lora-16
+
+processor:
+  _target_: transformers.CLIPProcessor.from_pretrained
+  pretrained_model_name_or_path: openai/clip-vit-base-patch16
+
+train_datasets: null
+test_datasets: null
diff --git a/docs/modelpool/clip_vit.md b/docs/modelpool/clip_vit.md
@@ -154,26 +154,14 @@ You can find the script for fine-tuning the models at `examples/clip_finetune/cl
 - [CLIP-ViT-B/16 on the eight image classification tasks (LoRA)](https://huggingface.co/collections/tanganke/clip-vit-b-16-on-the-eight-image-classification-tasks-lora-66cd554ee7829e9dbb236c29)
 - [CLIP-ViT-B/16 on eight image classification tasks (L-LoRA)](https://huggingface.co/collections/tanganke/clip-vit-b-16-on-eight-image-classification-tasks-l-lora-66cd5b0e332ce5c7468d1bc6)
 
-Load LoRA models:
+Load LoRA models (see [load_lora_vision_model_hf][fusion_bench.models.linearized.vision_model.load_lora_vision_model_hf]):
 
 ```python
-base_model = CLIPVisionModel.from_pretrained('openai/clip-vit-base-patch16')
+base_model = CLIPVisionModel.from_pretrained('openai/clip-vit-base-patch16').vision_model
 model = PeftModel.from_pretrained(base_model, peft_model_id)
 ```
 
-Load L-LoRA models:
-
-```python
-from transformers import CLIPVisionModel
-from peft import PeftModel
-from fusion_bench.method.classification.clip_finetune import linearize_lora_model_
-
-base_model = CLIPVisionModel.from_pretrained("openai/clip-vit-base-patch16")
-peft_model_id = "openai/clip-vit-base-patch16_eurosat_l-lora-16"
-model = PeftModel.from_pretrained(base_model, peft_model_id)
-
-model = linearize_lora_model_(model)
-```
+Load L-LoRA models, refer to [load_l_lora_vision_model_hf][fusion_bench.models.linearized.vision_model.load_l_lora_vision_model_hf].
 
 === "Performance of the fine-tuned CLIP-ViT-B/16 models (LoRA-16)"
 
@@ -721,4 +709,7 @@ Table: Results of the robustness experiments ($\lambda=0.3$).
 
 ## References
 
-[^1]: Dan Hendrycks and Thomas Dietterich. Benchmarking neural network robustness to common corruptions and perturbations. Proceedings of the International Conference on Learning Representations, 2019.
+::: fusion_bench.modelpool.clip_vision
+::: fusion_bench.models.linearized.vision_model
+
+[^1]: Dan Hendrycks and Thomas Dietterich. Benchmarking neural network robustness to common corruptions and perturbations. Proceedings of the International Conference on Learning Representations, 2019.
diff --git a/fusion_bench/models/linearized/vision_model.py b/fusion_bench/models/linearized/vision_model.py
@@ -48,9 +48,31 @@ def load_fft_vision_model_hf(model_name: str) -> CLIPVisionTransformer:
     return CLIPVisionModel.from_pretrained(model_name).vision_model
 
 
-def load_lora_vision_model_hf(base_model_name: str, peft_name: str):
+def load_lora_vision_model_hf(
+    base_model_name: str,
+    peft_name: str,
+    merge_and_unload: bool = False,
+):
+    """
+    Load a LoRA (Low-Rank Adaptation) vision model from Hugging Face.
+
+    This function loads a vision model and applies a LoRA adaptation to it. The model can be optionally merged and unloaded.
+
+    Parameters:
+        base_model_name (str): The name of the base vision model to load from Hugging Face.
+        peft_name (str): The name of the LoRA adaptation to apply to the base model.
+        merge_and_unload (bool, optional): If True, the LoRA adaptation is merged into the base model and the LoRA layers are removed. Defaults to False.
+
+    Returns:
+        PeftModel: The adapted vision model, optionally merged and unloaded.
+    """
+    # note that we apply lora on type `CLIPVisionTransformer` instead of `CLIPVisionModel`
     model = CLIPVisionModel.from_pretrained(base_model_name).vision_model
-    return PeftModel.from_pretrained(model, peft_name, is_trainable=True)
+    peft_model = PeftModel.from_pretrained(model, peft_name, is_trainable=True)
+    if merge_and_unload:
+        return peft_model.merge_and_unload()
+    else:
+        return peft_model
 
 
 def load_l_lora_vision_model_hf(base_model_name: str, peft_name: str):