From 96bf381b9a14a6d87b92869e3e4ae9359dc24f0b Mon Sep 17 00:00:00 2001 From: lixiang007666 <88304454@qq.com> Date: Tue, 2 Jul 2024 15:28:54 +0800 Subject: [PATCH 1/4] Add lora quant exp --- .../examples/quant_lora/README.md | 17 ++ .../examples/quant_lora/test.py | 164 ++++++++++++++++++ 2 files changed, 181 insertions(+) create mode 100644 onediff_diffusers_extensions/examples/quant_lora/README.md create mode 100644 onediff_diffusers_extensions/examples/quant_lora/test.py diff --git a/onediff_diffusers_extensions/examples/quant_lora/README.md b/onediff_diffusers_extensions/examples/quant_lora/README.md new file mode 100644 index 000000000..e8cdd1817 --- /dev/null +++ b/onediff_diffusers_extensions/examples/quant_lora/README.md @@ -0,0 +1,17 @@ +python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ + --saved-image sd.png + +python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ + --saved-image sd_lora.png \ + --use_lora + +python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ + --compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}' \ + --saved-image sd_lora_compile.png \ + --use_lora + +python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ + --compiler-config '{"mode": "quant:max-optimize:max-autotune:low-precision", "memory_format": "channels_last"}' \ + --quantize-config '{"quant_type": "fp8_e4m3_e4m3_dynamic_per_tensor"}' \ + --saved-image sd_lora_fp8.png \ + --use_lora diff --git a/onediff_diffusers_extensions/examples/quant_lora/test.py b/onediff_diffusers_extensions/examples/quant_lora/test.py new file mode 100644 index 000000000..9986ab69f --- /dev/null +++ b/onediff_diffusers_extensions/examples/quant_lora/test.py @@ -0,0 +1,164 @@ +import argparse +import json +import time + +from diffusers import AutoPipelineForText2Image as pipeline_cls +from onediffx import compile_pipe, quantize_pipe +import torch + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Use onediif (nexfort) to accelerate image generation with Stable Diffusion + LoRA" + ) + parser.add_argument( + "--model", + type=str, + default="stabilityai/stable-diffusion-xl-base-1.0", + help="Model path or identifier.", + ) + parser.add_argument( + "--compiler-config", type=str, help="JSON string for compiler config." + ) + parser.add_argument( + "--quantize-config", type=str, help="JSON string for quantization config." + ) + parser.add_argument( + "--prompt", + type=str, + default="A girl smiling", + help="Prompt for the image generation.", + ) + parser.add_argument( + "--height", type=int, default=1024, help="Height of the generated image." + ) + parser.add_argument( + "--width", type=int, default=1024, help="Width of the generated image." + ) + parser.add_argument( + "--guidance_scale", + type=float, + default=7.5, + help="The scale factor for the guidance.", + ) + parser.add_argument( + "--num-inference-steps", type=int, default=30, help="Number of inference steps." + ) + parser.add_argument( + "--saved-image", + type=str, + default="./sd.png", + help="Path to save the generated image.", + ) + parser.add_argument( + "--seed", type=int, default=3, help="Seed for random number generation." + ) + parser.add_argument( + "--warmup-iterations", + type=int, + default=1, + help="Number of warm-up iterations before actual inference.", + ) + parser.add_argument( + "--use_lora", action="store_true", help="Use LoRA weights for the generation" + ) + parser.add_argument( + "--cross-attention-kwargs", + type=str, + default='{"scale": 0.5}', + help="JSON string for cross-attention configuration.", + ) + return parser.parse_args() + + +args = parse_args() + +device = torch.device("cuda") + + +class SDGenerator: + def __init__(self, model, compiler_config=None, quantize_config=None): + self.pipe = pipeline_cls.from_pretrained(model, torch_dtype=torch.float16) + + if args.use_lora: + print("Use LoRA...") + LORA_MODEL_ID = "ostris/watercolor_style_lora_sdxl" + LORA_FILENAME = "watercolor_v1_sdxl.safetensors" + self.pipe.load_lora_weights(LORA_MODEL_ID, weight_name=LORA_FILENAME) + # self.pipe.fuse_lora() + + self.pipe.to(device) + + if compiler_config: + print("compile...") + self.pipe = self.compile_pipe(self.pipe, compiler_config) + + if quantize_config: + print("quant...") + self.pipe = self.quantize_pipe(self.pipe, quantize_config) + + def warmup(self, gen_args, warmup_iterations): + warmup_args = gen_args.copy() + + warmup_args["generator"] = torch.Generator(device=device).manual_seed(0) + + print("Starting warmup...") + start_time = time.time() + for _ in range(warmup_iterations): + self.pipe(**warmup_args) + end_time = time.time() + print("Warmup complete.") + print(f"Warmup time: {end_time - start_time:.2f} seconds") + + def generate(self, gen_args): + gen_args["generator"] = torch.Generator(device=device).manual_seed(args.seed) + + # Run the model + start_time = time.time() + images = self.pipe(**gen_args).images + end_time = time.time() + + images[0].save(args.saved_image) + + return images[0], end_time - start_time + + def compile_pipe(self, pipe, compiler_config): + options = compiler_config + pipe = compile_pipe( + pipe, backend="nexfort", options=options, fuse_qkv_projections=True + ) + return pipe + + def quantize_pipe(self, pipe, quantize_config): + pipe = quantize_pipe(pipe, ignores=[], **quantize_config) + return pipe + + +def main(): + compiler_config = json.loads(args.compiler_config) if args.compiler_config else None + quantize_config = json.loads(args.quantize_config) if args.quantize_config else None + cross_attention_kwargs = json.loads(args.cross_attention_kwargs) + + sd = SDGenerator(args.model, compiler_config, quantize_config) + + gen_args = { + "prompt": args.prompt, + "num_inference_steps": args.num_inference_steps, + "height": args.height, + "width": args.width, + "guidance_scale": args.guidance_scale, + "cross_attention_kwargs": cross_attention_kwargs, + } + + sd.warmup(gen_args, args.warmup_iterations) + + image, inference_time = sd.generate(gen_args) + print( + f"Generated image saved to {args.saved_image} in {inference_time:.2f} seconds." + ) + cuda_mem_after_used = torch.cuda.max_memory_allocated() / (1024**3) + print(f"Max used CUDA memory : {cuda_mem_after_used:.3f}GiB") + + +if __name__ == "__main__": + main() From e8ce873bd0c7ba59fcf05b25a78b8fd818eaef96 Mon Sep 17 00:00:00 2001 From: lixiang007666 <88304454@qq.com> Date: Tue, 2 Jul 2024 15:35:26 +0800 Subject: [PATCH 2/4] Refine --- .../examples/quant_lora/test.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/onediff_diffusers_extensions/examples/quant_lora/test.py b/onediff_diffusers_extensions/examples/quant_lora/test.py index 9986ab69f..e3cfdf520 100644 --- a/onediff_diffusers_extensions/examples/quant_lora/test.py +++ b/onediff_diffusers_extensions/examples/quant_lora/test.py @@ -17,6 +17,18 @@ def parse_args(): default="stabilityai/stable-diffusion-xl-base-1.0", help="Model path or identifier.", ) + parser.add_argument( + "--lora-model-id", + type=str, + default="ostris/watercolor_style_lora_sdxl", + help="LoRA model identifier for fine-tuning weights." + ) + parser.add_argument( + "--lora-filename", + type=str, + default="watercolor_v1_sdxl.safetensors", + help="Filename for LoRA weights." + ) parser.add_argument( "--compiler-config", type=str, help="JSON string for compiler config." ) @@ -82,9 +94,7 @@ def __init__(self, model, compiler_config=None, quantize_config=None): if args.use_lora: print("Use LoRA...") - LORA_MODEL_ID = "ostris/watercolor_style_lora_sdxl" - LORA_FILENAME = "watercolor_v1_sdxl.safetensors" - self.pipe.load_lora_weights(LORA_MODEL_ID, weight_name=LORA_FILENAME) + self.pipe.load_lora_weights(args.lora_model_id, weight_name=args.lora_filename) # self.pipe.fuse_lora() self.pipe.to(device) From bcb013440a95d3b9c41540f53c11a4a01dce69cc Mon Sep 17 00:00:00 2001 From: lixiang007666 <88304454@qq.com> Date: Tue, 2 Jul 2024 17:04:07 +0800 Subject: [PATCH 3/4] README --- .../examples/quant_lora/README.md | 47 ++++++++++++++++++- 1 file changed, 45 insertions(+), 2 deletions(-) diff --git a/onediff_diffusers_extensions/examples/quant_lora/README.md b/onediff_diffusers_extensions/examples/quant_lora/README.md index e8cdd1817..d8fe859cc 100644 --- a/onediff_diffusers_extensions/examples/quant_lora/README.md +++ b/onediff_diffusers_extensions/examples/quant_lora/README.md @@ -1,17 +1,60 @@ +### Default Configuration without LoRA + +Run: +``` python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ --saved-image sd.png +``` + +Performance: +- Iterations per second: 7.03 it/s +- Time taken: 4.65 seconds +- Max used CUDA memory: 10.467 GiB + +### Using LoRA + +Run: +``` python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ --saved-image sd_lora.png \ --use_lora +``` + +Performance: +- Iterations per second: 6.28 it/s +- Time taken: 5.16 seconds +- Max used CUDA memory: 10.481 GiB + +### Compile + +Run: +``` python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ --compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}' \ --saved-image sd_lora_compile.png \ --use_lora +``` + +Performance: +- Iterations per second: 13.29 it/s +- Time taken: 2.61 seconds +- Max used CUDA memory: 11.477 GiB + +### Compiled with Quantization + +Run: +``` python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ --compiler-config '{"mode": "quant:max-optimize:max-autotune:low-precision", "memory_format": "channels_last"}' \ - --quantize-config '{"quant_type": "fp8_e4m3_e4m3_dynamic_per_tensor"}' \ - --saved-image sd_lora_fp8.png \ + --quantize-config '{"quant_type": "int8_dynamic"}' \ + --saved-image sd_lora_int8.png \ --use_lora +``` + +Performance: +- Iterations per second: 15.55 it/s +- Time taken: 2.22 seconds +- Max used CUDA memory: 8.804 GiB From 535c62ec3dbb958a68ebc29505aed2cda21e736b Mon Sep 17 00:00:00 2001 From: lixiang007666 <88304454@qq.com> Date: Wed, 31 Jul 2024 10:02:01 +0800 Subject: [PATCH 4/4] Fix script --- .../examples/quant_lora/README.md | 31 ++++++++++--------- .../examples/quant_lora/test.py | 30 ++++++++---------- 2 files changed, 30 insertions(+), 31 deletions(-) diff --git a/onediff_diffusers_extensions/examples/quant_lora/README.md b/onediff_diffusers_extensions/examples/quant_lora/README.md index d8fe859cc..4f82ba14a 100644 --- a/onediff_diffusers_extensions/examples/quant_lora/README.md +++ b/onediff_diffusers_extensions/examples/quant_lora/README.md @@ -7,9 +7,10 @@ python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ ``` Performance: -- Iterations per second: 7.03 it/s -- Time taken: 4.65 seconds -- Max used CUDA memory: 10.467 GiB +- Iterations per second: 8.49 it/s +- Time taken: 3.92 seconds +- Max used CUDA memory: 10.465GiB + ### Using LoRA @@ -22,9 +23,10 @@ python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ ``` Performance: -- Iterations per second: 6.28 it/s -- Time taken: 5.16 seconds -- Max used CUDA memory: 10.481 GiB +- Iterations per second: 8.53 it/s +- Time taken: 3.91 seconds +- Max used CUDA memory: 10.477GiB + ### Compile @@ -32,15 +34,16 @@ Performance: Run: ``` python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ - --compiler-config '{"mode": "max-optimize:max-autotune:low-precision:cache-all", "memory_format": "channels_last"}' \ + --compiler-config '{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last", "options": {"triton.fuse_attention_allow_fp16_reduction": false}}' \ --saved-image sd_lora_compile.png \ --use_lora ``` Performance: -- Iterations per second: 13.29 it/s -- Time taken: 2.61 seconds -- Max used CUDA memory: 11.477 GiB +- Iterations per second: 14.94 it/s +- Time taken: 2.29 seconds +- Max used CUDA memory: 11.475GiB + ### Compiled with Quantization @@ -48,13 +51,13 @@ Performance: Run: ``` python3 onediff_diffusers_extensions/examples/quant_lora/test.py \ - --compiler-config '{"mode": "quant:max-optimize:max-autotune:low-precision", "memory_format": "channels_last"}' \ + --compiler-config '{"mode": "max-optimize:max-autotune:low-precision", "memory_format": "channels_last", "options": {"triton.fuse_attention_allow_fp16_reduction": false}}' \ --quantize-config '{"quant_type": "int8_dynamic"}' \ --saved-image sd_lora_int8.png \ --use_lora ``` Performance: -- Iterations per second: 15.55 it/s -- Time taken: 2.22 seconds -- Max used CUDA memory: 8.804 GiB +- Iterations per second: 17.00 it/s +- Time taken: 2.04 seconds +- Max used CUDA memory: 8.808GiB diff --git a/onediff_diffusers_extensions/examples/quant_lora/test.py b/onediff_diffusers_extensions/examples/quant_lora/test.py index e3cfdf520..6b3c78e78 100644 --- a/onediff_diffusers_extensions/examples/quant_lora/test.py +++ b/onediff_diffusers_extensions/examples/quant_lora/test.py @@ -2,9 +2,10 @@ import json import time +import torch + from diffusers import AutoPipelineForText2Image as pipeline_cls from onediffx import compile_pipe, quantize_pipe -import torch def parse_args(): @@ -20,14 +21,14 @@ def parse_args(): parser.add_argument( "--lora-model-id", type=str, - default="ostris/watercolor_style_lora_sdxl", - help="LoRA model identifier for fine-tuning weights." + default="minimaxir/sdxl-wrong-lora", + help="LoRA model identifier for fine-tuning weights.", ) parser.add_argument( "--lora-filename", type=str, - default="watercolor_v1_sdxl.safetensors", - help="Filename for LoRA weights." + default="pytorch_lora_weights.bin", + help="Filename for LoRA weights.", ) parser.add_argument( "--compiler-config", type=str, help="JSON string for compiler config." @@ -38,7 +39,7 @@ def parse_args(): parser.add_argument( "--prompt", type=str, - default="A girl smiling", + default="anime style, 1 girl, indoors, sitting on the sofa, living room, pink hair, white sock, blue eyes, from back, from above, face towards viewer, playing video games, holding controller, black silk, parted lips.", help="Prompt for the image generation.", ) parser.add_argument( @@ -63,7 +64,7 @@ def parse_args(): help="Path to save the generated image.", ) parser.add_argument( - "--seed", type=int, default=3, help="Seed for random number generation." + "--seed", type=int, default=888, help="Seed for random number generation." ) parser.add_argument( "--warmup-iterations", @@ -74,12 +75,6 @@ def parse_args(): parser.add_argument( "--use_lora", action="store_true", help="Use LoRA weights for the generation" ) - parser.add_argument( - "--cross-attention-kwargs", - type=str, - default='{"scale": 0.5}', - help="JSON string for cross-attention configuration.", - ) return parser.parse_args() @@ -94,8 +89,10 @@ def __init__(self, model, compiler_config=None, quantize_config=None): if args.use_lora: print("Use LoRA...") - self.pipe.load_lora_weights(args.lora_model_id, weight_name=args.lora_filename) - # self.pipe.fuse_lora() + self.pipe.load_lora_weights( + args.lora_model_id, weight_name=args.lora_filename + ) + self.pipe.fuse_lora() self.pipe.to(device) @@ -147,17 +144,16 @@ def quantize_pipe(self, pipe, quantize_config): def main(): compiler_config = json.loads(args.compiler_config) if args.compiler_config else None quantize_config = json.loads(args.quantize_config) if args.quantize_config else None - cross_attention_kwargs = json.loads(args.cross_attention_kwargs) sd = SDGenerator(args.model, compiler_config, quantize_config) gen_args = { "prompt": args.prompt, + "negative_prompt": "wrong", "num_inference_steps": args.num_inference_steps, "height": args.height, "width": args.width, "guidance_scale": args.guidance_scale, - "cross_attention_kwargs": cross_attention_kwargs, } sd.warmup(gen_args, args.warmup_iterations)