diff --git a/docs/source/usage.md b/docs/source/usage.md index 62a4cdb..aff2716 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -38,19 +38,33 @@ optimized_model = torch.compile(model, backend="npu") In windows torch.compile is not supported yet. So you might want to use the explicit function `intel_npu_acceleration_library.compile`. This is true also if you use a `pytorch` version < 2.0.0 +To do this, you just need to call the `compile` function with your model and the compiler configuration `CompilerConfig` to compile and optimize the model for the NPU. ```python import intel_npu_acceleration_library -optimized_model = intel_npu_acceleration_library.compile(model, dtype=torch.int8) +from intel_npu_acceleration_library.compiler import CompilerConfig +compiler_conf = CompilerConfig(dtype=torch.int8) +optimized_model = intel_npu_acceleration_library.compile(model, compiler_conf) # Use the model as usual ``` +To compile and optimize a single layer of a model to be pushed to the NPU as one block, you can set `use_to=True` in the the compiler configuration `CompilerConfig`. +```python +import intel_npu_acceleration_library +from intel_npu_acceleration_library.compiler import CompilerConfig +compiler_conf = CompilerConfig(use_to=True, dtype=torch.int8) +optimized_block = intel_npu_acceleration_library.compile(single_block, compiler_conf) + +``` + ## Training (**Experimental!**) It is possible to use IntelĀ® NPU Acceleration Library to train a model. As before you just need to call the `compile` function, this time with `training=True`. This allows to use the same training script you use in other device with a very minimal modifications. ```python import intel_npu_acceleration_library -compiled_model = intel_npu_acceleration_library.compile(model, dtype=torch.float32, training=True) +from intel_npu_acceleration_library.compiler import CompilerConfig +compiler_conf = CompilerConfig(dtype=torch.float32, training=True) +compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) ``` diff --git a/examples/compile_model.py b/examples/compile_model.py index 2146fcd..afe51ce 100644 --- a/examples/compile_model.py +++ b/examples/compile_model.py @@ -5,6 +5,7 @@ from intel_npu_acceleration_library import compile +from intel_npu_acceleration_library.compiler import CompilerConfig from sklearn.metrics import r2_score import intel_npu_acceleration_library import pytest @@ -41,7 +42,8 @@ def forward(self, x): print( "Windows do not support torch.compile, fallback to intel_npu_acceleration_library.compile" ) - compiled_model = intel_npu_acceleration_library.compile(model) + compiler_conf = CompilerConfig() + compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) else: compiled_model = torch.compile(model, backend="npu") @@ -49,4 +51,4 @@ def forward(self, x): with torch.no_grad(): y = compiled_model(x) - print(f"Reference vs actual R2 score: {r2_score(y_ref, y):.2f}") + print(f"Reference vs actual R2 score: {r2_score(y_ref.numpy(), y.numpy()):.2f}") diff --git a/examples/llava.py b/examples/llava.py index a8e5545..dafa22d 100644 --- a/examples/llava.py +++ b/examples/llava.py @@ -12,6 +12,7 @@ TextStreamer, ) from transformers.feature_extraction_utils import BatchFeature +from intel_npu_acceleration_library.compiler import CompilerConfig import intel_npu_acceleration_library import torch @@ -21,7 +22,8 @@ # Load model model = LlavaForConditionalGeneration.from_pretrained(checkpoint) -model = intel_npu_acceleration_library.compile(model) +compiler_conf = CompilerConfig() +model = intel_npu_acceleration_library.compile(model, compiler_conf) image_processor = CLIPImageProcessor.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) diff --git a/examples/tiny_llama_chat.py b/examples/tiny_llama_chat.py index 13f595c..82a699e 100644 --- a/examples/tiny_llama_chat.py +++ b/examples/tiny_llama_chat.py @@ -4,6 +4,7 @@ # from transformers import pipeline, TextStreamer, set_seed +from intel_npu_acceleration_library.compiler import CompilerConfig import intel_npu_acceleration_library import torch import os @@ -15,7 +16,8 @@ "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto" ) print("Compiling the model for NPU...") -pipe.model = intel_npu_acceleration_library.compile(pipe.model, dtype=torch.int8) +compiler_conf = CompilerConfig(dtype=torch.int8) +pipe.model = intel_npu_acceleration_library.compile(pipe.model, compiler_conf) streamer = TextStreamer(pipe.tokenizer, skip_special_tokens=True, skip_prompt=True) diff --git a/examples/train_mnist.py b/examples/train_mnist.py index 972eb81..6e14a22 100644 --- a/examples/train_mnist.py +++ b/examples/train_mnist.py @@ -7,6 +7,7 @@ import torch from torch import nn import intel_npu_acceleration_library +from intel_npu_acceleration_library.compiler import CompilerConfig from torch.utils.data import DataLoader from torchvision import datasets from torchvision.transforms import ToTensor @@ -90,8 +91,8 @@ def test_loop(dataloader, model, loss_fn): model = NeuralNetwork() - -model = intel_npu_acceleration_library.compile(model, torch.float32, training=True) +compiler_conf = CompilerConfig(dtype=torch.float32, training=True) +model = intel_npu_acceleration_library.compile(model, compiler_conf) learning_rate = 1e-3 batch_size = 64 diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 6b12dea..88183b6 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -288,14 +288,12 @@ def forward(self, input): @register_backend def npu( gm: Union[torch.nn.Module, torch.fx.GraphModule], - config: CompilerConfig, example_inputs: List[torch.Tensor], ) -> Union[torch.nn.Module, torch.fx.GraphModule]: """Implement the custom torch 2.0 compile backend for the NPU. Args: gm (Union[torch.nn.Module, torch.fx.GraphModule]): The torch fx Module - config (CompilerConfig): The compiler configuration example_inputs (List[torch.Tensor]): A list of example inputs Returns: @@ -305,4 +303,5 @@ def npu( gm = horizontal_fusion_linear(gm) # For now compile in fp16 + config = CompilerConfig() return compile(gm, config) diff --git a/script/export.py b/script/export.py index 892711e..4f63f71 100644 --- a/script/export.py +++ b/script/export.py @@ -5,6 +5,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from intel_npu_acceleration_library.compiler import compile +from intel_npu_acceleration_library.compiler import CompilerConfig +from intel_npu_acceleration_library.dtypes import int8, int4 import argparse import torch import os @@ -41,15 +43,19 @@ def export(model_id, dtype, output): if dtype == "fp16": print(f"Compiling model {model_id}") - torch_dtype = torch.float16 + dtype = torch.float16 elif dtype == "int8": print(f"Quantizing & Compiling model {model_id}") - torch_dtype = torch.int8 + dtype = int8 + elif dtype == "int4": + print(f"Quantizing & Compiling model {model_id}") + dtype = int4 else: raise RuntimeError(f"Invalid dtype {dtype}") with torch.no_grad(): - compile(model, dtype=torch_dtype) + compiler_conf = CompilerConfig(dtype=dtype) + compile(model, compiler_conf) filename = os.path.join(PATH, "model.pth") os.makedirs(PATH, exist_ok=True)