From 22c627cc74c67f738cb2f235f3d7f66f1c326089 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Mon, 1 Jul 2024 16:56:00 +0100 Subject: [PATCH 01/18] Add support for phi-3 MLP layer --- intel_npu_acceleration_library/compiler.py | 19 +++++++ intel_npu_acceleration_library/nn/__init__.py | 4 +- intel_npu_acceleration_library/nn/llm.py | 53 +++++++++++++++++++ test/python/test_llm.py | 28 ++++++++++ 4 files changed, 102 insertions(+), 2 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 4e80d04..4132cfe 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -6,6 +6,7 @@ from intel_npu_acceleration_library.optimizations import horizontal_fusion_linear from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention from transformers.models.gemma.modeling_gemma import GemmaMLP, GemmaAttention +from transformers.models.phi3.modeling_phi3 import Phi3MLP from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear from intel_npu_acceleration_library.quantization import quantize_model from intel_npu_acceleration_library.dtypes import int8, int4 @@ -174,6 +175,24 @@ def optimize_llama_attention( return None +@module_optimization +def optimize_phi3_MLP( + name: str, layer: torch.nn.Module +) -> Union[torch.nn.Module, None]: + """Optimize Phi-3 MLP block. + + Args: + name (str): Module name + layer (torch.nn.Module): Original Module + + Returns: + Union[torch.nn.Module, None]: optimized Phi-3 module + """ + if isinstance(layer, Phi3MLP): + return nn.Phi3MLP.fromTorch(layer) + return None + + @register_backend def npu( gm: Union[torch.nn.Module, torch.fx.GraphModule], example_inputs: List[torch.Tensor] diff --git a/intel_npu_acceleration_library/nn/__init__.py b/intel_npu_acceleration_library/nn/__init__.py index 408d1b2..58033bb 100644 --- a/intel_npu_acceleration_library/nn/__init__.py +++ b/intel_npu_acceleration_library/nn/__init__.py @@ -9,9 +9,9 @@ from .module import Module # noqa try: - from .llm import LlamaAttention, PhiMLP # noqa + from .llm import LlamaAttention, PhiMLP, Phi3MLP # noqa - llm_modules = ["LlamaAttention", "PhiMLP"] + llm_modules = ["LlamaAttention", "PhiMLP", "Phi3MLP"] except ModuleNotFoundError: # Transformer library is not installed llm_modules = [] diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py index 8cf6cd3..17a93d6 100644 --- a/intel_npu_acceleration_library/nn/llm.py +++ b/intel_npu_acceleration_library/nn/llm.py @@ -9,6 +9,7 @@ LlamaConfig, ) from transformers import AutoTokenizer +from intel_npu_acceleration_library.backend.tensor import Tensor from intel_npu_acceleration_library.nn import Linear from intel_npu_acceleration_library.backend import run_factory, MLP from functools import partial @@ -72,6 +73,58 @@ def fromTorch( return new_layer +class Phi3MLP(torch.nn.Module): + """Phi-3 MLP operation NPU backend.""" + + def __init__(self, config): + """Initialize Phi-3 MLP operation. + + Args: + config (Phi3Config): Phi-3 MLP configuration + """ + super().__init__() + self.config = config + self.op_id = str(uuid.uuid4()) + + def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor: + """NPU module forward method. + + Args: + hidden_states (Tensor): The input tensor. + gate_up_proj_w (Tensor): The gate up projection input weight tensor. + down_proj_w (Tensor): The down projection input weight tensor. + kwargs: Additional arguments + + Returns: + Tensor: The output tensor + """ + gate_up_states = torch.nn.functional.linear(hidden_states, gate_up_proj_w) + + gate = gate_up_states[:, : self.config.intermediate_size] + up_states = gate_up_states[:, self.config.intermediate_size :] + + up_states = up_states * torch.nn.functional.silu(gate) + + return torch.nn.functional.linear(up_states, down_proj_w) + + @staticmethod + def fromTorch( + layer: torch.nn.Module, dtype: torch.dtype = torch.float16 + ) -> "Phi3MLP": + """Generate a NPU Phi-3 MLP layer from a transformer one. + + Args: + layer (torch.nn.Linear): the original Phi-3 MLP model to run on the NPU + dtype (torch.dtype): the desired datatype + + Returns: + Phi3MLP: A NPU Phi-3 MLP layer + """ + new_layer = Phi3MLP(config=layer.config) + + return new_layer + + class FusedLlamaMLP(torch.nn.Module): """LLAMA MLP operation NPU backend.""" diff --git a/test/python/test_llm.py b/test/python/test_llm.py index 8e4dbf0..7e29dbd 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -5,6 +5,7 @@ from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaConfig from transformers.models.phi.modeling_phi import PhiConfig, PhiMLP +from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP from transformers import AutoTokenizer, AutoModelForCausalLM from sklearn.metrics import r2_score import intel_npu_acceleration_library @@ -76,3 +77,30 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size): out = model(x) assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001 + + +@torch.no_grad +@pytest.mark.parametrize("seq_len", [16]) +@pytest.mark.parametrize("hidden_size", [256, 512]) +@pytest.mark.parametrize("intermediate_size", [512]) +def test_phi3_mlp(seq_len, hidden_size, intermediate_size): + conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + conf.num_hidden_layers = 1 + conf.hidden_size = hidden_size + conf.intermediate_size = intermediate_size + + mlp = Phi3MLP(conf) + + hidden_states = torch.rand((seq_len, conf.hidden_size)).half() + + reference = mlp(hidden_states.to(torch.float32)).to(torch.float16) + + model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp) + + assert model + + out = model( + hidden_states, mlp.gate_up_proj.weight.half(), mlp.down_proj.weight.half() + ) + + assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001 From ea4b27a76d6965d4eb0be01ac89533df28f09f61 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Tue, 2 Jul 2024 10:26:02 +0100 Subject: [PATCH 02/18] Updating support for Phi-3 MLP --- intel_npu_acceleration_library/compiler.py | 5 +++-- intel_npu_acceleration_library/nn/llm.py | 20 +++++++++++++------- test/python/test_llm.py | 11 +++++++++-- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 4132cfe..760ea83 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -177,19 +177,20 @@ def optimize_llama_attention( @module_optimization def optimize_phi3_MLP( - name: str, layer: torch.nn.Module + name: str, layer: torch.nn.Module, activation_fn: torch.nn ) -> Union[torch.nn.Module, None]: """Optimize Phi-3 MLP block. Args: name (str): Module name layer (torch.nn.Module): Original Module + activation_fn (torch.nn): Activation function Returns: Union[torch.nn.Module, None]: optimized Phi-3 module """ if isinstance(layer, Phi3MLP): - return nn.Phi3MLP.fromTorch(layer) + return nn.Phi3MLP.fromTorch(layer, activation_fn) return None diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py index 17a93d6..88f65ae 100644 --- a/intel_npu_acceleration_library/nn/llm.py +++ b/intel_npu_acceleration_library/nn/llm.py @@ -76,14 +76,16 @@ def fromTorch( class Phi3MLP(torch.nn.Module): """Phi-3 MLP operation NPU backend.""" - def __init__(self, config): + def __init__(self, config, activation_fn): """Initialize Phi-3 MLP operation. Args: config (Phi3Config): Phi-3 MLP configuration + activation_fn (torch.nn): Phi-3 MLP activation function """ super().__init__() self.config = config + self.activation_fn = activation_fn self.op_id = str(uuid.uuid4()) def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor: @@ -100,27 +102,31 @@ def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tenso """ gate_up_states = torch.nn.functional.linear(hidden_states, gate_up_proj_w) - gate = gate_up_states[:, : self.config.intermediate_size] - up_states = gate_up_states[:, self.config.intermediate_size :] + midpoint = gate_up_states.size(dim=-1) // 2 + gate = gate_up_states[:, :midpoint] + up_states = gate_up_states[:, midpoint:] - up_states = up_states * torch.nn.functional.silu(gate) + up_states = up_states * self.activation_fn(gate) return torch.nn.functional.linear(up_states, down_proj_w) @staticmethod def fromTorch( - layer: torch.nn.Module, dtype: torch.dtype = torch.float16 + layer: torch.nn.Module, + activation_fn: torch.nn, + dtype: torch.dtype = torch.float16, ) -> "Phi3MLP": """Generate a NPU Phi-3 MLP layer from a transformer one. Args: - layer (torch.nn.Linear): the original Phi-3 MLP model to run on the NPU + layer (torch.nn.Module): the original Phi-3 MLP model to run on the NPU + activation_fn (torch.nn): the activation function dtype (torch.dtype): the desired datatype Returns: Phi3MLP: A NPU Phi-3 MLP layer """ - new_layer = Phi3MLP(config=layer.config) + new_layer = Phi3MLP(config=layer.config, activation_fn=activation_fn) return new_layer diff --git a/test/python/test_llm.py b/test/python/test_llm.py index 7e29dbd..a830f78 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -11,6 +11,7 @@ import intel_npu_acceleration_library import pytest import torch +import numpy as np @pytest.fixture @@ -80,7 +81,7 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size): @torch.no_grad -@pytest.mark.parametrize("seq_len", [16]) +@pytest.mark.parametrize("seq_len", [16, 128, 256]) @pytest.mark.parametrize("hidden_size", [256, 512]) @pytest.mark.parametrize("intermediate_size", [512]) def test_phi3_mlp(seq_len, hidden_size, intermediate_size): @@ -95,7 +96,9 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size): reference = mlp(hidden_states.to(torch.float32)).to(torch.float16) - model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp) + model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch( + layer=mlp, activation_fn=torch.nn.functional.silu + ).to("npu") assert model @@ -103,4 +106,8 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size): hidden_states, mlp.gate_up_proj.weight.half(), mlp.down_proj.weight.half() ) + assert out.shape == reference.shape, "Output shape mismatch" + assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf" + assert np.isfinite(out).all(), "NPU output contains NaN or Inf" + assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001 From 39c070c7076f8b6a42157b73cfa1b368e6c5b87a Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Wed, 3 Jul 2024 11:23:41 +0100 Subject: [PATCH 03/18] Update for Phi-3 MLP testing --- intel_npu_acceleration_library/activations.py | 86 +++++++++++++++++++ intel_npu_acceleration_library/compiler.py | 6 +- intel_npu_acceleration_library/nn/llm.py | 12 ++- test/python/test_llm.py | 4 +- 4 files changed, 95 insertions(+), 13 deletions(-) create mode 100644 intel_npu_acceleration_library/activations.py diff --git a/intel_npu_acceleration_library/activations.py b/intel_npu_acceleration_library/activations.py new file mode 100644 index 0000000..e879030 --- /dev/null +++ b/intel_npu_acceleration_library/activations.py @@ -0,0 +1,86 @@ +# +# Copyright © 2024 Intel Corporation +# SPDX-License-Identifier: Apache 2.0 +# + +import torch + + +def get_activation(act_function: str): + """Return an activation function for the NPU. + + Args: + act_function (str): an NPU supported activation function + + Returns: + torch.nn: activation function + """ + match act_function: + case "cos": + return torch.cos + case "sin": + return torch.sin + case "tan": + return torch.tan + case "acos": + return torch.acos + case "asin": + return torch.asin + case "atan": + return torch.atan + case "cosh": + return torch.cosh + case "sinh": + return torch.sinh + case "tanh": + return torch.tanh + case "acosh": + return torch.acosh + case "asinh": + return torch.asinh + case "atanh": + return torch.atanh + case "abs": + return torch.abs + case "ceil": + return torch.ceil + case "clamp": + return torch.clamp + case "elu": + return torch.nn.functional.elu + case "erf": + return torch.erf + case "exp": + return torch.exp + case "floor": + return torch.floor + case "gelu": + return torch.nn.functional.gelu + case "hardsigmoid": + return torch.nn.functional.hardsigmoid + case "hardswish": + return torch.nn.functional.hardswish + case "log": + return torch.log + case "mish": + return torch.nn.functional.mish + case "neg": + return torch.neg + case "relu": + return torch.nn.functional.relu + case "round": + return torch.round + case "sigmoid": + return torch.nn.functional.sigmoid + case "sign": + return torch.sign + case "silu": + return torch.nn.functional.silu + case "softmax": + return torch.nn.functional.softmax + case "softplus": + return torch.nn.functional.softplus + case "sqrt": + return torch.sqrt + case _: + return torch.nn.functional.silu diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 760ea83..a97e36a 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -64,6 +64,7 @@ def apply_general_optimizations(model: torch.nn.Module): """ apply_horizontal_fusion(model) optimize_llama_attention(model) + optimize_phi3_MLP(model) def create_npu_kernels(model: torch.nn.Module): @@ -177,20 +178,19 @@ def optimize_llama_attention( @module_optimization def optimize_phi3_MLP( - name: str, layer: torch.nn.Module, activation_fn: torch.nn + name: str, layer: torch.nn.Module ) -> Union[torch.nn.Module, None]: """Optimize Phi-3 MLP block. Args: name (str): Module name layer (torch.nn.Module): Original Module - activation_fn (torch.nn): Activation function Returns: Union[torch.nn.Module, None]: optimized Phi-3 module """ if isinstance(layer, Phi3MLP): - return nn.Phi3MLP.fromTorch(layer, activation_fn) + return nn.Phi3MLP.fromTorch(layer) return None diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py index 88f65ae..925ec7b 100644 --- a/intel_npu_acceleration_library/nn/llm.py +++ b/intel_npu_acceleration_library/nn/llm.py @@ -12,6 +12,7 @@ from intel_npu_acceleration_library.backend.tensor import Tensor from intel_npu_acceleration_library.nn import Linear from intel_npu_acceleration_library.backend import run_factory, MLP +from intel_npu_acceleration_library.activations import get_activation from functools import partial from typing import Optional, List, Generator from transformers.cache_utils import Cache @@ -76,16 +77,15 @@ def fromTorch( class Phi3MLP(torch.nn.Module): """Phi-3 MLP operation NPU backend.""" - def __init__(self, config, activation_fn): + def __init__(self, config): """Initialize Phi-3 MLP operation. Args: config (Phi3Config): Phi-3 MLP configuration - activation_fn (torch.nn): Phi-3 MLP activation function """ super().__init__() self.config = config - self.activation_fn = activation_fn + self.activation_fn = config.hidden_act self.op_id = str(uuid.uuid4()) def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor: @@ -106,27 +106,25 @@ def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tenso gate = gate_up_states[:, :midpoint] up_states = gate_up_states[:, midpoint:] - up_states = up_states * self.activation_fn(gate) + up_states = up_states * get_activation(self.activation_fn)(gate) return torch.nn.functional.linear(up_states, down_proj_w) @staticmethod def fromTorch( layer: torch.nn.Module, - activation_fn: torch.nn, dtype: torch.dtype = torch.float16, ) -> "Phi3MLP": """Generate a NPU Phi-3 MLP layer from a transformer one. Args: layer (torch.nn.Module): the original Phi-3 MLP model to run on the NPU - activation_fn (torch.nn): the activation function dtype (torch.dtype): the desired datatype Returns: Phi3MLP: A NPU Phi-3 MLP layer """ - new_layer = Phi3MLP(config=layer.config, activation_fn=activation_fn) + new_layer = Phi3MLP(config=layer.config) return new_layer diff --git a/test/python/test_llm.py b/test/python/test_llm.py index a830f78..742e9ee 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -96,9 +96,7 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size): reference = mlp(hidden_states.to(torch.float32)).to(torch.float16) - model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch( - layer=mlp, activation_fn=torch.nn.functional.silu - ).to("npu") + model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp).to("npu") assert model From 727454e6805051f197163b30c04ac76d0819caaa Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Mon, 8 Jul 2024 11:09:38 +0100 Subject: [PATCH 04/18] Update for phi-3 mlp layer --- intel_npu_acceleration_library/compiler.py | 38 +++++++++---- intel_npu_acceleration_library/nn/__init__.py | 4 +- intel_npu_acceleration_library/nn/llm.py | 57 ------------------- intel_npu_acceleration_library/nn/module.py | 7 ++- test/python/test_llm.py | 22 ++++--- 5 files changed, 48 insertions(+), 80 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index a97e36a..2dfdfc6 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -10,6 +10,7 @@ from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear from intel_npu_acceleration_library.quantization import quantize_model from intel_npu_acceleration_library.dtypes import int8, int4 +from intel_npu_acceleration_library.nn.module import NPUModuleWrapper import intel_npu_acceleration_library.nn as nn from torch._dynamo import register_backend from typing import Union, Callable, Any @@ -40,14 +41,18 @@ def compile( # Prepare and optimize model for NPU with torch.no_grad(): - # General optimizations - apply_general_optimizations(model) + if dtype in (int8, int4): # Quantize model model = quantize_model(model, dtype) # Model lowering to NPU ops - create_npu_kernels(model) + if isinstance(model, Phi3MLP): + model = model.to("npu") + else: + # General optimizations + apply_general_optimizations(model) + create_npu_kernels(model) if dtype.is_floating_point and training: # Set model to evaluation only as quantized training is not supported yet @@ -97,13 +102,22 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): kwargs (Any): keyword arguments """ - for name, layer in model.named_children(): - new_layer = func(name, layer, *args, **kwargs) - if new_layer: - model.add_module(name, new_layer) - wrapper(new_layer, *args, **kwargs) - else: - wrapper(layer, *args, **kwargs) + if not isinstance(model, NPUModuleWrapper): + for name, layer in model.named_children(): + print(f"MODEL: {model} \n\n") + if isinstance(model, Phi3MLP): + new_layer = func(model.__class__.__name__, model, *args, **kwargs) + if new_layer: + model.add_module(model.__class__.__name__, new_layer) + else: + new_layer = func(name, layer, *args, **kwargs) + if new_layer: + model.add_module(name, new_layer) + if not isinstance(new_layer, NPUModuleWrapper): + wrapper(new_layer, *args, **kwargs) + else: + if not isinstance(layer, NPUModuleWrapper): + wrapper(layer, *args, **kwargs) return wrapper @@ -189,8 +203,8 @@ def optimize_phi3_MLP( Returns: Union[torch.nn.Module, None]: optimized Phi-3 module """ - if isinstance(layer, Phi3MLP): - return nn.Phi3MLP.fromTorch(layer) + if layer.__class__.__name__ == "Phi3MLP": + return layer.to("npu") return None diff --git a/intel_npu_acceleration_library/nn/__init__.py b/intel_npu_acceleration_library/nn/__init__.py index 58033bb..408d1b2 100644 --- a/intel_npu_acceleration_library/nn/__init__.py +++ b/intel_npu_acceleration_library/nn/__init__.py @@ -9,9 +9,9 @@ from .module import Module # noqa try: - from .llm import LlamaAttention, PhiMLP, Phi3MLP # noqa + from .llm import LlamaAttention, PhiMLP # noqa - llm_modules = ["LlamaAttention", "PhiMLP", "Phi3MLP"] + llm_modules = ["LlamaAttention", "PhiMLP"] except ModuleNotFoundError: # Transformer library is not installed llm_modules = [] diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py index 925ec7b..8cf6cd3 100644 --- a/intel_npu_acceleration_library/nn/llm.py +++ b/intel_npu_acceleration_library/nn/llm.py @@ -9,10 +9,8 @@ LlamaConfig, ) from transformers import AutoTokenizer -from intel_npu_acceleration_library.backend.tensor import Tensor from intel_npu_acceleration_library.nn import Linear from intel_npu_acceleration_library.backend import run_factory, MLP -from intel_npu_acceleration_library.activations import get_activation from functools import partial from typing import Optional, List, Generator from transformers.cache_utils import Cache @@ -74,61 +72,6 @@ def fromTorch( return new_layer -class Phi3MLP(torch.nn.Module): - """Phi-3 MLP operation NPU backend.""" - - def __init__(self, config): - """Initialize Phi-3 MLP operation. - - Args: - config (Phi3Config): Phi-3 MLP configuration - """ - super().__init__() - self.config = config - self.activation_fn = config.hidden_act - self.op_id = str(uuid.uuid4()) - - def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor: - """NPU module forward method. - - Args: - hidden_states (Tensor): The input tensor. - gate_up_proj_w (Tensor): The gate up projection input weight tensor. - down_proj_w (Tensor): The down projection input weight tensor. - kwargs: Additional arguments - - Returns: - Tensor: The output tensor - """ - gate_up_states = torch.nn.functional.linear(hidden_states, gate_up_proj_w) - - midpoint = gate_up_states.size(dim=-1) // 2 - gate = gate_up_states[:, :midpoint] - up_states = gate_up_states[:, midpoint:] - - up_states = up_states * get_activation(self.activation_fn)(gate) - - return torch.nn.functional.linear(up_states, down_proj_w) - - @staticmethod - def fromTorch( - layer: torch.nn.Module, - dtype: torch.dtype = torch.float16, - ) -> "Phi3MLP": - """Generate a NPU Phi-3 MLP layer from a transformer one. - - Args: - layer (torch.nn.Module): the original Phi-3 MLP model to run on the NPU - dtype (torch.dtype): the desired datatype - - Returns: - Phi3MLP: A NPU Phi-3 MLP layer - """ - new_layer = Phi3MLP(config=layer.config) - - return new_layer - - class FusedLlamaMLP(torch.nn.Module): """LLAMA MLP operation NPU backend.""" diff --git a/intel_npu_acceleration_library/nn/module.py b/intel_npu_acceleration_library/nn/module.py index ef23c8e..0ac46cc 100644 --- a/intel_npu_acceleration_library/nn/module.py +++ b/intel_npu_acceleration_library/nn/module.py @@ -4,6 +4,7 @@ # from intel_npu_acceleration_library.backend import NNFactory, Tensor from typing import MutableMapping, Sequence, Any, List +from torch.profiler import record_function import numpy as np import torch @@ -249,7 +250,8 @@ def _call_impl(self, *args: Any, **kwargs: Any) -> Any: # Run the model by replacing the forward method with the factory_forward old_forward = self.forward self.forward = self.factory_forward # type: ignore - out = super()._call_impl(*args, **kwargs) + with record_function(f"npu_{self.__class__.__name__}"): + out = super()._call_impl(*args, **kwargs) # Restore the original forward method self.forward = old_forward # type: ignore @@ -322,7 +324,8 @@ def forward(self, *args, **kwargs) -> torch.Tensor: Returns: torch.Tensor: The output tensor. """ - return self.module(*args, **kwargs) + with record_function(f"npu_{self.module.__class__.__name__}"): + return self.module(*args, **kwargs) def convert_to_npu_module(module: torch.nn.Module) -> Module: diff --git a/test/python/test_llm.py b/test/python/test_llm.py index 742e9ee..75a663b 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -8,6 +8,7 @@ from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP from transformers import AutoTokenizer, AutoModelForCausalLM from sklearn.metrics import r2_score +from torch.profiler import profile, ProfilerActivity import intel_npu_acceleration_library import pytest import torch @@ -84,7 +85,7 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size): @pytest.mark.parametrize("seq_len", [16, 128, 256]) @pytest.mark.parametrize("hidden_size", [256, 512]) @pytest.mark.parametrize("intermediate_size", [512]) -def test_phi3_mlp(seq_len, hidden_size, intermediate_size): +def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size): conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") conf.num_hidden_layers = 1 conf.hidden_size = hidden_size @@ -92,20 +93,27 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size): mlp = Phi3MLP(conf) - hidden_states = torch.rand((seq_len, conf.hidden_size)).half() + hidden_states = torch.rand((seq_len, conf.hidden_size)) - reference = mlp(hidden_states.to(torch.float32)).to(torch.float16) + reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy() - model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp).to("npu") + model = intel_npu_acceleration_library.compile(mlp) assert model - out = model( - hidden_states, mlp.gate_up_proj.weight.half(), mlp.down_proj.weight.half() + with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof: + out = model(hidden_states) + + print( + prof.key_averages(group_by_input_shape=True).table( + sort_by="cpu_time_total", row_limit=20 + ) ) + out = out.detach().numpy() + assert out.shape == reference.shape, "Output shape mismatch" assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf" assert np.isfinite(out).all(), "NPU output contains NaN or Inf" - assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001 + assert 1 - r2_score(reference, out) < 0.001 From ea4ea19dc4748c4ebe05426b80ef375546e76f1d Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Mon, 8 Jul 2024 11:35:45 +0100 Subject: [PATCH 05/18] Remove old code for phi-3 mlp layer --- intel_npu_acceleration_library/activations.py | 86 ------------------- intel_npu_acceleration_library/compiler.py | 20 ++--- 2 files changed, 7 insertions(+), 99 deletions(-) delete mode 100644 intel_npu_acceleration_library/activations.py diff --git a/intel_npu_acceleration_library/activations.py b/intel_npu_acceleration_library/activations.py deleted file mode 100644 index e879030..0000000 --- a/intel_npu_acceleration_library/activations.py +++ /dev/null @@ -1,86 +0,0 @@ -# -# Copyright © 2024 Intel Corporation -# SPDX-License-Identifier: Apache 2.0 -# - -import torch - - -def get_activation(act_function: str): - """Return an activation function for the NPU. - - Args: - act_function (str): an NPU supported activation function - - Returns: - torch.nn: activation function - """ - match act_function: - case "cos": - return torch.cos - case "sin": - return torch.sin - case "tan": - return torch.tan - case "acos": - return torch.acos - case "asin": - return torch.asin - case "atan": - return torch.atan - case "cosh": - return torch.cosh - case "sinh": - return torch.sinh - case "tanh": - return torch.tanh - case "acosh": - return torch.acosh - case "asinh": - return torch.asinh - case "atanh": - return torch.atanh - case "abs": - return torch.abs - case "ceil": - return torch.ceil - case "clamp": - return torch.clamp - case "elu": - return torch.nn.functional.elu - case "erf": - return torch.erf - case "exp": - return torch.exp - case "floor": - return torch.floor - case "gelu": - return torch.nn.functional.gelu - case "hardsigmoid": - return torch.nn.functional.hardsigmoid - case "hardswish": - return torch.nn.functional.hardswish - case "log": - return torch.log - case "mish": - return torch.nn.functional.mish - case "neg": - return torch.neg - case "relu": - return torch.nn.functional.relu - case "round": - return torch.round - case "sigmoid": - return torch.nn.functional.sigmoid - case "sign": - return torch.sign - case "silu": - return torch.nn.functional.silu - case "softmax": - return torch.nn.functional.softmax - case "softplus": - return torch.nn.functional.softplus - case "sqrt": - return torch.sqrt - case _: - return torch.nn.functional.silu diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 2dfdfc6..21024e6 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -104,20 +104,14 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): """ if not isinstance(model, NPUModuleWrapper): for name, layer in model.named_children(): - print(f"MODEL: {model} \n\n") - if isinstance(model, Phi3MLP): - new_layer = func(model.__class__.__name__, model, *args, **kwargs) - if new_layer: - model.add_module(model.__class__.__name__, new_layer) + new_layer = func(name, layer, *args, **kwargs) + if new_layer: + model.add_module(name, new_layer) + if not isinstance(new_layer, NPUModuleWrapper): + wrapper(new_layer, *args, **kwargs) else: - new_layer = func(name, layer, *args, **kwargs) - if new_layer: - model.add_module(name, new_layer) - if not isinstance(new_layer, NPUModuleWrapper): - wrapper(new_layer, *args, **kwargs) - else: - if not isinstance(layer, NPUModuleWrapper): - wrapper(layer, *args, **kwargs) + if not isinstance(layer, NPUModuleWrapper): + wrapper(layer, *args, **kwargs) return wrapper From 1fef8a40948ed562b5f9e2e9cae9fcc05898928a Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Fri, 12 Jul 2024 09:55:17 +0100 Subject: [PATCH 06/18] Add type tensor op and quantisation support --- .../backend/tensor.py | 12 ++++ intel_npu_acceleration_library/compiler.py | 58 +++++++++++++++++++ 2 files changed, 70 insertions(+) diff --git a/intel_npu_acceleration_library/backend/tensor.py b/intel_npu_acceleration_library/backend/tensor.py index 2236eda..e8cca7f 100644 --- a/intel_npu_acceleration_library/backend/tensor.py +++ b/intel_npu_acceleration_library/backend/tensor.py @@ -948,6 +948,18 @@ def to(self, dtype: NPUDtype) -> "Tensor": """ return generate_op([self], "to", dtype) + def type(self, dtype: NPUDtype) -> "Tensor": + """ + Convert the tensor to the specified data type. + + Args: + dtype (NPUDtype): The data type to convert the tensor to. + + Returns: + Tensor: The converted tensor. + """ + return self.to(dtype) + @classmethod def __torch_function__( cls: Any, diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 21024e6..7325c0b 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -16,6 +16,7 @@ from typing import Union, Callable, Any from typing import List import torch +from functools import partial def compile( @@ -45,6 +46,7 @@ def compile( if dtype in (int8, int4): # Quantize model model = quantize_model(model, dtype) + weights_quantization(model) # Model lowering to NPU ops if isinstance(model, Phi3MLP): @@ -92,6 +94,7 @@ def module_optimization(func: Callable) -> torch.nn.Module: Returns: torch.nn.Module: optimized module """ + module_optimization.counter = 0 # type: ignore[attr-defined] def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): """Recursively apply the optimization function. @@ -105,7 +108,13 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): if not isinstance(model, NPUModuleWrapper): for name, layer in model.named_children(): new_layer = func(name, layer, *args, **kwargs) + if (func.__name__ == "optimize_phi3_MLP") and ( + module_optimization.counter >= 5 # type: ignore[attr-defined] + ): + new_layer = None + if new_layer: + module_optimization.counter += 1 # type: ignore[attr-defined] model.add_module(name, new_layer) if not isinstance(new_layer, NPUModuleWrapper): wrapper(new_layer, *args, **kwargs) @@ -202,6 +211,55 @@ def optimize_phi3_MLP( return None +@module_optimization +def weights_quantization( + name: str, layer: torch.nn.Module +) -> Union[torch.nn.Module, None]: + """Apply weights quantization. + + Args: + name (str): Layer name + layer (torch.nn.Module): Original torch.nn.Linear module + + Raises: + RuntimeError: unsupported quantization bits + + Returns: + None: Returns None + """ + if isinstance(layer, WeightOnlyLinear): + if layer.bits == 4: + print("This works - int4 !!") + layer.forward = partial(forward, layer) + elif layer.bits == 8: + print("This works - int8 !!") + layer.forward = partial(forward, layer) + else: + raise RuntimeError(f"Unsupported quantization bits: {layer.bits}") + return None + + +def forward(self, input): + """Override forward method for WeightOnlyLinear class. + + Args: + input: Thr input tensor. + + Returns: + torch.Tensor: The output tensor. + """ + w = self.qweight.to(torch.float16) + # output = torch.nn.functional.linear(input, w, None) * self.scales + # if self.bias: + # return output + self.bias + output = torch.nn.functional.linear(input.to(w.dtype), w, self.bias) * self.scales + if self.bias: + output = torch.nn.functional.linear(input, w, self.bias) * self.scales + else: + output = torch.nn.functional.linear(input, w, None) * self.scales + return output + + @register_backend def npu( gm: Union[torch.nn.Module, torch.fx.GraphModule], example_inputs: List[torch.Tensor] From cc5d3739fa2f72feb2085cd4bb28b9b809b1371f Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Mon, 15 Jul 2024 11:07:34 +0100 Subject: [PATCH 07/18] add support for model quantisation and code clean up --- intel_npu_acceleration_library/compiler.py | 44 +++++++++++----------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 7325c0b..98c0202 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -50,7 +50,7 @@ def compile( # Model lowering to NPU ops if isinstance(model, Phi3MLP): - model = model.to("npu") + model = model else: # General optimizations apply_general_optimizations(model) @@ -94,7 +94,6 @@ def module_optimization(func: Callable) -> torch.nn.Module: Returns: torch.nn.Module: optimized module """ - module_optimization.counter = 0 # type: ignore[attr-defined] def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): """Recursively apply the optimization function. @@ -108,13 +107,8 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): if not isinstance(model, NPUModuleWrapper): for name, layer in model.named_children(): new_layer = func(name, layer, *args, **kwargs) - if (func.__name__ == "optimize_phi3_MLP") and ( - module_optimization.counter >= 5 # type: ignore[attr-defined] - ): - new_layer = None if new_layer: - module_optimization.counter += 1 # type: ignore[attr-defined] model.add_module(name, new_layer) if not isinstance(new_layer, NPUModuleWrapper): wrapper(new_layer, *args, **kwargs) @@ -207,7 +201,7 @@ def optimize_phi3_MLP( Union[torch.nn.Module, None]: optimized Phi-3 module """ if layer.__class__.__name__ == "Phi3MLP": - return layer.to("npu") + return layer return None @@ -228,11 +222,7 @@ def weights_quantization( None: Returns None """ if isinstance(layer, WeightOnlyLinear): - if layer.bits == 4: - print("This works - int4 !!") - layer.forward = partial(forward, layer) - elif layer.bits == 8: - print("This works - int8 !!") + if (layer.bits == 4) or (layer.bits == 8): layer.forward = partial(forward, layer) else: raise RuntimeError(f"Unsupported quantization bits: {layer.bits}") @@ -248,15 +238,27 @@ def forward(self, input): Returns: torch.Tensor: The output tensor. """ - w = self.qweight.to(torch.float16) - # output = torch.nn.functional.linear(input, w, None) * self.scales - # if self.bias: - # return output + self.bias - output = torch.nn.functional.linear(input.to(w.dtype), w, self.bias) * self.scales + if self.bits == 4: + # Unpack the int4 values + lower_int4 = self.qweight & 0x0F + lower_int4 = lower_int4 - (lower_int4 & 0x8) * 2 + upper_int4 = (self.qweight >> 4) & 0x0F + upper_int4 = upper_int4 - (upper_int4 & 0x8) * 2 + + w = torch.stack((lower_int4, upper_int4), dim=2) + w = w.contiguous().view(self.qweight.shape[0], -1) + + elif self.bits == 8: + w = self.qweight.view(torch.int8) + + output = ( + torch.nn.functional.linear(input.to(torch.float16), w.to(torch.float16), None) + * self.scales.T + ) + if self.bias: - output = torch.nn.functional.linear(input, w, self.bias) * self.scales - else: - output = torch.nn.functional.linear(input, w, None) * self.scales + return output + self.bias + return output From d2fe9feb9182423c2fc641081deb8484d8f90145 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Mon, 15 Jul 2024 15:12:46 +0100 Subject: [PATCH 08/18] Fix for model quantization --- intel_npu_acceleration_library/compiler.py | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 98c0202..2c82763 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -42,18 +42,25 @@ def compile( # Prepare and optimize model for NPU with torch.no_grad(): - - if dtype in (int8, int4): - # Quantize model - model = quantize_model(model, dtype) - weights_quantization(model) - # Model lowering to NPU ops if isinstance(model, Phi3MLP): + # Apply optimizations to a single MLP block model model = model + + if dtype in (int8, int4): + # Quantize model + model = quantize_model(model, dtype) + weights_quantization(model) + else: # General optimizations apply_general_optimizations(model) + + if dtype in (int8, int4): + # Quantize model + model = quantize_model(model, dtype) + weights_quantization(model) + create_npu_kernels(model) if dtype.is_floating_point and training: @@ -233,7 +240,7 @@ def forward(self, input): """Override forward method for WeightOnlyLinear class. Args: - input: Thr input tensor. + input: The input tensor. Returns: torch.Tensor: The output tensor. From b7825e754e90b5e6c7607406ece7d0e077970d8b Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Tue, 16 Jul 2024 11:24:45 +0100 Subject: [PATCH 09/18] Add testing for phi-3 mlp quantisation --- test/python/test_llm.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/test/python/test_llm.py b/test/python/test_llm.py index 75a663b..13b5d79 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -7,6 +7,7 @@ from transformers.models.phi.modeling_phi import PhiConfig, PhiMLP from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP from transformers import AutoTokenizer, AutoModelForCausalLM +from intel_npu_acceleration_library.dtypes import int8, int4 from sklearn.metrics import r2_score from torch.profiler import profile, ProfilerActivity import intel_npu_acceleration_library @@ -85,19 +86,27 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size): @pytest.mark.parametrize("seq_len", [16, 128, 256]) @pytest.mark.parametrize("hidden_size", [256, 512]) @pytest.mark.parametrize("intermediate_size", [512]) -def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size): +@pytest.mark.parametrize("dtype", ["float16", "int8", "int4"]) +def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size, dtype): conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") conf.num_hidden_layers = 1 conf.hidden_size = hidden_size conf.intermediate_size = intermediate_size + if dtype == "int8": + dtype = int8 + elif dtype == "int4": + dtype = int4 + else: + dtype = torch.float16 + mlp = Phi3MLP(conf) hidden_states = torch.rand((seq_len, conf.hidden_size)) reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy() - model = intel_npu_acceleration_library.compile(mlp) + model = intel_npu_acceleration_library.compile(mlp, dtype) assert model @@ -116,4 +125,7 @@ def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size): assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf" assert np.isfinite(out).all(), "NPU output contains NaN or Inf" - assert 1 - r2_score(reference, out) < 0.001 + if dtype == int4: + assert 1 - r2_score(reference, out) < 0.05 + else: + assert 1 - r2_score(reference, out) < 0.001 From c65285907635a96b0b24ae0ba02666650713a89d Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Wed, 17 Jul 2024 09:26:35 +0100 Subject: [PATCH 10/18] Add phi-3 mlp test and enable model profiling toggling --- intel_npu_acceleration_library/nn/module.py | 3 +- script/profile_mlp.py | 124 ++++++++++++++++++++ 2 files changed, 126 insertions(+), 1 deletion(-) create mode 100644 script/profile_mlp.py diff --git a/intel_npu_acceleration_library/nn/module.py b/intel_npu_acceleration_library/nn/module.py index 0ac46cc..b5b2f37 100644 --- a/intel_npu_acceleration_library/nn/module.py +++ b/intel_npu_acceleration_library/nn/module.py @@ -111,6 +111,7 @@ def __init__(self) -> None: self._nn_factory_cache: MutableMapping[str, NNFactory] = {} self._npu_inference = False self.npu_top_level_module = True + self.profile = False def extract_tensors_from_arguments( self, args: Sequence[Any] @@ -171,7 +172,7 @@ def create_model( Returns: NNFactory: The model. """ - model = NNFactory() + model = NNFactory(profile=self.profile) def create_args_from_list(args: Sequence[Any]) -> Sequence[Any]: """Create arguments from a list. diff --git a/script/profile_mlp.py b/script/profile_mlp.py new file mode 100644 index 0000000..3b61b6a --- /dev/null +++ b/script/profile_mlp.py @@ -0,0 +1,124 @@ +# +# Copyright © 2024 Intel Corporation +# SPDX-License-Identifier: Apache 2.0 +# + +from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP +from intel_npu_acceleration_library.dtypes import int8, int4 +from torch.profiler import profile, ProfilerActivity +from sklearn.metrics import r2_score +import intel_npu_acceleration_library +import argparse +import torch +import numpy as np + + +def main( + seq_len=128, + hidden_size=256, + intermediate_size=512, + dtype="float16", + _profile=False, +): + + conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct") + conf.num_hidden_layers = 1 + conf.hidden_size = hidden_size + conf.intermediate_size = intermediate_size + + # Define a single Phi-3 MLP layer + mlp = Phi3MLP(conf) + + hidden_states = torch.rand((seq_len, conf.hidden_size)) + + reference = mlp(hidden_states.to(torch.float32)).to(torch.float16) + + if dtype == "float16": + dtype = torch.float16 + elif dtype == "int8": + dtype = int8 + elif dtype == "int4": + dtype = int4 + else: + raise RuntimeError(f"Invalid dtype: {dtype}") + + # Compile model + model = intel_npu_acceleration_library.compile(mlp, dtype) + if _profile: + model.profile = True + + with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof: + for _ in range(1000): + results = model(hidden_states) + + print( + prof.key_averages(group_by_input_shape=True).table( + sort_by="cpu_time_total", row_limit=20 + ) + ) + + prof.export_chrome_trace("trace.json") + + results = results.detach().numpy() + reference = reference.detach().numpy() + + assert results.shape == reference.shape, "Output shape mismatch" + assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf" + assert np.isfinite(results).all(), "NPU output contains NaN or Inf" + + if dtype == int4: + assert 1 - r2_score(reference, results) < 0.05 + else: + assert 1 - r2_score(reference, results) < 0.001 + + +def define_and_parse_args(): + parser = argparse.ArgumentParser(description="Profiling a MLP layer in the NPU") + parser.add_argument( + "--seq-len", + type=int, + default=128, + help="Sequence length (default: %(default)s)", + ) + parser.add_argument( + "--hidden-size", + type=int, + default=256, + help="Hidden size (default: %(default)s)", + ) + parser.add_argument( + "--intermediate-size", + type=int, + default=512, + help="Intermediate size (default: %(default)s)", + ) + parser.add_argument( + "--dtype", + default="float16", + choices=["float16", "int8", "int4"], + help="Select the target dtype (default: %(default)s)", + ) + parser.add_argument( + "--profile", + action="store_true", + default=False, + help="Enable the profiling (default: False)", + ) + + return parser.parse_args() + + +if __name__ == "__main__": + args = define_and_parse_args() + + print( + f"Profiling with sequence length {args.seq_len}, hidden size {args.hidden_size}, intermediate size {args.intermediate_size}, dtype {args.dtype}" + ) + + main( + seq_len=args.seq_len, + hidden_size=args.hidden_size, + intermediate_size=args.intermediate_size, + dtype=args.dtype, + _profile=args.profile, + ) From 786c663398d71ff7c4b61e326cd4899775201b4b Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Wed, 17 Jul 2024 09:55:36 +0100 Subject: [PATCH 11/18] Update for model profiling toggle --- intel_npu_acceleration_library/nn/module.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/intel_npu_acceleration_library/nn/module.py b/intel_npu_acceleration_library/nn/module.py index b5b2f37..e861260 100644 --- a/intel_npu_acceleration_library/nn/module.py +++ b/intel_npu_acceleration_library/nn/module.py @@ -105,13 +105,17 @@ def patch_modules(module: torch.nn.Module, model: NNFactory): class Module(torch.nn.Module): """A PyTorch module that runs on the NPU.""" - def __init__(self) -> None: - """Initialize the module.""" + def __init__(self, profile: bool = False) -> None: + """Initialize the module. + + Args: + profile (bool): Enable model profiling. Defaults to False. + """ super().__init__() self._nn_factory_cache: MutableMapping[str, NNFactory] = {} self._npu_inference = False self.npu_top_level_module = True - self.profile = False + self.profile = profile def extract_tensors_from_arguments( self, args: Sequence[Any] From 003d639b96e7d7e0b74bb0fababce7f66d9c0330 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Thu, 18 Jul 2024 08:40:05 +0100 Subject: [PATCH 12/18] Add compile config feature --- intel_npu_acceleration_library/compiler.py | 73 +++++++++++++-------- intel_npu_acceleration_library/modelling.py | 13 ++-- script/profile_llm.py | 4 +- script/profile_mlp.py | 4 +- test/python/test_llm.py | 4 +- 5 files changed, 61 insertions(+), 37 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 2c82763..7e76a2f 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -6,10 +6,9 @@ from intel_npu_acceleration_library.optimizations import horizontal_fusion_linear from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention from transformers.models.gemma.modeling_gemma import GemmaMLP, GemmaAttention -from transformers.models.phi3.modeling_phi3 import Phi3MLP from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear from intel_npu_acceleration_library.quantization import quantize_model -from intel_npu_acceleration_library.dtypes import int8, int4 +from intel_npu_acceleration_library.dtypes import int8, int4, NPUDtype from intel_npu_acceleration_library.nn.module import NPUModuleWrapper import intel_npu_acceleration_library.nn as nn from torch._dynamo import register_backend @@ -19,15 +18,33 @@ from functools import partial -def compile( - model: torch.nn.Module, dtype: torch.dtype = torch.float16, training: bool = False -) -> torch.nn.Module: +class CompilerConfig: + """Configuration class to store the compilation configuration of a model for the NPU.""" + + def __init__( + self, + use_to: bool = False, + dtype: Union[torch.dtype, NPUDtype] = torch.float16, + training: bool = False, + ) -> None: + """Initialize the configuration class. + + Args: + use_to (bool): Enable model compiling using .to() . Defaults to disabled + dtype (Union[torch.dtype, NPUDtype]): The dtype to compile the model with. Defaults to torch.float16 + training (bool): Enable training. Defaults to disabled + """ + self.use_to = use_to + self.dtype = dtype + self.training = training + + +def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module: """Compile a model for the NPU. Args: model (torch.nn.Module): a pytorch nn.Module to compile and optimize for the npu - dtype (torch.dtype): the model target datatype, default to torch.float16 - training (bool): enable training. Default disabled + config (CompilerConfig): the compiler configuration Raises: RuntimeError: invalid datatypes @@ -35,35 +52,29 @@ def compile( Returns: torch.nn.Module: compiled NPU nn.Module """ - if not (dtype.is_floating_point or dtype in (int8, int4)): + if not (config.dtype.is_floating_point or config.dtype in (int8, int4)): raise RuntimeError( - f"intel-npu-acceleration-library library do not support yet the requeste datatype: {dtype}" + f"intel-npu-acceleration-library library do not support yet the requeste datatype: {config.dtype}" ) # Prepare and optimize model for NPU with torch.no_grad(): # Model lowering to NPU ops - if isinstance(model, Phi3MLP): - # Apply optimizations to a single MLP block model - model = model - - if dtype in (int8, int4): - # Quantize model - model = quantize_model(model, dtype) - weights_quantization(model) - + if config.use_to: + model = model.to("npu") else: # General optimizations apply_general_optimizations(model) - if dtype in (int8, int4): - # Quantize model - model = quantize_model(model, dtype) - weights_quantization(model) + if config.dtype in (int8, int4): + # Quantize model + model = quantize_model(model, config.dtype) + weights_quantization(model) + if not config.use_to: create_npu_kernels(model) - if dtype.is_floating_point and training: + if config.dtype.is_floating_point and config.training: # Set model to evaluation only as quantized training is not supported yet return model @@ -101,6 +112,7 @@ def module_optimization(func: Callable) -> torch.nn.Module: Returns: torch.nn.Module: optimized module """ + module_optimization.counter = 0 # type: ignore[attr-defined] def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): """Recursively apply the optimization function. @@ -114,8 +126,12 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): if not isinstance(model, NPUModuleWrapper): for name, layer in model.named_children(): new_layer = func(name, layer, *args, **kwargs) - + if (func.__name__ == "optimize_phi3_MLP") and ( + module_optimization.counter >= 5 # type: ignore[attr-defined] + ): + new_layer = None if new_layer: + module_optimization.counter += 1 # type: ignore[attr-defined] model.add_module(name, new_layer) if not isinstance(new_layer, NPUModuleWrapper): wrapper(new_layer, *args, **kwargs) @@ -208,7 +224,7 @@ def optimize_phi3_MLP( Union[torch.nn.Module, None]: optimized Phi-3 module """ if layer.__class__.__name__ == "Phi3MLP": - return layer + return layer.to("npu") return None @@ -271,12 +287,15 @@ def forward(self, input): @register_backend def npu( - gm: Union[torch.nn.Module, torch.fx.GraphModule], example_inputs: List[torch.Tensor] + gm: Union[torch.nn.Module, torch.fx.GraphModule], + config: CompilerConfig, + example_inputs: List[torch.Tensor], ) -> Union[torch.nn.Module, torch.fx.GraphModule]: """Implement the custom torch 2.0 compile backend for the NPU. Args: gm (Union[torch.nn.Module, torch.fx.GraphModule]): The torch fx Module + config (CompilerConfig): The compiler configuration example_inputs (List[torch.Tensor]): A list of example inputs Returns: @@ -286,4 +305,4 @@ def npu( gm = horizontal_fusion_linear(gm) # For now compile in fp16 - return compile(gm) + return compile(gm, config) diff --git a/intel_npu_acceleration_library/modelling.py b/intel_npu_acceleration_library/modelling.py index 420db3c..606cbd4 100644 --- a/intel_npu_acceleration_library/modelling.py +++ b/intel_npu_acceleration_library/modelling.py @@ -4,6 +4,7 @@ # from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM import intel_npu_acceleration_library as npu_lib +from intel_npu_acceleration_library.compiler import CompilerConfig from functools import partialmethod from typing import Type, Any, Tuple, Optional import hashlib @@ -62,8 +63,7 @@ class NPUModel: @staticmethod def from_pretrained( model_name_or_path: str, - dtype: torch.dtype = torch.float16, - training: bool = False, + config: CompilerConfig, transformers_class: Optional[Type] = None, export=True, *args: Any, @@ -73,8 +73,7 @@ def from_pretrained( Args: model_name_or_path (str): model name or path - dtype (torch.dtype, optional): compilation dtype. Defaults to torch.float16. - training (bool, optional): enable training. Defaults to False. + config (CompilerConfig): compiler configuration transformers_class (Optional[Type], optional): base class to use. Must have a `from_pretrained` method. Defaults to None. export (bool, optional): enable the caching of the model. Defaults to True. args (Any): positional arguments @@ -91,18 +90,18 @@ def from_pretrained( raise RuntimeError(f"Invalid transformer class {type(transformers_class)}") # get the model cache dir and path from the name and arguments model_dir_path, model_path = get_model_path( - model_name_or_path, dtype, training, *args, **kwargs + model_name_or_path, config.dtype, config.training, *args, **kwargs ) if os.path.isdir(model_dir_path) and os.path.isfile(model_path): # Model already exist so I can load it directly return torch.load(model_path) else: # Model does not exists, so I need to compile it first - print(f"Compiling model {model_name_or_path} {dtype} for the NPU") + print(f"Compiling model {model_name_or_path} {config.dtype} for the NPU") model = transformers_class.from_pretrained( model_name_or_path, *args, **kwargs ) - model = npu_lib.compile(model, dtype, training) + model = npu_lib.compile(model, config) if export: if kwargs.get("trust_remote_code", False): raise AttributeError( diff --git a/script/profile_llm.py b/script/profile_llm.py index 6a69089..cdf7c76 100644 --- a/script/profile_llm.py +++ b/script/profile_llm.py @@ -6,6 +6,7 @@ from transformers import AutoTokenizer, AutoModelForCausalLM from intel_npu_acceleration_library.nn.llm import generate_with_static_shape from intel_npu_acceleration_library.dtypes import int8, int4 +from intel_npu_acceleration_library.compiler import CompilerConfig from torch.profiler import profile, ProfilerActivity import intel_npu_acceleration_library @@ -52,7 +53,8 @@ def main( if not disable_intel_npu_acceleration_library: if not compiled: - model = intel_npu_acceleration_library.compile(model, dtype) + compiler_conf = CompilerConfig(dtype=dtype) + model = intel_npu_acceleration_library.compile(model, compiler_conf) intel_npu_acceleration_library.nn.llm.warm_up_decoder_model( tokenizer, model, context_size ) diff --git a/script/profile_mlp.py b/script/profile_mlp.py index 3b61b6a..4c64fc1 100644 --- a/script/profile_mlp.py +++ b/script/profile_mlp.py @@ -5,6 +5,7 @@ from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP from intel_npu_acceleration_library.dtypes import int8, int4 +from intel_npu_acceleration_library.compiler import CompilerConfig from torch.profiler import profile, ProfilerActivity from sklearn.metrics import r2_score import intel_npu_acceleration_library @@ -43,7 +44,8 @@ def main( raise RuntimeError(f"Invalid dtype: {dtype}") # Compile model - model = intel_npu_acceleration_library.compile(mlp, dtype) + compiler_conf = CompilerConfig(use_to=True, dtype=dtype) + model = intel_npu_acceleration_library.compile(mlp, compiler_conf) if _profile: model.profile = True diff --git a/test/python/test_llm.py b/test/python/test_llm.py index 13b5d79..d5b093e 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -8,6 +8,7 @@ from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP from transformers import AutoTokenizer, AutoModelForCausalLM from intel_npu_acceleration_library.dtypes import int8, int4 +from intel_npu_acceleration_library.compiler import CompilerConfig from sklearn.metrics import r2_score from torch.profiler import profile, ProfilerActivity import intel_npu_acceleration_library @@ -106,7 +107,8 @@ def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size, dtype): reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy() - model = intel_npu_acceleration_library.compile(mlp, dtype) + compiler_conf = CompilerConfig(use_to=True, dtype=dtype) + model = intel_npu_acceleration_library.compile(mlp, compiler_conf) assert model From c63c22316d8c82a9376f462342d93cd295e13f04 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Thu, 18 Jul 2024 09:41:59 +0100 Subject: [PATCH 13/18] Fix test for compile config and remove old code --- intel_npu_acceleration_library/compiler.py | 4 ++-- test/python/test_llm.py | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 7e76a2f..6b12dea 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -61,7 +61,7 @@ def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module: with torch.no_grad(): # Model lowering to NPU ops if config.use_to: - model = model.to("npu") + model = model else: # General optimizations apply_general_optimizations(model) @@ -224,7 +224,7 @@ def optimize_phi3_MLP( Union[torch.nn.Module, None]: optimized Phi-3 module """ if layer.__class__.__name__ == "Phi3MLP": - return layer.to("npu") + return layer return None diff --git a/test/python/test_llm.py b/test/python/test_llm.py index d5b093e..3aa16e7 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -50,7 +50,10 @@ def test_compilation(tokenizer, decoder_model, dtype): prefill = tokenizer("test sentence", return_tensors="pt")["input_ids"].to("cpu") y_ref = decoder_model(prefill).logits.detach() - compiled_model = intel_npu_acceleration_library.compile(decoder_model, dtype=dtype) + compiler_conf = CompilerConfig(dtype=dtype) + compiled_model = intel_npu_acceleration_library.compile( + decoder_model, compiler_conf + ) assert compiled_model From e652eaa9887cb603379f0ea725f8aea6b67aed64 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Thu, 18 Jul 2024 10:32:56 +0100 Subject: [PATCH 14/18] Fix tests with compile config --- test/python/test_compile.py | 10 +++++++--- test/python/test_conv.py | 4 +++- test/python/test_llm.py | 3 ++- test/python/test_optimizations.py | 4 +++- test/python/test_quantization.py | 5 ++++- test/python/test_training.py | 10 +++++++--- 6 files changed, 26 insertions(+), 10 deletions(-) diff --git a/test/python/test_compile.py b/test/python/test_compile.py index 07fb144..faf3d28 100644 --- a/test/python/test_compile.py +++ b/test/python/test_compile.py @@ -4,6 +4,7 @@ # from intel_npu_acceleration_library.compiler import compile +from intel_npu_acceleration_library.compiler import CompilerConfig from intel_npu_acceleration_library.dtypes import int4 from sklearn.metrics import r2_score import intel_npu_acceleration_library @@ -39,7 +40,8 @@ def test_compilation(dtype): y_ref = model(x).detach() - compiled_model = compile(model, dtype) + compiler_conf = CompilerConfig(dtype=dtype) + compiled_model = compile(model, compiler_conf) assert compiled_model @@ -104,7 +106,8 @@ def test_compile_training(dtype): model = NN() - compiled_model = compile(model, dtype, training=True) + compiler_conf = CompilerConfig(dtype=dtype, training=True) + compiled_model = compile(model, compiler_conf) for name, layer in compiled_model.named_children(): if dtype == torch.int8: @@ -118,7 +121,8 @@ def test_compile_inference(dtype): model = NN() - compiled_model = compile(model, dtype) + compiler_conf = CompilerConfig(dtype=dtype) + compiled_model = compile(model, compiler_conf) for name, layer in compiled_model.named_children(): assert layer.training == False diff --git a/test/python/test_conv.py b/test/python/test_conv.py index 5a0ec5b..6fa94a6 100644 --- a/test/python/test_conv.py +++ b/test/python/test_conv.py @@ -5,6 +5,7 @@ import intel_npu_acceleration_library +from intel_npu_acceleration_library.compiler import CompilerConfig from sklearn.metrics import r2_score import pytest import torch @@ -71,7 +72,8 @@ def test_conv( conv.conv.weight.data *= 128 y_ref = conv(X) - npu_conv = intel_npu_acceleration_library.compile(conv, dtype) + compiler_conf = CompilerConfig(dtype=dtype) + npu_conv = intel_npu_acceleration_library.compile(conv, compiler_conf) y = npu_conv(X) assert y.dtype == y_ref.dtype diff --git a/test/python/test_llm.py b/test/python/test_llm.py index 3aa16e7..49e2952 100644 --- a/test/python/test_llm.py +++ b/test/python/test_llm.py @@ -39,7 +39,8 @@ def tokenizer(): @pytest.mark.parametrize("model_seq_length", [128, 256]) def test_warm_up(tokenizer, model, model_seq_length): - compiled_model = intel_npu_acceleration_library.compile(model) + compiler_conf = CompilerConfig() + compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) intel_npu_acceleration_library.nn.llm.warm_up_decoder_model( tokenizer, compiled_model, model_seq_length ) diff --git a/test/python/test_optimizations.py b/test/python/test_optimizations.py index 0f02f07..b3c5b97 100644 --- a/test/python/test_optimizations.py +++ b/test/python/test_optimizations.py @@ -7,6 +7,7 @@ from transformers.models.llama.modeling_llama import LlamaConfig, LlamaMLP, LlamaModel from transformers.models.gemma.modeling_gemma import GemmaConfig, GemmaMLP, GemmaModel from intel_npu_acceleration_library.optimizations import horizontal_fusion_linear +from intel_npu_acceleration_library.compiler import CompilerConfig from sklearn.metrics import r2_score import torch.nn as nn import intel_npu_acceleration_library @@ -142,7 +143,8 @@ def test_model(model_name, hidden_size, intermediate_size, sequence_length, bias reference = model(example_input)[0] - optimized = intel_npu_acceleration_library.compile(model, torch.float16) + compiler_conf = CompilerConfig(dtype=torch.float16) + optimized = intel_npu_acceleration_library.compile(model, compiler_conf) output = optimized(example_input)[0] diff --git a/test/python/test_quantization.py b/test/python/test_quantization.py index 50044b2..c0a1c27 100644 --- a/test/python/test_quantization.py +++ b/test/python/test_quantization.py @@ -4,6 +4,7 @@ # from sklearn.metrics import r2_score +from intel_npu_acceleration_library.compiler import CompilerConfig import numpy as np import intel_npu_acceleration_library import pytest @@ -88,7 +89,9 @@ def test_compiled_quantized(batch, inC, outC): model = NN(inC, outC) y_ref = model(X.to(torch.float32)).detach() - compiled_model = intel_npu_acceleration_library.compile(model, torch.int8) + + compiler_conf = CompilerConfig(dtype=torch.int8) + compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) assert compiled_model y1 = compiled_model(X).detach() diff --git a/test/python/test_training.py b/test/python/test_training.py index aa8f390..adc398d 100644 --- a/test/python/test_training.py +++ b/test/python/test_training.py @@ -6,6 +6,7 @@ from sklearn.metrics import r2_score from intel_npu_acceleration_library import compile +from intel_npu_acceleration_library.compiler import CompilerConfig import torch import pytest import copy @@ -28,12 +29,14 @@ def forward(self, x): @pytest.fixture def model_no_bias(): - return compile(NN(inc=in_c, outc=out_c, bias=False)) + compiler_conf = CompilerConfig() + return compile(NN(inc=in_c, outc=out_c, bias=False), compiler_conf) @pytest.fixture def model(): - return compile(NN(inc=in_c, outc=out_c, bias=True)) + compiler_conf = CompilerConfig() + return compile(NN(inc=in_c, outc=out_c, bias=True), compiler_conf) def test_parameters(model, model_no_bias): @@ -48,7 +51,8 @@ def test_gradient(): cpu_model.load_state_dict(copy.deepcopy(npu_model.state_dict())) # Compile one of the model on npu - compile(npu_model, training=True) + compiler_conf = CompilerConfig(training=True) + compile(npu_model, compiler_conf) x = torch.rand([batch, in_c]).half() yref = torch.rand([batch, in_c]).half() From 7f2faf979e749a05c25f175cabd0aad4b40e43e2 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Thu, 18 Jul 2024 11:58:54 +0100 Subject: [PATCH 15/18] Fix for compiler, updates for tests and examples, doc update --- docs/source/usage.md | 18 ++++++++++++++++-- examples/compile_model.py | 6 ++++-- examples/llava.py | 4 +++- examples/tiny_llama_chat.py | 4 +++- examples/train_mnist.py | 5 +++-- intel_npu_acceleration_library/compiler.py | 3 +-- script/export.py | 12 +++++++++--- 7 files changed, 39 insertions(+), 13 deletions(-) diff --git a/docs/source/usage.md b/docs/source/usage.md index 62a4cdb..aff2716 100644 --- a/docs/source/usage.md +++ b/docs/source/usage.md @@ -38,19 +38,33 @@ optimized_model = torch.compile(model, backend="npu") In windows torch.compile is not supported yet. So you might want to use the explicit function `intel_npu_acceleration_library.compile`. This is true also if you use a `pytorch` version < 2.0.0 +To do this, you just need to call the `compile` function with your model and the compiler configuration `CompilerConfig` to compile and optimize the model for the NPU. ```python import intel_npu_acceleration_library -optimized_model = intel_npu_acceleration_library.compile(model, dtype=torch.int8) +from intel_npu_acceleration_library.compiler import CompilerConfig +compiler_conf = CompilerConfig(dtype=torch.int8) +optimized_model = intel_npu_acceleration_library.compile(model, compiler_conf) # Use the model as usual ``` +To compile and optimize a single layer of a model to be pushed to the NPU as one block, you can set `use_to=True` in the the compiler configuration `CompilerConfig`. +```python +import intel_npu_acceleration_library +from intel_npu_acceleration_library.compiler import CompilerConfig +compiler_conf = CompilerConfig(use_to=True, dtype=torch.int8) +optimized_block = intel_npu_acceleration_library.compile(single_block, compiler_conf) + +``` + ## Training (**Experimental!**) It is possible to use Intel® NPU Acceleration Library to train a model. As before you just need to call the `compile` function, this time with `training=True`. This allows to use the same training script you use in other device with a very minimal modifications. ```python import intel_npu_acceleration_library -compiled_model = intel_npu_acceleration_library.compile(model, dtype=torch.float32, training=True) +from intel_npu_acceleration_library.compiler import CompilerConfig +compiler_conf = CompilerConfig(dtype=torch.float32, training=True) +compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) ``` diff --git a/examples/compile_model.py b/examples/compile_model.py index 2146fcd..afe51ce 100644 --- a/examples/compile_model.py +++ b/examples/compile_model.py @@ -5,6 +5,7 @@ from intel_npu_acceleration_library import compile +from intel_npu_acceleration_library.compiler import CompilerConfig from sklearn.metrics import r2_score import intel_npu_acceleration_library import pytest @@ -41,7 +42,8 @@ def forward(self, x): print( "Windows do not support torch.compile, fallback to intel_npu_acceleration_library.compile" ) - compiled_model = intel_npu_acceleration_library.compile(model) + compiler_conf = CompilerConfig() + compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf) else: compiled_model = torch.compile(model, backend="npu") @@ -49,4 +51,4 @@ def forward(self, x): with torch.no_grad(): y = compiled_model(x) - print(f"Reference vs actual R2 score: {r2_score(y_ref, y):.2f}") + print(f"Reference vs actual R2 score: {r2_score(y_ref.numpy(), y.numpy()):.2f}") diff --git a/examples/llava.py b/examples/llava.py index a8e5545..dafa22d 100644 --- a/examples/llava.py +++ b/examples/llava.py @@ -12,6 +12,7 @@ TextStreamer, ) from transformers.feature_extraction_utils import BatchFeature +from intel_npu_acceleration_library.compiler import CompilerConfig import intel_npu_acceleration_library import torch @@ -21,7 +22,8 @@ # Load model model = LlavaForConditionalGeneration.from_pretrained(checkpoint) -model = intel_npu_acceleration_library.compile(model) +compiler_conf = CompilerConfig() +model = intel_npu_acceleration_library.compile(model, compiler_conf) image_processor = CLIPImageProcessor.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) diff --git a/examples/tiny_llama_chat.py b/examples/tiny_llama_chat.py index 13f595c..82a699e 100644 --- a/examples/tiny_llama_chat.py +++ b/examples/tiny_llama_chat.py @@ -4,6 +4,7 @@ # from transformers import pipeline, TextStreamer, set_seed +from intel_npu_acceleration_library.compiler import CompilerConfig import intel_npu_acceleration_library import torch import os @@ -15,7 +16,8 @@ "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto" ) print("Compiling the model for NPU...") -pipe.model = intel_npu_acceleration_library.compile(pipe.model, dtype=torch.int8) +compiler_conf = CompilerConfig(dtype=torch.int8) +pipe.model = intel_npu_acceleration_library.compile(pipe.model, compiler_conf) streamer = TextStreamer(pipe.tokenizer, skip_special_tokens=True, skip_prompt=True) diff --git a/examples/train_mnist.py b/examples/train_mnist.py index 972eb81..6e14a22 100644 --- a/examples/train_mnist.py +++ b/examples/train_mnist.py @@ -7,6 +7,7 @@ import torch from torch import nn import intel_npu_acceleration_library +from intel_npu_acceleration_library.compiler import CompilerConfig from torch.utils.data import DataLoader from torchvision import datasets from torchvision.transforms import ToTensor @@ -90,8 +91,8 @@ def test_loop(dataloader, model, loss_fn): model = NeuralNetwork() - -model = intel_npu_acceleration_library.compile(model, torch.float32, training=True) +compiler_conf = CompilerConfig(dtype=torch.float32, training=True) +model = intel_npu_acceleration_library.compile(model, compiler_conf) learning_rate = 1e-3 batch_size = 64 diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 6b12dea..88183b6 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -288,14 +288,12 @@ def forward(self, input): @register_backend def npu( gm: Union[torch.nn.Module, torch.fx.GraphModule], - config: CompilerConfig, example_inputs: List[torch.Tensor], ) -> Union[torch.nn.Module, torch.fx.GraphModule]: """Implement the custom torch 2.0 compile backend for the NPU. Args: gm (Union[torch.nn.Module, torch.fx.GraphModule]): The torch fx Module - config (CompilerConfig): The compiler configuration example_inputs (List[torch.Tensor]): A list of example inputs Returns: @@ -305,4 +303,5 @@ def npu( gm = horizontal_fusion_linear(gm) # For now compile in fp16 + config = CompilerConfig() return compile(gm, config) diff --git a/script/export.py b/script/export.py index 892711e..4f63f71 100644 --- a/script/export.py +++ b/script/export.py @@ -5,6 +5,8 @@ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig from intel_npu_acceleration_library.compiler import compile +from intel_npu_acceleration_library.compiler import CompilerConfig +from intel_npu_acceleration_library.dtypes import int8, int4 import argparse import torch import os @@ -41,15 +43,19 @@ def export(model_id, dtype, output): if dtype == "fp16": print(f"Compiling model {model_id}") - torch_dtype = torch.float16 + dtype = torch.float16 elif dtype == "int8": print(f"Quantizing & Compiling model {model_id}") - torch_dtype = torch.int8 + dtype = int8 + elif dtype == "int4": + print(f"Quantizing & Compiling model {model_id}") + dtype = int4 else: raise RuntimeError(f"Invalid dtype {dtype}") with torch.no_grad(): - compile(model, dtype=torch_dtype) + compiler_conf = CompilerConfig(dtype=dtype) + compile(model, compiler_conf) filename = os.path.join(PATH, "model.pth") os.makedirs(PATH, exist_ok=True) From 4b5f857c7e7185d9858bf99eb068f8ed269361d7 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Thu, 18 Jul 2024 15:58:48 +0100 Subject: [PATCH 16/18] Update for model examples and remove test code --- examples/llama.py | 4 +++- examples/llama3.py | 6 +++++- examples/phi-2.py | 4 +++- examples/phi-3.py | 4 +++- examples/t5.py | 6 +++++- intel_npu_acceleration_library/compiler.py | 6 ------ 6 files changed, 19 insertions(+), 11 deletions(-) diff --git a/examples/llama.py b/examples/llama.py index 9c2aaba..e4aebb3 100644 --- a/examples/llama.py +++ b/examples/llama.py @@ -5,11 +5,13 @@ from transformers import AutoTokenizer, TextStreamer from intel_npu_acceleration_library import NPUModelForCausalLM, int4 +from intel_npu_acceleration_library.compiler import CompilerConfig model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" +compiler_conf = CompilerConfig(dtype=int4) model = NPUModelForCausalLM.from_pretrained( - model_id, use_cache=True, dtype=int4, attn_implementation="sdpa" + model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa" ).eval() tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) tokenizer.pad_token_id = tokenizer.eos_token_id diff --git a/examples/llama3.py b/examples/llama3.py index 5a4fb95..9f6ec2a 100644 --- a/examples/llama3.py +++ b/examples/llama3.py @@ -5,10 +5,14 @@ from transformers import AutoTokenizer, TextStreamer from intel_npu_acceleration_library import NPUModelForCausalLM, int4 +from intel_npu_acceleration_library.compiler import CompilerConfig model_id = "meta-llama/Meta-Llama-3-8B-Instruct" -model = NPUModelForCausalLM.from_pretrained(model_id, dtype=int4, use_cache=True).eval() +compiler_conf = CompilerConfig(dtype=int4) +model = NPUModelForCausalLM.from_pretrained( + model_id, use_cache=True, config=compiler_conf +).eval() tokenizer = AutoTokenizer.from_pretrained(model_id) streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) diff --git a/examples/phi-2.py b/examples/phi-2.py index 8bf59d4..7b4a4ae 100644 --- a/examples/phi-2.py +++ b/examples/phi-2.py @@ -7,12 +7,14 @@ from langchain.chains import LLMChain from langchain.llms import HuggingFacePipeline from transformers import AutoTokenizer, pipeline, TextStreamer +from intel_npu_acceleration_library.compiler import CompilerConfig import intel_npu_acceleration_library as npu_lib model_id = "microsoft/Phi-2" +compiler_conf = CompilerConfig(dtype=npu_lib.int4) model = npu_lib.NPUModelForCausalLM.from_pretrained( - model_id, use_cache=True, dtype=npu_lib.int4 + model_id, use_cache=True, config=compiler_conf ).eval() tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True) diff --git a/examples/phi-3.py b/examples/phi-3.py index 184b428..87ec94b 100644 --- a/examples/phi-3.py +++ b/examples/phi-3.py @@ -5,15 +5,17 @@ import torch from transformers import AutoTokenizer, pipeline, TextStreamer +from intel_npu_acceleration_library.compiler import CompilerConfig import intel_npu_acceleration_library as npu_lib import warnings torch.random.manual_seed(0) +compiler_conf = CompilerConfig(dtype=npu_lib.int4) model = npu_lib.NPUModelForCausalLM.from_pretrained( "microsoft/Phi-3-mini-4k-instruct", + config=compiler_conf, torch_dtype="auto", - dtype=npu_lib.int4, ) tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct") diff --git a/examples/t5.py b/examples/t5.py index bec55b3..1607e22 100644 --- a/examples/t5.py +++ b/examples/t5.py @@ -5,10 +5,14 @@ from transformers import AutoTokenizer, TextStreamer from intel_npu_acceleration_library import NPUModelForSeq2SeqLM +from intel_npu_acceleration_library.compiler import CompilerConfig model_id = "google/flan-t5-small" -model = NPUModelForSeq2SeqLM.from_pretrained(model_id, use_cache=True).eval() +compiler_conf = CompilerConfig() +model = NPUModelForSeq2SeqLM.from_pretrained( + model_id, use_cache=True, config=compiler_conf +).eval() tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True) tokenizer.pad_token_id = tokenizer.eos_token_id streamer = TextStreamer(tokenizer, skip_special_tokens=True) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 88183b6..9cb56d9 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -112,7 +112,6 @@ def module_optimization(func: Callable) -> torch.nn.Module: Returns: torch.nn.Module: optimized module """ - module_optimization.counter = 0 # type: ignore[attr-defined] def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): """Recursively apply the optimization function. @@ -126,12 +125,7 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): if not isinstance(model, NPUModuleWrapper): for name, layer in model.named_children(): new_layer = func(name, layer, *args, **kwargs) - if (func.__name__ == "optimize_phi3_MLP") and ( - module_optimization.counter >= 5 # type: ignore[attr-defined] - ): - new_layer = None if new_layer: - module_optimization.counter += 1 # type: ignore[attr-defined] model.add_module(name, new_layer) if not isinstance(new_layer, NPUModuleWrapper): wrapper(new_layer, *args, **kwargs) From ae1fd61159ce042123baa6386ea0085377cabcda Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Fri, 19 Jul 2024 11:07:27 +0100 Subject: [PATCH 17/18] Fix for quantization and remove unused code --- intel_npu_acceleration_library/compiler.py | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 9cb56d9..656ec23 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -69,7 +69,8 @@ def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module: if config.dtype in (int8, int4): # Quantize model model = quantize_model(model, config.dtype) - weights_quantization(model) + if config.use_to: + weights_quantization(model) if not config.use_to: create_npu_kernels(model) @@ -89,7 +90,6 @@ def apply_general_optimizations(model: torch.nn.Module): """ apply_horizontal_fusion(model) optimize_llama_attention(model) - optimize_phi3_MLP(model) def create_npu_kernels(model: torch.nn.Module): @@ -204,24 +204,6 @@ def optimize_llama_attention( return None -@module_optimization -def optimize_phi3_MLP( - name: str, layer: torch.nn.Module -) -> Union[torch.nn.Module, None]: - """Optimize Phi-3 MLP block. - - Args: - name (str): Module name - layer (torch.nn.Module): Original Module - - Returns: - Union[torch.nn.Module, None]: optimized Phi-3 module - """ - if layer.__class__.__name__ == "Phi3MLP": - return layer - return None - - @module_optimization def weights_quantization( name: str, layer: torch.nn.Module From 28902996628ff017ee279545f556b30062850417 Mon Sep 17 00:00:00 2001 From: SarahByrneIntel Date: Fri, 19 Jul 2024 13:17:40 +0100 Subject: [PATCH 18/18] Update for quantization of a model --- intel_npu_acceleration_library/compiler.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py index 656ec23..ea357f4 100644 --- a/intel_npu_acceleration_library/compiler.py +++ b/intel_npu_acceleration_library/compiler.py @@ -69,8 +69,7 @@ def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module: if config.dtype in (int8, int4): # Quantize model model = quantize_model(model, config.dtype) - if config.use_to: - weights_quantization(model) + weights_quantization(model) if not config.use_to: create_npu_kernels(model) @@ -122,15 +121,21 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any): kwargs (Any): keyword arguments """ - if not isinstance(model, NPUModuleWrapper): + if not isinstance(model, NPUModuleWrapper) or kwargs.get( + "ignore_isinstance", False + ): for name, layer in model.named_children(): new_layer = func(name, layer, *args, **kwargs) if new_layer: model.add_module(name, new_layer) - if not isinstance(new_layer, NPUModuleWrapper): + if not isinstance(new_layer, NPUModuleWrapper) or kwargs.get( + "ignore_isinstance", False + ): wrapper(new_layer, *args, **kwargs) else: - if not isinstance(layer, NPUModuleWrapper): + if not isinstance(layer, NPUModuleWrapper) or kwargs.get( + "ignore_isinstance", False + ): wrapper(layer, *args, **kwargs) return wrapper @@ -206,13 +211,14 @@ def optimize_llama_attention( @module_optimization def weights_quantization( - name: str, layer: torch.nn.Module + name: str, layer: torch.nn.Module, ignore_isinstance: bool = True ) -> Union[torch.nn.Module, None]: """Apply weights quantization. Args: name (str): Layer name layer (torch.nn.Module): Original torch.nn.Linear module + ignore_isinstance (bool): ignore isinstance check in module_optimization. Defaults to True. Raises: RuntimeError: unsupported quantization bits