From 22c627cc74c67f738cb2f235f3d7f66f1c326089 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Mon, 1 Jul 2024 16:56:00 +0100
Subject: [PATCH 01/18] Add support for phi-3 MLP layer

---
 intel_npu_acceleration_library/compiler.py    | 19 +++++++
 intel_npu_acceleration_library/nn/__init__.py |  4 +-
 intel_npu_acceleration_library/nn/llm.py      | 53 +++++++++++++++++++
 test/python/test_llm.py                       | 28 ++++++++++
 4 files changed, 102 insertions(+), 2 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 4e80d04..4132cfe 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -6,6 +6,7 @@
 from intel_npu_acceleration_library.optimizations import horizontal_fusion_linear
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention
 from transformers.models.gemma.modeling_gemma import GemmaMLP, GemmaAttention
+from transformers.models.phi3.modeling_phi3 import Phi3MLP
 from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
 from intel_npu_acceleration_library.quantization import quantize_model
 from intel_npu_acceleration_library.dtypes import int8, int4
@@ -174,6 +175,24 @@ def optimize_llama_attention(
     return None
 
 
+@module_optimization
+def optimize_phi3_MLP(
+    name: str, layer: torch.nn.Module
+) -> Union[torch.nn.Module, None]:
+    """Optimize Phi-3 MLP block.
+
+    Args:
+        name (str): Module name
+        layer (torch.nn.Module): Original Module
+
+    Returns:
+        Union[torch.nn.Module, None]: optimized Phi-3 module
+    """
+    if isinstance(layer, Phi3MLP):
+        return nn.Phi3MLP.fromTorch(layer)
+    return None
+
+
 @register_backend
 def npu(
     gm: Union[torch.nn.Module, torch.fx.GraphModule], example_inputs: List[torch.Tensor]
diff --git a/intel_npu_acceleration_library/nn/__init__.py b/intel_npu_acceleration_library/nn/__init__.py
index 408d1b2..58033bb 100644
--- a/intel_npu_acceleration_library/nn/__init__.py
+++ b/intel_npu_acceleration_library/nn/__init__.py
@@ -9,9 +9,9 @@
 from .module import Module  # noqa
 
 try:
-    from .llm import LlamaAttention, PhiMLP  # noqa
+    from .llm import LlamaAttention, PhiMLP, Phi3MLP  # noqa
 
-    llm_modules = ["LlamaAttention", "PhiMLP"]
+    llm_modules = ["LlamaAttention", "PhiMLP", "Phi3MLP"]
 except ModuleNotFoundError:
     # Transformer library is not installed
     llm_modules = []
diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py
index 8cf6cd3..17a93d6 100644
--- a/intel_npu_acceleration_library/nn/llm.py
+++ b/intel_npu_acceleration_library/nn/llm.py
@@ -9,6 +9,7 @@
     LlamaConfig,
 )
 from transformers import AutoTokenizer
+from intel_npu_acceleration_library.backend.tensor import Tensor
 from intel_npu_acceleration_library.nn import Linear
 from intel_npu_acceleration_library.backend import run_factory, MLP
 from functools import partial
@@ -72,6 +73,58 @@ def fromTorch(
         return new_layer
 
 
+class Phi3MLP(torch.nn.Module):
+    """Phi-3 MLP operation NPU backend."""
+
+    def __init__(self, config):
+        """Initialize Phi-3 MLP operation.
+
+        Args:
+            config (Phi3Config): Phi-3 MLP configuration
+        """
+        super().__init__()
+        self.config = config
+        self.op_id = str(uuid.uuid4())
+
+    def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor:
+        """NPU module forward method.
+
+        Args:
+            hidden_states (Tensor): The input tensor.
+            gate_up_proj_w (Tensor): The gate up projection input weight tensor.
+            down_proj_w (Tensor): The down projection input weight tensor.
+            kwargs: Additional arguments
+
+        Returns:
+           Tensor: The output tensor
+        """
+        gate_up_states = torch.nn.functional.linear(hidden_states, gate_up_proj_w)
+
+        gate = gate_up_states[:, : self.config.intermediate_size]
+        up_states = gate_up_states[:, self.config.intermediate_size :]
+
+        up_states = up_states * torch.nn.functional.silu(gate)
+
+        return torch.nn.functional.linear(up_states, down_proj_w)
+
+    @staticmethod
+    def fromTorch(
+        layer: torch.nn.Module, dtype: torch.dtype = torch.float16
+    ) -> "Phi3MLP":
+        """Generate a NPU Phi-3 MLP layer from a transformer one.
+
+        Args:
+            layer (torch.nn.Linear): the original Phi-3 MLP model to run on the NPU
+            dtype (torch.dtype): the desired datatype
+
+        Returns:
+            Phi3MLP: A NPU Phi-3 MLP layer
+        """
+        new_layer = Phi3MLP(config=layer.config)
+
+        return new_layer
+
+
 class FusedLlamaMLP(torch.nn.Module):
     """LLAMA MLP operation NPU backend."""
 
diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index 8e4dbf0..7e29dbd 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -5,6 +5,7 @@
 
 from transformers.models.llama.modeling_llama import LlamaForCausalLM, LlamaConfig
 from transformers.models.phi.modeling_phi import PhiConfig, PhiMLP
+from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from sklearn.metrics import r2_score
 import intel_npu_acceleration_library
@@ -76,3 +77,30 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size):
     out = model(x)
 
     assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001
+
+
+@torch.no_grad
+@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("hidden_size", [256, 512])
+@pytest.mark.parametrize("intermediate_size", [512])
+def test_phi3_mlp(seq_len, hidden_size, intermediate_size):
+    conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    conf.num_hidden_layers = 1
+    conf.hidden_size = hidden_size
+    conf.intermediate_size = intermediate_size
+
+    mlp = Phi3MLP(conf)
+
+    hidden_states = torch.rand((seq_len, conf.hidden_size)).half()
+
+    reference = mlp(hidden_states.to(torch.float32)).to(torch.float16)
+
+    model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp)
+
+    assert model
+
+    out = model(
+        hidden_states, mlp.gate_up_proj.weight.half(), mlp.down_proj.weight.half()
+    )
+
+    assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001

From ea4b27a76d6965d4eb0be01ac89533df28f09f61 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Tue, 2 Jul 2024 10:26:02 +0100
Subject: [PATCH 02/18] Updating support for Phi-3 MLP

---
 intel_npu_acceleration_library/compiler.py |  5 +++--
 intel_npu_acceleration_library/nn/llm.py   | 20 +++++++++++++-------
 test/python/test_llm.py                    | 11 +++++++++--
 3 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 4132cfe..760ea83 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -177,19 +177,20 @@ def optimize_llama_attention(
 
 @module_optimization
 def optimize_phi3_MLP(
-    name: str, layer: torch.nn.Module
+    name: str, layer: torch.nn.Module, activation_fn: torch.nn
 ) -> Union[torch.nn.Module, None]:
     """Optimize Phi-3 MLP block.
 
     Args:
         name (str): Module name
         layer (torch.nn.Module): Original Module
+        activation_fn (torch.nn): Activation function
 
     Returns:
         Union[torch.nn.Module, None]: optimized Phi-3 module
     """
     if isinstance(layer, Phi3MLP):
-        return nn.Phi3MLP.fromTorch(layer)
+        return nn.Phi3MLP.fromTorch(layer, activation_fn)
     return None
 
 
diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py
index 17a93d6..88f65ae 100644
--- a/intel_npu_acceleration_library/nn/llm.py
+++ b/intel_npu_acceleration_library/nn/llm.py
@@ -76,14 +76,16 @@ def fromTorch(
 class Phi3MLP(torch.nn.Module):
     """Phi-3 MLP operation NPU backend."""
 
-    def __init__(self, config):
+    def __init__(self, config, activation_fn):
         """Initialize Phi-3 MLP operation.
 
         Args:
             config (Phi3Config): Phi-3 MLP configuration
+            activation_fn (torch.nn): Phi-3 MLP activation function
         """
         super().__init__()
         self.config = config
+        self.activation_fn = activation_fn
         self.op_id = str(uuid.uuid4())
 
     def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor:
@@ -100,27 +102,31 @@ def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tenso
         """
         gate_up_states = torch.nn.functional.linear(hidden_states, gate_up_proj_w)
 
-        gate = gate_up_states[:, : self.config.intermediate_size]
-        up_states = gate_up_states[:, self.config.intermediate_size :]
+        midpoint = gate_up_states.size(dim=-1) // 2
+        gate = gate_up_states[:, :midpoint]
+        up_states = gate_up_states[:, midpoint:]
 
-        up_states = up_states * torch.nn.functional.silu(gate)
+        up_states = up_states * self.activation_fn(gate)
 
         return torch.nn.functional.linear(up_states, down_proj_w)
 
     @staticmethod
     def fromTorch(
-        layer: torch.nn.Module, dtype: torch.dtype = torch.float16
+        layer: torch.nn.Module,
+        activation_fn: torch.nn,
+        dtype: torch.dtype = torch.float16,
     ) -> "Phi3MLP":
         """Generate a NPU Phi-3 MLP layer from a transformer one.
 
         Args:
-            layer (torch.nn.Linear): the original Phi-3 MLP model to run on the NPU
+            layer (torch.nn.Module): the original Phi-3 MLP model to run on the NPU
+            activation_fn (torch.nn): the activation function
             dtype (torch.dtype): the desired datatype
 
         Returns:
             Phi3MLP: A NPU Phi-3 MLP layer
         """
-        new_layer = Phi3MLP(config=layer.config)
+        new_layer = Phi3MLP(config=layer.config, activation_fn=activation_fn)
 
         return new_layer
 
diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index 7e29dbd..a830f78 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -11,6 +11,7 @@
 import intel_npu_acceleration_library
 import pytest
 import torch
+import numpy as np
 
 
 @pytest.fixture
@@ -80,7 +81,7 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size):
 
 
 @torch.no_grad
-@pytest.mark.parametrize("seq_len", [16])
+@pytest.mark.parametrize("seq_len", [16, 128, 256])
 @pytest.mark.parametrize("hidden_size", [256, 512])
 @pytest.mark.parametrize("intermediate_size", [512])
 def test_phi3_mlp(seq_len, hidden_size, intermediate_size):
@@ -95,7 +96,9 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size):
 
     reference = mlp(hidden_states.to(torch.float32)).to(torch.float16)
 
-    model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp)
+    model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(
+        layer=mlp, activation_fn=torch.nn.functional.silu
+    ).to("npu")
 
     assert model
 
@@ -103,4 +106,8 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size):
         hidden_states, mlp.gate_up_proj.weight.half(), mlp.down_proj.weight.half()
     )
 
+    assert out.shape == reference.shape, "Output shape mismatch"
+    assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf"
+    assert np.isfinite(out).all(), "NPU output contains NaN or Inf"
+
     assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001

From 39c070c7076f8b6a42157b73cfa1b368e6c5b87a Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Wed, 3 Jul 2024 11:23:41 +0100
Subject: [PATCH 03/18] Update for Phi-3 MLP testing

---
 intel_npu_acceleration_library/activations.py | 86 +++++++++++++++++++
 intel_npu_acceleration_library/compiler.py    |  6 +-
 intel_npu_acceleration_library/nn/llm.py      | 12 ++-
 test/python/test_llm.py                       |  4 +-
 4 files changed, 95 insertions(+), 13 deletions(-)
 create mode 100644 intel_npu_acceleration_library/activations.py

diff --git a/intel_npu_acceleration_library/activations.py b/intel_npu_acceleration_library/activations.py
new file mode 100644
index 0000000..e879030
--- /dev/null
+++ b/intel_npu_acceleration_library/activations.py
@@ -0,0 +1,86 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+
+import torch
+
+
+def get_activation(act_function: str):
+    """Return an activation function for the NPU.
+
+    Args:
+        act_function (str): an NPU supported activation function
+
+    Returns:
+        torch.nn: activation function
+    """
+    match act_function:
+        case "cos":
+            return torch.cos
+        case "sin":
+            return torch.sin
+        case "tan":
+            return torch.tan
+        case "acos":
+            return torch.acos
+        case "asin":
+            return torch.asin
+        case "atan":
+            return torch.atan
+        case "cosh":
+            return torch.cosh
+        case "sinh":
+            return torch.sinh
+        case "tanh":
+            return torch.tanh
+        case "acosh":
+            return torch.acosh
+        case "asinh":
+            return torch.asinh
+        case "atanh":
+            return torch.atanh
+        case "abs":
+            return torch.abs
+        case "ceil":
+            return torch.ceil
+        case "clamp":
+            return torch.clamp
+        case "elu":
+            return torch.nn.functional.elu
+        case "erf":
+            return torch.erf
+        case "exp":
+            return torch.exp
+        case "floor":
+            return torch.floor
+        case "gelu":
+            return torch.nn.functional.gelu
+        case "hardsigmoid":
+            return torch.nn.functional.hardsigmoid
+        case "hardswish":
+            return torch.nn.functional.hardswish
+        case "log":
+            return torch.log
+        case "mish":
+            return torch.nn.functional.mish
+        case "neg":
+            return torch.neg
+        case "relu":
+            return torch.nn.functional.relu
+        case "round":
+            return torch.round
+        case "sigmoid":
+            return torch.nn.functional.sigmoid
+        case "sign":
+            return torch.sign
+        case "silu":
+            return torch.nn.functional.silu
+        case "softmax":
+            return torch.nn.functional.softmax
+        case "softplus":
+            return torch.nn.functional.softplus
+        case "sqrt":
+            return torch.sqrt
+        case _:
+            return torch.nn.functional.silu
diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 760ea83..a97e36a 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -64,6 +64,7 @@ def apply_general_optimizations(model: torch.nn.Module):
     """
     apply_horizontal_fusion(model)
     optimize_llama_attention(model)
+    optimize_phi3_MLP(model)
 
 
 def create_npu_kernels(model: torch.nn.Module):
@@ -177,20 +178,19 @@ def optimize_llama_attention(
 
 @module_optimization
 def optimize_phi3_MLP(
-    name: str, layer: torch.nn.Module, activation_fn: torch.nn
+    name: str, layer: torch.nn.Module
 ) -> Union[torch.nn.Module, None]:
     """Optimize Phi-3 MLP block.
 
     Args:
         name (str): Module name
         layer (torch.nn.Module): Original Module
-        activation_fn (torch.nn): Activation function
 
     Returns:
         Union[torch.nn.Module, None]: optimized Phi-3 module
     """
     if isinstance(layer, Phi3MLP):
-        return nn.Phi3MLP.fromTorch(layer, activation_fn)
+        return nn.Phi3MLP.fromTorch(layer)
     return None
 
 
diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py
index 88f65ae..925ec7b 100644
--- a/intel_npu_acceleration_library/nn/llm.py
+++ b/intel_npu_acceleration_library/nn/llm.py
@@ -12,6 +12,7 @@
 from intel_npu_acceleration_library.backend.tensor import Tensor
 from intel_npu_acceleration_library.nn import Linear
 from intel_npu_acceleration_library.backend import run_factory, MLP
+from intel_npu_acceleration_library.activations import get_activation
 from functools import partial
 from typing import Optional, List, Generator
 from transformers.cache_utils import Cache
@@ -76,16 +77,15 @@ def fromTorch(
 class Phi3MLP(torch.nn.Module):
     """Phi-3 MLP operation NPU backend."""
 
-    def __init__(self, config, activation_fn):
+    def __init__(self, config):
         """Initialize Phi-3 MLP operation.
 
         Args:
             config (Phi3Config): Phi-3 MLP configuration
-            activation_fn (torch.nn): Phi-3 MLP activation function
         """
         super().__init__()
         self.config = config
-        self.activation_fn = activation_fn
+        self.activation_fn = config.hidden_act
         self.op_id = str(uuid.uuid4())
 
     def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor:
@@ -106,27 +106,25 @@ def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tenso
         gate = gate_up_states[:, :midpoint]
         up_states = gate_up_states[:, midpoint:]
 
-        up_states = up_states * self.activation_fn(gate)
+        up_states = up_states * get_activation(self.activation_fn)(gate)
 
         return torch.nn.functional.linear(up_states, down_proj_w)
 
     @staticmethod
     def fromTorch(
         layer: torch.nn.Module,
-        activation_fn: torch.nn,
         dtype: torch.dtype = torch.float16,
     ) -> "Phi3MLP":
         """Generate a NPU Phi-3 MLP layer from a transformer one.
 
         Args:
             layer (torch.nn.Module): the original Phi-3 MLP model to run on the NPU
-            activation_fn (torch.nn): the activation function
             dtype (torch.dtype): the desired datatype
 
         Returns:
             Phi3MLP: A NPU Phi-3 MLP layer
         """
-        new_layer = Phi3MLP(config=layer.config, activation_fn=activation_fn)
+        new_layer = Phi3MLP(config=layer.config)
 
         return new_layer
 
diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index a830f78..742e9ee 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -96,9 +96,7 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size):
 
     reference = mlp(hidden_states.to(torch.float32)).to(torch.float16)
 
-    model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(
-        layer=mlp, activation_fn=torch.nn.functional.silu
-    ).to("npu")
+    model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp).to("npu")
 
     assert model
 

From 727454e6805051f197163b30c04ac76d0819caaa Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Mon, 8 Jul 2024 11:09:38 +0100
Subject: [PATCH 04/18] Update for phi-3 mlp layer

---
 intel_npu_acceleration_library/compiler.py    | 38 +++++++++----
 intel_npu_acceleration_library/nn/__init__.py |  4 +-
 intel_npu_acceleration_library/nn/llm.py      | 57 -------------------
 intel_npu_acceleration_library/nn/module.py   |  7 ++-
 test/python/test_llm.py                       | 22 ++++---
 5 files changed, 48 insertions(+), 80 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index a97e36a..2dfdfc6 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -10,6 +10,7 @@
 from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
 from intel_npu_acceleration_library.quantization import quantize_model
 from intel_npu_acceleration_library.dtypes import int8, int4
+from intel_npu_acceleration_library.nn.module import NPUModuleWrapper
 import intel_npu_acceleration_library.nn as nn
 from torch._dynamo import register_backend
 from typing import Union, Callable, Any
@@ -40,14 +41,18 @@ def compile(
 
     # Prepare and optimize model for NPU
     with torch.no_grad():
-        # General optimizations
-        apply_general_optimizations(model)
+
         if dtype in (int8, int4):
             # Quantize model
             model = quantize_model(model, dtype)
 
         # Model lowering to NPU ops
-        create_npu_kernels(model)
+        if isinstance(model, Phi3MLP):
+            model = model.to("npu")
+        else:
+            # General optimizations
+            apply_general_optimizations(model)
+            create_npu_kernels(model)
 
     if dtype.is_floating_point and training:
         # Set model to evaluation only as quantized training is not supported yet
@@ -97,13 +102,22 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
             kwargs (Any): keyword arguments
 
         """
-        for name, layer in model.named_children():
-            new_layer = func(name, layer, *args, **kwargs)
-            if new_layer:
-                model.add_module(name, new_layer)
-                wrapper(new_layer, *args, **kwargs)
-            else:
-                wrapper(layer, *args, **kwargs)
+        if not isinstance(model, NPUModuleWrapper):
+            for name, layer in model.named_children():
+                print(f"MODEL: {model} \n\n")
+                if isinstance(model, Phi3MLP):
+                    new_layer = func(model.__class__.__name__, model, *args, **kwargs)
+                    if new_layer:
+                        model.add_module(model.__class__.__name__, new_layer)
+                else:
+                    new_layer = func(name, layer, *args, **kwargs)
+                    if new_layer:
+                        model.add_module(name, new_layer)
+                        if not isinstance(new_layer, NPUModuleWrapper):
+                            wrapper(new_layer, *args, **kwargs)
+                    else:
+                        if not isinstance(layer, NPUModuleWrapper):
+                            wrapper(layer, *args, **kwargs)
 
     return wrapper
 
@@ -189,8 +203,8 @@ def optimize_phi3_MLP(
     Returns:
         Union[torch.nn.Module, None]: optimized Phi-3 module
     """
-    if isinstance(layer, Phi3MLP):
-        return nn.Phi3MLP.fromTorch(layer)
+    if layer.__class__.__name__ == "Phi3MLP":
+        return layer.to("npu")
     return None
 
 
diff --git a/intel_npu_acceleration_library/nn/__init__.py b/intel_npu_acceleration_library/nn/__init__.py
index 58033bb..408d1b2 100644
--- a/intel_npu_acceleration_library/nn/__init__.py
+++ b/intel_npu_acceleration_library/nn/__init__.py
@@ -9,9 +9,9 @@
 from .module import Module  # noqa
 
 try:
-    from .llm import LlamaAttention, PhiMLP, Phi3MLP  # noqa
+    from .llm import LlamaAttention, PhiMLP  # noqa
 
-    llm_modules = ["LlamaAttention", "PhiMLP", "Phi3MLP"]
+    llm_modules = ["LlamaAttention", "PhiMLP"]
 except ModuleNotFoundError:
     # Transformer library is not installed
     llm_modules = []
diff --git a/intel_npu_acceleration_library/nn/llm.py b/intel_npu_acceleration_library/nn/llm.py
index 925ec7b..8cf6cd3 100644
--- a/intel_npu_acceleration_library/nn/llm.py
+++ b/intel_npu_acceleration_library/nn/llm.py
@@ -9,10 +9,8 @@
     LlamaConfig,
 )
 from transformers import AutoTokenizer
-from intel_npu_acceleration_library.backend.tensor import Tensor
 from intel_npu_acceleration_library.nn import Linear
 from intel_npu_acceleration_library.backend import run_factory, MLP
-from intel_npu_acceleration_library.activations import get_activation
 from functools import partial
 from typing import Optional, List, Generator
 from transformers.cache_utils import Cache
@@ -74,61 +72,6 @@ def fromTorch(
         return new_layer
 
 
-class Phi3MLP(torch.nn.Module):
-    """Phi-3 MLP operation NPU backend."""
-
-    def __init__(self, config):
-        """Initialize Phi-3 MLP operation.
-
-        Args:
-            config (Phi3Config): Phi-3 MLP configuration
-        """
-        super().__init__()
-        self.config = config
-        self.activation_fn = config.hidden_act
-        self.op_id = str(uuid.uuid4())
-
-    def forward(self, hidden_states, gate_up_proj_w, down_proj_w, **kwargs) -> Tensor:
-        """NPU module forward method.
-
-        Args:
-            hidden_states (Tensor): The input tensor.
-            gate_up_proj_w (Tensor): The gate up projection input weight tensor.
-            down_proj_w (Tensor): The down projection input weight tensor.
-            kwargs: Additional arguments
-
-        Returns:
-           Tensor: The output tensor
-        """
-        gate_up_states = torch.nn.functional.linear(hidden_states, gate_up_proj_w)
-
-        midpoint = gate_up_states.size(dim=-1) // 2
-        gate = gate_up_states[:, :midpoint]
-        up_states = gate_up_states[:, midpoint:]
-
-        up_states = up_states * get_activation(self.activation_fn)(gate)
-
-        return torch.nn.functional.linear(up_states, down_proj_w)
-
-    @staticmethod
-    def fromTorch(
-        layer: torch.nn.Module,
-        dtype: torch.dtype = torch.float16,
-    ) -> "Phi3MLP":
-        """Generate a NPU Phi-3 MLP layer from a transformer one.
-
-        Args:
-            layer (torch.nn.Module): the original Phi-3 MLP model to run on the NPU
-            dtype (torch.dtype): the desired datatype
-
-        Returns:
-            Phi3MLP: A NPU Phi-3 MLP layer
-        """
-        new_layer = Phi3MLP(config=layer.config)
-
-        return new_layer
-
-
 class FusedLlamaMLP(torch.nn.Module):
     """LLAMA MLP operation NPU backend."""
 
diff --git a/intel_npu_acceleration_library/nn/module.py b/intel_npu_acceleration_library/nn/module.py
index ef23c8e..0ac46cc 100644
--- a/intel_npu_acceleration_library/nn/module.py
+++ b/intel_npu_acceleration_library/nn/module.py
@@ -4,6 +4,7 @@
 #
 from intel_npu_acceleration_library.backend import NNFactory, Tensor
 from typing import MutableMapping, Sequence, Any, List
+from torch.profiler import record_function
 import numpy as np
 import torch
 
@@ -249,7 +250,8 @@ def _call_impl(self, *args: Any, **kwargs: Any) -> Any:
             # Run the model by replacing the forward method with the factory_forward
             old_forward = self.forward
             self.forward = self.factory_forward  # type: ignore
-            out = super()._call_impl(*args, **kwargs)
+            with record_function(f"npu_{self.__class__.__name__}"):
+                out = super()._call_impl(*args, **kwargs)
 
             # Restore the original forward method
             self.forward = old_forward  # type: ignore
@@ -322,7 +324,8 @@ def forward(self, *args, **kwargs) -> torch.Tensor:
         Returns:
             torch.Tensor: The output tensor.
         """
-        return self.module(*args, **kwargs)
+        with record_function(f"npu_{self.module.__class__.__name__}"):
+            return self.module(*args, **kwargs)
 
 
 def convert_to_npu_module(module: torch.nn.Module) -> Module:
diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index 742e9ee..75a663b 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -8,6 +8,7 @@
 from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from sklearn.metrics import r2_score
+from torch.profiler import profile, ProfilerActivity
 import intel_npu_acceleration_library
 import pytest
 import torch
@@ -84,7 +85,7 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size):
 @pytest.mark.parametrize("seq_len", [16, 128, 256])
 @pytest.mark.parametrize("hidden_size", [256, 512])
 @pytest.mark.parametrize("intermediate_size", [512])
-def test_phi3_mlp(seq_len, hidden_size, intermediate_size):
+def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size):
     conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
     conf.num_hidden_layers = 1
     conf.hidden_size = hidden_size
@@ -92,20 +93,27 @@ def test_phi3_mlp(seq_len, hidden_size, intermediate_size):
 
     mlp = Phi3MLP(conf)
 
-    hidden_states = torch.rand((seq_len, conf.hidden_size)).half()
+    hidden_states = torch.rand((seq_len, conf.hidden_size))
 
-    reference = mlp(hidden_states.to(torch.float32)).to(torch.float16)
+    reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy()
 
-    model = intel_npu_acceleration_library.nn.Phi3MLP.fromTorch(layer=mlp).to("npu")
+    model = intel_npu_acceleration_library.compile(mlp)
 
     assert model
 
-    out = model(
-        hidden_states, mlp.gate_up_proj.weight.half(), mlp.down_proj.weight.half()
+    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
+        out = model(hidden_states)
+
+    print(
+        prof.key_averages(group_by_input_shape=True).table(
+            sort_by="cpu_time_total", row_limit=20
+        )
     )
 
+    out = out.detach().numpy()
+
     assert out.shape == reference.shape, "Output shape mismatch"
     assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf"
     assert np.isfinite(out).all(), "NPU output contains NaN or Inf"
 
-    assert 1 - r2_score(reference.numpy(), out.numpy()) < 0.001
+    assert 1 - r2_score(reference, out) < 0.001

From ea4ea19dc4748c4ebe05426b80ef375546e76f1d Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Mon, 8 Jul 2024 11:35:45 +0100
Subject: [PATCH 05/18] Remove old code for phi-3 mlp layer

---
 intel_npu_acceleration_library/activations.py | 86 -------------------
 intel_npu_acceleration_library/compiler.py    | 20 ++---
 2 files changed, 7 insertions(+), 99 deletions(-)
 delete mode 100644 intel_npu_acceleration_library/activations.py

diff --git a/intel_npu_acceleration_library/activations.py b/intel_npu_acceleration_library/activations.py
deleted file mode 100644
index e879030..0000000
--- a/intel_npu_acceleration_library/activations.py
+++ /dev/null
@@ -1,86 +0,0 @@
-#
-# Copyright © 2024 Intel Corporation
-# SPDX-License-Identifier: Apache 2.0
-#
-
-import torch
-
-
-def get_activation(act_function: str):
-    """Return an activation function for the NPU.
-
-    Args:
-        act_function (str): an NPU supported activation function
-
-    Returns:
-        torch.nn: activation function
-    """
-    match act_function:
-        case "cos":
-            return torch.cos
-        case "sin":
-            return torch.sin
-        case "tan":
-            return torch.tan
-        case "acos":
-            return torch.acos
-        case "asin":
-            return torch.asin
-        case "atan":
-            return torch.atan
-        case "cosh":
-            return torch.cosh
-        case "sinh":
-            return torch.sinh
-        case "tanh":
-            return torch.tanh
-        case "acosh":
-            return torch.acosh
-        case "asinh":
-            return torch.asinh
-        case "atanh":
-            return torch.atanh
-        case "abs":
-            return torch.abs
-        case "ceil":
-            return torch.ceil
-        case "clamp":
-            return torch.clamp
-        case "elu":
-            return torch.nn.functional.elu
-        case "erf":
-            return torch.erf
-        case "exp":
-            return torch.exp
-        case "floor":
-            return torch.floor
-        case "gelu":
-            return torch.nn.functional.gelu
-        case "hardsigmoid":
-            return torch.nn.functional.hardsigmoid
-        case "hardswish":
-            return torch.nn.functional.hardswish
-        case "log":
-            return torch.log
-        case "mish":
-            return torch.nn.functional.mish
-        case "neg":
-            return torch.neg
-        case "relu":
-            return torch.nn.functional.relu
-        case "round":
-            return torch.round
-        case "sigmoid":
-            return torch.nn.functional.sigmoid
-        case "sign":
-            return torch.sign
-        case "silu":
-            return torch.nn.functional.silu
-        case "softmax":
-            return torch.nn.functional.softmax
-        case "softplus":
-            return torch.nn.functional.softplus
-        case "sqrt":
-            return torch.sqrt
-        case _:
-            return torch.nn.functional.silu
diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 2dfdfc6..21024e6 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -104,20 +104,14 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         """
         if not isinstance(model, NPUModuleWrapper):
             for name, layer in model.named_children():
-                print(f"MODEL: {model} \n\n")
-                if isinstance(model, Phi3MLP):
-                    new_layer = func(model.__class__.__name__, model, *args, **kwargs)
-                    if new_layer:
-                        model.add_module(model.__class__.__name__, new_layer)
+                new_layer = func(name, layer, *args, **kwargs)
+                if new_layer:
+                    model.add_module(name, new_layer)
+                    if not isinstance(new_layer, NPUModuleWrapper):
+                        wrapper(new_layer, *args, **kwargs)
                 else:
-                    new_layer = func(name, layer, *args, **kwargs)
-                    if new_layer:
-                        model.add_module(name, new_layer)
-                        if not isinstance(new_layer, NPUModuleWrapper):
-                            wrapper(new_layer, *args, **kwargs)
-                    else:
-                        if not isinstance(layer, NPUModuleWrapper):
-                            wrapper(layer, *args, **kwargs)
+                    if not isinstance(layer, NPUModuleWrapper):
+                        wrapper(layer, *args, **kwargs)
 
     return wrapper
 

From 1fef8a40948ed562b5f9e2e9cae9fcc05898928a Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Fri, 12 Jul 2024 09:55:17 +0100
Subject: [PATCH 06/18] Add type tensor op and quantisation support

---
 .../backend/tensor.py                         | 12 ++++
 intel_npu_acceleration_library/compiler.py    | 58 +++++++++++++++++++
 2 files changed, 70 insertions(+)

diff --git a/intel_npu_acceleration_library/backend/tensor.py b/intel_npu_acceleration_library/backend/tensor.py
index 2236eda..e8cca7f 100644
--- a/intel_npu_acceleration_library/backend/tensor.py
+++ b/intel_npu_acceleration_library/backend/tensor.py
@@ -948,6 +948,18 @@ def to(self, dtype: NPUDtype) -> "Tensor":
         """
         return generate_op([self], "to", dtype)
 
+    def type(self, dtype: NPUDtype) -> "Tensor":
+        """
+        Convert the tensor to the specified data type.
+
+        Args:
+            dtype (NPUDtype): The data type to convert the tensor to.
+
+        Returns:
+            Tensor: The converted tensor.
+        """
+        return self.to(dtype)
+
     @classmethod
     def __torch_function__(
         cls: Any,
diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 21024e6..7325c0b 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -16,6 +16,7 @@
 from typing import Union, Callable, Any
 from typing import List
 import torch
+from functools import partial
 
 
 def compile(
@@ -45,6 +46,7 @@ def compile(
         if dtype in (int8, int4):
             # Quantize model
             model = quantize_model(model, dtype)
+            weights_quantization(model)
 
         # Model lowering to NPU ops
         if isinstance(model, Phi3MLP):
@@ -92,6 +94,7 @@ def module_optimization(func: Callable) -> torch.nn.Module:
     Returns:
         torch.nn.Module: optimized module
     """
+    module_optimization.counter = 0  # type: ignore[attr-defined]
 
     def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         """Recursively apply the optimization function.
@@ -105,7 +108,13 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         if not isinstance(model, NPUModuleWrapper):
             for name, layer in model.named_children():
                 new_layer = func(name, layer, *args, **kwargs)
+                if (func.__name__ == "optimize_phi3_MLP") and (
+                    module_optimization.counter >= 5  # type: ignore[attr-defined]
+                ):
+                    new_layer = None
+
                 if new_layer:
+                    module_optimization.counter += 1  # type: ignore[attr-defined]
                     model.add_module(name, new_layer)
                     if not isinstance(new_layer, NPUModuleWrapper):
                         wrapper(new_layer, *args, **kwargs)
@@ -202,6 +211,55 @@ def optimize_phi3_MLP(
     return None
 
 
+@module_optimization
+def weights_quantization(
+    name: str, layer: torch.nn.Module
+) -> Union[torch.nn.Module, None]:
+    """Apply weights quantization.
+
+    Args:
+        name (str): Layer name
+        layer (torch.nn.Module): Original torch.nn.Linear module
+
+    Raises:
+        RuntimeError: unsupported quantization bits
+
+    Returns:
+        None: Returns None
+    """
+    if isinstance(layer, WeightOnlyLinear):
+        if layer.bits == 4:
+            print("This works - int4 !!")
+            layer.forward = partial(forward, layer)
+        elif layer.bits == 8:
+            print("This works - int8 !!")
+            layer.forward = partial(forward, layer)
+        else:
+            raise RuntimeError(f"Unsupported quantization bits: {layer.bits}")
+    return None
+
+
+def forward(self, input):
+    """Override forward method for WeightOnlyLinear class.
+
+    Args:
+        input: Thr input tensor.
+
+    Returns:
+        torch.Tensor: The output tensor.
+    """
+    w = self.qweight.to(torch.float16)
+    # output = torch.nn.functional.linear(input, w, None) * self.scales
+    # if self.bias:
+    #     return output + self.bias
+    output = torch.nn.functional.linear(input.to(w.dtype), w, self.bias) * self.scales
+    if self.bias:
+        output = torch.nn.functional.linear(input, w, self.bias) * self.scales
+    else:
+        output = torch.nn.functional.linear(input, w, None) * self.scales
+    return output
+
+
 @register_backend
 def npu(
     gm: Union[torch.nn.Module, torch.fx.GraphModule], example_inputs: List[torch.Tensor]

From cc5d3739fa2f72feb2085cd4bb28b9b809b1371f Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Mon, 15 Jul 2024 11:07:34 +0100
Subject: [PATCH 07/18] add support for model quantisation and code clean up

---
 intel_npu_acceleration_library/compiler.py | 44 +++++++++++-----------
 1 file changed, 23 insertions(+), 21 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 7325c0b..98c0202 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -50,7 +50,7 @@ def compile(
 
         # Model lowering to NPU ops
         if isinstance(model, Phi3MLP):
-            model = model.to("npu")
+            model = model
         else:
             # General optimizations
             apply_general_optimizations(model)
@@ -94,7 +94,6 @@ def module_optimization(func: Callable) -> torch.nn.Module:
     Returns:
         torch.nn.Module: optimized module
     """
-    module_optimization.counter = 0  # type: ignore[attr-defined]
 
     def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         """Recursively apply the optimization function.
@@ -108,13 +107,8 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         if not isinstance(model, NPUModuleWrapper):
             for name, layer in model.named_children():
                 new_layer = func(name, layer, *args, **kwargs)
-                if (func.__name__ == "optimize_phi3_MLP") and (
-                    module_optimization.counter >= 5  # type: ignore[attr-defined]
-                ):
-                    new_layer = None
 
                 if new_layer:
-                    module_optimization.counter += 1  # type: ignore[attr-defined]
                     model.add_module(name, new_layer)
                     if not isinstance(new_layer, NPUModuleWrapper):
                         wrapper(new_layer, *args, **kwargs)
@@ -207,7 +201,7 @@ def optimize_phi3_MLP(
         Union[torch.nn.Module, None]: optimized Phi-3 module
     """
     if layer.__class__.__name__ == "Phi3MLP":
-        return layer.to("npu")
+        return layer
     return None
 
 
@@ -228,11 +222,7 @@ def weights_quantization(
         None: Returns None
     """
     if isinstance(layer, WeightOnlyLinear):
-        if layer.bits == 4:
-            print("This works - int4 !!")
-            layer.forward = partial(forward, layer)
-        elif layer.bits == 8:
-            print("This works - int8 !!")
+        if (layer.bits == 4) or (layer.bits == 8):
             layer.forward = partial(forward, layer)
         else:
             raise RuntimeError(f"Unsupported quantization bits: {layer.bits}")
@@ -248,15 +238,27 @@ def forward(self, input):
     Returns:
         torch.Tensor: The output tensor.
     """
-    w = self.qweight.to(torch.float16)
-    # output = torch.nn.functional.linear(input, w, None) * self.scales
-    # if self.bias:
-    #     return output + self.bias
-    output = torch.nn.functional.linear(input.to(w.dtype), w, self.bias) * self.scales
+    if self.bits == 4:
+        # Unpack the int4 values
+        lower_int4 = self.qweight & 0x0F
+        lower_int4 = lower_int4 - (lower_int4 & 0x8) * 2
+        upper_int4 = (self.qweight >> 4) & 0x0F
+        upper_int4 = upper_int4 - (upper_int4 & 0x8) * 2
+
+        w = torch.stack((lower_int4, upper_int4), dim=2)
+        w = w.contiguous().view(self.qweight.shape[0], -1)
+
+    elif self.bits == 8:
+        w = self.qweight.view(torch.int8)
+
+    output = (
+        torch.nn.functional.linear(input.to(torch.float16), w.to(torch.float16), None)
+        * self.scales.T
+    )
+
     if self.bias:
-        output = torch.nn.functional.linear(input, w, self.bias) * self.scales
-    else:
-        output = torch.nn.functional.linear(input, w, None) * self.scales
+        return output + self.bias
+
     return output
 
 

From d2fe9feb9182423c2fc641081deb8484d8f90145 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Mon, 15 Jul 2024 15:12:46 +0100
Subject: [PATCH 08/18] Fix for model quantization

---
 intel_npu_acceleration_library/compiler.py | 21 ++++++++++++++-------
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 98c0202..2c82763 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -42,18 +42,25 @@ def compile(
 
     # Prepare and optimize model for NPU
     with torch.no_grad():
-
-        if dtype in (int8, int4):
-            # Quantize model
-            model = quantize_model(model, dtype)
-            weights_quantization(model)
-
         # Model lowering to NPU ops
         if isinstance(model, Phi3MLP):
+            # Apply optimizations to a single MLP block model
             model = model
+
+            if dtype in (int8, int4):
+                # Quantize model
+                model = quantize_model(model, dtype)
+                weights_quantization(model)
+
         else:
             # General optimizations
             apply_general_optimizations(model)
+
+            if dtype in (int8, int4):
+                # Quantize model
+                model = quantize_model(model, dtype)
+                weights_quantization(model)
+
             create_npu_kernels(model)
 
     if dtype.is_floating_point and training:
@@ -233,7 +240,7 @@ def forward(self, input):
     """Override forward method for WeightOnlyLinear class.
 
     Args:
-        input: Thr input tensor.
+        input: The input tensor.
 
     Returns:
         torch.Tensor: The output tensor.

From b7825e754e90b5e6c7607406ece7d0e077970d8b Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Tue, 16 Jul 2024 11:24:45 +0100
Subject: [PATCH 09/18] Add testing for phi-3 mlp quantisation

---
 test/python/test_llm.py | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index 75a663b..13b5d79 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -7,6 +7,7 @@
 from transformers.models.phi.modeling_phi import PhiConfig, PhiMLP
 from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
 from transformers import AutoTokenizer, AutoModelForCausalLM
+from intel_npu_acceleration_library.dtypes import int8, int4
 from sklearn.metrics import r2_score
 from torch.profiler import profile, ProfilerActivity
 import intel_npu_acceleration_library
@@ -85,19 +86,27 @@ def test_phi2_mlp(seq_len, hidden_size, intermediate_size):
 @pytest.mark.parametrize("seq_len", [16, 128, 256])
 @pytest.mark.parametrize("hidden_size", [256, 512])
 @pytest.mark.parametrize("intermediate_size", [512])
-def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size):
+@pytest.mark.parametrize("dtype", ["float16", "int8", "int4"])
+def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size, dtype):
     conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
     conf.num_hidden_layers = 1
     conf.hidden_size = hidden_size
     conf.intermediate_size = intermediate_size
 
+    if dtype == "int8":
+        dtype = int8
+    elif dtype == "int4":
+        dtype = int4
+    else:
+        dtype = torch.float16
+
     mlp = Phi3MLP(conf)
 
     hidden_states = torch.rand((seq_len, conf.hidden_size))
 
     reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy()
 
-    model = intel_npu_acceleration_library.compile(mlp)
+    model = intel_npu_acceleration_library.compile(mlp, dtype)
 
     assert model
 
@@ -116,4 +125,7 @@ def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size):
     assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf"
     assert np.isfinite(out).all(), "NPU output contains NaN or Inf"
 
-    assert 1 - r2_score(reference, out) < 0.001
+    if dtype == int4:
+        assert 1 - r2_score(reference, out) < 0.05
+    else:
+        assert 1 - r2_score(reference, out) < 0.001

From c65285907635a96b0b24ae0ba02666650713a89d Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Wed, 17 Jul 2024 09:26:35 +0100
Subject: [PATCH 10/18] Add phi-3 mlp test and enable model profiling toggling

---
 intel_npu_acceleration_library/nn/module.py |   3 +-
 script/profile_mlp.py                       | 124 ++++++++++++++++++++
 2 files changed, 126 insertions(+), 1 deletion(-)
 create mode 100644 script/profile_mlp.py

diff --git a/intel_npu_acceleration_library/nn/module.py b/intel_npu_acceleration_library/nn/module.py
index 0ac46cc..b5b2f37 100644
--- a/intel_npu_acceleration_library/nn/module.py
+++ b/intel_npu_acceleration_library/nn/module.py
@@ -111,6 +111,7 @@ def __init__(self) -> None:
         self._nn_factory_cache: MutableMapping[str, NNFactory] = {}
         self._npu_inference = False
         self.npu_top_level_module = True
+        self.profile = False
 
     def extract_tensors_from_arguments(
         self, args: Sequence[Any]
@@ -171,7 +172,7 @@ def create_model(
         Returns:
             NNFactory: The model.
         """
-        model = NNFactory()
+        model = NNFactory(profile=self.profile)
 
         def create_args_from_list(args: Sequence[Any]) -> Sequence[Any]:
             """Create arguments from a list.
diff --git a/script/profile_mlp.py b/script/profile_mlp.py
new file mode 100644
index 0000000..3b61b6a
--- /dev/null
+++ b/script/profile_mlp.py
@@ -0,0 +1,124 @@
+#
+# Copyright © 2024 Intel Corporation
+# SPDX-License-Identifier: Apache 2.0
+#
+
+from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
+from intel_npu_acceleration_library.dtypes import int8, int4
+from torch.profiler import profile, ProfilerActivity
+from sklearn.metrics import r2_score
+import intel_npu_acceleration_library
+import argparse
+import torch
+import numpy as np
+
+
+def main(
+    seq_len=128,
+    hidden_size=256,
+    intermediate_size=512,
+    dtype="float16",
+    _profile=False,
+):
+
+    conf = Phi3Config.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
+    conf.num_hidden_layers = 1
+    conf.hidden_size = hidden_size
+    conf.intermediate_size = intermediate_size
+
+    # Define a single Phi-3 MLP layer
+    mlp = Phi3MLP(conf)
+
+    hidden_states = torch.rand((seq_len, conf.hidden_size))
+
+    reference = mlp(hidden_states.to(torch.float32)).to(torch.float16)
+
+    if dtype == "float16":
+        dtype = torch.float16
+    elif dtype == "int8":
+        dtype = int8
+    elif dtype == "int4":
+        dtype = int4
+    else:
+        raise RuntimeError(f"Invalid dtype: {dtype}")
+
+    # Compile model
+    model = intel_npu_acceleration_library.compile(mlp, dtype)
+    if _profile:
+        model.profile = True
+
+    with profile(activities=[ProfilerActivity.CPU], record_shapes=True) as prof:
+        for _ in range(1000):
+            results = model(hidden_states)
+
+    print(
+        prof.key_averages(group_by_input_shape=True).table(
+            sort_by="cpu_time_total", row_limit=20
+        )
+    )
+
+    prof.export_chrome_trace("trace.json")
+
+    results = results.detach().numpy()
+    reference = reference.detach().numpy()
+
+    assert results.shape == reference.shape, "Output shape mismatch"
+    assert np.isfinite(reference).all(), "Pytorch Reference contains NaN or Inf"
+    assert np.isfinite(results).all(), "NPU output contains NaN or Inf"
+
+    if dtype == int4:
+        assert 1 - r2_score(reference, results) < 0.05
+    else:
+        assert 1 - r2_score(reference, results) < 0.001
+
+
+def define_and_parse_args():
+    parser = argparse.ArgumentParser(description="Profiling a MLP layer in the NPU")
+    parser.add_argument(
+        "--seq-len",
+        type=int,
+        default=128,
+        help="Sequence length (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--hidden-size",
+        type=int,
+        default=256,
+        help="Hidden size (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--intermediate-size",
+        type=int,
+        default=512,
+        help="Intermediate size (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--dtype",
+        default="float16",
+        choices=["float16", "int8", "int4"],
+        help="Select the target dtype (default: %(default)s)",
+    )
+    parser.add_argument(
+        "--profile",
+        action="store_true",
+        default=False,
+        help="Enable the profiling (default: False)",
+    )
+
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = define_and_parse_args()
+
+    print(
+        f"Profiling with sequence length {args.seq_len}, hidden size {args.hidden_size}, intermediate size {args.intermediate_size}, dtype {args.dtype}"
+    )
+
+    main(
+        seq_len=args.seq_len,
+        hidden_size=args.hidden_size,
+        intermediate_size=args.intermediate_size,
+        dtype=args.dtype,
+        _profile=args.profile,
+    )

From 786c663398d71ff7c4b61e326cd4899775201b4b Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Wed, 17 Jul 2024 09:55:36 +0100
Subject: [PATCH 11/18] Update for model profiling toggle

---
 intel_npu_acceleration_library/nn/module.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/intel_npu_acceleration_library/nn/module.py b/intel_npu_acceleration_library/nn/module.py
index b5b2f37..e861260 100644
--- a/intel_npu_acceleration_library/nn/module.py
+++ b/intel_npu_acceleration_library/nn/module.py
@@ -105,13 +105,17 @@ def patch_modules(module: torch.nn.Module, model: NNFactory):
 class Module(torch.nn.Module):
     """A PyTorch module that runs on the NPU."""
 
-    def __init__(self) -> None:
-        """Initialize the module."""
+    def __init__(self, profile: bool = False) -> None:
+        """Initialize the module.
+
+        Args:
+            profile (bool): Enable model profiling. Defaults to False.
+        """
         super().__init__()
         self._nn_factory_cache: MutableMapping[str, NNFactory] = {}
         self._npu_inference = False
         self.npu_top_level_module = True
-        self.profile = False
+        self.profile = profile
 
     def extract_tensors_from_arguments(
         self, args: Sequence[Any]

From 003d639b96e7d7e0b74bb0fababce7f66d9c0330 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Thu, 18 Jul 2024 08:40:05 +0100
Subject: [PATCH 12/18] Add compile config feature

---
 intel_npu_acceleration_library/compiler.py  | 73 +++++++++++++--------
 intel_npu_acceleration_library/modelling.py | 13 ++--
 script/profile_llm.py                       |  4 +-
 script/profile_mlp.py                       |  4 +-
 test/python/test_llm.py                     |  4 +-
 5 files changed, 61 insertions(+), 37 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 2c82763..7e76a2f 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -6,10 +6,9 @@
 from intel_npu_acceleration_library.optimizations import horizontal_fusion_linear
 from transformers.models.llama.modeling_llama import LlamaMLP, LlamaAttention
 from transformers.models.gemma.modeling_gemma import GemmaMLP, GemmaAttention
-from transformers.models.phi3.modeling_phi3 import Phi3MLP
 from neural_compressor.adaptor.torch_utils.model_wrapper import WeightOnlyLinear
 from intel_npu_acceleration_library.quantization import quantize_model
-from intel_npu_acceleration_library.dtypes import int8, int4
+from intel_npu_acceleration_library.dtypes import int8, int4, NPUDtype
 from intel_npu_acceleration_library.nn.module import NPUModuleWrapper
 import intel_npu_acceleration_library.nn as nn
 from torch._dynamo import register_backend
@@ -19,15 +18,33 @@
 from functools import partial
 
 
-def compile(
-    model: torch.nn.Module, dtype: torch.dtype = torch.float16, training: bool = False
-) -> torch.nn.Module:
+class CompilerConfig:
+    """Configuration class to store the compilation configuration of a model for the NPU."""
+
+    def __init__(
+        self,
+        use_to: bool = False,
+        dtype: Union[torch.dtype, NPUDtype] = torch.float16,
+        training: bool = False,
+    ) -> None:
+        """Initialize the configuration class.
+
+        Args:
+            use_to (bool): Enable model compiling using .to() . Defaults to disabled
+            dtype (Union[torch.dtype, NPUDtype]): The dtype to compile the model with. Defaults to torch.float16
+            training (bool): Enable training. Defaults to disabled
+        """
+        self.use_to = use_to
+        self.dtype = dtype
+        self.training = training
+
+
+def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module:
     """Compile a model for the NPU.
 
     Args:
         model (torch.nn.Module): a pytorch nn.Module to compile and optimize for the npu
-        dtype (torch.dtype): the model target datatype, default to torch.float16
-        training (bool): enable training. Default disabled
+        config (CompilerConfig): the compiler configuration
 
     Raises:
         RuntimeError: invalid datatypes
@@ -35,35 +52,29 @@ def compile(
     Returns:
         torch.nn.Module: compiled NPU nn.Module
     """
-    if not (dtype.is_floating_point or dtype in (int8, int4)):
+    if not (config.dtype.is_floating_point or config.dtype in (int8, int4)):
         raise RuntimeError(
-            f"intel-npu-acceleration-library library do not support yet the requeste datatype: {dtype}"
+            f"intel-npu-acceleration-library library do not support yet the requeste datatype: {config.dtype}"
         )
 
     # Prepare and optimize model for NPU
     with torch.no_grad():
         # Model lowering to NPU ops
-        if isinstance(model, Phi3MLP):
-            # Apply optimizations to a single MLP block model
-            model = model
-
-            if dtype in (int8, int4):
-                # Quantize model
-                model = quantize_model(model, dtype)
-                weights_quantization(model)
-
+        if config.use_to:
+            model = model.to("npu")
         else:
             # General optimizations
             apply_general_optimizations(model)
 
-            if dtype in (int8, int4):
-                # Quantize model
-                model = quantize_model(model, dtype)
-                weights_quantization(model)
+        if config.dtype in (int8, int4):
+            # Quantize model
+            model = quantize_model(model, config.dtype)
+            weights_quantization(model)
 
+        if not config.use_to:
             create_npu_kernels(model)
 
-    if dtype.is_floating_point and training:
+    if config.dtype.is_floating_point and config.training:
         # Set model to evaluation only as quantized training is not supported yet
         return model
 
@@ -101,6 +112,7 @@ def module_optimization(func: Callable) -> torch.nn.Module:
     Returns:
         torch.nn.Module: optimized module
     """
+    module_optimization.counter = 0  # type: ignore[attr-defined]
 
     def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         """Recursively apply the optimization function.
@@ -114,8 +126,12 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         if not isinstance(model, NPUModuleWrapper):
             for name, layer in model.named_children():
                 new_layer = func(name, layer, *args, **kwargs)
-
+                if (func.__name__ == "optimize_phi3_MLP") and (
+                    module_optimization.counter >= 5  # type: ignore[attr-defined]
+                ):
+                    new_layer = None
                 if new_layer:
+                    module_optimization.counter += 1  # type: ignore[attr-defined]
                     model.add_module(name, new_layer)
                     if not isinstance(new_layer, NPUModuleWrapper):
                         wrapper(new_layer, *args, **kwargs)
@@ -208,7 +224,7 @@ def optimize_phi3_MLP(
         Union[torch.nn.Module, None]: optimized Phi-3 module
     """
     if layer.__class__.__name__ == "Phi3MLP":
-        return layer
+        return layer.to("npu")
     return None
 
 
@@ -271,12 +287,15 @@ def forward(self, input):
 
 @register_backend
 def npu(
-    gm: Union[torch.nn.Module, torch.fx.GraphModule], example_inputs: List[torch.Tensor]
+    gm: Union[torch.nn.Module, torch.fx.GraphModule],
+    config: CompilerConfig,
+    example_inputs: List[torch.Tensor],
 ) -> Union[torch.nn.Module, torch.fx.GraphModule]:
     """Implement the custom torch 2.0 compile backend for the NPU.
 
     Args:
         gm (Union[torch.nn.Module, torch.fx.GraphModule]): The torch fx Module
+        config (CompilerConfig): The compiler configuration
         example_inputs (List[torch.Tensor]): A list of example inputs
 
     Returns:
@@ -286,4 +305,4 @@ def npu(
     gm = horizontal_fusion_linear(gm)
 
     # For now compile in fp16
-    return compile(gm)
+    return compile(gm, config)
diff --git a/intel_npu_acceleration_library/modelling.py b/intel_npu_acceleration_library/modelling.py
index 420db3c..606cbd4 100644
--- a/intel_npu_acceleration_library/modelling.py
+++ b/intel_npu_acceleration_library/modelling.py
@@ -4,6 +4,7 @@
 #
 from transformers import AutoModel, AutoModelForCausalLM, AutoModelForSeq2SeqLM
 import intel_npu_acceleration_library as npu_lib
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from functools import partialmethod
 from typing import Type, Any, Tuple, Optional
 import hashlib
@@ -62,8 +63,7 @@ class NPUModel:
     @staticmethod
     def from_pretrained(
         model_name_or_path: str,
-        dtype: torch.dtype = torch.float16,
-        training: bool = False,
+        config: CompilerConfig,
         transformers_class: Optional[Type] = None,
         export=True,
         *args: Any,
@@ -73,8 +73,7 @@ def from_pretrained(
 
         Args:
             model_name_or_path (str): model name or path
-            dtype (torch.dtype, optional): compilation dtype. Defaults to torch.float16.
-            training (bool, optional): enable training. Defaults to False.
+            config (CompilerConfig): compiler configuration
             transformers_class (Optional[Type], optional): base class to use. Must have a `from_pretrained` method. Defaults to None.
             export (bool, optional): enable the caching of the model. Defaults to True.
             args (Any): positional arguments
@@ -91,18 +90,18 @@ def from_pretrained(
             raise RuntimeError(f"Invalid transformer class {type(transformers_class)}")
         # get the model cache dir and path from the name and arguments
         model_dir_path, model_path = get_model_path(
-            model_name_or_path, dtype, training, *args, **kwargs
+            model_name_or_path, config.dtype, config.training, *args, **kwargs
         )
         if os.path.isdir(model_dir_path) and os.path.isfile(model_path):
             # Model already exist so I can load it directly
             return torch.load(model_path)
         else:
             # Model does not exists, so I need to compile it first
-            print(f"Compiling model {model_name_or_path} {dtype} for the NPU")
+            print(f"Compiling model {model_name_or_path} {config.dtype} for the NPU")
             model = transformers_class.from_pretrained(
                 model_name_or_path, *args, **kwargs
             )
-            model = npu_lib.compile(model, dtype, training)
+            model = npu_lib.compile(model, config)
             if export:
                 if kwargs.get("trust_remote_code", False):
                     raise AttributeError(
diff --git a/script/profile_llm.py b/script/profile_llm.py
index 6a69089..cdf7c76 100644
--- a/script/profile_llm.py
+++ b/script/profile_llm.py
@@ -6,6 +6,7 @@
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from intel_npu_acceleration_library.nn.llm import generate_with_static_shape
 from intel_npu_acceleration_library.dtypes import int8, int4
+from intel_npu_acceleration_library.compiler import CompilerConfig
 
 from torch.profiler import profile, ProfilerActivity
 import intel_npu_acceleration_library
@@ -52,7 +53,8 @@ def main(
 
     if not disable_intel_npu_acceleration_library:
         if not compiled:
-            model = intel_npu_acceleration_library.compile(model, dtype)
+            compiler_conf = CompilerConfig(dtype=dtype)
+            model = intel_npu_acceleration_library.compile(model, compiler_conf)
         intel_npu_acceleration_library.nn.llm.warm_up_decoder_model(
             tokenizer, model, context_size
         )
diff --git a/script/profile_mlp.py b/script/profile_mlp.py
index 3b61b6a..4c64fc1 100644
--- a/script/profile_mlp.py
+++ b/script/profile_mlp.py
@@ -5,6 +5,7 @@
 
 from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
 from intel_npu_acceleration_library.dtypes import int8, int4
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from torch.profiler import profile, ProfilerActivity
 from sklearn.metrics import r2_score
 import intel_npu_acceleration_library
@@ -43,7 +44,8 @@ def main(
         raise RuntimeError(f"Invalid dtype: {dtype}")
 
     # Compile model
-    model = intel_npu_acceleration_library.compile(mlp, dtype)
+    compiler_conf = CompilerConfig(use_to=True, dtype=dtype)
+    model = intel_npu_acceleration_library.compile(mlp, compiler_conf)
     if _profile:
         model.profile = True
 
diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index 13b5d79..d5b093e 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -8,6 +8,7 @@
 from transformers.models.phi3.modeling_phi3 import Phi3Config, Phi3MLP
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from intel_npu_acceleration_library.dtypes import int8, int4
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from sklearn.metrics import r2_score
 from torch.profiler import profile, ProfilerActivity
 import intel_npu_acceleration_library
@@ -106,7 +107,8 @@ def test_phi3_mlp_compile(seq_len, hidden_size, intermediate_size, dtype):
 
     reference = mlp(hidden_states.to(torch.float32)).to(torch.float16).detach().numpy()
 
-    model = intel_npu_acceleration_library.compile(mlp, dtype)
+    compiler_conf = CompilerConfig(use_to=True, dtype=dtype)
+    model = intel_npu_acceleration_library.compile(mlp, compiler_conf)
 
     assert model
 

From c63c22316d8c82a9376f462342d93cd295e13f04 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Thu, 18 Jul 2024 09:41:59 +0100
Subject: [PATCH 13/18] Fix test for compile config and remove old code

---
 intel_npu_acceleration_library/compiler.py | 4 ++--
 test/python/test_llm.py                    | 5 ++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 7e76a2f..6b12dea 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -61,7 +61,7 @@ def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module:
     with torch.no_grad():
         # Model lowering to NPU ops
         if config.use_to:
-            model = model.to("npu")
+            model = model
         else:
             # General optimizations
             apply_general_optimizations(model)
@@ -224,7 +224,7 @@ def optimize_phi3_MLP(
         Union[torch.nn.Module, None]: optimized Phi-3 module
     """
     if layer.__class__.__name__ == "Phi3MLP":
-        return layer.to("npu")
+        return layer
     return None
 
 
diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index d5b093e..3aa16e7 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -50,7 +50,10 @@ def test_compilation(tokenizer, decoder_model, dtype):
     prefill = tokenizer("test sentence", return_tensors="pt")["input_ids"].to("cpu")
     y_ref = decoder_model(prefill).logits.detach()
 
-    compiled_model = intel_npu_acceleration_library.compile(decoder_model, dtype=dtype)
+    compiler_conf = CompilerConfig(dtype=dtype)
+    compiled_model = intel_npu_acceleration_library.compile(
+        decoder_model, compiler_conf
+    )
 
     assert compiled_model
 

From e652eaa9887cb603379f0ea725f8aea6b67aed64 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Thu, 18 Jul 2024 10:32:56 +0100
Subject: [PATCH 14/18] Fix tests with compile config

---
 test/python/test_compile.py       | 10 +++++++---
 test/python/test_conv.py          |  4 +++-
 test/python/test_llm.py           |  3 ++-
 test/python/test_optimizations.py |  4 +++-
 test/python/test_quantization.py  |  5 ++++-
 test/python/test_training.py      | 10 +++++++---
 6 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/test/python/test_compile.py b/test/python/test_compile.py
index 07fb144..faf3d28 100644
--- a/test/python/test_compile.py
+++ b/test/python/test_compile.py
@@ -4,6 +4,7 @@
 #
 
 from intel_npu_acceleration_library.compiler import compile
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from intel_npu_acceleration_library.dtypes import int4
 from sklearn.metrics import r2_score
 import intel_npu_acceleration_library
@@ -39,7 +40,8 @@ def test_compilation(dtype):
 
     y_ref = model(x).detach()
 
-    compiled_model = compile(model, dtype)
+    compiler_conf = CompilerConfig(dtype=dtype)
+    compiled_model = compile(model, compiler_conf)
 
     assert compiled_model
 
@@ -104,7 +106,8 @@ def test_compile_training(dtype):
 
     model = NN()
 
-    compiled_model = compile(model, dtype, training=True)
+    compiler_conf = CompilerConfig(dtype=dtype, training=True)
+    compiled_model = compile(model, compiler_conf)
 
     for name, layer in compiled_model.named_children():
         if dtype == torch.int8:
@@ -118,7 +121,8 @@ def test_compile_inference(dtype):
 
     model = NN()
 
-    compiled_model = compile(model, dtype)
+    compiler_conf = CompilerConfig(dtype=dtype)
+    compiled_model = compile(model, compiler_conf)
 
     for name, layer in compiled_model.named_children():
         assert layer.training == False
diff --git a/test/python/test_conv.py b/test/python/test_conv.py
index 5a0ec5b..6fa94a6 100644
--- a/test/python/test_conv.py
+++ b/test/python/test_conv.py
@@ -5,6 +5,7 @@
 
 
 import intel_npu_acceleration_library
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from sklearn.metrics import r2_score
 import pytest
 import torch
@@ -71,7 +72,8 @@ def test_conv(
         conv.conv.weight.data *= 128
         y_ref = conv(X)
 
-        npu_conv = intel_npu_acceleration_library.compile(conv, dtype)
+        compiler_conf = CompilerConfig(dtype=dtype)
+        npu_conv = intel_npu_acceleration_library.compile(conv, compiler_conf)
         y = npu_conv(X)
 
         assert y.dtype == y_ref.dtype
diff --git a/test/python/test_llm.py b/test/python/test_llm.py
index 3aa16e7..49e2952 100644
--- a/test/python/test_llm.py
+++ b/test/python/test_llm.py
@@ -39,7 +39,8 @@ def tokenizer():
 
 @pytest.mark.parametrize("model_seq_length", [128, 256])
 def test_warm_up(tokenizer, model, model_seq_length):
-    compiled_model = intel_npu_acceleration_library.compile(model)
+    compiler_conf = CompilerConfig()
+    compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
     intel_npu_acceleration_library.nn.llm.warm_up_decoder_model(
         tokenizer, compiled_model, model_seq_length
     )
diff --git a/test/python/test_optimizations.py b/test/python/test_optimizations.py
index 0f02f07..b3c5b97 100644
--- a/test/python/test_optimizations.py
+++ b/test/python/test_optimizations.py
@@ -7,6 +7,7 @@
 from transformers.models.llama.modeling_llama import LlamaConfig, LlamaMLP, LlamaModel
 from transformers.models.gemma.modeling_gemma import GemmaConfig, GemmaMLP, GemmaModel
 from intel_npu_acceleration_library.optimizations import horizontal_fusion_linear
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from sklearn.metrics import r2_score
 import torch.nn as nn
 import intel_npu_acceleration_library
@@ -142,7 +143,8 @@ def test_model(model_name, hidden_size, intermediate_size, sequence_length, bias
 
         reference = model(example_input)[0]
 
-        optimized = intel_npu_acceleration_library.compile(model, torch.float16)
+        compiler_conf = CompilerConfig(dtype=torch.float16)
+        optimized = intel_npu_acceleration_library.compile(model, compiler_conf)
 
         output = optimized(example_input)[0]
 
diff --git a/test/python/test_quantization.py b/test/python/test_quantization.py
index 50044b2..c0a1c27 100644
--- a/test/python/test_quantization.py
+++ b/test/python/test_quantization.py
@@ -4,6 +4,7 @@
 #
 
 from sklearn.metrics import r2_score
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import numpy as np
 import intel_npu_acceleration_library
 import pytest
@@ -88,7 +89,9 @@ def test_compiled_quantized(batch, inC, outC):
 
     model = NN(inC, outC)
     y_ref = model(X.to(torch.float32)).detach()
-    compiled_model = intel_npu_acceleration_library.compile(model, torch.int8)
+
+    compiler_conf = CompilerConfig(dtype=torch.int8)
+    compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
     assert compiled_model
 
     y1 = compiled_model(X).detach()
diff --git a/test/python/test_training.py b/test/python/test_training.py
index aa8f390..adc398d 100644
--- a/test/python/test_training.py
+++ b/test/python/test_training.py
@@ -6,6 +6,7 @@
 
 from sklearn.metrics import r2_score
 from intel_npu_acceleration_library import compile
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import torch
 import pytest
 import copy
@@ -28,12 +29,14 @@ def forward(self, x):
 
 @pytest.fixture
 def model_no_bias():
-    return compile(NN(inc=in_c, outc=out_c, bias=False))
+    compiler_conf = CompilerConfig()
+    return compile(NN(inc=in_c, outc=out_c, bias=False), compiler_conf)
 
 
 @pytest.fixture
 def model():
-    return compile(NN(inc=in_c, outc=out_c, bias=True))
+    compiler_conf = CompilerConfig()
+    return compile(NN(inc=in_c, outc=out_c, bias=True), compiler_conf)
 
 
 def test_parameters(model, model_no_bias):
@@ -48,7 +51,8 @@ def test_gradient():
     cpu_model.load_state_dict(copy.deepcopy(npu_model.state_dict()))
 
     # Compile one of the model on npu
-    compile(npu_model, training=True)
+    compiler_conf = CompilerConfig(training=True)
+    compile(npu_model, compiler_conf)
 
     x = torch.rand([batch, in_c]).half()
     yref = torch.rand([batch, in_c]).half()

From 7f2faf979e749a05c25f175cabd0aad4b40e43e2 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Thu, 18 Jul 2024 11:58:54 +0100
Subject: [PATCH 15/18] Fix for compiler, updates for tests and examples, doc
 update

---
 docs/source/usage.md                       | 18 ++++++++++++++++--
 examples/compile_model.py                  |  6 ++++--
 examples/llava.py                          |  4 +++-
 examples/tiny_llama_chat.py                |  4 +++-
 examples/train_mnist.py                    |  5 +++--
 intel_npu_acceleration_library/compiler.py |  3 +--
 script/export.py                           | 12 +++++++++---
 7 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/docs/source/usage.md b/docs/source/usage.md
index 62a4cdb..aff2716 100644
--- a/docs/source/usage.md
+++ b/docs/source/usage.md
@@ -38,19 +38,33 @@ optimized_model = torch.compile(model, backend="npu")
 
 In windows torch.compile is not supported yet. So you might want to use the explicit function `intel_npu_acceleration_library.compile`. This is true also if you use a `pytorch` version < 2.0.0
 
+To do this, you just need to call the `compile` function with your model and the compiler configuration `CompilerConfig` to compile and optimize the model for the NPU.
 ```python
 import intel_npu_acceleration_library
-optimized_model = intel_npu_acceleration_library.compile(model, dtype=torch.int8)
+from intel_npu_acceleration_library.compiler import CompilerConfig
+compiler_conf = CompilerConfig(dtype=torch.int8)
+optimized_model = intel_npu_acceleration_library.compile(model, compiler_conf)
 
 # Use the model as usual
 
 ```
 
+To compile and optimize a single layer of a model to be pushed to the NPU as one block, you can set `use_to=True` in the the compiler configuration `CompilerConfig`.
+```python
+import intel_npu_acceleration_library
+from intel_npu_acceleration_library.compiler import CompilerConfig
+compiler_conf = CompilerConfig(use_to=True, dtype=torch.int8)
+optimized_block = intel_npu_acceleration_library.compile(single_block, compiler_conf)
+
+```
+
 ## Training (**Experimental!**)
 
 It is possible to use Intel® NPU Acceleration Library to train a model. As before you just need to call the `compile` function, this time with `training=True`. This allows to use the same training script you use in other device with a very minimal modifications.
 
 ```python
 import intel_npu_acceleration_library
-compiled_model = intel_npu_acceleration_library.compile(model, dtype=torch.float32, training=True)
+from intel_npu_acceleration_library.compiler import CompilerConfig
+compiler_conf = CompilerConfig(dtype=torch.float32, training=True)
+compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
 ```
diff --git a/examples/compile_model.py b/examples/compile_model.py
index 2146fcd..afe51ce 100644
--- a/examples/compile_model.py
+++ b/examples/compile_model.py
@@ -5,6 +5,7 @@
 
 
 from intel_npu_acceleration_library import compile
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from sklearn.metrics import r2_score
 import intel_npu_acceleration_library
 import pytest
@@ -41,7 +42,8 @@ def forward(self, x):
         print(
             "Windows do not support torch.compile, fallback to intel_npu_acceleration_library.compile"
         )
-        compiled_model = intel_npu_acceleration_library.compile(model)
+        compiler_conf = CompilerConfig()
+        compiled_model = intel_npu_acceleration_library.compile(model, compiler_conf)
     else:
         compiled_model = torch.compile(model, backend="npu")
 
@@ -49,4 +51,4 @@ def forward(self, x):
     with torch.no_grad():
         y = compiled_model(x)
 
-    print(f"Reference vs actual R2 score: {r2_score(y_ref, y):.2f}")
+    print(f"Reference vs actual R2 score: {r2_score(y_ref.numpy(), y.numpy()):.2f}")
diff --git a/examples/llava.py b/examples/llava.py
index a8e5545..dafa22d 100644
--- a/examples/llava.py
+++ b/examples/llava.py
@@ -12,6 +12,7 @@
     TextStreamer,
 )
 from transformers.feature_extraction_utils import BatchFeature
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import intel_npu_acceleration_library
 import torch
 
@@ -21,7 +22,8 @@
 # Load model
 model = LlavaForConditionalGeneration.from_pretrained(checkpoint)
 
-model = intel_npu_acceleration_library.compile(model)
+compiler_conf = CompilerConfig()
+model = intel_npu_acceleration_library.compile(model, compiler_conf)
 
 image_processor = CLIPImageProcessor.from_pretrained(checkpoint)
 tokenizer = AutoTokenizer.from_pretrained(checkpoint)
diff --git a/examples/tiny_llama_chat.py b/examples/tiny_llama_chat.py
index 13f595c..82a699e 100644
--- a/examples/tiny_llama_chat.py
+++ b/examples/tiny_llama_chat.py
@@ -4,6 +4,7 @@
 #
 
 from transformers import pipeline, TextStreamer, set_seed
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import intel_npu_acceleration_library
 import torch
 import os
@@ -15,7 +16,8 @@
     "text-generation", model=model_id, torch_dtype=torch.bfloat16, device_map="auto"
 )
 print("Compiling the model for NPU...")
-pipe.model = intel_npu_acceleration_library.compile(pipe.model, dtype=torch.int8)
+compiler_conf = CompilerConfig(dtype=torch.int8)
+pipe.model = intel_npu_acceleration_library.compile(pipe.model, compiler_conf)
 
 streamer = TextStreamer(pipe.tokenizer, skip_special_tokens=True, skip_prompt=True)
 
diff --git a/examples/train_mnist.py b/examples/train_mnist.py
index 972eb81..6e14a22 100644
--- a/examples/train_mnist.py
+++ b/examples/train_mnist.py
@@ -7,6 +7,7 @@
 import torch
 from torch import nn
 import intel_npu_acceleration_library
+from intel_npu_acceleration_library.compiler import CompilerConfig
 from torch.utils.data import DataLoader
 from torchvision import datasets
 from torchvision.transforms import ToTensor
@@ -90,8 +91,8 @@ def test_loop(dataloader, model, loss_fn):
 
 
 model = NeuralNetwork()
-
-model = intel_npu_acceleration_library.compile(model, torch.float32, training=True)
+compiler_conf = CompilerConfig(dtype=torch.float32, training=True)
+model = intel_npu_acceleration_library.compile(model, compiler_conf)
 
 learning_rate = 1e-3
 batch_size = 64
diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 6b12dea..88183b6 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -288,14 +288,12 @@ def forward(self, input):
 @register_backend
 def npu(
     gm: Union[torch.nn.Module, torch.fx.GraphModule],
-    config: CompilerConfig,
     example_inputs: List[torch.Tensor],
 ) -> Union[torch.nn.Module, torch.fx.GraphModule]:
     """Implement the custom torch 2.0 compile backend for the NPU.
 
     Args:
         gm (Union[torch.nn.Module, torch.fx.GraphModule]): The torch fx Module
-        config (CompilerConfig): The compiler configuration
         example_inputs (List[torch.Tensor]): A list of example inputs
 
     Returns:
@@ -305,4 +303,5 @@ def npu(
     gm = horizontal_fusion_linear(gm)
 
     # For now compile in fp16
+    config = CompilerConfig()
     return compile(gm, config)
diff --git a/script/export.py b/script/export.py
index 892711e..4f63f71 100644
--- a/script/export.py
+++ b/script/export.py
@@ -5,6 +5,8 @@
 
 from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
 from intel_npu_acceleration_library.compiler import compile
+from intel_npu_acceleration_library.compiler import CompilerConfig
+from intel_npu_acceleration_library.dtypes import int8, int4
 import argparse
 import torch
 import os
@@ -41,15 +43,19 @@ def export(model_id, dtype, output):
 
     if dtype == "fp16":
         print(f"Compiling model {model_id}")
-        torch_dtype = torch.float16
+        dtype = torch.float16
     elif dtype == "int8":
         print(f"Quantizing & Compiling model {model_id}")
-        torch_dtype = torch.int8
+        dtype = int8
+    elif dtype == "int4":
+        print(f"Quantizing & Compiling model {model_id}")
+        dtype = int4
     else:
         raise RuntimeError(f"Invalid dtype {dtype}")
 
     with torch.no_grad():
-        compile(model, dtype=torch_dtype)
+        compiler_conf = CompilerConfig(dtype=dtype)
+        compile(model, compiler_conf)
 
     filename = os.path.join(PATH, "model.pth")
     os.makedirs(PATH, exist_ok=True)

From 4b5f857c7e7185d9858bf99eb068f8ed269361d7 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Thu, 18 Jul 2024 15:58:48 +0100
Subject: [PATCH 16/18] Update for model examples and remove test code

---
 examples/llama.py                          | 4 +++-
 examples/llama3.py                         | 6 +++++-
 examples/phi-2.py                          | 4 +++-
 examples/phi-3.py                          | 4 +++-
 examples/t5.py                             | 6 +++++-
 intel_npu_acceleration_library/compiler.py | 6 ------
 6 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/examples/llama.py b/examples/llama.py
index 9c2aaba..e4aebb3 100644
--- a/examples/llama.py
+++ b/examples/llama.py
@@ -5,11 +5,13 @@
 
 from transformers import AutoTokenizer, TextStreamer
 from intel_npu_acceleration_library import NPUModelForCausalLM, int4
+from intel_npu_acceleration_library.compiler import CompilerConfig
 
 model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
 
+compiler_conf = CompilerConfig(dtype=int4)
 model = NPUModelForCausalLM.from_pretrained(
-    model_id, use_cache=True, dtype=int4, attn_implementation="sdpa"
+    model_id, use_cache=True, config=compiler_conf, attn_implementation="sdpa"
 ).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
 tokenizer.pad_token_id = tokenizer.eos_token_id
diff --git a/examples/llama3.py b/examples/llama3.py
index 5a4fb95..9f6ec2a 100644
--- a/examples/llama3.py
+++ b/examples/llama3.py
@@ -5,10 +5,14 @@
 
 from transformers import AutoTokenizer, TextStreamer
 from intel_npu_acceleration_library import NPUModelForCausalLM, int4
+from intel_npu_acceleration_library.compiler import CompilerConfig
 
 model_id = "meta-llama/Meta-Llama-3-8B-Instruct"
 
-model = NPUModelForCausalLM.from_pretrained(model_id, dtype=int4, use_cache=True).eval()
+compiler_conf = CompilerConfig(dtype=int4)
+model = NPUModelForCausalLM.from_pretrained(
+    model_id, use_cache=True, config=compiler_conf
+).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_id)
 streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
 
diff --git a/examples/phi-2.py b/examples/phi-2.py
index 8bf59d4..7b4a4ae 100644
--- a/examples/phi-2.py
+++ b/examples/phi-2.py
@@ -7,12 +7,14 @@
 from langchain.chains import LLMChain
 from langchain.llms import HuggingFacePipeline
 from transformers import AutoTokenizer, pipeline, TextStreamer
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import intel_npu_acceleration_library as npu_lib
 
 model_id = "microsoft/Phi-2"
 
+compiler_conf = CompilerConfig(dtype=npu_lib.int4)
 model = npu_lib.NPUModelForCausalLM.from_pretrained(
-    model_id, use_cache=True, dtype=npu_lib.int4
+    model_id, use_cache=True, config=compiler_conf
 ).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
 streamer = TextStreamer(tokenizer, skip_special_tokens=True, skip_prompt=True)
diff --git a/examples/phi-3.py b/examples/phi-3.py
index 184b428..87ec94b 100644
--- a/examples/phi-3.py
+++ b/examples/phi-3.py
@@ -5,15 +5,17 @@
 
 import torch
 from transformers import AutoTokenizer, pipeline, TextStreamer
+from intel_npu_acceleration_library.compiler import CompilerConfig
 import intel_npu_acceleration_library as npu_lib
 import warnings
 
 torch.random.manual_seed(0)
 
+compiler_conf = CompilerConfig(dtype=npu_lib.int4)
 model = npu_lib.NPUModelForCausalLM.from_pretrained(
     "microsoft/Phi-3-mini-4k-instruct",
+    config=compiler_conf,
     torch_dtype="auto",
-    dtype=npu_lib.int4,
 )
 
 tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct")
diff --git a/examples/t5.py b/examples/t5.py
index bec55b3..1607e22 100644
--- a/examples/t5.py
+++ b/examples/t5.py
@@ -5,10 +5,14 @@
 
 from transformers import AutoTokenizer, TextStreamer
 from intel_npu_acceleration_library import NPUModelForSeq2SeqLM
+from intel_npu_acceleration_library.compiler import CompilerConfig
 
 model_id = "google/flan-t5-small"
 
-model = NPUModelForSeq2SeqLM.from_pretrained(model_id, use_cache=True).eval()
+compiler_conf = CompilerConfig()
+model = NPUModelForSeq2SeqLM.from_pretrained(
+    model_id, use_cache=True, config=compiler_conf
+).eval()
 tokenizer = AutoTokenizer.from_pretrained(model_id, use_default_system_prompt=True)
 tokenizer.pad_token_id = tokenizer.eos_token_id
 streamer = TextStreamer(tokenizer, skip_special_tokens=True)
diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 88183b6..9cb56d9 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -112,7 +112,6 @@ def module_optimization(func: Callable) -> torch.nn.Module:
     Returns:
         torch.nn.Module: optimized module
     """
-    module_optimization.counter = 0  # type: ignore[attr-defined]
 
     def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         """Recursively apply the optimization function.
@@ -126,12 +125,7 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
         if not isinstance(model, NPUModuleWrapper):
             for name, layer in model.named_children():
                 new_layer = func(name, layer, *args, **kwargs)
-                if (func.__name__ == "optimize_phi3_MLP") and (
-                    module_optimization.counter >= 5  # type: ignore[attr-defined]
-                ):
-                    new_layer = None
                 if new_layer:
-                    module_optimization.counter += 1  # type: ignore[attr-defined]
                     model.add_module(name, new_layer)
                     if not isinstance(new_layer, NPUModuleWrapper):
                         wrapper(new_layer, *args, **kwargs)

From ae1fd61159ce042123baa6386ea0085377cabcda Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Fri, 19 Jul 2024 11:07:27 +0100
Subject: [PATCH 17/18] Fix for quantization and remove unused code

---
 intel_npu_acceleration_library/compiler.py | 22 ++--------------------
 1 file changed, 2 insertions(+), 20 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 9cb56d9..656ec23 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -69,7 +69,8 @@ def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module:
         if config.dtype in (int8, int4):
             # Quantize model
             model = quantize_model(model, config.dtype)
-            weights_quantization(model)
+            if config.use_to:
+                weights_quantization(model)
 
         if not config.use_to:
             create_npu_kernels(model)
@@ -89,7 +90,6 @@ def apply_general_optimizations(model: torch.nn.Module):
     """
     apply_horizontal_fusion(model)
     optimize_llama_attention(model)
-    optimize_phi3_MLP(model)
 
 
 def create_npu_kernels(model: torch.nn.Module):
@@ -204,24 +204,6 @@ def optimize_llama_attention(
     return None
 
 
-@module_optimization
-def optimize_phi3_MLP(
-    name: str, layer: torch.nn.Module
-) -> Union[torch.nn.Module, None]:
-    """Optimize Phi-3 MLP block.
-
-    Args:
-        name (str): Module name
-        layer (torch.nn.Module): Original Module
-
-    Returns:
-        Union[torch.nn.Module, None]: optimized Phi-3 module
-    """
-    if layer.__class__.__name__ == "Phi3MLP":
-        return layer
-    return None
-
-
 @module_optimization
 def weights_quantization(
     name: str, layer: torch.nn.Module

From 28902996628ff017ee279545f556b30062850417 Mon Sep 17 00:00:00 2001
From: SarahByrneIntel <sarahbyrne@intel.com>
Date: Fri, 19 Jul 2024 13:17:40 +0100
Subject: [PATCH 18/18] Update for quantization of a model

---
 intel_npu_acceleration_library/compiler.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/intel_npu_acceleration_library/compiler.py b/intel_npu_acceleration_library/compiler.py
index 656ec23..ea357f4 100644
--- a/intel_npu_acceleration_library/compiler.py
+++ b/intel_npu_acceleration_library/compiler.py
@@ -69,8 +69,7 @@ def compile(model: torch.nn.Module, config: CompilerConfig) -> torch.nn.Module:
         if config.dtype in (int8, int4):
             # Quantize model
             model = quantize_model(model, config.dtype)
-            if config.use_to:
-                weights_quantization(model)
+            weights_quantization(model)
 
         if not config.use_to:
             create_npu_kernels(model)
@@ -122,15 +121,21 @@ def wrapper(model: torch.nn.Module, *args: Any, **kwargs: Any):
             kwargs (Any): keyword arguments
 
         """
-        if not isinstance(model, NPUModuleWrapper):
+        if not isinstance(model, NPUModuleWrapper) or kwargs.get(
+            "ignore_isinstance", False
+        ):
             for name, layer in model.named_children():
                 new_layer = func(name, layer, *args, **kwargs)
                 if new_layer:
                     model.add_module(name, new_layer)
-                    if not isinstance(new_layer, NPUModuleWrapper):
+                    if not isinstance(new_layer, NPUModuleWrapper) or kwargs.get(
+                        "ignore_isinstance", False
+                    ):
                         wrapper(new_layer, *args, **kwargs)
                 else:
-                    if not isinstance(layer, NPUModuleWrapper):
+                    if not isinstance(layer, NPUModuleWrapper) or kwargs.get(
+                        "ignore_isinstance", False
+                    ):
                         wrapper(layer, *args, **kwargs)
 
     return wrapper
@@ -206,13 +211,14 @@ def optimize_llama_attention(
 
 @module_optimization
 def weights_quantization(
-    name: str, layer: torch.nn.Module
+    name: str, layer: torch.nn.Module, ignore_isinstance: bool = True
 ) -> Union[torch.nn.Module, None]:
     """Apply weights quantization.
 
     Args:
         name (str): Layer name
         layer (torch.nn.Module): Original torch.nn.Linear module
+        ignore_isinstance (bool): ignore isinstance check in module_optimization. Defaults to True.
 
     Raises:
         RuntimeError: unsupported quantization bits