Fuse LlamaAttention to attention (onert)

Sanggyu Lee · Sanggyu Lee · commit 1089e808b2d6 · 2025-11-04T16:54:52.000+09:00
It fuses LlamaAttention from TinyLlama model.
Fused attention works as onert attention op.

TICO-DCO-1.0-Signed-off-by: Sanggyu Lee &lt;sg5.lee@samsung.com&gt;
diff --git a/test/modules/model/TinyLlamaWithFusedAttention/__init__.py b/test/modules/model/TinyLlamaWithFusedAttention/__init__.py
@@ -0,0 +1 @@
+# DO NOT REMOVE THIS FILE
diff --git a/test/modules/model/TinyLlamaWithFusedAttention/decode.py b/test/modules/model/TinyLlamaWithFusedAttention/decode.py
@@ -0,0 +1,71 @@
+# User input
+prompt = "Lily picked up a flower."
+model_name = "Maykeye/TinyLLama-v0"
+
+# Tokenizer
+from transformers import AutoTokenizer
+
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.padding_side = "right"
+inputs = tokenizer(
+    prompt,
+    return_tensors="pt",
+    padding="max_length",
+    max_length=30,
+    truncation=True,
+)
+
+# Generator
+import torch
+
+from transformers import AutoModelForCausalLM
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+
+from tico.utils.record_input import RecordingInput
+
+# past_key_values
+# ---------------
+# During prefill, "past_key_values" not None, but an empty Cache instance.
+# Passing None makes torch.export happy.
+
+
+input_to_remove = [
+    "attention_mask",
+    # For left pad,        [0, ⋯, 0, 1, ⋯, 1]
+    # For right right pad, [1, ⋯, 1, 0, ⋯, 0]
+    # ( 0 is pad-token )
+    # This script uses right pad and pass all-1 attention mask (including pad).
+    # Npu computes all positions whether it is pad or not.
+]
+condition_fn = lambda args_dict: args_dict["past_key_values"].get_seq_length() != 0
+
+with torch.no_grad(), RecordingInput(
+    model, condition_fn, input_to_remove=input_to_remove
+) as rec:
+    outputs = model.generate(
+        **inputs,
+        max_new_tokens=32,
+        do_sample=False,
+        pad_token_id=tokenizer.eos_token_id,
+    )
+    captured_input = rec.captured_input
+
+generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
+print(generated_text)
+
+# Tico
+import tico
+from tico.serialize.operators.adapters.onert.op_attention import (
+    llama_attention_forward_adapter,
+)
+from transformers.models.llama.modeling_llama import LlamaAttention
+
+LlamaAttention.forward = llama_attention_forward_adapter
+
+model = AutoModelForCausalLM.from_pretrained(model_name)
+model.eval()
+circle_model = tico.convert(model, captured_input)
+circle_model.save(f"tinyllama.decode.circle")
diff --git a/test/modules/model/TinyLlamaWithFusedAttention/requirements.txt b/test/modules/model/TinyLlamaWithFusedAttention/requirements.txt
@@ -0,0 +1 @@
+transformers>=4.50.1
diff --git a/tico/serialize/operators/adapters/onert/__init__.py b/tico/serialize/operators/adapters/onert/__init__.py
@@ -0,0 +1 @@
+# DO NOT REMOVE THIS FILE
diff --git a/tico/serialize/operators/adapters/onert/op_attention.py b/tico/serialize/operators/adapters/onert/op_attention.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2025 Samsung Electronics Co., Ltd. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, TYPE_CHECKING
+
+if TYPE_CHECKING:
+    import torch._ops
+    import torch.fx
+import torch
+from circle_schema import circle
+
+from torch.library import Library
+
+from tico.serialize.circle_graph import CircleSubgraph
+from tico.serialize.operators.hashable_opcode import OpCode
+from tico.serialize.operators.node_visitor import NodeVisitor, register_node_visitor
+from tico.serialize.operators.utils import create_builtin_operator, get_op_index
+
+lib = Library("circle", "DEF")
+lib.define(
+    """
+attention.llama(
+    Tensor hidden_states,
+    Tensor wq,
+    Tensor wk,
+    Tensor wv,
+    Tensor wo,
+    Tensor position_cos,
+    Tensor position_sin,
+    Tensor attention_mask,
+    Tensor past_key,
+    Tensor past_value,
+    Tensor cache_position
+) -> Tensor
+"""
+)
+
+# ATTENTION FUSER
+
+
+@torch.library.register_fake("circle::attention.llama")
+def attention_llama(*args, **kwargs):
+    (
+        hidden_states,
+        q_proj,
+        k_proj,
+        v_proj,
+        o_proj,
+        position_cos,
+        position_sin,
+        attention_mask,
+        past_key,
+        past_value,
+        cache_position,
+    ) = args
+    return hidden_states
+
+
+from typing import List, Optional
+
+from transformers.cache_utils import DynamicCache
+from transformers.models.llama.modeling_llama import LlamaAttention
+
+
+def llama_attention_forward_adapter(
+    self: LlamaAttention,
+    hidden_states: torch.Tensor,
+    position_embeddings: List[torch.Tensor],
+    attention_mask: torch.Tensor,
+    past_key_value: DynamicCache,
+    cache_position: torch.Tensor,
+    **kwargs,
+):
+    # past_key_value is a dict with key_cache and value_cache.
+    # It needs to be decomposed for tico and circle which does not know dict.
+    key_cache = past_key_value.key_cache  # type: ignore[union-attr]
+    value_cache = past_key_value.value_cache  # type: ignore[union-attr]
+    return (
+        torch.ops.circle.attention.llama(
+            hidden_states,
+            self.q_proj.weight,
+            self.k_proj.weight,
+            self.v_proj.weight,
+            self.o_proj.weight,
+            position_embeddings[0],  # cos
+            position_embeddings[1],  # sin
+            attention_mask,
+            # key_cache is a list of cache for each decoder layer.
+            # Assumtion: key cache is continuous
+            #
+            #    k_cache[0] | k_cache[1] | ...  | k_cache[n]
+            key_cache[self.layer_idx],
+            value_cache[self.layer_idx],  # Same to value_cache
+            cache_position,
+        ),
+        None,
+    )
+
+
+@register_node_visitor
+class AttentionVisitor(NodeVisitor):
+    target: List[torch._ops.OpOverload] = [
+        torch.ops.circle.attention.llama,
+    ]
+
+    def __init__(self, op_codes: Dict[OpCode, int], graph: CircleSubgraph):
+        super().__init__(op_codes, graph)
+
+    def define_node(
+        self,
+        node: torch.fx.Node,
+    ) -> circle.Operator.OperatorT:
+        (
+            hidden_states,
+            wq,
+            wk,
+            wv,
+            wo,
+            position_cos,
+            position_sin,
+            attention_mask,
+            past_key,
+            past_value,
+            cache_position,
+        ) = node.args
+
+        op_index = get_op_index(
+            circle.BuiltinOperator.BuiltinOperator.ATTENTION, self._op_codes
+        )
+
+        # remove last arg (= layer_idx) from inputs.
+        # layer_idx is attention op's param, not input.
+        inputs = node.args[:-1]
+        outputs = [node]
+        operator = create_builtin_operator(self.graph, op_index, inputs, outputs)
+
+        # Op-specific option
+        operator.builtinOptionsType = (
+            circle.BuiltinOptions.BuiltinOptions.AttentionOptions
+        )
+        operator.builtinOptions = circle.AttentionOptions.AttentionOptionsT()
+
+        return operator