diff --git a/docs/en/transformer_entries/OLMoTransformer.md b/docs/en/transformer_entries/OLMoTransformer.md
new file mode 100644
index 00000000000000..77f7235481d9c4
--- /dev/null
+++ b/docs/en/transformer_entries/OLMoTransformer.md
@@ -0,0 +1,135 @@
+{%- capture title -%}
+OLMoTransformer
+{%- endcapture -%}
+
+{%- capture description -%}
+OLMo, a series of Open Language Models, is designed to enable the science of language models. These models are trained on the Dolma dataset, offering open-source capabilities for language model research and application. The OLMo models support various NLP tasks including text generation, summarization, and more.
+
+Pretrained models can be loaded using the `pretrained` method from the companion object:
+
+
+```scala
+val olmo = OLMoTransformer.pretrained()
+  .setInputCols("document")
+  .setOutputCol("generation")
+```
+
+The default model is `"olmo_1b_int4"`, if no name is provided.
+
+For available pretrained models please see the
+[Models Hub](https://sparknlp.org/models?q=OLMo).
+
+For extended examples of usage, see
+[OLMoTestSpec](https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTestSpec.scala).
+
+**Sources** :
+[OLMo Project Page](https://allenai.org/olmo)
+[OLMo GitHub Repository](https://github.com/allenai/OLMo)
+[OLMo: Accelerating the Science of Language Models (Paper)](https://arxiv.org/pdf/2402.00838.pdf)
+
+**Paper abstract**
+
+*Language models (LMs) have become ubiquitous in both NLP research and commercial products. 
+As their commercial importance has surged, the most powerful models have become proprietary, 
+limiting scientific study. OLMo addresses this gap by offering an open-source framework, 
+including training data, models, and code. This initiative aims to empower the research community, 
+fostering transparency and innovation in language model development.*
+{%- endcapture -%}
+
+{%- capture input_anno -%}
+DOCUMENT
+{%- endcapture -%}
+
+{%- capture output_anno -%}
+DOCUMENT
+{%- endcapture -%}
+
+{%- capture python_example -%}
+import sparknlp
+from sparknlp.base import *
+from sparknlp.annotator import *
+from pyspark.ml import Pipeline
+
+# Document Assembler
+document_assembler = DocumentAssembler() \
+.setInputCol("text") \
+.setOutputCol("document")
+
+# OLMo Transformer
+olmo = OLMoTransformer.pretrained("olmo_1b_int4") \
+.setInputCols(["document"]) \
+.setMinOutputLength(10) \
+.setMaxOutputLength(50) \
+.setDoSample(False) \
+.setTopK(50) \
+.setNoRepeatNgramSize(3) \
+.setOutputCol("generation")
+
+# Pipeline
+pipeline = Pipeline(stages=[document_assembler, olmo])
+
+# Sample Data
+data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
+result = pipeline.fit(data).transform(data)
+
+# Display Results
+result.select("generation.result").show(truncate=False)
+
+{%- endcapture -%}
+
+{%- capture scala_example -%}
+import spark.implicits._
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import com.johnsnowlabs.nlp.annotators.seq2seq.OLMoTransformer
+import org.apache.spark.ml.Pipeline
+
+// Document Assembler
+val documentAssembler = new DocumentAssembler()
+.setInputCol("text")
+.setOutputCol("document")
+
+// OLMo Transformer
+val olmo = OLMoTransformer.pretrained("olmo_1b_int4")
+.setInputCols(Array("document"))
+.setMinOutputLength(10)
+.setMaxOutputLength(50)
+.setDoSample(false)
+.setTopK(50)
+.setNoRepeatNgramSize(3)
+.setOutputCol("generation")
+
+// Pipeline
+val pipeline = new Pipeline().setStages(Array(documentAssembler, olmo))
+
+// Sample Data
+val data = Seq("My name is Leonardo.").toDF("text")
+val result = pipeline.fit(data).transform(data)
+
+// Display Results
+result.select("generation.result").show(truncate = false)
+
+{%- endcapture -%}
+
+{%- capture api_link -%}
+[OLMoTransformer](/api/com/johnsnowlabs/nlp/seq2seq/OLMoTransformer)
+{%- endcapture -%}
+
+{%- capture python_api_link -%}
+[OLMoTransformer](/api/python/reference/autosummary/sparknlp/annotator/seq2seq/olmo_transformer/index.html#sparknlp.annotator.seq2seq.olmo_transformer.OLMoTransformer)
+{%- endcapture -%}
+
+{%- capture source_link -%}
+[OLMoTransformer](https://github.com/JohnSnowLabs/spark-nlp/tree/master/src/main/scala/com/johnsnowlabs/nlp/seq2seq/OLMoTransformer.scala)
+{%- endcapture -%}
+
+{% include templates/anno_template.md
+title=title
+description=description
+input_anno=input_anno
+output_anno=output_anno
+python_example=python_example
+scala_example=scala_example
+api_link=api_link
+python_api_link=python_api_link
+source_link=source_link
+%}
\ No newline at end of file
diff --git a/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_OLMO.ipynb b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_OLMO.ipynb
new file mode 100644
index 00000000000000..dd2d0b08b4f8f7
--- /dev/null
+++ b/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_OLMO.ipynb
@@ -0,0 +1,1217 @@
+{
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GB-OotnsS-JG"
+      },
+      "source": [
+        "![JohnSnowLabs](https://sparknlp.org/assets/images/logo.png)\n",
+        "\n",
+        "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/JohnSnowLabs/spark-nlp/blob/master/examples/python/transformers/onnx/HuggingFace_ONNX_in_Spark_NLP_OLMO.ipynb)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "gRuRMH7QS-JI"
+      },
+      "source": [
+        "## Import ONNX OLMO models from HuggingFace 🤗 into Spark NLP 🚀\n",
+        "\n",
+        "Let's keep in mind a few things before we start 😊\n",
+        "\n",
+        "- ONNX support was introduced in  `Spark NLP 5.0.0`, enabling high performance inference for models.\n",
+        "- You can import OLMO models via `OLMOModel`. These models are usually under `Text2Text Generation` category and have `OLMO` in their labels\n",
+        "- This is a very computationally expensive module especially on larger sequence. The use of an accelerator such as GPU is recommended.\n",
+        "- Reference: [OLMOModel](https://huggingface.co/docs/transformers/en/model_doc/OLMO)\n",
+        "- Some [example models](https://huggingface.co/models?other=OLMO)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Vd98DUZxS-JJ"
+      },
+      "source": [
+        "## Export and Save HuggingFace model\n",
+        "\n",
+        "- Let's install `transformers` package with the `onnx` extension and it's dependencies. You don't need `onnx` to be installed for Spark NLP, however, we need it to load and save models from HuggingFace.\n",
+        "- We lock `transformers` on version `4.41.0`. This doesn't mean it won't work with the future releases\n",
+        "- We will also need `sentencepiece` for tokenization."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 1,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "wFf3GagOS-JJ",
+        "outputId": "78b6529d-afad-414c-baa3-e8087061072f"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Requirement already satisfied: optimum in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (1.24.0)\n",
+            "Requirement already satisfied: sentencepiece in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (0.2.0)\n",
+            "Requirement already satisfied: onnx in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (1.17.0)\n",
+            "Requirement already satisfied: onnxruntime in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (1.19.2)\n",
+            "Collecting ai2-olmo\n",
+            "  Downloading ai2_olmo-0.6.0-py3-none-any.whl.metadata (25 kB)\n",
+            "Requirement already satisfied: transformers>=4.29 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from optimum) (4.41.0)\n",
+            "Requirement already satisfied: torch>=1.11 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from optimum) (2.6.0)\n",
+            "Requirement already satisfied: packaging in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from optimum) (24.2)\n",
+            "Requirement already satisfied: numpy in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from optimum) (2.0.2)\n",
+            "Requirement already satisfied: huggingface-hub>=0.8.0 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from optimum) (0.28.1)\n",
+            "Requirement already satisfied: protobuf>=3.20.2 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from onnx) (3.20.2)\n",
+            "Requirement already satisfied: coloredlogs in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from onnxruntime) (15.0.1)\n",
+            "Requirement already satisfied: flatbuffers in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from onnxruntime) (25.2.10)\n",
+            "Requirement already satisfied: sympy in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from onnxruntime) (1.13.1)\n",
+            "Collecting numpy (from optimum)\n",
+            "  Using cached numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)\n",
+            "Collecting ai2-olmo-core==0.1.0 (from ai2-olmo)\n",
+            "  Downloading ai2_olmo_core-0.1.0-py3-none-any.whl.metadata (14 kB)\n",
+            "Collecting omegaconf (from ai2-olmo)\n",
+            "  Using cached omegaconf-2.3.0-py3-none-any.whl.metadata (3.9 kB)\n",
+            "Collecting rich (from ai2-olmo)\n",
+            "  Downloading rich-13.9.4-py3-none-any.whl.metadata (18 kB)\n",
+            "Collecting boto3 (from ai2-olmo)\n",
+            "  Downloading boto3-1.36.18-py3-none-any.whl.metadata (6.7 kB)\n",
+            "Collecting google-cloud-storage (from ai2-olmo)\n",
+            "  Downloading google_cloud_storage-3.0.0-py2.py3-none-any.whl.metadata (12 kB)\n",
+            "Requirement already satisfied: tokenizers in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from ai2-olmo) (0.19.1)\n",
+            "Collecting cached_path>=1.6.2 (from ai2-olmo)\n",
+            "  Downloading cached_path-1.6.7-py3-none-any.whl.metadata (19 kB)\n",
+            "Collecting importlib_resources (from ai2-olmo)\n",
+            "  Downloading importlib_resources-6.5.2-py3-none-any.whl.metadata (3.9 kB)\n",
+            "Requirement already satisfied: safetensors in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from ai2-olmo-core==0.1.0->ai2-olmo) (0.5.2)\n",
+            "Collecting pydantic<3.0,>=2.0 (from ai2-olmo-core==0.1.0->ai2-olmo)\n",
+            "  Downloading pydantic-2.10.6-py3-none-any.whl.metadata (30 kB)\n",
+            "Requirement already satisfied: requests in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from ai2-olmo-core==0.1.0->ai2-olmo) (2.32.3)\n",
+            "Requirement already satisfied: filelock<4.0,>=3.4 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from cached_path>=1.6.2->ai2-olmo) (3.17.0)\n",
+            "Collecting google-cloud-storage (from ai2-olmo)\n",
+            "  Downloading google_cloud_storage-2.19.0-py2.py3-none-any.whl.metadata (9.1 kB)\n",
+            "Collecting huggingface-hub>=0.8.0 (from optimum)\n",
+            "  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)\n",
+            "Collecting botocore<1.37.0,>=1.36.18 (from boto3->ai2-olmo)\n",
+            "  Downloading botocore-1.36.18-py3-none-any.whl.metadata (5.7 kB)\n",
+            "Collecting jmespath<2.0.0,>=0.7.1 (from boto3->ai2-olmo)\n",
+            "  Using cached jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)\n",
+            "Collecting s3transfer<0.12.0,>=0.11.0 (from boto3->ai2-olmo)\n",
+            "  Downloading s3transfer-0.11.2-py3-none-any.whl.metadata (1.7 kB)\n",
+            "Collecting google-auth<3.0dev,>=2.26.1 (from google-cloud-storage->ai2-olmo)\n",
+            "  Downloading google_auth-2.38.0-py2.py3-none-any.whl.metadata (4.8 kB)\n",
+            "Collecting google-api-core<3.0.0dev,>=2.15.0 (from google-cloud-storage->ai2-olmo)\n",
+            "  Downloading google_api_core-2.24.1-py3-none-any.whl.metadata (3.0 kB)\n",
+            "Collecting google-cloud-core<3.0dev,>=2.3.0 (from google-cloud-storage->ai2-olmo)\n",
+            "  Using cached google_cloud_core-2.4.1-py2.py3-none-any.whl.metadata (2.7 kB)\n",
+            "Collecting google-resumable-media>=2.7.2 (from google-cloud-storage->ai2-olmo)\n",
+            "  Downloading google_resumable_media-2.7.2-py2.py3-none-any.whl.metadata (2.2 kB)\n",
+            "Collecting google-crc32c<2.0dev,>=1.0 (from google-cloud-storage->ai2-olmo)\n",
+            "  Downloading google_crc32c-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (2.3 kB)\n",
+            "Requirement already satisfied: fsspec>=2023.5.0 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from huggingface-hub>=0.8.0->optimum) (2025.2.0)\n",
+            "Requirement already satisfied: pyyaml>=5.1 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from huggingface-hub>=0.8.0->optimum) (6.0.2)\n",
+            "Requirement already satisfied: tqdm>=4.42.1 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from huggingface-hub>=0.8.0->optimum) (4.67.1)\n",
+            "Requirement already satisfied: typing-extensions>=3.7.4.3 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from huggingface-hub>=0.8.0->optimum) (4.12.2)\n",
+            "Collecting markdown-it-py>=2.2.0 (from rich->ai2-olmo)\n",
+            "  Using cached markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)\n",
+            "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from rich->ai2-olmo) (2.19.1)\n",
+            "Requirement already satisfied: networkx in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (3.2.1)\n",
+            "Requirement already satisfied: jinja2 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (3.1.5)\n",
+            "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.4.127 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (12.4.127)\n",
+            "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.4.127 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (12.4.127)\n",
+            "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.4.127 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (12.4.127)\n",
+            "Requirement already satisfied: nvidia-cudnn-cu12==9.1.0.70 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (9.1.0.70)\n",
+            "Requirement already satisfied: nvidia-cublas-cu12==12.4.5.8 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (12.4.5.8)\n",
+            "Requirement already satisfied: nvidia-cufft-cu12==11.2.1.3 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (11.2.1.3)\n",
+            "Requirement already satisfied: nvidia-curand-cu12==10.3.5.147 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (10.3.5.147)\n",
+            "Requirement already satisfied: nvidia-cusolver-cu12==11.6.1.9 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (11.6.1.9)\n",
+            "Requirement already satisfied: nvidia-cusparse-cu12==12.3.1.170 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (12.3.1.170)\n",
+            "Requirement already satisfied: nvidia-cusparselt-cu12==0.6.2 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (0.6.2)\n",
+            "Requirement already satisfied: nvidia-nccl-cu12==2.21.5 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (2.21.5)\n",
+            "Requirement already satisfied: nvidia-nvtx-cu12==12.4.127 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (12.4.127)\n",
+            "Requirement already satisfied: nvidia-nvjitlink-cu12==12.4.127 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (12.4.127)\n",
+            "Requirement already satisfied: triton==3.2.0 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from torch>=1.11->optimum) (3.2.0)\n",
+            "Requirement already satisfied: mpmath<1.4,>=1.1.0 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from sympy->onnxruntime) (1.3.0)\n",
+            "Requirement already satisfied: regex!=2019.12.17 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from transformers>=4.29->optimum) (2024.11.6)\n",
+            "Requirement already satisfied: humanfriendly>=9.1 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from coloredlogs->onnxruntime) (10.0)\n",
+            "Collecting zipp>=3.1.0 (from importlib_resources->ai2-olmo)\n",
+            "  Downloading zipp-3.21.0-py3-none-any.whl.metadata (3.7 kB)\n",
+            "Collecting antlr4-python3-runtime==4.9.* (from omegaconf->ai2-olmo)\n",
+            "  Using cached antlr4_python3_runtime-4.9.3-py3-none-any.whl\n",
+            "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from botocore<1.37.0,>=1.36.18->boto3->ai2-olmo) (2.9.0.post0)\n",
+            "Collecting urllib3<1.27,>=1.25.4 (from botocore<1.37.0,>=1.36.18->boto3->ai2-olmo)\n",
+            "  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)\n",
+            "Collecting googleapis-common-protos<2.0.dev0,>=1.56.2 (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->ai2-olmo)\n",
+            "  Downloading googleapis_common_protos-1.67.0rc1-py2.py3-none-any.whl.metadata (5.1 kB)\n",
+            "Collecting proto-plus<2.0.0dev,>=1.22.3 (from google-api-core<3.0.0dev,>=2.15.0->google-cloud-storage->ai2-olmo)\n",
+            "  Downloading proto_plus-1.26.0-py3-none-any.whl.metadata (2.2 kB)\n",
+            "Collecting cachetools<6.0,>=2.0.0 (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->ai2-olmo)\n",
+            "  Downloading cachetools-5.5.1-py3-none-any.whl.metadata (5.4 kB)\n",
+            "Collecting pyasn1-modules>=0.2.1 (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->ai2-olmo)\n",
+            "  Downloading pyasn1_modules-0.4.1-py3-none-any.whl.metadata (3.5 kB)\n",
+            "Collecting rsa<5,>=3.1.4 (from google-auth<3.0dev,>=2.26.1->google-cloud-storage->ai2-olmo)\n",
+            "  Using cached rsa-4.9-py3-none-any.whl.metadata (4.2 kB)\n",
+            "Collecting mdurl~=0.1 (from markdown-it-py>=2.2.0->rich->ai2-olmo)\n",
+            "  Using cached mdurl-0.1.2-py3-none-any.whl.metadata (1.6 kB)\n",
+            "Collecting annotated-types>=0.6.0 (from pydantic<3.0,>=2.0->ai2-olmo-core==0.1.0->ai2-olmo)\n",
+            "  Using cached annotated_types-0.7.0-py3-none-any.whl.metadata (15 kB)\n",
+            "Collecting pydantic-core==2.27.2 (from pydantic<3.0,>=2.0->ai2-olmo-core==0.1.0->ai2-olmo)\n",
+            "  Downloading pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)\n",
+            "Requirement already satisfied: charset-normalizer<4,>=2 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from requests->ai2-olmo-core==0.1.0->ai2-olmo) (3.4.1)\n",
+            "Requirement already satisfied: idna<4,>=2.5 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from requests->ai2-olmo-core==0.1.0->ai2-olmo) (3.10)\n",
+            "Requirement already satisfied: certifi>=2017.4.17 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from requests->ai2-olmo-core==0.1.0->ai2-olmo) (2025.1.31)\n",
+            "Requirement already satisfied: MarkupSafe>=2.0 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from jinja2->torch>=1.11->optimum) (3.0.2)\n",
+            "Collecting pyasn1<0.7.0,>=0.4.6 (from pyasn1-modules>=0.2.1->google-auth<3.0dev,>=2.26.1->google-cloud-storage->ai2-olmo)\n",
+            "  Downloading pyasn1-0.6.1-py3-none-any.whl.metadata (8.4 kB)\n",
+            "Requirement already satisfied: six>=1.5 in /home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.37.0,>=1.36.18->boto3->ai2-olmo) (1.17.0)\n",
+            "Downloading ai2_olmo-0.6.0-py3-none-any.whl (144.9 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m144.9/144.9 MB\u001b[0m \u001b[31m14.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
+            "\u001b[?25hDownloading ai2_olmo_core-0.1.0-py3-none-any.whl (56 kB)\n",
+            "Downloading cached_path-1.6.7-py3-none-any.whl (35 kB)\n",
+            "Downloading boto3-1.36.18-py3-none-any.whl (139 kB)\n",
+            "Downloading google_cloud_storage-2.19.0-py2.py3-none-any.whl (131 kB)\n",
+            "Downloading huggingface_hub-0.27.1-py3-none-any.whl (450 kB)\n",
+            "Using cached numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (18.2 MB)\n",
+            "Downloading rich-13.9.4-py3-none-any.whl (242 kB)\n",
+            "Downloading importlib_resources-6.5.2-py3-none-any.whl (37 kB)\n",
+            "Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n",
+            "Downloading botocore-1.36.18-py3-none-any.whl (13.3 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m13.3/13.3 MB\u001b[0m \u001b[31m36.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m \u001b[36m0:00:01\u001b[0m\n",
+            "\u001b[?25hDownloading google_api_core-2.24.1-py3-none-any.whl (160 kB)\n",
+            "Downloading google_auth-2.38.0-py2.py3-none-any.whl (210 kB)\n",
+            "Downloading google_cloud_core-2.4.1-py2.py3-none-any.whl (29 kB)\n",
+            "Downloading google_crc32c-1.6.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (37 kB)\n",
+            "Downloading google_resumable_media-2.7.2-py2.py3-none-any.whl (81 kB)\n",
+            "Downloading jmespath-1.0.1-py3-none-any.whl (20 kB)\n",
+            "Using cached markdown_it_py-3.0.0-py3-none-any.whl (87 kB)\n",
+            "Downloading pydantic-2.10.6-py3-none-any.whl (431 kB)\n",
+            "Downloading pydantic_core-2.27.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m35.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading s3transfer-0.11.2-py3-none-any.whl (84 kB)\n",
+            "Downloading zipp-3.21.0-py3-none-any.whl (9.6 kB)\n",
+            "Using cached annotated_types-0.7.0-py3-none-any.whl (13 kB)\n",
+            "Downloading cachetools-5.5.1-py3-none-any.whl (9.5 kB)\n",
+            "Downloading googleapis_common_protos-1.67.0rc1-py2.py3-none-any.whl (165 kB)\n",
+            "Using cached mdurl-0.1.2-py3-none-any.whl (10.0 kB)\n",
+            "Downloading proto_plus-1.26.0-py3-none-any.whl (50 kB)\n",
+            "Downloading pyasn1_modules-0.4.1-py3-none-any.whl (181 kB)\n",
+            "Using cached rsa-4.9-py3-none-any.whl (34 kB)\n",
+            "Downloading urllib3-1.26.20-py2.py3-none-any.whl (144 kB)\n",
+            "Downloading pyasn1-0.6.1-py3-none-any.whl (83 kB)\n",
+            "Installing collected packages: antlr4-python3-runtime, zipp, urllib3, pydantic-core, pyasn1, proto-plus, omegaconf, numpy, mdurl, jmespath, googleapis-common-protos, google-crc32c, cachetools, annotated-types, rsa, pydantic, pyasn1-modules, markdown-it-py, importlib_resources, google-resumable-media, botocore, s3transfer, rich, huggingface-hub, google-auth, google-api-core, boto3, google-cloud-core, google-cloud-storage, cached_path, ai2-olmo-core, ai2-olmo\n",
+            "  Attempting uninstall: urllib3\n",
+            "    Found existing installation: urllib3 2.3.0\n",
+            "    Uninstalling urllib3-2.3.0:\n",
+            "      Successfully uninstalled urllib3-2.3.0\n",
+            "  Attempting uninstall: numpy\n",
+            "    Found existing installation: numpy 2.0.2\n",
+            "    Uninstalling numpy-2.0.2:\n",
+            "      Successfully uninstalled numpy-2.0.2\n",
+            "  Attempting uninstall: huggingface-hub\n",
+            "    Found existing installation: huggingface-hub 0.28.1\n",
+            "    Uninstalling huggingface-hub-0.28.1:\n",
+            "      Successfully uninstalled huggingface-hub-0.28.1\n",
+            "Successfully installed ai2-olmo-0.6.0 ai2-olmo-core-0.1.0 annotated-types-0.7.0 antlr4-python3-runtime-4.9.3 boto3-1.36.18 botocore-1.36.18 cached_path-1.6.7 cachetools-5.5.1 google-api-core-2.24.1 google-auth-2.38.0 google-cloud-core-2.4.1 google-cloud-storage-2.19.0 google-crc32c-1.6.0 google-resumable-media-2.7.2 googleapis-common-protos-1.67.0rc1 huggingface-hub-0.27.1 importlib_resources-6.5.2 jmespath-1.0.1 markdown-it-py-3.0.0 mdurl-0.1.2 numpy-1.26.4 omegaconf-2.3.0 proto-plus-1.26.0 pyasn1-0.6.1 pyasn1-modules-0.4.1 pydantic-2.10.6 pydantic-core-2.27.2 rich-13.9.4 rsa-4.9 s3transfer-0.11.2 urllib3-1.26.20 zipp-3.21.0\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install -q --upgrade transformers[onnx]==4.41.0\n",
+        "!pip install optimum sentencepiece onnx onnxruntime ai2-olmo"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "GX1TUzkhS-JK"
+      },
+      "source": [
+        "- HuggingFace has an extension called Optimum which offers specialized model inference, including ONNX. We can use this to import and export ONNX models with `from_pretrained` and `save_pretrained`.\n",
+        "- We'll use [allenai/OLMo-1B-hf](https://huggingface.co/allenai/OLMo-1B-hf) model from HuggingFace as an example\n",
+        "- In addition to `OLMO` we also need to save the tokenizer. This is the same for every model, these are assets needed for tokenization inside Spark NLP.\n",
+        "- If we want to optimize the model, a GPU will be needed. Make sure to select the correct runtime.\n",
+        "0"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "-ibF3eK_S-JK"
+      },
+      "outputs": [],
+      "source": [
+        "import transformers\n",
+        "MODEL_NAME = \"allenai/OLMo-1B-hf\"\n",
+        "\n",
+        "\n",
+        "# Path to store the exported models\n",
+        "EXPORT_PATH = f\"onnx_models/{MODEL_NAME}\""
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "kDH5EpwnS-JK",
+        "outputId": "e32328ad-45b3-4d6c-d5d6-d2da88dbbd4a"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "/home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+            "  warnings.warn(\n",
+            "config.json: 100%|█████████████████████████████| 632/632 [00:00<00:00, 38.6kB/s]\n",
+            "model.safetensors: 100%|███████████████████| 4.71G/4.71G [03:24<00:00, 23.1MB/s]\n",
+            "generation_config.json: 100%|██████████████████| 116/116 [00:00<00:00, 12.9kB/s]\n",
+            "tokenizer_config.json: 100%|████████████████| 5.37k/5.37k [00:00<00:00, 698kB/s]\n",
+            "tokenizer.json: 100%|██████████████████████| 2.12M/2.12M [00:00<00:00, 2.45MB/s]\n",
+            "special_tokens_map.json: 100%|███████████████| 65.0/65.0 [00:00<00:00, 25.5kB/s]\n",
+            "/home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+            "  warnings.warn(\n",
+            "/home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages/transformers/models/olmo/modeling_olmo.py:1039: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
+            "  if sequence_length != 1:\n",
+            "Weight deduplication check in the ONNX export requires accelerate. Please install accelerate to run it.\n",
+            "\t\t-[x] values not close enough, max diff: 0.0007228851318359375 (atol: 0.0001)\n",
+            "The ONNX export succeeded with the warning: The maximum absolute difference between the output of the reference model and the ONNX exported model is not within the set tolerance 0.0001:\n",
+            "- logits: max diff = 0.0007228851318359375.\n",
+            " The exported model was saved at: onnx_models/allenai/OLMo-1B-hf\n"
+          ]
+        }
+      ],
+      "source": [
+        "!optimum-cli export onnx  --trust-remote-code --task text-generation --model {MODEL_NAME} {EXPORT_PATH} "
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "oDAyLDCcS-JL"
+      },
+      "source": [
+        "Let's have a look inside these two directories and see what we are dealing with:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "jp2ssmF2S-JL",
+        "outputId": "7c3379db-18cd-4990-de7e-51e9b4eada8c"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 5001720\n",
+            "-rw-rw-r-- 1 prabod prabod        646 Feb 12 03:51 config.json\n",
+            "-rw-rw-r-- 1 prabod prabod        111 Feb 12 03:51 generation_config.json\n",
+            "-rw-rw-r-- 1 prabod prabod     468660 Feb 12 03:52 model.onnx\n",
+            "-rw-rw-r-- 1 prabod prabod 5119148032 Feb 12 03:52 model.onnx_data\n",
+            "-rw-rw-r-- 1 prabod prabod        293 Feb 12 03:51 special_tokens_map.json\n",
+            "-rw-rw-r-- 1 prabod prabod       5372 Feb 12 03:51 tokenizer_config.json\n",
+            "-rw-rw-r-- 1 prabod prabod    2115417 Feb 12 03:51 tokenizer.json\n"
+          ]
+        }
+      ],
+      "source": [
+        "!ls -l {EXPORT_PATH}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "TJ-z0eSzS-JL"
+      },
+      "source": [
+        "- As you can see, we need to move the sentence piece models `spiece.model` from the tokenizer to assets folder which Spark NLP will look for"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 6,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/home/prabod/anaconda3/envs/olmo/lib/python3.9/site-packages/huggingface_hub/file_download.py:795: FutureWarning: `resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.\n",
+            "  warnings.warn(\n"
+          ]
+        },
+        {
+          "data": {
+            "text/plain": [
+              "('onnx_models/allenai/OLMo-1B-hf/assets/tokenizer_config.json',\n",
+              " 'onnx_models/allenai/OLMo-1B-hf/assets/special_tokens_map.json',\n",
+              " 'onnx_models/allenai/OLMo-1B-hf/assets/tokenizer.json')"
+            ]
+          },
+          "execution_count": 6,
+          "metadata": {},
+          "output_type": "execute_result"
+        }
+      ],
+      "source": [
+        "from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig\n",
+        "from pathlib import Path\n",
+        "model_id = 'allenai/OLMo-1B-hf'\n",
+        "\n",
+        "tokenizer = AutoTokenizer.from_pretrained(model_id,trust_remote_code=True)\n",
+        "config = AutoConfig.from_pretrained(model_id,trust_remote_code=True)\n",
+        "\n",
+        "\n",
+        "ASSETS_PATH = f\"{EXPORT_PATH}/assets\"\n",
+        "\n",
+        "\n",
+        "\n",
+        "# make sure the directory exists\n",
+        "Path(ASSETS_PATH).mkdir(parents=True, exist_ok=True)\n",
+        "\n",
+        "config.save_pretrained(ASSETS_PATH)\n",
+        "tokenizer.save_vocabulary(ASSETS_PATH)\n",
+        "\n",
+        "tokenizer.save_pretrained(ASSETS_PATH)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "id": "062OnFBIS-JL"
+      },
+      "outputs": [],
+      "source": [
+        "! mkdir -p {EXPORT_PATH}/assets\n",
+        "! mv -t {EXPORT_PATH}/assets {EXPORT_PATH}/merges.txt"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 7,
+      "metadata": {
+        "id": "xZMCq14PUdrG"
+      },
+      "outputs": [],
+      "source": [
+        "import json\n",
+        "with open(f\"{ASSETS_PATH}/vocab.json\", \"r\") as F:\n",
+        "    vocab_json = json.load(F)\n",
+        "    vocab = [\"\" for i in range(len(vocab_json))]\n",
+        "    for word in vocab_json:\n",
+        "        vocab[vocab_json[word]] = word\n",
+        "    with open(f\"{ASSETS_PATH}/vocab.txt\", \"w\") as F2:\n",
+        "        F2.writelines(map(lambda x: str(x) + \"\\n\", vocab))"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "3fbDIHVFS-JL",
+        "outputId": "ebe0a435-3c5c-4c20-df51-534397802fbd"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 3716\n",
+            "-rw-rw-r-- 1 prabod prabod     673 Feb 12 03:59 config.json\n",
+            "-rw-rw-r-- 1 prabod prabod  456598 Feb 12 03:59 merges.txt\n",
+            "-rw-rw-r-- 1 prabod prabod     293 Feb 12 03:59 special_tokens_map.json\n",
+            "-rw-rw-r-- 1 prabod prabod    5372 Feb 12 03:59 tokenizer_config.json\n",
+            "-rw-rw-r-- 1 prabod prabod 2115417 Feb 12 03:59 tokenizer.json\n",
+            "-rw-rw-r-- 1 prabod prabod  799451 Feb 12 03:59 vocab.json\n",
+            "-rw-rw-r-- 1 prabod prabod  407614 Feb 12 04:00 vocab.txt\n"
+          ]
+        }
+      ],
+      "source": [
+        "!ls -l {EXPORT_PATH}/assets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "2025-02-12 04:30:03,971 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:03,994 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.0/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:03,995 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,016 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.0/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,017 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,039 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.0/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,041 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/self_attn/rotary_emb/MatMul ...\n",
+            "2025-02-12 04:30:04,042 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:04,043 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:04,045 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:04,046 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:04,047 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:04,048 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,073 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.0/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,074 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,186 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.0/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,192 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,279 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.0/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,283 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.0/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,370 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.0/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,373 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,402 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.1/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,403 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,422 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.1/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,423 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,442 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.1/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,444 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:04,445 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:04,446 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:04,447 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:04,448 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,470 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.1/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,471 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,569 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.1/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,573 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,672 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.1/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,676 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.1/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,775 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.1/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,779 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,807 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.2/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,808 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,827 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.2/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,828 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,848 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.2/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,849 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:04,850 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:04,851 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:04,852 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:04,855 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,874 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.2/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,875 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,964 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.2/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:04,968 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,057 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.2/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,060 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.2/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,151 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.2/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,155 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,183 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.3/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,184 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,203 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.3/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,204 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,223 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.3/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,224 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:05,225 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:05,226 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:05,227 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:05,228 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,250 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.3/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,251 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,348 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.3/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,352 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,459 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.3/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,464 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.3/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,564 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.3/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,568 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,601 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.4/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,602 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,623 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.4/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,624 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,645 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.4/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,646 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:05,647 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:05,649 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:05,650 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:05,651 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,671 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.4/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,672 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,768 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.4/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,772 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,859 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.4/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,863 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.4/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,952 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.4/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,956 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,989 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.5/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:05,990 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,010 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.5/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,011 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,032 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.5/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,033 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:06,034 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:06,036 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:06,037 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:06,038 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,061 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.5/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,062 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,175 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.5/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,182 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,268 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.5/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,272 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.5/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,368 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.5/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,375 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,403 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.6/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,404 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,423 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.6/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,424 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,443 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.6/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,445 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:06,446 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:06,447 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:06,448 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:06,449 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,469 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.6/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,470 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,555 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.6/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,559 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,652 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.6/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,655 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.6/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,743 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.6/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,747 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,775 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.7/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,776 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,795 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.7/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,796 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,815 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.7/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,816 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:06,818 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:06,819 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:06,820 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:06,821 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,844 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.7/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,846 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,947 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.7/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:06,952 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,053 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.7/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,058 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.7/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,161 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.7/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,166 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,198 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.8/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,199 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,220 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.8/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,221 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,241 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.8/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,243 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:07,244 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:07,245 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:07,246 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:07,247 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,268 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.8/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,269 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,356 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.8/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,360 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,445 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.8/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,449 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.8/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,540 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.8/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,544 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,571 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.9/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,572 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,591 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.9/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,592 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,613 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.9/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,615 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:07,616 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:07,617 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:07,618 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:07,619 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,640 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.9/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,641 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,734 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.9/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,739 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,844 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.9/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,849 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.9/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,948 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.9/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,951 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,980 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.10/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:07,981 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,001 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.10/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,002 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,022 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.10/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,023 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:08,025 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:08,026 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:08,027 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:08,028 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,047 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.10/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,048 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,135 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.10/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,141 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,226 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.10/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,230 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.10/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,315 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.10/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,319 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,348 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.11/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,349 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,368 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.11/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,369 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,388 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.11/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,389 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:08,391 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:08,392 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:08,393 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:08,394 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,415 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.11/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,416 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,521 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.11/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,525 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,630 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.11/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,634 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.11/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,738 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.11/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,742 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,775 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.12/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,776 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,797 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.12/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,798 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,818 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.12/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,820 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:08,821 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:08,822 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:08,823 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:08,824 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,846 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.12/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,847 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,929 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.12/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:08,933 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,025 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.12/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,029 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.12/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,118 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.12/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,122 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,151 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.13/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,152 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,171 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.13/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,172 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,191 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.13/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,193 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:09,194 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:09,195 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:09,197 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:09,198 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,219 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.13/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,220 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,308 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.13/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,311 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,399 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.13/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,402 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.13/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,489 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.13/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,492 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,520 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.14/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,521 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,540 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.14/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,541 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,560 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.14/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,561 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:09,563 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:09,564 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:09,565 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:09,566 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,587 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.14/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,588 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,713 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.14/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,717 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,842 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.14/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,847 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.14/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,973 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.14/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:09,976 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,004 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.15/self_attn/q_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,005 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,024 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.15/self_attn/k_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,025 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,044 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.15/self_attn/v_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,046 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/self_attn/MatMul ...\n",
+            "2025-02-12 04:30:10,047 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:10,048 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/self_attn/MatMul_1 ...\n",
+            "2025-02-12 04:30:10,050 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - MatMul doesn't have const weight. Skip to quantize\n",
+            "2025-02-12 04:30:10,051 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,072 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.15/self_attn/o_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,073 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,193 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.15/mlp/gate_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,198 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,326 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.15/mlp/up_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,331 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /model/layers.15/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,456 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /model/layers.15/mlp/down_proj/MatMul ...\n",
+            "2025-02-12 04:30:10,462 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - start to quantize /lm_head/MatMul ...\n",
+            "2025-02-12 04:30:11,248 onnxruntime.quantization.matmul_4bits_quantizer [INFO] - complete quantization of /lm_head/MatMul ...\n"
+          ]
+        }
+      ],
+      "source": [
+        "import onnx\n",
+        "# from onnxruntime import quantization as ort_quantization\n",
+        "from onnxruntime.quantization.matmul_4bits_quantizer import MatMul4BitsQuantizer\n",
+        "\n",
+        "Path(f'onnx_models/{model_id}_int4').mkdir(parents=True, exist_ok=True)\n",
+        "\n",
+        "model = onnx.load_model(f\"onnx_models/{model_id}/model.onnx\", load_external_data=True)\n",
+        "quant = MatMul4BitsQuantizer(\n",
+        "    model=model,\n",
+        "    block_size=32,\n",
+        "    is_symmetric=True,\n",
+        "    nodes_to_exclude=[],\n",
+        ")\n",
+        "quant.process()\n",
+        "quant.model.save_model_to_file(f'onnx_models/{model_id}_int4/model.onnx', use_external_data_format=True)"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "model_id = 'allenai/OLMo-1B-hf'"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "import onnx\n",
+        "model = onnx.load(f\"onnx_models/{model_id}_int4/model.onnx\")\n",
+        "EXPORT_PATH = f\"onnx_models/{model_id}_int4\"\n",
+        "onnx.save_model(model, f\"{EXPORT_PATH}/decoder_model.onnx\", save_as_external_data=True, all_tensors_to_one_file=True, location=\"_olmo_decoder_model.onnx_data\", size_threshold=1024, convert_attribute=False)\n"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "!rm -rf {EXPORT_PATH}/model.onnx {EXPORT_PATH}/model.onnx_data"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 10,
+      "metadata": {},
+      "outputs": [],
+      "source": [
+        "#copy the assets\n",
+        "!cp -r onnx_models/{model_id}/assets onnx_models/{model_id}_int4/assets"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 12,
+      "metadata": {},
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Processing /home/prabod/Projects/spark-nlp/python/dist/spark_nlp-5.5.3-py2.py3-none-any.whl\n",
+            "Collecting pyspark==3.2.3\n",
+            "  Using cached pyspark-3.2.3.tar.gz (281.5 MB)\n",
+            "  Preparing metadata (setup.py) ... \u001b[?25ldone\n",
+            "\u001b[?25hCollecting py4j==0.10.9.5 (from pyspark==3.2.3)\n",
+            "  Using cached py4j-0.10.9.5-py2.py3-none-any.whl.metadata (1.5 kB)\n",
+            "spark-nlp is already installed with the same version as the provided wheel. Use --force-reinstall to force an installation of the wheel.\n",
+            "Using cached py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)\n",
+            "Building wheels for collected packages: pyspark\n",
+            "  Building wheel for pyspark (setup.py) ... \u001b[?25ldone\n",
+            "\u001b[?25h  Created wheel for pyspark: filename=pyspark-3.2.3-py2.py3-none-any.whl size=281990715 sha256=ec075358b0ed3cc8cae95e6699c93f9e9949e54045ca13ced0d05052e0143361\n",
+            "  Stored in directory: /home/prabod/.cache/pip/wheels/cc/f4/8d/dfbbd536587311afde33711613a0c193f18e7d90b120801108\n",
+            "Successfully built pyspark\n",
+            "Installing collected packages: py4j, pyspark\n",
+            "Successfully installed py4j-0.10.9.5 pyspark-3.2.3\n"
+          ]
+        }
+      ],
+      "source": [
+        "!pip install /home/prabod/Projects/spark-nlp/python/dist/spark_nlp-5.5.3-py2.py3-none-any.whl pyspark==3.2.3"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "NZZqEbvvS-JM"
+      },
+      "source": [
+        "## Import and Save OLMO in Spark NLP\n",
+        "\n",
+        "- Let's install and setup Spark NLP in Google Colab\n",
+        "- This part is pretty easy via our simple script"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 8,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "SLlypPRaS-JM",
+        "outputId": "54ab8af5-a1cb-4c29-f982-2f5aac5e6e35"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Installing PySpark 3.2.3 and Spark NLP 5.4.2\n",
+            "setup Colab for PySpark 3.2.3 and Spark NLP 5.4.2\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m281.5/281.5 MB\u001b[0m \u001b[31m5.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
+            "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.6/55.6 kB\u001b[0m \u001b[31m3.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m579.5/579.5 kB\u001b[0m \u001b[31m29.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m199.7/199.7 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25h  Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n"
+          ]
+        }
+      ],
+      "source": [
+        "! wget -q http://setup.johnsnowlabs.com/colab.sh -O - | bash"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "QEy-zFjnS-JM"
+      },
+      "source": [
+        "Let's start Spark with Spark NLP included via our simple `start()` function"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 9,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "0KOd7hwNS-JM",
+        "outputId": "8e408b69-db08-42f5-9d14-c163034f9c04"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Collecting spark-nlp==5.5.0rc1\n",
+            "  Downloading spark_nlp-5.5.0rc1-py2.py3-none-any.whl.metadata (55 kB)\n",
+            "\u001b[?25l     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/55.8 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m55.8/55.8 kB\u001b[0m \u001b[31m2.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hDownloading spark_nlp-5.5.0rc1-py2.py3-none-any.whl (629 kB)\n",
+            "\u001b[?25l   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/629.6 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K   \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m \u001b[32m624.6/629.6 kB\u001b[0m \u001b[31m25.1 MB/s\u001b[0m eta \u001b[36m0:00:01\u001b[0m\r\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m629.6/629.6 kB\u001b[0m \u001b[31m17.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
+            "\u001b[?25hInstalling collected packages: spark-nlp\n",
+            "  Attempting uninstall: spark-nlp\n",
+            "    Found existing installation: spark-nlp 5.4.2\n",
+            "    Uninstalling spark-nlp-5.4.2:\n",
+            "      Successfully uninstalled spark-nlp-5.4.2\n",
+            "Successfully installed spark-nlp-5.5.0rc1\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "/usr/lib/python3.10/subprocess.py:1796: RuntimeWarning: os.fork() was called. os.fork() is incompatible with multithreaded code, and JAX is multithreaded, so this will likely lead to a deadlock.\n",
+            "  self.pid = _posixsubprocess.fork_exec(\n"
+          ]
+        }
+      ],
+      "source": [
+        "import sparknlp\n",
+        "# let's start Spark with Spark NLP\n",
+        "spark = sparknlp.start()\n",
+        "print(\"Apache Spark version: {}\".format(spark.version))"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Qgl_T39AS-JM"
+      },
+      "source": [
+        "- Let's use `loadSavedModel` functon in `OLMOTransformer` which allows us to load the ONNX model\n",
+        "- Most params will be set automatically. They can also be set later after loading the model in `OLMOTransformer` during runtime, so don't worry about setting them now\n",
+        "- `loadSavedModel` accepts two params, first is the path to the exported model. The second is the SparkSession that is `spark` variable we previously started via `sparknlp.start()`\n",
+        "- NOTE: `loadSavedModel` accepts local paths in addition to distributed file systems such as `HDFS`, `S3`, `DBFS`, etc. This feature was introduced in Spark NLP 4.2.2 release. Keep in mind the best and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively.st and recommended way to move/share/reuse Spark NLP models is to use `write.save` so you can use `.load()` from any file systems natively."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 3,
+      "metadata": {
+        "id": "Ij_8ZwLxS-JM"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Could not extract bos_token_id from config.json, assigning default value -1\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "WARNING: An illegal reflective access operation has occurred\n",
+            "WARNING: Illegal reflective access by org.apache.spark.util.SizeEstimator$ (file:/home/prabod/spark/jars/spark-core_2.12-3.3.2.jar) to field java.util.regex.Pattern.pattern\n",
+            "WARNING: Please consider reporting this to the maintainers of org.apache.spark.util.SizeEstimator$\n",
+            "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
+            "WARNING: All illegal access operations will be denied in a future release\n"
+          ]
+        }
+      ],
+      "source": [
+        "from sparknlp.annotator import *\n",
+        "\n",
+        "olmo = OLMoTransformer.loadSavedModel(EXPORT_PATH, spark)\\\n",
+        "  .setInputCols([\"documents\"])\\\n",
+        "  .setMaxOutputLength(100)\\\n",
+        "  .setDoSample(False)\\\n",
+        "  .setOutputCol(\"generation\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "v_eeGHNZS-JM"
+      },
+      "source": [
+        "Let's save it on disk so it is easier to be moved around and also be used later via `.load` function"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "id": "0rmW0bXLS-JM"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "                                                                                \r"
+          ]
+        }
+      ],
+      "source": [
+        "olmo.write().overwrite().save(f\"/tmp/{MODEL_NAME}_spark_nlp_int4\")"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "VnmGJlakS-JM"
+      },
+      "source": [
+        "Let's clean up stuff we don't need anymore"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 13,
+      "metadata": {
+        "id": "kWkdSCjIS-JN"
+      },
+      "outputs": [],
+      "source": [
+        "!rm -rf {EXPORT_PATH}"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "I9YtKl-aS-JN"
+      },
+      "source": [
+        "Awesome  😎 !\n",
+        "\n",
+        "This is your ONNX OLMO model from HuggingFace 🤗  loaded and saved by Spark NLP 🚀"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 5,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "9nbzEjwWS-JN",
+        "outputId": "4b20ba7c-41c5-440f-89c8-fd4e6a0ec541"
+      },
+      "outputs": [
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "total 1121168\n",
+            "-rw-r--r-- 1 prabod prabod     496159 Feb 12 11:54 decoder_model.onnx\n",
+            "drwxr-xr-x 5 prabod prabod       4096 Feb 12 11:54 fields\n",
+            "drwxr-xr-x 2 prabod prabod       4096 Feb 12 11:54 metadata\n",
+            "-rw-r--r-- 1 prabod prabod 1147568128 Feb 12 11:54 _olmo_decoder_model.onnx_data\n"
+          ]
+        }
+      ],
+      "source": [
+        "! ls -l /tmp/{MODEL_NAME}_spark_nlp_int4"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "lcNqKR7mS-JN"
+      },
+      "source": [
+        "Now let's see how we can use it on other machines, clusters, or any place you wish to use your new and shiny OLMO model 😊"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "execution_count": 4,
+      "metadata": {
+        "colab": {
+          "base_uri": "https://localhost:8080/",
+          "height": 0
+        },
+        "id": "DZyaiumUS-JN",
+        "outputId": "d7db52cb-b85d-4d9a-fd94-24e5b0af7f4b"
+      },
+      "outputs": [
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "                                                                                \r"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "Using CPUs\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "[Stage 21:======================================================> (30 + 1) / 31]\r"
+          ]
+        },
+        {
+          "name": "stdout",
+          "output_type": "stream",
+          "text": [
+            "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+            "|text                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |document                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     |generation                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              |\n",
+            "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+            "|Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code.|[{document, 0, 1008, Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text-based language problems into a text-to-text format. Our systematic study compares pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens of language understanding tasks. By combining the insights from our exploration with scale and our new Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering summarization, question answering, text classification, and more. To facilitate future work on transfer learning for NLP, we release our data set, pre-trained models, and code., {sentence -> 0}, []}]|[{document, 0, 1195, Transfer learning , where a model is first pre - trained on a data - rich task before being fine - tuned on a downstream task , has emerged as a powerful technique in natural language processing ( NLP ). The effectiveness of transfer learning has given rise to a diversity of approaches , methodology , and practice . In this paper , we explore the landscape of transfer learning techniques for NLP by introducing a unified framework that converts all text - based language problems into a text - to - text format . Our systematic study compares pre - training objectives , architectures , unlabeled data sets , transfer approaches , and other factors on dozens of language understanding tasks . By combining the insights from our exploration with scale and our new Colossal Clean Crawled Corpus , we achieve state - of - the - art results on many benchmarks covering summarization , question answering , text classification , and more . To facilitate future work on transfer learning for NLP , we release our data set , pre - trained models , and code . We also release the Colossala testset and a full report on our results , which we provide for researchers . The paper is available at https ., {sentence -> 0}, []}]|\n",
+            "+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
+            "\n"
+          ]
+        },
+        {
+          "name": "stderr",
+          "output_type": "stream",
+          "text": [
+            "                                                                                \r"
+          ]
+        }
+      ],
+      "source": [
+        "import sparknlp\n",
+        "from sparknlp.base import *\n",
+        "from sparknlp.annotator import *\n",
+        "from pyspark.ml import Pipeline\n",
+        "\n",
+        "test_data = spark.createDataFrame([\n",
+        "    [\"Transfer learning, where a model is first pre-trained on a data-rich task before being fine-tuned on a \" +\n",
+        "       \"downstream task, has emerged as a powerful technique in natural language processing (NLP). The effectiveness\" +\n",
+        "       \" of transfer learning has given rise to a diversity of approaches, methodology, and practice. In this \" +\n",
+        "       \"paper, we explore the landscape of transfer learning techniques for NLP by introducing a unified framework \" +\n",
+        "       \"that converts all text-based language problems into a text-to-text format. Our systematic study compares \" +\n",
+        "       \"pre-training objectives, architectures, unlabeled data sets, transfer approaches, and other factors on dozens \" +\n",
+        "       \"of language understanding tasks. By combining the insights from our exploration with scale and our new \" +\n",
+        "       \"Colossal Clean Crawled Corpus, we achieve state-of-the-art results on many benchmarks covering \" +\n",
+        "       \"summarization, question answering, text classification, and more. To facilitate future work on transfer \" +\n",
+        "       \"learning for NLP, we release our data set, pre-trained models, and code.\"]\n",
+        "]).toDF(\"text\")\n",
+        "\n",
+        "\n",
+        "document_assembler = DocumentAssembler() \\\n",
+        "    .setInputCol(\"text\")\\\n",
+        "    .setOutputCol(\"document\")\n",
+        "\n",
+        "olmo = OLMoTransformer.load(f\"file:///tmp/{MODEL_NAME}_spark_nlp_int4\")\\\n",
+        "      .setInputCols([\"document\"])\\\n",
+        "      .setMaxOutputLength(50)\\\n",
+        "      .setDoSample(True)\\\n",
+        "      .setTopK(50)\\\n",
+        "      .setTemperature(0)\\\n",
+        "      .setBatchSize(5)\\\n",
+        "      .setNoRepeatNgramSize(3)\\\n",
+        "      .setOutputCol(\"generation\")\n",
+        "\n",
+        "pipeline = Pipeline().setStages([document_assembler, olmo])\n",
+        "\n",
+        "result = pipeline.fit(test_data).transform(test_data)\n",
+        "result.show(truncate=False)"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "uTnIQ3HKS-JN"
+      },
+      "source": [
+        "That's it! You can now go wild and use hundreds of OLMO models from HuggingFace 🤗 in Spark NLP 🚀\n"
+      ]
+    }
+  ],
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "display_name": "olmo",
+      "language": "python",
+      "name": "python3"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.9.21"
+    }
+  },
+  "nbformat": 4,
+  "nbformat_minor": 0
+}
diff --git a/python/sparknlp/annotator/seq2seq/__init__.py b/python/sparknlp/annotator/seq2seq/__init__.py
index e9c3984c21ecc1..45f44b44ccc400 100644
--- a/python/sparknlp/annotator/seq2seq/__init__.py
+++ b/python/sparknlp/annotator/seq2seq/__init__.py
@@ -28,3 +28,4 @@
 from sparknlp.annotator.seq2seq.qwen_transformer import *
 from sparknlp.annotator.seq2seq.starcoder_transformer import *
 from sparknlp.annotator.seq2seq.llama3_transformer import *
+from sparknlp.annotator.seq2seq.olmo_transformer import *
diff --git a/python/sparknlp/annotator/seq2seq/olmo_transformer.py b/python/sparknlp/annotator/seq2seq/olmo_transformer.py
new file mode 100644
index 00000000000000..eb1b63d71cdcf1
--- /dev/null
+++ b/python/sparknlp/annotator/seq2seq/olmo_transformer.py
@@ -0,0 +1,326 @@
+#  Copyright 2017-2022 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+"""Contains classes for the OLMoTransformer."""
+
+from sparknlp.common import *
+
+
+class OLMoTransformer(AnnotatorModel, HasBatchedAnnotate, HasEngine):
+    """OLMo: Open Language Models
+
+    OLMo is a series of Open Language Models designed to enable the science of language models.
+    The OLMo models are trained on the Dolma dataset. We release all code, checkpoints, logs
+    (coming soon), and details involved in training these models.
+
+    Pretrained models can be loaded with :meth:`.pretrained` of the companion
+    object:
+
+    >>> olmo = OLMoTransformer.pretrained() \\
+    ...     .setInputCols(["document"]) \\
+    ...     .setOutputCol("generation")
+
+
+    The default model is ``"olmo_1b_int4"``, if no name is provided. For available
+    pretrained models please see the `Models Hub
+    <https://sparknlp.org/models?q=olmo>`__.
+
+    ====================== ======================
+    Input Annotation types Output Annotation type
+    ====================== ======================
+    ``DOCUMENT``           ``DOCUMENT``
+    ====================== ======================
+
+    Parameters
+    ----------
+    configProtoBytes
+        ConfigProto from tensorflow, serialized into byte array.
+    minOutputLength
+        Minimum length of the sequence to be generated, by default 0
+    maxOutputLength
+        Maximum length of output text, by default 20
+    doSample
+        Whether or not to use sampling; use greedy decoding otherwise, by default False
+    temperature
+        The value used to module the next token probabilities, by default 1.0
+    topK
+        The number of highest probability vocabulary tokens to keep for
+        top-k-filtering, by default 50
+    topP
+        Top cumulative probability for vocabulary tokens, by default 1.0
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+    repetitionPenalty
+        The parameter for repetition penalty, 1.0 means no penalty. , by default
+        1.0
+    noRepeatNgramSize
+        If set to int > 0, all ngrams of that size can only occur once, by
+        default 0
+    ignoreTokenIds
+        A list of token ids which are ignored in the decoder's output, by
+        default []
+
+    Notes
+    -----
+    This is a very computationally expensive module especially on larger
+    sequence. The use of an accelerator such as GPU is recommended.
+
+    References
+    ----------
+    - `OLMo Project Page.
+      <https://allenai.org/olmo>`__
+    - `OLMO GitHub Repository.
+      <https://github.com/allenai/OLMo>`__
+    - `OLMo: Accelerating the Science of Language Models
+      <https://arxiv.org/pdf/2402.00838.pdf>`__
+
+    **Paper Abstract:**
+
+    *Language models (LMs) have become ubiquitous in both NLP research and in commercial product offerings.
+    As their commercial importance has surged, the most powerful models have become closed off, gated behind
+    proprietary interfaces, with important details of their training data, architectures, and development
+    undisclosed. Given the importance of these details in scientifically studying these models, including
+    their biases and potential risks, we believe it is essential for the research community to have access
+    to powerful, truly open LMs. To this end, this technical report details the first release of OLMo,
+    a state-of-the-art, truly Open Language Model and its framework to build and study the science of
+    language modeling. Unlike most prior efforts that have only released model weights and inference code,
+    we release OLMo and the whole framework, including training data and training and evaluation code.
+    We hope this release will empower and strengthen the open research community and inspire a new wave
+    of innovation.*
+
+    Examples
+    --------
+    >>> import sparknlp
+    >>> from sparknlp.base import *
+    >>> from sparknlp.annotator import *
+    >>> from pyspark.ml import Pipeline
+    >>> documentAssembler = DocumentAssembler() \\
+    ...     .setInputCol("text") \\
+    ...     .setOutputCol("documents")
+    >>> olmo = OLMoTransformer.pretrained("olmo-7b") \\
+    ...     .setInputCols(["documents"]) \\
+    ...     .setMaxOutputLength(50) \\
+    ...     .setOutputCol("generation")
+    >>> pipeline = Pipeline().setStages([documentAssembler, olmo])
+    >>> data = spark.createDataFrame([["My name is Leonardo."]]).toDF("text")
+    >>> result = pipeline.fit(data).transform(data)
+    >>> result.select("summaries.generation").show(truncate=False)
+    +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |result                                                                                                                                                                                              |
+    +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    |[My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong    |
+    | passion for learning and am always looking for ways to improve my knowledge and skills]                                                                                                            |
+    -----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+    """
+
+    name = "OLMoTransformer"
+
+    inputAnnotatorTypes = [AnnotatorType.DOCUMENT]
+
+    outputAnnotatorType = AnnotatorType.DOCUMENT
+
+    configProtoBytes = Param(Params._dummy(), "configProtoBytes",
+                             "ConfigProto from tensorflow, serialized into byte array. Get with config_proto.SerializeToString()",
+                             TypeConverters.toListInt)
+
+    minOutputLength = Param(Params._dummy(), "minOutputLength", "Minimum length of the sequence to be generated",
+                            typeConverter=TypeConverters.toInt)
+
+    maxOutputLength = Param(Params._dummy(), "maxOutputLength", "Maximum length of output text",
+                            typeConverter=TypeConverters.toInt)
+
+    doSample = Param(Params._dummy(), "doSample", "Whether or not to use sampling; use greedy decoding otherwise",
+                     typeConverter=TypeConverters.toBoolean)
+
+    temperature = Param(Params._dummy(), "temperature", "The value used to module the next token probabilities",
+                        typeConverter=TypeConverters.toFloat)
+
+    topK = Param(Params._dummy(), "topK",
+                 "The number of highest probability vocabulary tokens to keep for top-k-filtering",
+                 typeConverter=TypeConverters.toInt)
+
+    topP = Param(Params._dummy(), "topP",
+                 "If set to float < 1, only the most probable tokens with probabilities that add up to ``top_p`` or higher are kept for generation",
+                 typeConverter=TypeConverters.toFloat)
+
+    repetitionPenalty = Param(Params._dummy(), "repetitionPenalty",
+                              "The parameter for repetition penalty. 1.0 means no penalty. See `this paper <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details",
+                              typeConverter=TypeConverters.toFloat)
+
+    noRepeatNgramSize = Param(Params._dummy(), "noRepeatNgramSize",
+                              "If set to int > 0, all ngrams of that size can only occur once",
+                              typeConverter=TypeConverters.toInt)
+
+    ignoreTokenIds = Param(Params._dummy(), "ignoreTokenIds",
+                           "A list of token ids which are ignored in the decoder's output",
+                           typeConverter=TypeConverters.toListInt)
+
+    def setIgnoreTokenIds(self, value):
+        """A list of token ids which are ignored in the decoder's output.
+
+        Parameters
+        ----------
+        value : List[int]
+            The words to be filtered out
+        """
+        return self._set(ignoreTokenIds=value)
+
+    def setConfigProtoBytes(self, b):
+        """Sets configProto from tensorflow, serialized into byte array.
+
+        Parameters
+        ----------
+        b : List[int]
+            ConfigProto from tensorflow, serialized into byte array
+        """
+        return self._set(configProtoBytes=b)
+
+    def setMinOutputLength(self, value):
+        """Sets minimum length of the sequence to be generated.
+
+        Parameters
+        ----------
+        value : int
+            Minimum length of the sequence to be generated
+        """
+        return self._set(minOutputLength=value)
+
+    def setMaxOutputLength(self, value):
+        """Sets maximum length of output text.
+
+        Parameters
+        ----------
+        value : int
+            Maximum length of output text
+        """
+        return self._set(maxOutputLength=value)
+
+    def setDoSample(self, value):
+        """Sets whether or not to use sampling, use greedy decoding otherwise.
+
+        Parameters
+        ----------
+        value : bool
+            Whether or not to use sampling; use greedy decoding otherwise
+        """
+        return self._set(doSample=value)
+
+    def setTemperature(self, value):
+        """Sets the value used to module the next token probabilities.
+
+        Parameters
+        ----------
+        value : float
+            The value used to module the next token probabilities
+        """
+        return self._set(temperature=value)
+
+    def setTopK(self, value):
+        """Sets the number of highest probability vocabulary tokens to keep for
+        top-k-filtering.
+
+        Parameters
+        ----------
+        value : int
+            Number of highest probability vocabulary tokens to keep
+        """
+        return self._set(topK=value)
+
+    def setTopP(self, value):
+        """Sets the top cumulative probability for vocabulary tokens.
+
+        If set to float < 1, only the most probable tokens with probabilities
+        that add up to ``topP`` or higher are kept for generation.
+
+        Parameters
+        ----------
+        value : float
+            Cumulative probability for vocabulary tokens
+        """
+        return self._set(topP=value)
+
+    def setRepetitionPenalty(self, value):
+        """Sets the parameter for repetition penalty. 1.0 means no penalty.
+
+        Parameters
+        ----------
+        value : float
+            The repetition penalty
+
+        References
+        ----------
+        See `Ctrl: A Conditional Transformer Language Model For Controllable
+        Generation <https://arxiv.org/pdf/1909.05858.pdf>`__ for more details.
+        """
+        return self._set(repetitionPenalty=value)
+
+    def setNoRepeatNgramSize(self, value):
+        """Sets size of n-grams that can only occur once.
+
+        If set to int > 0, all ngrams of that size can only occur once.
+
+        Parameters
+        ----------
+        value : int
+            N-gram size can only occur once
+        """
+        return self._set(noRepeatNgramSize=value)
+
+    @keyword_only
+    def __init__(self, classname="com.johnsnowlabs.nlp.annotators.seq2seq.OLMoTransformer", java_model=None):
+        super(OLMoTransformer, self).__init__(classname=classname, java_model=java_model)
+        self._setDefault(minOutputLength=0, maxOutputLength=20, doSample=False, temperature=0.6, topK=50, topP=0.9,
+                         repetitionPenalty=1.0, noRepeatNgramSize=0, ignoreTokenIds=[], batchSize=1)
+
+    @staticmethod
+    def loadSavedModel(folder, spark_session):
+        """Loads a locally saved model.
+
+        Parameters
+        ----------
+        folder : str
+            Folder of the saved model
+        spark_session : pyspark.sql.SparkSession
+            The current SparkSession
+
+        Returns
+        -------
+        OLMoTransformer
+            The restored model
+        """
+        from sparknlp.internal import _OLMoLoader
+        jModel = _OLMoLoader(folder, spark_session._jsparkSession)._java_obj
+        return OLMoTransformer(java_model=jModel)
+
+    @staticmethod
+    def pretrained(name="olmo_1b_int4", lang="en", remote_loc=None):
+        """Downloads and loads a pretrained model.
+
+        Parameters
+        ----------
+        name : str, optional
+            Name of the pretrained model, by default "olmo-7b"
+        lang : str, optional
+            Language of the pretrained model, by default "en"
+        remote_loc : str, optional
+            Optional remote address of the resource, by default None. Will use
+            Spark NLPs repositories otherwise.
+
+        Returns
+        -------
+        OLMoTransformer
+            The restored model
+        """
+        from sparknlp.pretrained import ResourceDownloader
+        return ResourceDownloader.downloadModel(OLMoTransformer, name, lang, remote_loc)
diff --git a/python/sparknlp/internal/__init__.py b/python/sparknlp/internal/__init__.py
index 4cb5321e8a8691..f96ba37bcbe57a 100644
--- a/python/sparknlp/internal/__init__.py
+++ b/python/sparknlp/internal/__init__.py
@@ -345,6 +345,10 @@ def __init__(self, path, jspark):
         )
 
 
+class _OLMoLoader(ExtendedJavaWrapper):
+    def __init__(self, path, jspark):
+        super(_OLMoLoader, self).__init__(
+            "com.johnsnowlabs.nlp.annotators.seq2seq.OLMoTransformer.loadSavedModel", path, jspark)
 class _Phi2Loader(ExtendedJavaWrapper):
     def __init__(self, path, jspark, use_openvino=False):
         super(_Phi2Loader, self).__init__(
@@ -992,8 +996,8 @@ class _AutoGGUFLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark):
         super(_AutoGGUFLoader, self).__init__(
             "com.johnsnowlabs.nlp.annotators.seq2seq.AutoGGUFModel.loadSavedModel", path, jspark)
-        
-        
+
+
 class _MxbaiEmbeddingsLoader(ExtendedJavaWrapper):
     def __init__(self, path, jspark):
         super(_MxbaiEmbeddingsLoader, self).__init__(
diff --git a/python/test/annotator/seq2seq/olmo_transformer_test.py b/python/test/annotator/seq2seq/olmo_transformer_test.py
new file mode 100644
index 00000000000000..8c09b3cfa2e4cf
--- /dev/null
+++ b/python/test/annotator/seq2seq/olmo_transformer_test.py
@@ -0,0 +1,47 @@
+#  Copyright 2017-2024 John Snow Labs
+#
+#  Licensed under the Apache License, Version 2.0 (the "License");
+#  you may not use this file except in compliance with the License.
+#  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+#  Unless required by applicable law or agreed to in writing, software
+#  distributed under the License is distributed on an "AS IS" BASIS,
+#  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+#  See the License for the specific language governing permissions and
+#  limitations under the License.
+import unittest
+
+import pytest
+
+from sparknlp.annotator import *
+from sparknlp.base import *
+from test.util import SparkContextForTest
+
+
+@pytest.mark.slow
+class OLMoTransformerTextGenerationTestSpec(unittest.TestCase):
+    def setUp(self):
+        self.spark = SparkContextForTest.spark
+
+    def runTest(self):
+        data = self.spark.createDataFrame([
+            [1, """Leonardo Da Vinci invented the microscope?""".strip().replace("\n", " ")]]).toDF("id", "text")
+
+        document_assembler = DocumentAssembler() \
+            .setInputCol("text") \
+            .setOutputCol("documents")
+
+        olmo = OLMoTransformer \
+            .pretrained() \
+            .setMaxOutputLength(50) \
+            .setDoSample(False) \
+            .setInputCols(["documents"]) \
+            .setOutputCol("generation")
+
+        pipeline = Pipeline().setStages([document_assembler, olmo])
+        results = pipeline.fit(data).transform(data)
+
+        results.select("generation.result").show(truncate=False)
+
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/OLMo.scala b/src/main/scala/com/johnsnowlabs/ml/ai/OLMo.scala
new file mode 100644
index 00000000000000..4ac08acc05d7ae
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/OLMo.scala
@@ -0,0 +1,363 @@
+/*
+ * Copyright 2017 - 2023  John Snow Labs
+ *
+ *    Licensed under the Apache License, Version 2.0 (the "License");
+ *    you may not use this file except in compliance with the License.
+ *    You may obtain a copy of the License at
+ *
+ *        http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *    Unless required by applicable law or agreed to in writing, software
+ *    distributed under the License is distributed on an "AS IS" BASIS,
+ *    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *    See the License for the specific language governing permissions and
+ *    limitations under the License.
+ */
+
+package com.johnsnowlabs.ml.ai
+
+import ai.onnxruntime.{OnnxTensor, OrtEnvironment, OrtSession}
+import com.johnsnowlabs.ml.ai.util.Generation.{Generate, GenerationConfig}
+import com.johnsnowlabs.ml.onnx.OnnxSession
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.onnx.TensorResources.implicits._
+import com.johnsnowlabs.ml.tensorflow.sentencepiece.SentencePieceWrapper
+import com.johnsnowlabs.nlp.Annotation
+import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
+import com.johnsnowlabs.nlp.annotators.common.SentenceSplit
+import com.johnsnowlabs.nlp.annotators.tokenizer.bpe.{BpeTokenizer, OLMoTokenizer}
+import org.intel.openvino.InferRequest
+import org.tensorflow.{Session, Tensor}
+
+import scala.collection.JavaConverters._
+
+private[johnsnowlabs] class OLMo(
+    val onnxWrappers: DecoderWrappers,
+    merges: Map[(String, String), Int],
+    vocabulary: Map[String, Int],
+    generationConfig: GenerationConfig)
+    extends Serializable
+    with Generate {
+
+  private val onnxSessionOptions: Map[String, String] = new OnnxSession().getSessionOptions
+  val bpeTokenizer: OLMoTokenizer = BpeTokenizer
+    .forModel("olmo", merges = merges, vocab = vocabulary, padWithSequenceTokens = false)
+    .asInstanceOf[OLMoTokenizer]
+  private val GenerationConfig(
+    bosTokenId: Int,
+    paddingTokenId: Int,
+    eosTokenId: Int,
+    vocabSize: Int,
+    beginSuppressTokens,
+    suppressTokenIds,
+    forcedDecoderIds) =
+    generationConfig
+
+  /** Decode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of decoded sentences
+    */
+  def decode(sentences: Array[Array[Int]]): Seq[String] = {
+    sentences.map(s => bpeTokenizer.decodeTokens(s.map(_.toInt)))
+  }
+
+  /** Encode a sequence of sentences
+    * @param sentences
+    *   Sequence of sentences
+    * @return
+    *   Sequence of encoded sentences
+    */
+  def encode(sentences: Seq[Annotation]): Seq[Array[Int]] = {
+    SentenceSplit
+      .unpack(sentences)
+      .map(s => {
+        val sentWithTask = s
+        bpeTokenizer
+          .tokenize(sentWithTask)
+          .map(bpeTokenizer.encode)
+          .flatMap(_.map(_.pieceId))
+      })
+  }
+
+  def tag(
+      batch: Seq[Array[Int]],
+      minOutputLength: Int,
+      maxOutputLength: Int,
+      doSample: Boolean,
+      temperature: Double,
+      topK: Int,
+      topP: Double,
+      repetitionPenalty: Double,
+      noRepeatNgramSize: Int,
+      randomSeed: Option[Long],
+      ignoreTokenIds: Array[Int] = Array(),
+      beamSize: Int,
+      maxInputLength: Int): Array[Array[Int]] = {
+    val (encoderSession, env) = onnxWrappers.decoder.getSession(onnxSessionOptions)
+    val ignoreTokenIdsInt = ignoreTokenIds
+    val expandedDecoderInputsVals = batch
+    val sequencesLength = expandedDecoderInputsVals.map(x => x.length).toArray
+    val maxSentenceLength = sequencesLength.max // - curLen
+
+    val numReturn_sequences = 1
+    // from config
+
+    var effectiveBatch_size = 1
+    var effectiveBatch_mult = 1
+
+    if (doSample) {
+      effectiveBatch_size = expandedDecoderInputsVals.length * numReturn_sequences
+      effectiveBatch_mult = numReturn_sequences
+    } else {
+      effectiveBatch_size = expandedDecoderInputsVals.length
+      effectiveBatch_mult = 1
+    }
+
+    // Run the prompt through the decoder and get the past
+//    val decoderOutputs =
+//      generateGreedyOnnx(
+//        expandedDecoderInputsVals.toArray,
+//        (encoderSession, env),
+//        maxOutputLength)
+
+    // dummy tensors for decoder encode state and attention mask
+    val decoderEncoderStateTensors = Right(OnnxTensor.createTensor(env, Array(0)))
+    val encoderAttentionMaskTensors = Right(OnnxTensor.createTensor(env, Array(1)))
+
+    // output with beam search
+    val modelOutputs = generate(
+      batch,
+      decoderEncoderStateTensors,
+      encoderAttentionMaskTensors,
+      expandedDecoderInputsVals.toArray,
+      maxOutputLength + maxSentenceLength,
+      minOutputLength,
+      doSample,
+      beamSize,
+      1,
+      temperature,
+      topK,
+      topP,
+      repetitionPenalty,
+      noRepeatNgramSize,
+      this.vocabSize,
+      this.eosTokenId,
+      this.paddingTokenId,
+      randomSeed,
+      ignoreTokenIdsInt,
+      Right((env, encoderSession)),
+      applySoftmax = false)
+
+//    decoderOutputs
+    modelOutputs
+  }
+
+  def predict(
+      sentences: Seq[Annotation],
+      batchSize: Int,
+      minOutputLength: Int,
+      maxOutputLength: Int,
+      doSample: Boolean,
+      temperature: Double,
+      topK: Int,
+      topP: Double,
+      repetitionPenalty: Double,
+      noRepeatNgramSize: Int,
+      randomSeed: Option[Long] = None,
+      ignoreTokenIds: Array[Int] = Array(),
+      beamSize: Int,
+      maxInputLength: Int): Seq[Annotation] = {
+
+    val batchDecoder = sentences.grouped(batchSize).toArray.flatMap { batch =>
+      val batchSP = encode(batch)
+      val spIds = tag(
+        batchSP,
+        minOutputLength,
+        maxOutputLength,
+        doSample,
+        temperature,
+        topK,
+        topP,
+        repetitionPenalty,
+        noRepeatNgramSize,
+        randomSeed,
+        ignoreTokenIds,
+        beamSize,
+        maxInputLength)
+
+      decode(spIds)
+
+    }
+
+    var sentBegin, nextSentEnd = 0
+    val annotations = batchDecoder.zip(sentences).map { case (content, sent) =>
+      nextSentEnd += content.length - 1
+      val annots = new Annotation(
+        annotatorType = DOCUMENT,
+        begin = sentBegin,
+        end = nextSentEnd,
+        result = content,
+        metadata = sent.metadata)
+      sentBegin += nextSentEnd + 1
+      annots
+    }
+    annotations
+  }
+
+  private def getDecoderOutputsWithPast(
+      inputIds: Array[Array[Int]],
+      decoderPast: Map[String, OnnxTensor],
+      onnxSession: (OrtSession, OrtEnvironment))
+      : (Array[Array[Float]], Map[String, OnnxTensor]) = {
+    val (session, env) = onnxSession
+
+    val lastTokens: Array[Array[Long]] =
+      inputIds.map { tokenIds =>
+        Array(tokenIds.last.toLong)
+      }
+
+    val lastTokensTensor: OnnxTensor =
+      OnnxTensor.createTensor(env, lastTokens)
+    val decoderAttentionMask: OnnxTensor =
+      OnnxTensor.createTensor(env, lastTokens.map(_.map(_ => 1L)))
+    val decoderWithPastInputs: java.util.Map[String, OnnxTensor] = (Map(
+      OnnxSignatures.decoderInputIDs -> lastTokensTensor,
+      OnnxSignatures.decoderAttentionMask -> decoderAttentionMask) ++ decoderPast).asJava
+    val sessionOutput = session.run(decoderWithPastInputs)
+    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+    val decoderPresent = sessionOutput.getOnnxTensors(OnnxSignatures.decoderPresent)
+    lastTokensTensor.close()
+    val batchLogits = logits.grouped(vocabSize).toArray
+    (batchLogits, decoderPresent)
+
+  }
+
+  override def getModelOutput(
+      encoderInputIds: Seq[Array[Int]],
+      decoderInputIds: Seq[Array[Int]],
+      decoderEncoderStateTensors: Either[Tensor, OnnxTensor],
+      encoderAttentionMaskTensors: Either[Tensor, OnnxTensor],
+      maxLength: Int,
+      session: Either[Session, (OrtEnvironment, OrtSession)],
+      ovInferRequest: Option[InferRequest]): Array[Array[Float]] = {
+
+    session.fold(
+      tfSession => {
+        // not implemented yet
+        Array()
+      },
+      onnxSession => {
+        val (env, decoderSession) = onnxSession
+        val decoderOutputs =
+          getDecoderOutputs(decoderInputIds.toArray, onnxSession = (decoderSession, env))
+        decoderOutputs
+      })
+
+  }
+  private def getDecoderOutputs(
+      inputIds: Array[Array[Int]],
+      onnxSession: (OrtSession, OrtEnvironment)): (Array[Array[Float]]) = {
+    val (session, env) = onnxSession
+
+    val inputIdsLong: Array[Array[Long]] =
+      inputIds.map { tokenIds => tokenIds.map(_.toLong) }
+
+    val inputPositionIDsLong: Array[Array[Long]] =
+      inputIds.map { tokenIds =>
+        tokenIds.zipWithIndex.map { case (_, i) =>
+          i.toLong
+        }
+      }
+
+    val inputIdsLongTensor: OnnxTensor =
+      OnnxTensor.createTensor(env, inputIdsLong)
+    val decoderAttentionMask: OnnxTensor =
+      OnnxTensor.createTensor(env, inputIdsLong.map(_.map(_ => 1L)))
+    val decoderPositionIDs: OnnxTensor =
+      OnnxTensor.createTensor(env, inputPositionIDsLong)
+
+    val decoderInputs: java.util.Map[String, OnnxTensor] = Map(
+      OnnxSignatures.decoderInputIDs -> inputIdsLongTensor,
+      OnnxSignatures.decoderAttentionMask -> decoderAttentionMask,
+      OnnxSignatures.decoderPositionIDs -> decoderPositionIDs).asJava
+    val sessionOutput = session.run(decoderInputs)
+
+    val sequenceLength = inputIds.head.length
+    val batchSize = inputIds.length
+
+//    val logits = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+//    inputIdsLongTensor.close()
+//    decoderPositionIDs.close()
+//    decoderAttentionMask.close()
+//    val batchLogits = logits.grouped(vocabSize).toArray
+//    batchLogits
+
+    val logitsRaw = sessionOutput.getFloatArray(OnnxSignatures.decoderOutput)
+    val decoderOutputs = (0 until batchSize).map(i => {
+      logitsRaw
+        .slice(
+          i * sequenceLength * vocabSize + (sequenceLength - 1) * vocabSize,
+          i * sequenceLength * vocabSize + sequenceLength * vocabSize)
+    })
+    decoderOutputs.toArray
+  }
+
+  /** Gets the index with the highest score
+    *
+    * @param scores
+    *   Array of Scores to max
+    * @return
+    *   Index of the highest score
+    */
+  private def argmax(scores: Array[Float]): Int =
+    scores.zipWithIndex.maxBy { case (score, _) =>
+      score
+    }._2
+  private def greedyGenerationFinished(
+      decoderIds: Seq[Array[Int]],
+      eosTokenId: Int,
+      maxOutputLength: Int): Boolean =
+    decoderIds.map(_.last).forall(_ == eosTokenId) || decoderIds.head.length == maxOutputLength
+
+  private def generateGreedyOnnx(
+      inputIds: Array[Array[Int]],
+      onnxSession: (OrtSession, OrtEnvironment),
+      maxOutputLength: Int): (Array[Array[Int]]) = {
+
+    val sequencesLength = inputIds.map(x => x.length).toArray
+    val maxSentenceLength = sequencesLength.max // - curLen
+    var generatedIds: Array[Array[Int]] = inputIds
+    while (!greedyGenerationFinished(
+        generatedIds,
+        eosTokenId,
+        maxOutputLength + maxSentenceLength)) {
+
+      val (batchLogits: Array[Array[Float]]) =
+        Array(getDecoderOutputs(generatedIds, onnxSession).last)
+
+      val nextTokenIds: Array[Int] = batchLogits.map(argmax)
+      generatedIds =
+        generatedIds.zip(nextTokenIds).map { case (currentIds: Array[Int], nextId: Int) =>
+          currentIds ++ Array(nextId)
+        }
+    }
+    generatedIds
+  }
+
+  private object OnnxSignatures {
+    val decoderInputIDs: String = "input_ids"
+    val decoderAttentionMask: String = "attention_mask"
+    val decoderPositionIDs: String = "position_ids"
+
+    // create decoder past for 32 layers of key and value eg. past_key_values.0.key and past_key_values.0.value
+    val decoderPast: Array[String] = (0 until 32)
+      .flatMap(i => Seq(s"past_key_values.$i.key", s"past_key_values.$i.value"))
+      .toArray
+    val decoderOutput: String = "logits"
+    val decoderPresent: Array[String] =
+      (0 until 32).flatMap(i => Seq(s"present.$i.key", s"present.$i.value")).toArray
+  }
+
+}
diff --git a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala
index 24d2ac1d3f6696..912a35409673be 100644
--- a/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/ai/util/Generation/Generate.scala
@@ -311,7 +311,7 @@ trait Generate {
           beamIndices(beamIdx(elem)) :+ beamIdx(elem)
         }
         currentLength = currentLength + 1
-        if (beamScorer.isDone || (expandedInputs.head.length >= maxLength)) {
+        if (beamScorer.isDone || (expandedInputs.head.length > maxLength)) {
           break
 
         }
diff --git a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala
index e985f2b0bcac99..27250cd5fceff6 100644
--- a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxSerializeModel.scala
@@ -98,7 +98,11 @@ trait ReadOnnxModel {
     val fsPath = new Path(path, localModelFile).toString
 
     val onnxDataFile: Option[String] = if (modelName.isDefined && dataFilePostfix.isDefined) {
-      Some(fsPath.replaceAll(modelName.get, s"${suffix}_${modelName.get}${dataFilePostfix.get}"))
+      var modelNameWithoutSuffix = modelName.get.replace(".onnx", "")
+      Some(
+        fsPath.replaceAll(
+          modelName.get,
+          s"${suffix}_${modelNameWithoutSuffix}${dataFilePostfix.get}"))
     } else None
 
     if (onnxDataFile.isDefined) {
@@ -117,7 +121,8 @@ trait ReadOnnxModel {
         zipped = zipped,
         useBundle = useBundle,
         modelName = if (modelName.isDefined) modelName.get else onnxFile,
-        onnxFileSuffix = Some(suffix))
+        onnxFileSuffix = Some(suffix),
+        dataFileSuffix = dataFilePostfix)
 
     onnxWrapper
 
diff --git a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala
index 6e748faa72ee63..1b5131446a944e 100644
--- a/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala
+++ b/src/main/scala/com/johnsnowlabs/ml/onnx/OnnxWrapper.scala
@@ -134,7 +134,9 @@ object OnnxWrapper {
 
     val onnxDataFileExist: Boolean = {
       if (onnxFileSuffix.isDefined && dataFileSuffix.isDefined) {
-        val onnxDataFilePath = s"${onnxFileSuffix.get}_$modelName${dataFileSuffix.get}"
+        var modelNameWithoutSuffix = modelName.replace(".onnx", "")
+        val onnxDataFilePath =
+          s"${onnxFileSuffix.get}_$modelNameWithoutSuffix${dataFileSuffix.get}"
         onnxDataFile = Paths.get(parentDir, onnxDataFilePath).toFile
         onnxDataFile.exists()
       } else false
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTransformer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTransformer.scala
new file mode 100644
index 00000000000000..a5afd467478eaf
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTransformer.scala
@@ -0,0 +1,436 @@
+/*
+ * Copyright 2017-2024 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.ml.ai.util.Generation.GenerationConfig
+import com.johnsnowlabs.ml.ai.OLMo
+import com.johnsnowlabs.ml.onnx.OnnxWrapper.DecoderWrappers
+import com.johnsnowlabs.ml.onnx.{OnnxWrapper, ReadOnnxModel, WriteOnnxModel}
+import com.johnsnowlabs.ml.util.LoadExternalModel.{
+  loadJsonStringAsset,
+  loadSentencePieceAsset,
+  loadTextAsset,
+  modelSanityCheck,
+  notSupportedEngineError
+}
+import com.johnsnowlabs.ml.util.ONNX
+import com.johnsnowlabs.nlp.AnnotatorType.DOCUMENT
+import com.johnsnowlabs.nlp._
+import com.johnsnowlabs.ml.tensorflow.sentencepiece.{
+  ReadSentencePieceModel,
+  SentencePieceWrapper,
+  WriteSentencePieceModel
+}
+import com.johnsnowlabs.nlp.serialization.MapFeature
+import org.apache.spark.broadcast.Broadcast
+import org.apache.spark.ml.param._
+import org.apache.spark.ml.util.Identifiable
+import org.apache.spark.sql.SparkSession
+import com.johnsnowlabs.nlp.serialization.{MapFeature, StructFeature}
+import org.json4s._
+import org.json4s.jackson.JsonMethods._
+
+/** OLMo: Open Language Models
+  *
+  * OLMo is a series of Open Language Models designed to enable the science of language models.
+  * The OLMo models are trained on the Dolma dataset.
+  *
+  * Pretrained models can be loaded with `pretrained` of the companion object:
+  * {{{
+  * val OLMo = OLMoTransformer.pretrained()
+  *   .setInputCols("document")
+  *   .setOutputCol("generation")
+  * }}}
+  * The default model is `"olmo_1b_int4"`, if no name is provided. For available pretrained models
+  * please see the [[https://sparknlp.org/models?q=OLMo Models Hub]].
+  *
+  * For extended examples of usage, see
+  * [[https://github.com/JohnSnowLabs/spark-nlp/blob/master/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTestSpec.scala OLMoTestSpec]].
+  *
+  * '''References:'''
+  *   - [[https://allenai.org/olmo OLMo Project Page.]]
+  *   - [[https://github.com/allenai/OLMo OLMo GitHub Repository.]]
+  *   - [[https://arxiv.org/pdf/2402.00838.pdf OLMo: Accelerating the Science of Language Models]]
+  *
+  * '''Paper Abstract:'''
+  *
+  * ''Language models (LMs) have become ubiquitous in both NLP research and in commercial product
+  * offerings. As their commercial importance has surged, the most powerful models have become
+  * closed off, gated behind proprietary interfaces, with important details of their training
+  * data, architectures, and development undisclosed. Given the importance of these details in
+  * scientifically studying these models, including their biases and potential risks, we believe
+  * it is essential for the research community to have access to powerful, truly open LMs. To this
+  * end, this technical report details the first release of OLMo, a state-of-the-art, truly Open
+  * Language Model and its framework to build and study the science of language modeling. Unlike
+  * most prior efforts that have only released model weights and inference code, we release OLMo
+  * and the whole framework, including training data and training and evaluation code. We hope
+  * this release will empower and strengthen the open research community and inspire a new wave of
+  * innovation.''
+  *
+  * '''Note:'''
+  *
+  * This is a very computationally expensive module especially on larger sequence. The use of an
+  * accelerator such as GPU is recommended.
+  *
+  * ==Example==
+  * {{{
+  * import spark.implicits._
+  * import com.johnsnowlabs.nlp.base.DocumentAssembler
+  * import com.johnsnowlabs.nlp.annotators.seq2seq.OLMoTransformer
+  * import org.apache.spark.ml.Pipeline
+  *
+  * val documentAssembler = new DocumentAssembler()
+  *   .setInputCol("text")
+  *   .setOutputCol("documents")
+  *
+  * val OLMo = OLMoTransformer.pretrained("olmo_1b_int4")
+  *   .setInputCols(Array("documents"))
+  *   .setMinOutputLength(10)
+  *   .setMaxOutputLength(50)
+  *   .setDoSample(false)
+  *   .setTopK(50)
+  *   .setNoRepeatNgramSize(3)
+  *   .setOutputCol("generation")
+  *
+  * val pipeline = new Pipeline().setStages(Array(documentAssembler, OLMo))
+  *
+  * val data = Seq(
+  *   "My name is Leonardo."
+  * ).toDF("text")
+  * val result = pipeline.fit(data).transform(data)
+  *
+  * results.select("generation.result").show(truncate = false)
+  * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  * |result                                                                                                                                                                                              |
+  * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  * |[ My name is Leonardo . I am a student of the University of California, Berkeley. I am interested in the field of Artificial Intelligence and its applications in the real world. I have a strong   |
+  * | passion for learning and am always looking for ways to improve my knowledge and skills]                                                                                                            |
+  * +----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+  * }}}
+  *
+  * @param uid
+  *   required uid for storing annotator to disk
+  * @groupname anno Annotator types
+  * @groupdesc anno
+  *   Required input and expected output annotator types
+  * @groupname Ungrouped Members
+  * @groupname param Parameters
+  * @groupname setParam Parameter setters
+  * @groupname getParam Parameter getters
+  * @groupname Ungrouped Members
+  * @groupprio param  1
+  * @groupprio anno  2
+  * @groupprio Ungrouped 3
+  * @groupprio setParam  4
+  * @groupprio getParam  5
+  * @groupdesc param
+  *   A list of (hyper-)parameter keys this annotator can take. Users can set and get the
+  *   parameter values through setters and getters, respectively.
+  */
+class OLMoTransformer(override val uid: String)
+    extends AnnotatorModel[OLMoTransformer]
+    with HasBatchedAnnotate[OLMoTransformer]
+    with ParamsAndFeaturesWritable
+    with WriteOnnxModel
+    with HasGeneratorProperties
+    with HasEngine {
+
+  def this() = this(Identifiable.randomUID("OLMoTRANSFORMER"))
+
+  /** Input annotator type : DOCUMENT
+    *
+    * @group param
+    */
+  override val inputAnnotatorTypes: Array[AnnotatorType] = Array(DOCUMENT)
+
+  /** Output annotator type : DOCUMENT
+    *
+    * @group param
+    */
+  override val outputAnnotatorType: String = DOCUMENT
+
+  /** @group setParam */
+  def setRandomSeed(value: Int): OLMoTransformer.this.type = {
+    if (randomSeed.isEmpty) {
+      this.randomSeed = Some(value)
+    }
+    this
+  }
+
+  /** A list of token ids which are ignored in the decoder's output (Default: `Array()`)
+    *
+    * @group param
+    */
+  var ignoreTokenIds = new IntArrayParam(
+    this,
+    "ignoreTokenIds",
+    "A list of token ids which are ignored in the decoder's output")
+
+  /** @group setParam */
+  def setIgnoreTokenIds(tokenIds: Array[Int]): OLMoTransformer.this.type = {
+    set(ignoreTokenIds, tokenIds)
+  }
+
+  /** @group getParam */
+  def getIgnoreTokenIds: Array[Int] = $(ignoreTokenIds)
+
+  /** Vocabulary used to encode the words to ids with bpeTokenizer.encode
+    *
+    * @group param
+    */
+  val vocabulary: MapFeature[String, Int] = new MapFeature(this, "vocabulary").setProtected()
+
+  /** @group setParam */
+  def setVocabulary(value: Map[String, Int]): this.type = set(vocabulary, value)
+
+  /** Holding merges.txt coming from RoBERTa model
+    *
+    * @group param
+    */
+  val merges: MapFeature[(String, String), Int] = new MapFeature(this, "merges").setProtected()
+
+  /** @group setParam */
+  def setMerges(value: Map[(String, String), Int]): this.type = set(merges, value)
+
+  private var _model: Option[Broadcast[OLMo]] = None
+
+  val generationConfig: StructFeature[GenerationConfig] =
+    new StructFeature(this, "generationConfig").setProtected()
+
+  def setGenerationConfig(value: GenerationConfig): this.type =
+    set(generationConfig, value)
+
+  def getGenerationConfig: GenerationConfig = $$(generationConfig)
+
+  /** @group setParam */
+  def setModelIfNotSet(spark: SparkSession, onnxWrappers: DecoderWrappers): this.type = {
+    if (_model.isEmpty) {
+      _model = Some(
+        spark.sparkContext.broadcast(
+          new OLMo(
+            onnxWrappers,
+            $$(merges),
+            $$(vocabulary),
+            generationConfig = getGenerationConfig)))
+    }
+    this
+  }
+
+  /** @group getParam */
+  def getModelIfNotSet: OLMo = _model.get.value
+
+  setDefault(
+    minOutputLength -> 0,
+    maxOutputLength -> 20,
+    doSample -> false,
+    temperature -> 0.6,
+    topK -> 50,
+    topP -> 0.9,
+    repetitionPenalty -> 1.0,
+    noRepeatNgramSize -> 3,
+    ignoreTokenIds -> Array(),
+    batchSize -> 1,
+    beamSize -> 1,
+    maxInputLength -> 4096)
+
+  /** takes a document and annotations and produces new annotations of this annotator's annotation
+    * type
+    *
+    * @param batchedAnnotations
+    *   Annotations that correspond to inputAnnotationCols generated by previous annotators if any
+    * @return
+    *   any number of annotations processed for every input annotation. Not necessary one to one
+    *   relationship
+    */
+  override def batchAnnotate(batchedAnnotations: Seq[Array[Annotation]]): Seq[Seq[Annotation]] = {
+
+    val allAnnotations = batchedAnnotations
+      .filter(_.nonEmpty)
+      .zipWithIndex
+      .flatMap { case (annotations, i) =>
+        annotations.filter(_.result.nonEmpty).map(x => (x, i))
+      }
+    val processedAnnotations = if (allAnnotations.nonEmpty) {
+      this.getModelIfNotSet.predict(
+        sentences = allAnnotations.map(_._1),
+        batchSize = $(batchSize),
+        minOutputLength = $(minOutputLength),
+        maxOutputLength = $(maxOutputLength),
+        doSample = $(doSample),
+        temperature = $(temperature),
+        topK = $(topK),
+        topP = $(topP),
+        repetitionPenalty = $(repetitionPenalty),
+        noRepeatNgramSize = $(noRepeatNgramSize),
+        randomSeed = this.randomSeed,
+        ignoreTokenIds = $(ignoreTokenIds),
+        beamSize = $(beamSize),
+        maxInputLength = $(maxInputLength))
+    } else {
+      Seq()
+    }
+    Seq(processedAnnotations)
+  }
+
+  override def onWrite(path: String, spark: SparkSession): Unit = {
+    super.onWrite(path, spark)
+    getEngine match {
+      case ONNX.name =>
+        val wrappers = getModelIfNotSet.onnxWrappers
+        writeOnnxModels(
+          path,
+          spark,
+          Seq((wrappers.decoder, "decoder_model.onnx")),
+          OLMoTransformer.suffix)
+    }
+  }
+}
+
+trait ReadablePretrainedOLMoTransformerModel
+    extends ParamsAndFeaturesReadable[OLMoTransformer]
+    with HasPretrained[OLMoTransformer] {
+  override val defaultModelName: Some[String] = Some("olmo_1b_int4")
+
+  /** Java compliant-overrides */
+  override def pretrained(): OLMoTransformer = super.pretrained()
+
+  override def pretrained(name: String): OLMoTransformer = super.pretrained(name)
+
+  override def pretrained(name: String, lang: String): OLMoTransformer =
+    super.pretrained(name, lang)
+
+  override def pretrained(name: String, lang: String, remoteLoc: String): OLMoTransformer =
+    super.pretrained(name, lang, remoteLoc)
+}
+
+trait ReadOLMoTransformerDLModel extends ReadOnnxModel {
+  this: ParamsAndFeaturesReadable[OLMoTransformer] =>
+
+  override val onnxFile: String = "decoder_model.onnx"
+  val suffix: String = "_olmo"
+
+  def readModel(instance: OLMoTransformer, path: String, spark: SparkSession): Unit = {
+    instance.getEngine match {
+      case ONNX.name =>
+        val wrapper =
+          readOnnxModel(
+            path,
+            spark,
+            suffix,
+            zipped = true,
+            useBundle = false,
+            modelName = Some("decoder_model.onnx"),
+            dataFilePostfix = Some(".onnx_data"))
+        val onnxWrappers =
+          DecoderWrappers(decoder = wrapper)
+        instance.setModelIfNotSet(spark, onnxWrappers)
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+  }
+
+  addReader(readModel)
+
+  def loadSavedModel(modelPath: String, spark: SparkSession): OLMoTransformer = {
+    implicit val formats: DefaultFormats.type = DefaultFormats // for json4
+    val (localModelPath, detectedEngine) =
+      modelSanityCheck(modelPath, isDecoder = true)
+    val modelConfig: JValue =
+      parse(loadJsonStringAsset(localModelPath, "config.json"))
+
+    val beginSuppressTokens: Array[Int] =
+      (modelConfig \ "begin_suppress_tokens").extract[Array[Int]]
+
+    val suppressTokenIds: Array[Int] =
+      (modelConfig \ "suppress_tokens").extract[Array[Int]]
+
+    val forcedDecoderIds: Array[(Int, Int)] =
+      (modelConfig \ "forced_decoder_ids").extract[Array[Array[Int]]].map {
+        case idxWithTokenId: Array[Int] if idxWithTokenId.length == 2 =>
+          (idxWithTokenId(0), idxWithTokenId(1))
+        case _ =>
+          throw new Exception(
+            "Could not extract forced_decoder_ids. Should be a list of tuples with 2 entries.")
+      }
+
+    def arrayOrNone[T](array: Array[T]): Option[Array[T]] =
+      if (array.nonEmpty) Some(array) else None
+
+    var bosTokenId = -1
+    try {
+      bosTokenId = (modelConfig \ "bos_token_id").extract[Int]
+    } catch {
+      case _: Exception =>
+        println("Could not extract bos_token_id from config.json, assigning default value -1")
+    }
+    val eosTokenId = (modelConfig \ "eos_token_id").extract[Int]
+    val padTokenId = (modelConfig \ "eos_token_id").extract[Int]
+    val vocabSize = (modelConfig \ "vocab_size").extract[Int]
+
+    val vocabs = loadTextAsset(localModelPath, "vocab.txt").zipWithIndex.toMap
+
+    val bytePairs = loadTextAsset(localModelPath, "merges.txt")
+      .map(_.split(" "))
+      .filter(w => w.length == 2)
+      .map { case Array(c1, c2) => (c1, c2) }
+      .zipWithIndex
+      .toMap
+
+    val annotatorModel = new OLMoTransformer()
+      .setGenerationConfig(
+        GenerationConfig(
+          bosTokenId,
+          padTokenId,
+          eosTokenId,
+          vocabSize,
+          arrayOrNone(beginSuppressTokens),
+          arrayOrNone(suppressTokenIds),
+          arrayOrNone(forcedDecoderIds)))
+      .setVocabulary(vocabs)
+      .setMerges(bytePairs)
+
+    annotatorModel.set(annotatorModel.engine, detectedEngine)
+
+    detectedEngine match {
+      case ONNX.name =>
+        val onnxWrapperDecoder =
+          OnnxWrapper.read(
+            spark,
+            localModelPath,
+            zipped = false,
+            useBundle = true,
+            modelName = "decoder_model",
+            dataFileSuffix = Some(".onnx_data"),
+            onnxFileSuffix = Some(suffix))
+
+        val onnxWrappers = DecoderWrappers(onnxWrapperDecoder)
+
+        annotatorModel
+          .setModelIfNotSet(spark, onnxWrappers)
+
+      case _ =>
+        throw new Exception(notSupportedEngineError)
+    }
+
+    annotatorModel
+  }
+
+}
+
+object OLMoTransformer
+    extends ReadablePretrainedOLMoTransformerModel
+    with ReadOLMoTransformerDLModel
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
index 4afb1d5b9bf18c..4e790cf171a1cd 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeSpecialTokens.scala
@@ -137,6 +137,14 @@ private[johnsnowlabs] object SpecialTokens {
           unkTokenString = "<unk>",
           maskTokenString = "<mask>",
           padTokenString = "<pad>")
+      case "olmo" =>
+        SpecialTokens(
+          vocab,
+          startTokenString = "<|endoftext|>",
+          endTokenString = "<|endoftext|>",
+          unkTokenString = "<|endoftext|>",
+          maskTokenString = "<|endoftext|>",
+          padTokenString = "<|padding|>")
       case "clip" =>
         SpecialTokens(
           vocab,
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
index 8c72a8f99d6685..e5d075e31317ad 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/BpeTokenizer.scala
@@ -352,6 +352,13 @@ object BpeTokenizer {
           modelSpecialTokens(),
           padWithSequenceTokens,
           addPrefixSpaceToSentence = addPrefixSpaceToSentence)
+      case "olmo" =>
+        new OLMoTokenizer(
+          merges,
+          vocab,
+          modelSpecialTokens(),
+          padWithSequenceTokens,
+          addPrefixSpaceToSentence = addPrefixSpaceToSentence)
       case "clip" =>
         new CLIPTokenizer(merges, vocab, modelSpecialTokens())
       case "phi2" =>
diff --git a/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/OLMoTokenizer.scala b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/OLMoTokenizer.scala
new file mode 100644
index 00000000000000..95f046f5913670
--- /dev/null
+++ b/src/main/scala/com/johnsnowlabs/nlp/annotators/tokenizer/bpe/OLMoTokenizer.scala
@@ -0,0 +1,31 @@
+/*
+ * Copyright 2017-2023 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.tokenizer.bpe
+
+class OLMoTokenizer(
+    merges: Map[(String, String), Int],
+    vocab: Map[String, Int],
+    specialTokens: SpecialTokens,
+    padWithSequenceTokens: Boolean = false,
+    addPrefixSpaceToSentence: Boolean = false)
+    extends Gpt2Tokenizer(
+      merges,
+      vocab,
+      specialTokens,
+      padWithSequenceTokens,
+      prependString = "Ġ",
+      addPrefixSpaceToSentence)
diff --git a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
index 0e457d4d6e20df..b2164805efd415 100644
--- a/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
+++ b/src/main/scala/com/johnsnowlabs/nlp/pretrained/ResourceDownloader.scala
@@ -697,7 +697,8 @@ object PythonResourceDownloader {
     "NLLBTransformer" -> NLLBTransformer,
     "Phi3Transformer" -> Phi3Transformer,
     "QwenTransformer" -> QwenTransformer,
-    "AutoGGUFEmbeddings" -> AutoGGUFEmbeddings)
+    "AutoGGUFEmbeddings" -> AutoGGUFEmbeddings,
+    "OLMoTransformer" -> OLMoTransformer)
 
   // List pairs of types such as the one with key type can load a pretrained model from the value type
   val typeMapper: Map[String, String] = Map("ZeroShotNerModel" -> "RoBertaForQuestionAnswering")
diff --git a/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTestSpec.scala b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTestSpec.scala
new file mode 100644
index 00000000000000..55cfaffa6f2474
--- /dev/null
+++ b/src/test/scala/com/johnsnowlabs/nlp/annotators/seq2seq/OLMoTestSpec.scala
@@ -0,0 +1,75 @@
+/*
+ * Copyright 2017-2023 John Snow Labs
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.johnsnowlabs.nlp.annotators.seq2seq
+
+import com.johnsnowlabs.nlp.base.DocumentAssembler
+import com.johnsnowlabs.nlp.util.io.ResourceHelper
+import com.johnsnowlabs.tags.{FastTest, SlowTest}
+import org.apache.spark.ml.Pipeline
+import org.scalatest.flatspec.AnyFlatSpec
+
+class OLMoTestSpec extends AnyFlatSpec {
+
+  "olmo" should "should handle temperature=0 correctly and not crash when predicting more than 1 element with doSample=True" taggedAs SlowTest in {
+    // Even tough the Paper states temperature in interval [0,1), using temperature=0 will result in division by 0 error.
+    // Also DoSample=True may result in infinities being generated and distFiltered.length==0 which results in exception if we don't return 0 instead internally.
+    val testData = ResourceHelper.spark
+      .createDataFrame(Seq((1, "My name is Leonardo.")))
+      .toDF("id", "text")
+      .repartition(1)
+    val documentAssembler = new DocumentAssembler()
+      .setInputCol("text")
+      .setOutputCol("documents")
+
+    val bart = OLMoTransformer
+      .pretrained()
+      .setInputCols(Array("documents"))
+      .setDoSample(false)
+      .setMaxOutputLength(100)
+      .setOutputCol("generation")
+      .setBeamSize(1)
+
+    val pipeline = new Pipeline()
+      .setStages(Array(documentAssembler, bart))
+
+    val pipelineModel = pipeline.fit(testData)
+
+    pipelineModel
+      .transform(testData)
+      .show(truncate = false)
+
+    pipelineModel
+      .transform(testData)
+      .show(truncate = false)
+
+    pipelineModel.stages.last
+      .asInstanceOf[OLMoTransformer]
+      .write
+      .overwrite()
+      .save("/tmp/olmo-1b-4bit-model")
+
+    val loadedLLAMA3 = OLMoTransformer.load("/tmp/olmo-1b-4bit-model")
+
+    val loadedPipeline = new Pipeline().setStages(Array(documentAssembler, loadedLLAMA3))
+
+    loadedPipeline
+      .fit(testData)
+      .transform(testData)
+      .show(truncate = false)
+
+  }
+}