Merge pull request #828 from siddhivelankar23/main

Add hpu support for Intel® Gaudi®
PromtEngineer · Oct 28, 2024 · 51eb664 · 51eb664
2 parents b654a59 + c83249b
commit 51eb664
Show file tree

Hide file tree

Showing 8 changed files with 292 additions and 8 deletions.
diff --git a/Dockerfile_hpu b/Dockerfile_hpu
@@ -0,0 +1,45 @@
+FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest
+
+ENV HABANA_VISIBLE_DEVICES=all
+ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
+ENV PT_HPU_LAZY_ACC_PAR_MODE=0
+ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=1
+
+# Install linux packages
+ENV DEBIAN_FRONTEND="noninteractive"  TZ=Etc/UTC
+RUN apt-get update && apt-get install -y tzdata bash-completion python3-pip openssh-server \
+    vim git iputils-ping net-tools protobuf-compiler curl bc gawk tmux \
+    && rm -rf /var/lib/apt/lists/*
+
+# Add repo contents
+ADD localGPT /root/localGPT
+WORKDIR /root/localGPT
+
+# Install python packages
+RUN pip install --upgrade pip \
+    && pip install langchain-experimental==0.0.62 \
+    && pip install langchain==0.0.329 \
+    && pip install protobuf==3.20.2 \
+    && pip install grpcio-tools \
+    && pip install pymilvus==2.4.0 \
+    && pip install chromadb==0.5.15 \
+    && pip install llama-cpp-python==0.1.66 \
+    && pip install pdfminer.six==20221105 \
+    && pip install transformers==4.43.1 \
+    && pip install optimum[habana]==1.13.1 \
+    && pip install InstructorEmbedding==1.0.1 \
+    && pip install sentence-transformers==3.0.1 \
+    && pip install faiss-cpu==1.7.4 \
+    && pip install huggingface_hub==0.16.4 \
+    && pip install protobuf==3.20.2 \
+    && pip install auto-gptq==0.2.2 \
+    && pip install docx2txt unstructured unstructured[pdf] urllib3 accelerate \
+    && pip install bitsandbytes \
+    && pip install click flask requests openpyxl \
+    && pip install git+https://github.com/HabanaAI/[email protected] \
+    && pip install python-multipart \
+    && pip install fastapi \
+    && pip install uvicorn \
+    && pip install gptcache==0.1.43 \
+    && pip install pypdf==4.3.1 \
+    && pip install python-jose[cryptography]
diff --git a/README.md b/README.md
@@ -19,7 +19,7 @@
 - **Chat History**: Remembers your previous conversations (in a session).
 - **API**: LocalGPT has an API that you can use for building RAG Applications.
 - **Graphical Interface**: LocalGPT comes with two GUIs, one uses the API and the other is standalone (based on streamlit).
-- **GPU, CPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU` or `MPS` and more!
+- **GPU, CPU, HPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU`, `HPU (Intel® Gaudi®)` or `MPS` and more!
 
 ## Dive Deeper with Our Videos 🎥
 - [Detailed code-walkthrough](https://youtu.be/MlyoObdIHyo)
@@ -98,6 +98,7 @@ It includes CUDA, your system just needs Docker, BuildKit, your NVIDIA GPU drive
 Build as `docker build -t localgpt .`, requires BuildKit.
 Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
 Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`.
+For running the code on Intel® Gaudi® HPU, use the following Dockerfile - `Dockerfile_hpu`.
 
 ## Test dataset
 
@@ -173,6 +174,12 @@ You can also specify the device type just like `ingest.py`
 python run_localGPT.py --device_type mps # to run on Apple silicon
 ```
 
+```shell
+# To run on Intel® Gaudi® hpu
+MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2" # in constants.py
+python run_localGPT.py --device_type hpu
+```
+
 This will load the ingested vector store and embedding model. You will be presented with a prompt:
 
 ```shell

diff --git a/constants.py b/constants.py
@@ -29,7 +29,7 @@
 )
 
 # Context Window and Max New Tokens
-CONTEXT_WINDOW_SIZE = 8096
+CONTEXT_WINDOW_SIZE = 2048
 MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE  # int(CONTEXT_WINDOW_SIZE/4)
 
 #### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
@@ -106,6 +106,9 @@
 # MODEL_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
 # MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"
 
+# Use mistral to run on hpu
+# MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"
+
 # LLAMA 3 # use for Apple Silicon
 MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
 MODEL_BASENAME = None

diff --git a/gaudi_utils/embeddings.py b/gaudi_utils/embeddings.py
@@ -0,0 +1,40 @@
+import logging
+import torch
+
+from langchain.embeddings import HuggingFaceEmbeddings
+from habana_frameworks.torch.utils.library_loader import load_habana_module
+from optimum.habana.sentence_transformers.modeling_utils import (
+    adapt_sentence_transformers_to_gaudi,
+)
+
+from constants import EMBEDDING_MODEL_NAME
+
+
+def load_embeddings():
+    """Load HuggingFace Embeddings object onto Gaudi or CPU"""
+    load_habana_module()
+    if torch.hpu.is_available():
+        logging.info("Loading embedding model on hpu")
+
+        adapt_sentence_transformers_to_gaudi()
+        embeddings = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "hpu"}
+        )
+    else:
+        logging.info("Loading embedding model on cpu")
+        embeddings = HuggingFaceEmbeddings(
+            model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cpu"}
+        )
+    return embeddings
+
+
+def calculate_similarity(model, response, expected_answer):
+    """Calculate similarity between response and expected answer using the model"""
+    response_embedding = model.client.encode(response, convert_to_tensor=True).squeeze()
+    expected_embedding = model.client.encode(
+        expected_answer, convert_to_tensor=True
+    ).squeeze()
+    similarity_score = torch.nn.functional.cosine_similarity(
+        response_embedding, expected_embedding, dim=0
+    )
+    return similarity_score.item()
diff --git a/gaudi_utils/pipeline.py b/gaudi_utils/pipeline.py
@@ -0,0 +1,168 @@
+import copy
+import os
+import torch
+from pathlib import Path
+from typing import List
+
+import habana_frameworks.torch.hpu as torch_hpu
+
+from habana_frameworks.torch.hpu import wrap_in_hpu_graph
+from huggingface_hub import snapshot_download
+from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
+from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+from optimum.habana.utils import set_seed
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline
+from transformers.utils import is_offline_mode
+
+
+def get_repo_root(model_name_or_path, local_rank=-1, token=None):
+    """
+    Downloads the specified model checkpoint and returns the repository where it was downloaded.
+    """
+    if Path(model_name_or_path).is_dir():
+        # If it is a local model, no need to download anything
+        return model_name_or_path
+    else:
+        # Checks if online or not
+        if is_offline_mode():
+            if local_rank == 0:
+                print("Offline mode: forcing local_files_only=True")
+
+        # Only download PyTorch weights by default
+        allow_patterns = ["*.bin"]
+
+        # Download only on first process
+        if local_rank in [-1, 0]:
+            cache_dir = snapshot_download(
+                model_name_or_path,
+                local_files_only=is_offline_mode(),
+                cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+                allow_patterns=allow_patterns,
+                max_workers=16,
+                token=token,
+            )
+            if local_rank == -1:
+                # If there is only one process, then the method is finished
+                return cache_dir
+
+        # Make all processes wait so that other processes can get the checkpoint directly from cache
+        torch.distributed.barrier()
+
+        return snapshot_download(
+            model_name_or_path,
+            local_files_only=is_offline_mode(),
+            cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
+            allow_patterns=allow_patterns,
+            token=token,
+        )
+
+
+def get_optimized_model_name(config):
+    for model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
+        if model_type == config.model_type:
+            return model_type
+
+    return None
+
+
+def model_is_optimized(config):
+    """
+    Checks if the given config belongs to a model in optimum/habana/transformers/models, which has a
+    new input token_idx.
+    """
+    return get_optimized_model_name(config) is not None
+
+
+class GaudiTextGenerationPipeline(TextGenerationPipeline):
+    """
+    An end-to-end text-generation pipeline that can used to initialize LangChain classes.
+    """
+    def __init__(self, model_name_or_path=None, revision="main", **kwargs):
+        self.task = "text-generation"
+        self.device = "hpu"
+
+        # Tweak generation so that it runs faster on Gaudi
+        adapt_transformers_to_gaudi()
+        set_seed(27)
+
+        # Initialize tokenizer and define datatype
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision=revision)
+        model_dtype = torch.bfloat16
+
+        # Intialize model
+        get_repo_root(model_name_or_path)
+        model = AutoModelForCausalLM.from_pretrained(model_name_or_path, revision=revision, torch_dtype=model_dtype)
+        model = model.eval().to(self.device)
+        is_optimized = model_is_optimized(model.config)
+        model = wrap_in_hpu_graph(model)
+        self.model = model
+
+        # Used for padding input to fixed length
+        self.tokenizer.padding_side = "left"
+        self.max_padding_length = kwargs.get("max_padding_length", self.model.config.max_position_embeddings)
+
+        # Define config params for llama and mistral models
+        if self.model.config.model_type in ["llama", "mistral"]:
+            self.model.generation_config.pad_token_id = 0
+            self.model.generation_config.bos_token_id = 1
+            self.model.generation_config.eos_token_id = 2
+            self.tokenizer.bos_token_id = self.model.generation_config.bos_token_id
+            self.tokenizer.eos_token_id = self.model.generation_config.eos_token_id
+            self.tokenizer.pad_token_id = self.model.generation_config.pad_token_id
+            self.tokenizer.pad_token = self.tokenizer.decode(self.tokenizer.pad_token_id)
+            self.tokenizer.eos_token = self.tokenizer.decode(self.tokenizer.eos_token_id)
+            self.tokenizer.bos_token = self.tokenizer.decode(self.tokenizer.bos_token_id)
+
+        # Applicable to models that do not have pad tokens
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+            self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id
+
+        # Edit generation configuration based on input arguments
+        self.generation_config = copy.deepcopy(self.model.generation_config)
+        self.generation_config.max_new_tokens = kwargs.get("max_new_tokens", 100)
+        self.generation_config.use_cache = kwargs.get("use_kv_cache", True)
+        self.generation_config.static_shapes = is_optimized
+        self.generation_config.do_sample = kwargs.get("do_sample", False)
+        self.generation_config.num_beams = kwargs.get("num_beams", 1)
+        self.generation_config.temperature = kwargs.get("temperature", 1.0)
+        self.generation_config.top_p = kwargs.get("top_p", 1.0)
+        self.generation_config.repetition_penalty = kwargs.get("repetition_penalty", 1.0)
+        self.generation_config.num_return_sequences = kwargs.get("num_return_sequences", 1)
+        self.generation_config.bad_words_ids = None
+        self.generation_config.force_words_ids = None
+        self.generation_config.ignore_eos = False
+
+        # Define empty post-process params dict as there is no postprocesing
+        self._postprocess_params = {}
+
+        # Warm-up hpu and compile computation graphs
+        self.compile_graph()
+
+    def __call__(self, prompt: List[str]):
+        """
+        __call__ method of pipeline class
+        """
+        # Tokenize input string
+        model_inputs = self.tokenizer.encode_plus(prompt[0], return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True)
+
+        # Move tensors to hpu
+        for t in model_inputs:
+            if torch.is_tensor(model_inputs[t]):
+                model_inputs[t] = model_inputs[t].to(self.device)
+
+        # Call model's generate method
+        output = self.model.generate(**model_inputs, generation_config=self.generation_config, lazy_mode=True, hpu_graphs=True, profiling_steps=0, profiling_warmup_steps=0).cpu()
+
+        # Decode and return result
+        output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
+        del output, model_inputs
+        return [{"generated_text": output_text}]
+
+    def compile_graph(self):
+        """
+        Function to compile computation graphs and synchronize hpus.
+        """
+        for _ in range(3):
+            self(["Here is my prompt"])
+        torch_hpu.synchronize()
diff --git a/ingest.py b/ingest.py
@@ -18,6 +18,9 @@
     SOURCE_DIRECTORY,
 )
 
+import nltk
+nltk.download('punkt_tab')
+nltk.download('averaged_perceptron_tagger_eng')
 
 def file_log(logentry):
     file1 = open("file_ingest.log", "a")

diff --git a/load_models.py b/load_models.py
@@ -135,7 +135,7 @@ def load_full_model(model_id, model_basename, device_type, logging):
     - Additional settings are provided for NVIDIA GPUs, such as loading in 4-bit and setting the compute dtype.
     """
 
-    if device_type.lower() in ["mps", "cpu"]:
+    if device_type.lower() in ["mps", "cpu", "hpu"]:
         logging.info("Using AutoModelForCausalLM")
         # tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
         # model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")

diff --git a/run_localGPT.py b/run_localGPT.py
@@ -35,7 +35,7 @@
     MODEL_BASENAME,
     MAX_NEW_TOKENS,
     MODELS_PATH,
-    CHROMA_SETTINGS,
+    CHROMA_SETTINGS,    
 )
 
 
@@ -59,7 +59,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     """
     logging.info(f"Loading Model: {model_id}, on: {device_type}")
     logging.info("This action can take a few minutes!")
-
+    
     if model_basename is not None:
         if ".gguf" in model_basename.lower():
             llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
@@ -80,7 +80,21 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
     # main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns
 
     # Create a pipeline for text generation
-    pipe = pipeline(
+    if device_type == "hpu":
+        from gaudi_utils.pipeline import GaudiTextGenerationPipeline
+
+        pipe = GaudiTextGenerationPipeline(
+            model_name_or_path=model_id,
+            max_new_tokens=1000,
+            temperature=0.2,
+            top_p=0.95,
+            repetition_penalty=1.15,
+            do_sample=True,
+            max_padding_length=5000,
+        )
+        pipe.compile_graph()
+    else:
+        pipe = pipeline(
         "text-generation",
         model=model,
         tokenizer=tokenizer,
@@ -122,12 +136,16 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):
 
     """
     (1) Chooses an appropriate langchain library based on the enbedding model name.  Matching code is contained within ingest.py.
-    
+
     (2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
     their respective huggingface repository, project page or github repository.
     """
+    if device_type == "hpu":
+        from gaudi_utils.embeddings import load_embeddings
 
-    embeddings = get_embeddings(device_type)
+        embeddings = load_embeddings()
+    else:
+        embeddings = get_embeddings(device_type)
 
     logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")