Skip to content

Commit

Permalink
Merge pull request #828 from siddhivelankar23/main
Browse files Browse the repository at this point in the history
Add hpu support for Intel® Gaudi®
  • Loading branch information
PromtEngineer authored Oct 28, 2024
2 parents b654a59 + c83249b commit 51eb664
Show file tree
Hide file tree
Showing 8 changed files with 292 additions and 8 deletions.
45 changes: 45 additions & 0 deletions Dockerfile_hpu
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
FROM vault.habana.ai/gaudi-docker/1.17.0/ubuntu22.04/habanalabs/pytorch-installer-2.3.1:latest

ENV HABANA_VISIBLE_DEVICES=all
ENV OMPI_MCA_btl_vader_single_copy_mechanism=none
ENV PT_HPU_LAZY_ACC_PAR_MODE=0
ENV PT_HPU_ENABLE_LAZY_COLLECTIVES=1

# Install linux packages
ENV DEBIAN_FRONTEND="noninteractive" TZ=Etc/UTC
RUN apt-get update && apt-get install -y tzdata bash-completion python3-pip openssh-server \
vim git iputils-ping net-tools protobuf-compiler curl bc gawk tmux \
&& rm -rf /var/lib/apt/lists/*

# Add repo contents
ADD localGPT /root/localGPT
WORKDIR /root/localGPT

# Install python packages
RUN pip install --upgrade pip \
&& pip install langchain-experimental==0.0.62 \
&& pip install langchain==0.0.329 \
&& pip install protobuf==3.20.2 \
&& pip install grpcio-tools \
&& pip install pymilvus==2.4.0 \
&& pip install chromadb==0.5.15 \
&& pip install llama-cpp-python==0.1.66 \
&& pip install pdfminer.six==20221105 \
&& pip install transformers==4.43.1 \
&& pip install optimum[habana]==1.13.1 \
&& pip install InstructorEmbedding==1.0.1 \
&& pip install sentence-transformers==3.0.1 \
&& pip install faiss-cpu==1.7.4 \
&& pip install huggingface_hub==0.16.4 \
&& pip install protobuf==3.20.2 \
&& pip install auto-gptq==0.2.2 \
&& pip install docx2txt unstructured unstructured[pdf] urllib3 accelerate \
&& pip install bitsandbytes \
&& pip install click flask requests openpyxl \
&& pip install git+https://github.com/HabanaAI/[email protected] \
&& pip install python-multipart \
&& pip install fastapi \
&& pip install uvicorn \
&& pip install gptcache==0.1.43 \
&& pip install pypdf==4.3.1 \
&& pip install python-jose[cryptography]
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
- **Chat History**: Remembers your previous conversations (in a session).
- **API**: LocalGPT has an API that you can use for building RAG Applications.
- **Graphical Interface**: LocalGPT comes with two GUIs, one uses the API and the other is standalone (based on streamlit).
- **GPU, CPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU` or `MPS` and more!
- **GPU, CPU, HPU & MPS Support**: Supports multiple platforms out of the box, Chat with your data using `CUDA`, `CPU`, `HPU (Intel® Gaudi®)` or `MPS` and more!

## Dive Deeper with Our Videos 🎥
- [Detailed code-walkthrough](https://youtu.be/MlyoObdIHyo)
Expand Down Expand Up @@ -98,6 +98,7 @@ It includes CUDA, your system just needs Docker, BuildKit, your NVIDIA GPU drive
Build as `docker build -t localgpt .`, requires BuildKit.
Docker BuildKit does not support GPU during *docker build* time right now, only during *docker run*.
Run as `docker run -it --mount src="$HOME/.cache",target=/root/.cache,type=bind --gpus=all localgpt`.
For running the code on Intel® Gaudi® HPU, use the following Dockerfile - `Dockerfile_hpu`.

## Test dataset

Expand Down Expand Up @@ -173,6 +174,12 @@ You can also specify the device type just like `ingest.py`
python run_localGPT.py --device_type mps # to run on Apple silicon
```

```shell
# To run on Intel® Gaudi® hpu
MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2" # in constants.py
python run_localGPT.py --device_type hpu
```

This will load the ingested vector store and embedding model. You will be presented with a prompt:

```shell
Expand Down
5 changes: 4 additions & 1 deletion constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
)

# Context Window and Max New Tokens
CONTEXT_WINDOW_SIZE = 8096
CONTEXT_WINDOW_SIZE = 2048
MAX_NEW_TOKENS = CONTEXT_WINDOW_SIZE # int(CONTEXT_WINDOW_SIZE/4)

#### If you get a "not enough space in the buffer" error, you should reduce the values below, start with half of the original values and keep halving the value until the error stops appearing
Expand Down Expand Up @@ -106,6 +106,9 @@
# MODEL_ID = "QuantFactory/Meta-Llama-3-8B-Instruct-GGUF"
# MODEL_BASENAME = "Meta-Llama-3-8B-Instruct.Q4_K_M.gguf"

# Use mistral to run on hpu
# MODEL_ID = "mistralai/Mistral-7B-Instruct-v0.2"

# LLAMA 3 # use for Apple Silicon
MODEL_ID = "meta-llama/Meta-Llama-3-8B-Instruct"
MODEL_BASENAME = None
Expand Down
40 changes: 40 additions & 0 deletions gaudi_utils/embeddings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
import logging
import torch

from langchain.embeddings import HuggingFaceEmbeddings
from habana_frameworks.torch.utils.library_loader import load_habana_module
from optimum.habana.sentence_transformers.modeling_utils import (
adapt_sentence_transformers_to_gaudi,
)

from constants import EMBEDDING_MODEL_NAME


def load_embeddings():
"""Load HuggingFace Embeddings object onto Gaudi or CPU"""
load_habana_module()
if torch.hpu.is_available():
logging.info("Loading embedding model on hpu")

adapt_sentence_transformers_to_gaudi()
embeddings = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "hpu"}
)
else:
logging.info("Loading embedding model on cpu")
embeddings = HuggingFaceEmbeddings(
model_name=EMBEDDING_MODEL_NAME, model_kwargs={"device": "cpu"}
)
return embeddings


def calculate_similarity(model, response, expected_answer):
"""Calculate similarity between response and expected answer using the model"""
response_embedding = model.client.encode(response, convert_to_tensor=True).squeeze()
expected_embedding = model.client.encode(
expected_answer, convert_to_tensor=True
).squeeze()
similarity_score = torch.nn.functional.cosine_similarity(
response_embedding, expected_embedding, dim=0
)
return similarity_score.item()
168 changes: 168 additions & 0 deletions gaudi_utils/pipeline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
import copy
import os
import torch
from pathlib import Path
from typing import List

import habana_frameworks.torch.hpu as torch_hpu

from habana_frameworks.torch.hpu import wrap_in_hpu_graph
from huggingface_hub import snapshot_download
from optimum.habana.transformers.generation import MODELS_OPTIMIZED_WITH_STATIC_SHAPES
from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
from optimum.habana.utils import set_seed
from transformers import AutoModelForCausalLM, AutoTokenizer, TextGenerationPipeline
from transformers.utils import is_offline_mode


def get_repo_root(model_name_or_path, local_rank=-1, token=None):
"""
Downloads the specified model checkpoint and returns the repository where it was downloaded.
"""
if Path(model_name_or_path).is_dir():
# If it is a local model, no need to download anything
return model_name_or_path
else:
# Checks if online or not
if is_offline_mode():
if local_rank == 0:
print("Offline mode: forcing local_files_only=True")

# Only download PyTorch weights by default
allow_patterns = ["*.bin"]

# Download only on first process
if local_rank in [-1, 0]:
cache_dir = snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
allow_patterns=allow_patterns,
max_workers=16,
token=token,
)
if local_rank == -1:
# If there is only one process, then the method is finished
return cache_dir

# Make all processes wait so that other processes can get the checkpoint directly from cache
torch.distributed.barrier()

return snapshot_download(
model_name_or_path,
local_files_only=is_offline_mode(),
cache_dir=os.getenv("TRANSFORMERS_CACHE", None),
allow_patterns=allow_patterns,
token=token,
)


def get_optimized_model_name(config):
for model_type in MODELS_OPTIMIZED_WITH_STATIC_SHAPES:
if model_type == config.model_type:
return model_type

return None


def model_is_optimized(config):
"""
Checks if the given config belongs to a model in optimum/habana/transformers/models, which has a
new input token_idx.
"""
return get_optimized_model_name(config) is not None


class GaudiTextGenerationPipeline(TextGenerationPipeline):
"""
An end-to-end text-generation pipeline that can used to initialize LangChain classes.
"""
def __init__(self, model_name_or_path=None, revision="main", **kwargs):
self.task = "text-generation"
self.device = "hpu"

# Tweak generation so that it runs faster on Gaudi
adapt_transformers_to_gaudi()
set_seed(27)

# Initialize tokenizer and define datatype
self.tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, revision=revision)
model_dtype = torch.bfloat16

# Intialize model
get_repo_root(model_name_or_path)
model = AutoModelForCausalLM.from_pretrained(model_name_or_path, revision=revision, torch_dtype=model_dtype)
model = model.eval().to(self.device)
is_optimized = model_is_optimized(model.config)
model = wrap_in_hpu_graph(model)
self.model = model

# Used for padding input to fixed length
self.tokenizer.padding_side = "left"
self.max_padding_length = kwargs.get("max_padding_length", self.model.config.max_position_embeddings)

# Define config params for llama and mistral models
if self.model.config.model_type in ["llama", "mistral"]:
self.model.generation_config.pad_token_id = 0
self.model.generation_config.bos_token_id = 1
self.model.generation_config.eos_token_id = 2
self.tokenizer.bos_token_id = self.model.generation_config.bos_token_id
self.tokenizer.eos_token_id = self.model.generation_config.eos_token_id
self.tokenizer.pad_token_id = self.model.generation_config.pad_token_id
self.tokenizer.pad_token = self.tokenizer.decode(self.tokenizer.pad_token_id)
self.tokenizer.eos_token = self.tokenizer.decode(self.tokenizer.eos_token_id)
self.tokenizer.bos_token = self.tokenizer.decode(self.tokenizer.bos_token_id)

# Applicable to models that do not have pad tokens
if self.tokenizer.pad_token is None:
self.tokenizer.pad_token = self.tokenizer.eos_token
self.model.generation_config.pad_token_id = self.model.generation_config.eos_token_id

# Edit generation configuration based on input arguments
self.generation_config = copy.deepcopy(self.model.generation_config)
self.generation_config.max_new_tokens = kwargs.get("max_new_tokens", 100)
self.generation_config.use_cache = kwargs.get("use_kv_cache", True)
self.generation_config.static_shapes = is_optimized
self.generation_config.do_sample = kwargs.get("do_sample", False)
self.generation_config.num_beams = kwargs.get("num_beams", 1)
self.generation_config.temperature = kwargs.get("temperature", 1.0)
self.generation_config.top_p = kwargs.get("top_p", 1.0)
self.generation_config.repetition_penalty = kwargs.get("repetition_penalty", 1.0)
self.generation_config.num_return_sequences = kwargs.get("num_return_sequences", 1)
self.generation_config.bad_words_ids = None
self.generation_config.force_words_ids = None
self.generation_config.ignore_eos = False

# Define empty post-process params dict as there is no postprocesing
self._postprocess_params = {}

# Warm-up hpu and compile computation graphs
self.compile_graph()

def __call__(self, prompt: List[str]):
"""
__call__ method of pipeline class
"""
# Tokenize input string
model_inputs = self.tokenizer.encode_plus(prompt[0], return_tensors="pt", max_length=self.max_padding_length, padding="max_length", truncation=True)

# Move tensors to hpu
for t in model_inputs:
if torch.is_tensor(model_inputs[t]):
model_inputs[t] = model_inputs[t].to(self.device)

# Call model's generate method
output = self.model.generate(**model_inputs, generation_config=self.generation_config, lazy_mode=True, hpu_graphs=True, profiling_steps=0, profiling_warmup_steps=0).cpu()

# Decode and return result
output_text = self.tokenizer.decode(output[0], skip_special_tokens=True)
del output, model_inputs
return [{"generated_text": output_text}]

def compile_graph(self):
"""
Function to compile computation graphs and synchronize hpus.
"""
for _ in range(3):
self(["Here is my prompt"])
torch_hpu.synchronize()
3 changes: 3 additions & 0 deletions ingest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
SOURCE_DIRECTORY,
)

import nltk
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')

def file_log(logentry):
file1 = open("file_ingest.log", "a")
Expand Down
2 changes: 1 addition & 1 deletion load_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,7 @@ def load_full_model(model_id, model_basename, device_type, logging):
- Additional settings are provided for NVIDIA GPUs, such as loading in 4-bit and setting the compute dtype.
"""

if device_type.lower() in ["mps", "cpu"]:
if device_type.lower() in ["mps", "cpu", "hpu"]:
logging.info("Using AutoModelForCausalLM")
# tokenizer = LlamaTokenizer.from_pretrained(model_id, cache_dir="./models/")
# model = LlamaForCausalLM.from_pretrained(model_id, cache_dir="./models/")
Expand Down
28 changes: 23 additions & 5 deletions run_localGPT.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
MODEL_BASENAME,
MAX_NEW_TOKENS,
MODELS_PATH,
CHROMA_SETTINGS,
CHROMA_SETTINGS,
)


Expand All @@ -59,7 +59,7 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
"""
logging.info(f"Loading Model: {model_id}, on: {device_type}")
logging.info("This action can take a few minutes!")

if model_basename is not None:
if ".gguf" in model_basename.lower():
llm = load_quantized_model_gguf_ggml(model_id, model_basename, device_type, LOGGING)
Expand All @@ -80,7 +80,21 @@ def load_model(device_type, model_id, model_basename=None, LOGGING=logging):
# main_classes/text_generation#transformers.GenerationConfig.from_pretrained.returns

# Create a pipeline for text generation
pipe = pipeline(
if device_type == "hpu":
from gaudi_utils.pipeline import GaudiTextGenerationPipeline

pipe = GaudiTextGenerationPipeline(
model_name_or_path=model_id,
max_new_tokens=1000,
temperature=0.2,
top_p=0.95,
repetition_penalty=1.15,
do_sample=True,
max_padding_length=5000,
)
pipe.compile_graph()
else:
pipe = pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
Expand Down Expand Up @@ -122,12 +136,16 @@ def retrieval_qa_pipline(device_type, use_history, promptTemplate_type="llama"):

"""
(1) Chooses an appropriate langchain library based on the enbedding model name. Matching code is contained within ingest.py.
(2) Provides additional arguments for instructor and BGE models to improve results, pursuant to the instructions contained on
their respective huggingface repository, project page or github repository.
"""
if device_type == "hpu":
from gaudi_utils.embeddings import load_embeddings

embeddings = get_embeddings(device_type)
embeddings = load_embeddings()
else:
embeddings = get_embeddings(device_type)

logging.info(f"Loaded embeddings from {EMBEDDING_MODEL_NAME}")

Expand Down

0 comments on commit 51eb664

Please sign in to comment.