From 43b919d057f58ab21e7a2ea10cf94a1cd107f37a Mon Sep 17 00:00:00 2001
From: kyriediculous <vergauwennico@gmail.com>
Date: Thu, 17 Oct 2024 13:43:33 +0200
Subject: [PATCH] fixup! llm: use vLLM

---
 runner/Dockerfile                      |   6 +-
 runner/app/pipelines/llm.py            |  91 +++--
 runner/app/pipelines/utils/__init__.py |   1 +
 runner/app/pipelines/utils/utils.py    |  20 +-
 runner/gateway.openapi.yaml            |  10 +-
 runner/openapi.yaml                    |  10 +-
 runner/requirements.in                 |  22 ++
 runner/requirements.txt                | 454 ++++++++++++++++++++++++-
 8 files changed, 555 insertions(+), 59 deletions(-)
 create mode 100644 runner/requirements.in

diff --git a/runner/Dockerfile b/runner/Dockerfile
index 5d00e2d2..36aced7e 100644
--- a/runner/Dockerfile
+++ b/runner/Dockerfile
@@ -29,9 +29,9 @@ RUN pyenv install $PYTHON_VERSION && \
   pyenv rehash
 
 # Upgrade pip and install your desired packages
-ARG PIP_VERSION=23.3.2
+ARG PIP_VERSION=24.2
 RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools==69.5.1 wheel==0.43.0 && \
-  pip install --no-cache-dir torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1
+  pip install --no-cache-dir torch==2.4.0 torchvision torchaudio
 
 WORKDIR /app
 COPY ./requirements.txt /app
@@ -48,6 +48,8 @@ ENV MAX_WORKERS=$max_workers
 ENV HUGGINGFACE_HUB_CACHE=/models
 ENV DIFFUSERS_CACHE=/models
 ENV MODEL_DIR=/models
+# This ensures compatbility with how GPUs are addresses within go-livepeer
+ENV CUDA_DEVICE_ORDER=PCI_BUS_ID
 
 COPY app/ /app/app
 COPY images/ /app/images
diff --git a/runner/app/pipelines/llm.py b/runner/app/pipelines/llm.py
index 5b04a2c1..ef91b48c 100644
--- a/runner/app/pipelines/llm.py
+++ b/runner/app/pipelines/llm.py
@@ -2,49 +2,59 @@
 import logging
 import os
 from typing import Dict, Any, List, Optional, AsyncGenerator, Union
-
+from app.pipelines.base import Pipeline
+from app.pipelines.utils import get_model_dir, get_max_memory
+from torch import cuda
 from vllm import LLM, SamplingParams
-from vllm.utils import InferenceRequest
-from vllm.model_executor.parallel_utils import get_gpu_memory
+from vllm.outputs import RequestOutput
+from huggingface_hub import file_download
 
 logger = logging.getLogger(__name__)
 
-
 class LLMPipeline(Pipeline):
     def __init__(self, model_id: str):
         self.model_id = model_id
-        self.local_model_path = os.path.join(get_model_dir(), model_id)
+        folder_name = file_download.repo_folder_name(repo_id=model_id, repo_type="model")
+        base_path = os.path.join(get_model_dir(), folder_name)
+        
+        # Find the actual model path
+        self.local_model_path = self._find_model_path(base_path)
+        
+        if not self.local_model_path:
+            raise ValueError(f"Could not find model files for {model_id}")
 
         use_8bit = os.getenv("USE_8BIT", "").strip().lower() == "true"
-        max_batch_size = os.getenv("MAX_BATCH_SIZE", "4096")
-        max_num_seqs = os.getenv("MAX_NUM_SEQS", "256")
-        mem_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.90")
+        max_batch_size = int(os.getenv("MAX_BATCH_SIZE", "4096"))
+        max_num_seqs = int(os.getenv("MAX_NUM_SEQS", "256"))
+        mem_utilization = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.90"))
+
+        # Get available GPU memory
+        max_memory = get_max_memory()       
+        logger.info(f"Available GPU memory: {max_memory.gpu_memory}")
+
+        llm_kwargs = {
+            "model": self.local_model_path,
+            "tokenizer": self.local_model_path,
+            "load_format": "auto",
+            "trust_remote_code": True,
+            "dtype": "Bfloat16",  # This specifies FP16 precision, TODO: Check GPU capabilities to set best type
+            "tensor_parallel_size": max_memory.num_gpus,
+            "max_num_batched_tokens": max_batch_size,
+            "gpu_memory_utilization": mem_utilization,
+            "max_num_seqs": max_num_seqs,
+        }
 
         if use_8bit:
-            quantization = "int8"
+            llm_kwargs["quantization"] = "bitsandbytes"  # or another supported 8-bit quantization method
+            llm_kwargs["load_format"] = "bitsandbytes"
             logger.info("Using 8-bit quantization")
         else:
-            quantization = "float16"  # Default to FP16
-            logger.info("Using default FP16 precision")
+            logger.info("Using FP16 precision")
 
-        # Get available GPU memory
-        gpu_memory = get_gpu_memory()
-        logger.info(f"Available GPU memory: {gpu_memory}")
-
-        # Initialize vLLM with more specific parameters
-        self.llm = LLM(
-            model=self.local_model_path,
-            quantization=quantization,
-            trust_remote_code=True,
-            dtype="float16",
-            tensor_parallel_size=len(gpu_memory),  # Use all available GPUs
-            max_num_batched_tokens=max_batch_size,  # Adjust based on your needs
-            max_num_seqs=max_num_seqs,  # Adjust based on your needs
-            gpu_memory_utilization=mem_utilization,  # Adjust GPU memory utilization
-        )
+        self.llm = LLM(**llm_kwargs)
 
         logger.info(f"Model loaded: {self.model_id}")
-        logger.info(f"Using tensor parallelism across {len(gpu_memory)} GPUs")
+        logger.info(f"Using GPU memory utilization: {mem_utilization}")
 
     async def __call__(self, prompt: str, history: Optional[List[tuple]] = None, system_msg: Optional[str] = None, **kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]:
         conversation = []
@@ -67,19 +77,28 @@ async def __call__(self, prompt: str, history: Optional[List[tuple]] = None, sys
             top_k=kwargs.get("top_k", -1),
         )
 
-        request_id = 0
-        request = InferenceRequest(request_id, full_prompt, sampling_params)
-
-        total_tokens = 0
-        async for output in self.llm.generate_stream(request):
-            if output.outputs:
+        async for output in self.llm.generate(prompt=full_prompt, sampling_params=sampling_params, stream=True):
+            if isinstance(output, RequestOutput):
                 generated_text = output.outputs[0].text
-                total_tokens += len(generated_text)
                 yield generated_text
                 await asyncio.sleep(0)  # Allow other tasks to run
 
-        input_length = len(self.llm.get_tokenizer().encode(full_prompt))
-        yield {"tokens_used": input_length + total_tokens}
+        # Get the final output to calculate total tokens
+        final_output = await self.llm.generate(prompt=full_prompt, sampling_params=sampling_params)
+        if isinstance(final_output, RequestOutput):
+            total_tokens = final_output.prompt_token_ids.shape[1] + len(final_output.outputs[0].token_ids)
+            yield {"tokens_used": total_tokens}
 
     def __str__(self):
         return f"LLMPipeline(model_id={self.model_id})"
+    def _find_model_path(self, base_path):
+        # Check if the model files are directly in the base path
+        if any(file.endswith('.bin') or file.endswith('.safetensors') for file in os.listdir(base_path)):
+            return base_path
+
+        # If not, look in subdirectories
+        for root, dirs, files in os.walk(base_path):
+            if any(file.endswith('.bin') or file.endswith('.safetensors') for file in files):
+                return root
+
+        return None
\ No newline at end of file
diff --git a/runner/app/pipelines/utils/__init__.py b/runner/app/pipelines/utils/__init__.py
index 99e06686..777eb6c8 100644
--- a/runner/app/pipelines/utils/__init__.py
+++ b/runner/app/pipelines/utils/__init__.py
@@ -14,4 +14,5 @@
     is_numeric,
     split_prompt,
     validate_torch_device,
+    get_max_memory
 )
diff --git a/runner/app/pipelines/utils/utils.py b/runner/app/pipelines/utils/utils.py
index 5c4b8ccd..22cbca50 100644
--- a/runner/app/pipelines/utils/utils.py
+++ b/runner/app/pipelines/utils/utils.py
@@ -6,6 +6,7 @@
 import re
 from pathlib import Path
 from typing import Any, Dict, List, Optional
+import psutil
 
 import numpy as np
 import torch
@@ -37,7 +38,24 @@ def get_torch_device():
         return torch.device("mps")
     else:
         return torch.device("cpu")
-
+    
+class MemoryInfo:
+    def __init__(self, gpu_memory, cpu_memory, num_gpus):
+        self.gpu_memory = gpu_memory
+        self.cpu_memory = cpu_memory
+        self.num_gpus = num_gpus
+
+    def __repr__(self):
+        return f"<MemoryInfo: GPUs={self.num_gpus}, CPU Memory={self.cpu_memory}, GPU Memory={self.gpu_memory}>"
+
+def get_max_memory() -> MemoryInfo:
+    num_gpus = torch.cuda.device_count()
+    gpu_memory = {i: f"{torch.cuda.get_device_properties(i).total_memory // 1024**3}GiB" for i in range(num_gpus)}
+    cpu_memory = f"{psutil.virtual_memory().available // 1024**3}GiB"
+    
+    memory_info = MemoryInfo(gpu_memory=gpu_memory, cpu_memory=cpu_memory, num_gpus=num_gpus)
+    
+    return memory_info
 
 def validate_torch_device(device_name: str) -> bool:
     """Checks if the given PyTorch device name is valid and available.
diff --git a/runner/gateway.openapi.yaml b/runner/gateway.openapi.yaml
index 92f8de04..4032da6b 100644
--- a/runner/gateway.openapi.yaml
+++ b/runner/gateway.openapi.yaml
@@ -2,7 +2,7 @@ openapi: 3.1.0
 info:
   title: Livepeer AI Runner
   description: An application to run AI pipelines
-  version: v0.7.0
+  version: ''
 servers:
 - url: https://dream-gateway.livepeer.cloud
   description: Livepeer Cloud Community Gateway
@@ -541,6 +541,14 @@ components:
           type: integer
           title: Max Tokens
           default: 256
+        top_p:
+          type: number
+          title: Top P
+          default: 1.0
+        top_k:
+          type: integer
+          title: Top K
+          default: -1
         history:
           type: string
           title: History
diff --git a/runner/openapi.yaml b/runner/openapi.yaml
index 57a8b01b..46284e6c 100644
--- a/runner/openapi.yaml
+++ b/runner/openapi.yaml
@@ -2,7 +2,7 @@ openapi: 3.1.0
 info:
   title: Livepeer AI Runner
   description: An application to run AI pipelines
-  version: v0.7.0
+  version: ''
 servers:
 - url: https://dream-gateway.livepeer.cloud
   description: Livepeer Cloud Community Gateway
@@ -549,6 +549,14 @@ components:
           type: integer
           title: Max Tokens
           default: 256
+        top_p:
+          type: number
+          title: Top P
+          default: 1.0
+        top_k:
+          type: integer
+          title: Top K
+          default: -1
         history:
           type: string
           title: History
diff --git a/runner/requirements.in b/runner/requirements.in
new file mode 100644
index 00000000..c4b004c2
--- /dev/null
+++ b/runner/requirements.in
@@ -0,0 +1,22 @@
+vllm==0.6.3
+diffusers
+accelerate
+transformers
+fastapi
+pydantic
+Pillow
+python-multipart
+uvicorn
+huggingface_hub
+xformers
+triton
+peft
+deepcache
+safetensors
+scipy
+numpy
+av
+sentencepiece
+protobuf
+bitsandbytes
+psutil
\ No newline at end of file
diff --git a/runner/requirements.txt b/runner/requirements.txt
index 5ea7f79d..10581a71 100644
--- a/runner/requirements.txt
+++ b/runner/requirements.txt
@@ -1,22 +1,440 @@
-diffusers==0.30.0
-accelerate==0.30.1
-transformers==4.43.3
-fastapi==0.111.0
-pydantic==2.7.2
-Pillow==10.3.0
-python-multipart==0.0.9
-uvicorn==0.30.0
-huggingface_hub==0.23.2
-xformers==0.0.23
-triton>=2.1.0
-peft==0.11.1
+#
+# This file is autogenerated by pip-compile with Python 3.11
+# by the following command:
+#
+#    pip-compile requirements.in
+#
+accelerate==1.0.1
+    # via
+    #   -r requirements.in
+    #   peft
+aiohappyeyeballs==2.4.3
+    # via aiohttp
+aiohttp==3.10.10
+    # via
+    #   datasets
+    #   fsspec
+    #   vllm
+aiosignal==1.3.1
+    # via
+    #   aiohttp
+    #   ray
+annotated-types==0.7.0
+    # via pydantic
+anyio==4.6.2.post1
+    # via
+    #   httpx
+    #   openai
+    #   starlette
+    #   watchfiles
+attrs==24.2.0
+    # via
+    #   aiohttp
+    #   jsonschema
+    #   referencing
+av==13.1.0
+    # via -r requirements.in
+bitsandbytes==0.44.1
+    # via -r requirements.in
+certifi==2024.8.30
+    # via
+    #   httpcore
+    #   httpx
+    #   requests
+charset-normalizer==3.4.0
+    # via requests
+click==8.1.7
+    # via
+    #   ray
+    #   uvicorn
+cloudpickle==3.1.0
+    # via outlines
+datasets==3.0.1
+    # via outlines
 deepcache==0.1.1
-safetensors==0.4.3
-scipy==1.13.0
+    # via -r requirements.in
+diffusers==0.30.3
+    # via
+    #   -r requirements.in
+    #   deepcache
+dill==0.3.8
+    # via
+    #   datasets
+    #   multiprocess
+diskcache==5.6.3
+    # via outlines
+distro==1.9.0
+    # via openai
+einops==0.8.0
+    # via vllm
+fastapi==0.115.2
+    # via
+    #   -r requirements.in
+    #   vllm
+filelock==3.16.1
+    # via
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   ray
+    #   torch
+    #   transformers
+    #   triton
+    #   vllm
+frozenlist==1.4.1
+    # via
+    #   aiohttp
+    #   aiosignal
+    #   ray
+fsspec[http]==2024.6.1
+    # via
+    #   datasets
+    #   huggingface-hub
+    #   torch
+gguf==0.10.0
+    # via vllm
+h11==0.14.0
+    # via
+    #   httpcore
+    #   uvicorn
+httpcore==1.0.6
+    # via httpx
+httptools==0.6.4
+    # via uvicorn
+httpx==0.27.2
+    # via openai
+huggingface-hub==0.25.2
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   datasets
+    #   diffusers
+    #   peft
+    #   tokenizers
+    #   transformers
+idna==3.10
+    # via
+    #   anyio
+    #   httpx
+    #   requests
+    #   yarl
+importlib-metadata==8.5.0
+    # via
+    #   diffusers
+    #   vllm
+interegular==0.3.3
+    # via
+    #   lm-format-enforcer
+    #   outlines
+jinja2==3.1.4
+    # via
+    #   outlines
+    #   torch
+jiter==0.6.1
+    # via openai
+jsonschema==4.23.0
+    # via
+    #   mistral-common
+    #   outlines
+    #   ray
+jsonschema-specifications==2024.10.1
+    # via jsonschema
+lark==1.2.2
+    # via outlines
+llvmlite==0.43.0
+    # via numba
+lm-format-enforcer==0.10.6
+    # via vllm
+markupsafe==3.0.1
+    # via jinja2
+mistral-common[opencv]==1.4.4
+    # via vllm
+mpmath==1.3.0
+    # via sympy
+msgpack==1.1.0
+    # via ray
+msgspec==0.18.6
+    # via vllm
+multidict==6.1.0
+    # via
+    #   aiohttp
+    #   yarl
+multiprocess==0.70.16
+    # via datasets
+nest-asyncio==1.6.0
+    # via outlines
+networkx==3.4.1
+    # via torch
+numba==0.60.0
+    # via outlines
 numpy==1.26.4
-av==12.1.0
-sentencepiece== 0.2.0
-protobuf==5.27.2
-bitsandbytes==0.43.3
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   bitsandbytes
+    #   datasets
+    #   diffusers
+    #   gguf
+    #   mistral-common
+    #   numba
+    #   opencv-python-headless
+    #   outlines
+    #   pandas
+    #   peft
+    #   pyarrow
+    #   scipy
+    #   torchvision
+    #   transformers
+    #   vllm
+    #   xformers
+nvidia-cublas-cu12==12.1.3.1
+    # via
+    #   nvidia-cudnn-cu12
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-cuda-cupti-cu12==12.1.105
+    # via torch
+nvidia-cuda-nvrtc-cu12==12.1.105
+    # via torch
+nvidia-cuda-runtime-cu12==12.1.105
+    # via torch
+nvidia-cudnn-cu12==9.1.0.70
+    # via torch
+nvidia-cufft-cu12==11.0.2.54
+    # via torch
+nvidia-curand-cu12==10.3.2.106
+    # via torch
+nvidia-cusolver-cu12==11.4.5.107
+    # via torch
+nvidia-cusparse-cu12==12.1.0.106
+    # via
+    #   nvidia-cusolver-cu12
+    #   torch
+nvidia-ml-py==12.560.30
+    # via vllm
+nvidia-nccl-cu12==2.20.5
+    # via torch
+nvidia-nvjitlink-cu12==12.6.77
+    # via
+    #   nvidia-cusolver-cu12
+    #   nvidia-cusparse-cu12
+nvidia-nvtx-cu12==12.1.105
+    # via torch
+openai==1.51.2
+    # via vllm
+opencv-python-headless==4.10.0.84
+    # via mistral-common
+outlines==0.0.46
+    # via vllm
+packaging==24.1
+    # via
+    #   accelerate
+    #   datasets
+    #   huggingface-hub
+    #   lm-format-enforcer
+    #   peft
+    #   ray
+    #   transformers
+pandas==2.2.3
+    # via datasets
+partial-json-parser==0.2.1.1.post4
+    # via vllm
+peft==0.13.2
+    # via -r requirements.in
+pillow==10.4.0
+    # via
+    #   -r requirements.in
+    #   diffusers
+    #   mistral-common
+    #   torchvision
+    #   vllm
+prometheus-client==0.21.0
+    # via
+    #   prometheus-fastapi-instrumentator
+    #   vllm
+prometheus-fastapi-instrumentator==7.0.0
+    # via vllm
+propcache==0.2.0
+    # via yarl
+protobuf==5.28.2
+    # via
+    #   -r requirements.in
+    #   ray
+    #   vllm
 psutil==6.0.0
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   peft
+    #   vllm
+py-cpuinfo==9.0.0
+    # via vllm
+pyairports==2.1.1
+    # via outlines
+pyarrow==17.0.0
+    # via datasets
+pycountry==24.6.1
+    # via outlines
+pydantic==2.9.2
+    # via
+    #   -r requirements.in
+    #   fastapi
+    #   lm-format-enforcer
+    #   mistral-common
+    #   openai
+    #   outlines
+    #   vllm
+pydantic-core==2.23.4
+    # via pydantic
+python-dateutil==2.9.0.post0
+    # via pandas
+python-dotenv==1.0.1
+    # via uvicorn
+python-multipart==0.0.12
+    # via -r requirements.in
+pytz==2024.2
+    # via pandas
+pyyaml==6.0.2
+    # via
+    #   accelerate
+    #   datasets
+    #   gguf
+    #   huggingface-hub
+    #   lm-format-enforcer
+    #   peft
+    #   ray
+    #   transformers
+    #   uvicorn
+    #   vllm
+pyzmq==26.2.0
+    # via vllm
+ray==2.37.0
+    # via vllm
+referencing==0.35.1
+    # via
+    #   jsonschema
+    #   jsonschema-specifications
+    #   outlines
+regex==2024.9.11
+    # via
+    #   diffusers
+    #   tiktoken
+    #   transformers
+requests==2.32.3
+    # via
+    #   datasets
+    #   diffusers
+    #   huggingface-hub
+    #   mistral-common
+    #   outlines
+    #   ray
+    #   tiktoken
+    #   transformers
+    #   vllm
+rpds-py==0.20.0
+    # via
+    #   jsonschema
+    #   referencing
+safetensors==0.4.5
+    # via
+    #   -r requirements.in
+    #   accelerate
+    #   diffusers
+    #   peft
+    #   transformers
+scipy==1.14.1
+    # via -r requirements.in
+sentencepiece==0.2.0
+    # via
+    #   -r requirements.in
+    #   mistral-common
+    #   vllm
+six==1.16.0
+    # via python-dateutil
+sniffio==1.3.1
+    # via
+    #   anyio
+    #   httpx
+    #   openai
+starlette==0.40.0
+    # via
+    #   fastapi
+    #   prometheus-fastapi-instrumentator
+sympy==1.13.3
+    # via torch
+tiktoken==0.7.0
+    # via
+    #   mistral-common
+    #   vllm
+tokenizers==0.20.1
+    # via
+    #   transformers
+    #   vllm
+torch==2.4.0
+    # via
+    #   accelerate
+    #   bitsandbytes
+    #   deepcache
+    #   peft
+    #   torchvision
+    #   vllm
+    #   xformers
+torchvision==0.19.0
+    # via vllm
+tqdm==4.66.5
+    # via
+    #   datasets
+    #   gguf
+    #   huggingface-hub
+    #   openai
+    #   outlines
+    #   peft
+    #   transformers
+    #   vllm
+transformers==4.45.2
+    # via
+    #   -r requirements.in
+    #   deepcache
+    #   peft
+    #   vllm
+triton==3.0.0
+    # via
+    #   -r requirements.in
+    #   torch
+typing-extensions==4.12.2
+    # via
+    #   fastapi
+    #   huggingface-hub
+    #   mistral-common
+    #   openai
+    #   outlines
+    #   pydantic
+    #   pydantic-core
+    #   torch
+    #   vllm
+tzdata==2024.2
+    # via pandas
+urllib3==2.2.3
+    # via requests
+uvicorn[standard]==0.32.0
+    # via
+    #   -r requirements.in
+    #   vllm
+uvloop==0.21.0
+    # via uvicorn
 vllm==0.6.3
+    # via -r requirements.in
+watchfiles==0.24.0
+    # via uvicorn
+websockets==13.1
+    # via uvicorn
+xformers==0.0.27.post2
+    # via
+    #   -r requirements.in
+    #   vllm
+xxhash==3.5.0
+    # via datasets
+yarl==1.15.4
+    # via aiohttp
+zipp==3.20.2
+    # via importlib-metadata