From 43b919d057f58ab21e7a2ea10cf94a1cd107f37a Mon Sep 17 00:00:00 2001 From: kyriediculous Date: Thu, 17 Oct 2024 13:43:33 +0200 Subject: [PATCH] fixup! llm: use vLLM --- runner/Dockerfile | 6 +- runner/app/pipelines/llm.py | 91 +++-- runner/app/pipelines/utils/__init__.py | 1 + runner/app/pipelines/utils/utils.py | 20 +- runner/gateway.openapi.yaml | 10 +- runner/openapi.yaml | 10 +- runner/requirements.in | 22 ++ runner/requirements.txt | 454 ++++++++++++++++++++++++- 8 files changed, 555 insertions(+), 59 deletions(-) create mode 100644 runner/requirements.in diff --git a/runner/Dockerfile b/runner/Dockerfile index 5d00e2d2..36aced7e 100644 --- a/runner/Dockerfile +++ b/runner/Dockerfile @@ -29,9 +29,9 @@ RUN pyenv install $PYTHON_VERSION && \ pyenv rehash # Upgrade pip and install your desired packages -ARG PIP_VERSION=23.3.2 +ARG PIP_VERSION=24.2 RUN pip install --no-cache-dir --upgrade pip==${PIP_VERSION} setuptools==69.5.1 wheel==0.43.0 && \ - pip install --no-cache-dir torch==2.1.1 torchvision==0.16.1 torchaudio==2.1.1 + pip install --no-cache-dir torch==2.4.0 torchvision torchaudio WORKDIR /app COPY ./requirements.txt /app @@ -48,6 +48,8 @@ ENV MAX_WORKERS=$max_workers ENV HUGGINGFACE_HUB_CACHE=/models ENV DIFFUSERS_CACHE=/models ENV MODEL_DIR=/models +# This ensures compatbility with how GPUs are addresses within go-livepeer +ENV CUDA_DEVICE_ORDER=PCI_BUS_ID COPY app/ /app/app COPY images/ /app/images diff --git a/runner/app/pipelines/llm.py b/runner/app/pipelines/llm.py index 5b04a2c1..ef91b48c 100644 --- a/runner/app/pipelines/llm.py +++ b/runner/app/pipelines/llm.py @@ -2,49 +2,59 @@ import logging import os from typing import Dict, Any, List, Optional, AsyncGenerator, Union - +from app.pipelines.base import Pipeline +from app.pipelines.utils import get_model_dir, get_max_memory +from torch import cuda from vllm import LLM, SamplingParams -from vllm.utils import InferenceRequest -from vllm.model_executor.parallel_utils import get_gpu_memory +from vllm.outputs import RequestOutput +from huggingface_hub import file_download logger = logging.getLogger(__name__) - class LLMPipeline(Pipeline): def __init__(self, model_id: str): self.model_id = model_id - self.local_model_path = os.path.join(get_model_dir(), model_id) + folder_name = file_download.repo_folder_name(repo_id=model_id, repo_type="model") + base_path = os.path.join(get_model_dir(), folder_name) + + # Find the actual model path + self.local_model_path = self._find_model_path(base_path) + + if not self.local_model_path: + raise ValueError(f"Could not find model files for {model_id}") use_8bit = os.getenv("USE_8BIT", "").strip().lower() == "true" - max_batch_size = os.getenv("MAX_BATCH_SIZE", "4096") - max_num_seqs = os.getenv("MAX_NUM_SEQS", "256") - mem_utilization = os.getenv("GPU_MEMORY_UTILIZATION", "0.90") + max_batch_size = int(os.getenv("MAX_BATCH_SIZE", "4096")) + max_num_seqs = int(os.getenv("MAX_NUM_SEQS", "256")) + mem_utilization = float(os.getenv("GPU_MEMORY_UTILIZATION", "0.90")) + + # Get available GPU memory + max_memory = get_max_memory() + logger.info(f"Available GPU memory: {max_memory.gpu_memory}") + + llm_kwargs = { + "model": self.local_model_path, + "tokenizer": self.local_model_path, + "load_format": "auto", + "trust_remote_code": True, + "dtype": "Bfloat16", # This specifies FP16 precision, TODO: Check GPU capabilities to set best type + "tensor_parallel_size": max_memory.num_gpus, + "max_num_batched_tokens": max_batch_size, + "gpu_memory_utilization": mem_utilization, + "max_num_seqs": max_num_seqs, + } if use_8bit: - quantization = "int8" + llm_kwargs["quantization"] = "bitsandbytes" # or another supported 8-bit quantization method + llm_kwargs["load_format"] = "bitsandbytes" logger.info("Using 8-bit quantization") else: - quantization = "float16" # Default to FP16 - logger.info("Using default FP16 precision") + logger.info("Using FP16 precision") - # Get available GPU memory - gpu_memory = get_gpu_memory() - logger.info(f"Available GPU memory: {gpu_memory}") - - # Initialize vLLM with more specific parameters - self.llm = LLM( - model=self.local_model_path, - quantization=quantization, - trust_remote_code=True, - dtype="float16", - tensor_parallel_size=len(gpu_memory), # Use all available GPUs - max_num_batched_tokens=max_batch_size, # Adjust based on your needs - max_num_seqs=max_num_seqs, # Adjust based on your needs - gpu_memory_utilization=mem_utilization, # Adjust GPU memory utilization - ) + self.llm = LLM(**llm_kwargs) logger.info(f"Model loaded: {self.model_id}") - logger.info(f"Using tensor parallelism across {len(gpu_memory)} GPUs") + logger.info(f"Using GPU memory utilization: {mem_utilization}") async def __call__(self, prompt: str, history: Optional[List[tuple]] = None, system_msg: Optional[str] = None, **kwargs) -> AsyncGenerator[Union[str, Dict[str, Any]], None]: conversation = [] @@ -67,19 +77,28 @@ async def __call__(self, prompt: str, history: Optional[List[tuple]] = None, sys top_k=kwargs.get("top_k", -1), ) - request_id = 0 - request = InferenceRequest(request_id, full_prompt, sampling_params) - - total_tokens = 0 - async for output in self.llm.generate_stream(request): - if output.outputs: + async for output in self.llm.generate(prompt=full_prompt, sampling_params=sampling_params, stream=True): + if isinstance(output, RequestOutput): generated_text = output.outputs[0].text - total_tokens += len(generated_text) yield generated_text await asyncio.sleep(0) # Allow other tasks to run - input_length = len(self.llm.get_tokenizer().encode(full_prompt)) - yield {"tokens_used": input_length + total_tokens} + # Get the final output to calculate total tokens + final_output = await self.llm.generate(prompt=full_prompt, sampling_params=sampling_params) + if isinstance(final_output, RequestOutput): + total_tokens = final_output.prompt_token_ids.shape[1] + len(final_output.outputs[0].token_ids) + yield {"tokens_used": total_tokens} def __str__(self): return f"LLMPipeline(model_id={self.model_id})" + def _find_model_path(self, base_path): + # Check if the model files are directly in the base path + if any(file.endswith('.bin') or file.endswith('.safetensors') for file in os.listdir(base_path)): + return base_path + + # If not, look in subdirectories + for root, dirs, files in os.walk(base_path): + if any(file.endswith('.bin') or file.endswith('.safetensors') for file in files): + return root + + return None \ No newline at end of file diff --git a/runner/app/pipelines/utils/__init__.py b/runner/app/pipelines/utils/__init__.py index 99e06686..777eb6c8 100644 --- a/runner/app/pipelines/utils/__init__.py +++ b/runner/app/pipelines/utils/__init__.py @@ -14,4 +14,5 @@ is_numeric, split_prompt, validate_torch_device, + get_max_memory ) diff --git a/runner/app/pipelines/utils/utils.py b/runner/app/pipelines/utils/utils.py index 5c4b8ccd..22cbca50 100644 --- a/runner/app/pipelines/utils/utils.py +++ b/runner/app/pipelines/utils/utils.py @@ -6,6 +6,7 @@ import re from pathlib import Path from typing import Any, Dict, List, Optional +import psutil import numpy as np import torch @@ -37,7 +38,24 @@ def get_torch_device(): return torch.device("mps") else: return torch.device("cpu") - + +class MemoryInfo: + def __init__(self, gpu_memory, cpu_memory, num_gpus): + self.gpu_memory = gpu_memory + self.cpu_memory = cpu_memory + self.num_gpus = num_gpus + + def __repr__(self): + return f"" + +def get_max_memory() -> MemoryInfo: + num_gpus = torch.cuda.device_count() + gpu_memory = {i: f"{torch.cuda.get_device_properties(i).total_memory // 1024**3}GiB" for i in range(num_gpus)} + cpu_memory = f"{psutil.virtual_memory().available // 1024**3}GiB" + + memory_info = MemoryInfo(gpu_memory=gpu_memory, cpu_memory=cpu_memory, num_gpus=num_gpus) + + return memory_info def validate_torch_device(device_name: str) -> bool: """Checks if the given PyTorch device name is valid and available. diff --git a/runner/gateway.openapi.yaml b/runner/gateway.openapi.yaml index 92f8de04..4032da6b 100644 --- a/runner/gateway.openapi.yaml +++ b/runner/gateway.openapi.yaml @@ -2,7 +2,7 @@ openapi: 3.1.0 info: title: Livepeer AI Runner description: An application to run AI pipelines - version: v0.7.0 + version: '' servers: - url: https://dream-gateway.livepeer.cloud description: Livepeer Cloud Community Gateway @@ -541,6 +541,14 @@ components: type: integer title: Max Tokens default: 256 + top_p: + type: number + title: Top P + default: 1.0 + top_k: + type: integer + title: Top K + default: -1 history: type: string title: History diff --git a/runner/openapi.yaml b/runner/openapi.yaml index 57a8b01b..46284e6c 100644 --- a/runner/openapi.yaml +++ b/runner/openapi.yaml @@ -2,7 +2,7 @@ openapi: 3.1.0 info: title: Livepeer AI Runner description: An application to run AI pipelines - version: v0.7.0 + version: '' servers: - url: https://dream-gateway.livepeer.cloud description: Livepeer Cloud Community Gateway @@ -549,6 +549,14 @@ components: type: integer title: Max Tokens default: 256 + top_p: + type: number + title: Top P + default: 1.0 + top_k: + type: integer + title: Top K + default: -1 history: type: string title: History diff --git a/runner/requirements.in b/runner/requirements.in new file mode 100644 index 00000000..c4b004c2 --- /dev/null +++ b/runner/requirements.in @@ -0,0 +1,22 @@ +vllm==0.6.3 +diffusers +accelerate +transformers +fastapi +pydantic +Pillow +python-multipart +uvicorn +huggingface_hub +xformers +triton +peft +deepcache +safetensors +scipy +numpy +av +sentencepiece +protobuf +bitsandbytes +psutil \ No newline at end of file diff --git a/runner/requirements.txt b/runner/requirements.txt index 5ea7f79d..10581a71 100644 --- a/runner/requirements.txt +++ b/runner/requirements.txt @@ -1,22 +1,440 @@ -diffusers==0.30.0 -accelerate==0.30.1 -transformers==4.43.3 -fastapi==0.111.0 -pydantic==2.7.2 -Pillow==10.3.0 -python-multipart==0.0.9 -uvicorn==0.30.0 -huggingface_hub==0.23.2 -xformers==0.0.23 -triton>=2.1.0 -peft==0.11.1 +# +# This file is autogenerated by pip-compile with Python 3.11 +# by the following command: +# +# pip-compile requirements.in +# +accelerate==1.0.1 + # via + # -r requirements.in + # peft +aiohappyeyeballs==2.4.3 + # via aiohttp +aiohttp==3.10.10 + # via + # datasets + # fsspec + # vllm +aiosignal==1.3.1 + # via + # aiohttp + # ray +annotated-types==0.7.0 + # via pydantic +anyio==4.6.2.post1 + # via + # httpx + # openai + # starlette + # watchfiles +attrs==24.2.0 + # via + # aiohttp + # jsonschema + # referencing +av==13.1.0 + # via -r requirements.in +bitsandbytes==0.44.1 + # via -r requirements.in +certifi==2024.8.30 + # via + # httpcore + # httpx + # requests +charset-normalizer==3.4.0 + # via requests +click==8.1.7 + # via + # ray + # uvicorn +cloudpickle==3.1.0 + # via outlines +datasets==3.0.1 + # via outlines deepcache==0.1.1 -safetensors==0.4.3 -scipy==1.13.0 + # via -r requirements.in +diffusers==0.30.3 + # via + # -r requirements.in + # deepcache +dill==0.3.8 + # via + # datasets + # multiprocess +diskcache==5.6.3 + # via outlines +distro==1.9.0 + # via openai +einops==0.8.0 + # via vllm +fastapi==0.115.2 + # via + # -r requirements.in + # vllm +filelock==3.16.1 + # via + # datasets + # diffusers + # huggingface-hub + # ray + # torch + # transformers + # triton + # vllm +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal + # ray +fsspec[http]==2024.6.1 + # via + # datasets + # huggingface-hub + # torch +gguf==0.10.0 + # via vllm +h11==0.14.0 + # via + # httpcore + # uvicorn +httpcore==1.0.6 + # via httpx +httptools==0.6.4 + # via uvicorn +httpx==0.27.2 + # via openai +huggingface-hub==0.25.2 + # via + # -r requirements.in + # accelerate + # datasets + # diffusers + # peft + # tokenizers + # transformers +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +importlib-metadata==8.5.0 + # via + # diffusers + # vllm +interegular==0.3.3 + # via + # lm-format-enforcer + # outlines +jinja2==3.1.4 + # via + # outlines + # torch +jiter==0.6.1 + # via openai +jsonschema==4.23.0 + # via + # mistral-common + # outlines + # ray +jsonschema-specifications==2024.10.1 + # via jsonschema +lark==1.2.2 + # via outlines +llvmlite==0.43.0 + # via numba +lm-format-enforcer==0.10.6 + # via vllm +markupsafe==3.0.1 + # via jinja2 +mistral-common[opencv]==1.4.4 + # via vllm +mpmath==1.3.0 + # via sympy +msgpack==1.1.0 + # via ray +msgspec==0.18.6 + # via vllm +multidict==6.1.0 + # via + # aiohttp + # yarl +multiprocess==0.70.16 + # via datasets +nest-asyncio==1.6.0 + # via outlines +networkx==3.4.1 + # via torch +numba==0.60.0 + # via outlines numpy==1.26.4 -av==12.1.0 -sentencepiece== 0.2.0 -protobuf==5.27.2 -bitsandbytes==0.43.3 + # via + # -r requirements.in + # accelerate + # bitsandbytes + # datasets + # diffusers + # gguf + # mistral-common + # numba + # opencv-python-headless + # outlines + # pandas + # peft + # pyarrow + # scipy + # torchvision + # transformers + # vllm + # xformers +nvidia-cublas-cu12==12.1.3.1 + # via + # nvidia-cudnn-cu12 + # nvidia-cusolver-cu12 + # torch +nvidia-cuda-cupti-cu12==12.1.105 + # via torch +nvidia-cuda-nvrtc-cu12==12.1.105 + # via torch +nvidia-cuda-runtime-cu12==12.1.105 + # via torch +nvidia-cudnn-cu12==9.1.0.70 + # via torch +nvidia-cufft-cu12==11.0.2.54 + # via torch +nvidia-curand-cu12==10.3.2.106 + # via torch +nvidia-cusolver-cu12==11.4.5.107 + # via torch +nvidia-cusparse-cu12==12.1.0.106 + # via + # nvidia-cusolver-cu12 + # torch +nvidia-ml-py==12.560.30 + # via vllm +nvidia-nccl-cu12==2.20.5 + # via torch +nvidia-nvjitlink-cu12==12.6.77 + # via + # nvidia-cusolver-cu12 + # nvidia-cusparse-cu12 +nvidia-nvtx-cu12==12.1.105 + # via torch +openai==1.51.2 + # via vllm +opencv-python-headless==4.10.0.84 + # via mistral-common +outlines==0.0.46 + # via vllm +packaging==24.1 + # via + # accelerate + # datasets + # huggingface-hub + # lm-format-enforcer + # peft + # ray + # transformers +pandas==2.2.3 + # via datasets +partial-json-parser==0.2.1.1.post4 + # via vllm +peft==0.13.2 + # via -r requirements.in +pillow==10.4.0 + # via + # -r requirements.in + # diffusers + # mistral-common + # torchvision + # vllm +prometheus-client==0.21.0 + # via + # prometheus-fastapi-instrumentator + # vllm +prometheus-fastapi-instrumentator==7.0.0 + # via vllm +propcache==0.2.0 + # via yarl +protobuf==5.28.2 + # via + # -r requirements.in + # ray + # vllm psutil==6.0.0 + # via + # -r requirements.in + # accelerate + # peft + # vllm +py-cpuinfo==9.0.0 + # via vllm +pyairports==2.1.1 + # via outlines +pyarrow==17.0.0 + # via datasets +pycountry==24.6.1 + # via outlines +pydantic==2.9.2 + # via + # -r requirements.in + # fastapi + # lm-format-enforcer + # mistral-common + # openai + # outlines + # vllm +pydantic-core==2.23.4 + # via pydantic +python-dateutil==2.9.0.post0 + # via pandas +python-dotenv==1.0.1 + # via uvicorn +python-multipart==0.0.12 + # via -r requirements.in +pytz==2024.2 + # via pandas +pyyaml==6.0.2 + # via + # accelerate + # datasets + # gguf + # huggingface-hub + # lm-format-enforcer + # peft + # ray + # transformers + # uvicorn + # vllm +pyzmq==26.2.0 + # via vllm +ray==2.37.0 + # via vllm +referencing==0.35.1 + # via + # jsonschema + # jsonschema-specifications + # outlines +regex==2024.9.11 + # via + # diffusers + # tiktoken + # transformers +requests==2.32.3 + # via + # datasets + # diffusers + # huggingface-hub + # mistral-common + # outlines + # ray + # tiktoken + # transformers + # vllm +rpds-py==0.20.0 + # via + # jsonschema + # referencing +safetensors==0.4.5 + # via + # -r requirements.in + # accelerate + # diffusers + # peft + # transformers +scipy==1.14.1 + # via -r requirements.in +sentencepiece==0.2.0 + # via + # -r requirements.in + # mistral-common + # vllm +six==1.16.0 + # via python-dateutil +sniffio==1.3.1 + # via + # anyio + # httpx + # openai +starlette==0.40.0 + # via + # fastapi + # prometheus-fastapi-instrumentator +sympy==1.13.3 + # via torch +tiktoken==0.7.0 + # via + # mistral-common + # vllm +tokenizers==0.20.1 + # via + # transformers + # vllm +torch==2.4.0 + # via + # accelerate + # bitsandbytes + # deepcache + # peft + # torchvision + # vllm + # xformers +torchvision==0.19.0 + # via vllm +tqdm==4.66.5 + # via + # datasets + # gguf + # huggingface-hub + # openai + # outlines + # peft + # transformers + # vllm +transformers==4.45.2 + # via + # -r requirements.in + # deepcache + # peft + # vllm +triton==3.0.0 + # via + # -r requirements.in + # torch +typing-extensions==4.12.2 + # via + # fastapi + # huggingface-hub + # mistral-common + # openai + # outlines + # pydantic + # pydantic-core + # torch + # vllm +tzdata==2024.2 + # via pandas +urllib3==2.2.3 + # via requests +uvicorn[standard]==0.32.0 + # via + # -r requirements.in + # vllm +uvloop==0.21.0 + # via uvicorn vllm==0.6.3 + # via -r requirements.in +watchfiles==0.24.0 + # via uvicorn +websockets==13.1 + # via uvicorn +xformers==0.0.27.post2 + # via + # -r requirements.in + # vllm +xxhash==3.5.0 + # via datasets +yarl==1.15.4 + # via aiohttp +zipp==3.20.2 + # via importlib-metadata