Merge branch 'main' into patch-6

abetlen · web-flow · commit a132897f1d3d · 2024-12-06T05:00:42.000-05:00
diff --git a/.github/workflows/build-and-release.yaml b/.github/workflows/build-and-release.yaml
@@ -42,7 +42,7 @@ jobs:
         shell: cmd
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.21.1
+        uses: pypa/cibuildwheel@v2.22.0
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
@@ -69,7 +69,7 @@ jobs:
           platforms: linux/arm64
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.21.1
+        uses: pypa/cibuildwheel@v2.22.0
         env:
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/.github/workflows/build-wheels-cuda.yaml b/.github/workflows/build-wheels-cuda.yaml
@@ -59,7 +59,7 @@ jobs:
           cache: 'pip'
 
       - name: Setup Mamba
-        uses: conda-incubator/setup-miniconda@v3.0.4
+        uses: conda-incubator/setup-miniconda@v3.1.0
         with:
           activate-environment: "build"
           python-version: ${{ matrix.pyver }}
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
@@ -43,7 +43,7 @@ jobs:
         shell: cmd
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v2.21.1
+        uses: pypa/cibuildwheel@v2.22.0
         env:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
@@ -44,7 +44,6 @@ jobs:
           ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
           ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
-          ./scripts/releases-to-pep-503.sh index/whl/cu125 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu125$'
           ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
       - name: Upload artifact
         uses: actions/upload-pages-artifact@v3
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
@@ -51,19 +51,11 @@ jobs:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
       - name: Install dependencies (Linux/MacOS)
-        if: runner.os != 'Windows'
         run: |
           python -m pip install --upgrade pip
           python -m pip install uv
           python -m uv pip install -e .[all] --verbose
         shell: bash
-      - name: Install dependencies (Windows)
-        if: runner.os == 'Windows'
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install uv
-          python -m uv pip install -e .[all] --verbose
-        shell: cmd
       - name: Test with pytest
         run: |
           python -m pytest
@@ -90,30 +82,21 @@ jobs:
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
-          
-      - name: Install dependencies (Linux/MacOS)
-        if: runner.os != 'Windows'
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install uv
-          python -m uv pip install -e .[all] --verbose
-        shell: bash
 
       - name: Install dependencies (Windows)
-        if: runner.os == 'Windows'
         run: |
           python -m pip install --upgrade pip
           python -m pip install uv
           python -m uv pip install -e .[all] --verbose
-        shell: cmd     
+        shell: cmd
           
       - name: Test with pytest
         run: |
           python -m pytest
 
   build-macos:
     needs: download-model
-    runs-on: macos-latest
+    runs-on: macos-13
     strategy:
       matrix:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
@@ -128,35 +111,33 @@ jobs:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'
 
+      - name: System Info
+        run: |
+          uname -a
+          sysctl -n machdep.cpu.brand_string
+          python3 -c "import platform; print(platform.machine(), platform.architecture())"
+
       - name: Restore model cache
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
           
       - name: Install dependencies (Linux/MacOS)
-        if: runner.os != 'Windows'
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install uv
-          python -m uv pip install -e .[all] --verbose
+          python3 -m pip install --upgrade pip
+          python3 -m pip install uv
+          python3 -m uv pip install -e .[all] --verbose
+          CMAKE_ARGS="-DLLAMA_METAL=off" python3 -m uv pip install .[all] --verbose
         shell: bash
 
-      - name: Install dependencies (Windows)
-        if: runner.os == 'Windows'
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install uv
-          python -m uv pip install -e .[all] --verbose
-        shell: cmd
-          
       - name: Test with pytest
         run: |
-          python -m pytest
+          python3 -m pytest
 
   build-macos-metal:
     needs: download-model
-    runs-on: macos-latest
+    runs-on: macos-13
     steps:
       - uses: actions/checkout@v4
         with:
@@ -167,25 +148,24 @@ jobs:
         with:
           python-version: "3.9"
 
+      - name: System Info
+        run: |
+          uname -a
+          sysctl -n machdep.cpu.brand_string
+          python3 -c "import platform; print(platform.machine(), platform.architecture())"
+
       - name: Restore model cache
         uses: actions/cache@v4
         with:
           path: ~/.cache/huggingface/hub
           key: ${{ runner.os }}-model-${{ env.REPO_ID }}-${{ env.MODEL_FILE }}
 
-      - name: Install dependencies (Linux/MacOS)
-        if: runner.os != 'Windows'
+      - name: Install dependencies
         run: |
-          python -m pip install --upgrade pip
-          python -m pip install uv
-          CMAKE_ARGS="-DLLAMA_METAL=on" python -m uv pip install .[all] --verbose
+          python3 -m pip install --upgrade pip
+          CMAKE_ARGS="-DLLAMA_METAL=on" python3 -m pip install .[all] --verbose
         shell: bash
 
-      - name: Install dependencies (Windows)
-        if: runner.os == 'Windows'
-        run: |
-          python -m pip install --upgrade pip
-          CMAKE_ARGS="-DGGML_METAL=on" python -m pip install .[all] --verbose
       - name: Test with pytest
         run: |
-          python -m pytest
+          python3 -m pytest
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.2]
+
+- feat: Update llama.cpp to ggerganov/llama.cpp@74d73dc85cc2057446bf63cc37ff649ae7cebd80
+
 ## [0.3.1]
 
 - feat: Update llama.cpp to ggerganov/llama.cpp@c919d5db39c8a7fcb64737f008e4b105ee0acd20
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -6,6 +6,10 @@ option(LLAMA_BUILD "Build llama.cpp shared library and install alongside python
 option(LLAVA_BUILD "Build llava shared library and install alongside python package" ON)
 
 function(llama_cpp_python_install_target target)
+    if(NOT TARGET ${target})
+        return()
+    endif()
+
     install(
         TARGETS ${target}
         LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/llama_cpp/lib
@@ -55,24 +59,59 @@ if (LLAMA_BUILD)
     set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
     set(CMAKE_SKIP_RPATH FALSE)
 
-    # Building llama
-    if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
-        # Need to disable these llama.cpp flags on Apple x86_64,
-        # otherwise users may encounter invalid instruction errors
-        set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
-        set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
-        set(GGML_FMA "Off" CACHE BOOL "gml: enable FMA" FORCE)
-        set(GGML_F16C "Off" CACHE BOOL "gml: enable F16C" FORCE)
-    endif()
+    # Enable building of the common library
+    set(LLAMA_BUILD_COMMON ON CACHE BOOL "Build llama.cpp common library" FORCE)
 
+    # Architecture detection and settings for Apple platforms
     if (APPLE)
-        set(GGML_METAL_EMBED_LIBRARY "On" CACHE BOOL "llama: embed metal library" FORCE)
+        # Get the target architecture
+        execute_process(
+            COMMAND uname -m
+            OUTPUT_VARIABLE HOST_ARCH
+            OUTPUT_STRIP_TRAILING_WHITESPACE
+        )
+
+        # If CMAKE_OSX_ARCHITECTURES is not set, use the host architecture
+        if(NOT CMAKE_OSX_ARCHITECTURES)
+            set(CMAKE_OSX_ARCHITECTURES ${HOST_ARCH} CACHE STRING "Build architecture for macOS" FORCE)
+        endif()
+
+        message(STATUS "Host architecture: ${HOST_ARCH}")
+        message(STATUS "Target architecture: ${CMAKE_OSX_ARCHITECTURES}")
+
+        # Configure based on target architecture
+        if(CMAKE_OSX_ARCHITECTURES STREQUAL "x86_64")
+            # Intel Mac settings
+            set(GGML_AVX "OFF" CACHE BOOL "ggml: enable AVX" FORCE)
+            set(GGML_AVX2 "OFF" CACHE BOOL "ggml: enable AVX2" FORCE)
+            set(GGML_FMA "OFF" CACHE BOOL "ggml: enable FMA" FORCE)
+            set(GGML_F16C "OFF" CACHE BOOL "ggml: enable F16C" FORCE)
+        endif()
+
+        # Metal settings (enable for both architectures)
+        set(GGML_METAL "ON" CACHE BOOL "ggml: enable Metal" FORCE)
+        set(GGML_METAL_EMBED_LIBRARY "ON" CACHE BOOL "ggml: embed metal library" FORCE)
     endif()
 
     add_subdirectory(vendor/llama.cpp)
     llama_cpp_python_install_target(llama)
     llama_cpp_python_install_target(ggml)
-    
+
+    llama_cpp_python_install_target(ggml-base)
+
+    llama_cpp_python_install_target(ggml-amx)
+    llama_cpp_python_install_target(ggml-blas)
+    llama_cpp_python_install_target(ggml-can)
+    llama_cpp_python_install_target(ggml-cpu)
+    llama_cpp_python_install_target(ggml-cuda)
+    llama_cpp_python_install_target(ggml-hip)
+    llama_cpp_python_install_target(ggml-kompute)
+    llama_cpp_python_install_target(ggml-metal)
+    llama_cpp_python_install_target(ggml-musa)
+    llama_cpp_python_install_target(ggml-rpc)
+    llama_cpp_python_install_target(ggml-sycl)
+    llama_cpp_python_install_target(ggml-vulkan)
+
     # Workaround for Windows + CUDA https://github.com/abetlen/llama-cpp-python/issues/563
     if (WIN32)
         install(
@@ -106,7 +145,7 @@ if (LLAMA_BUILD)
         # Building llava
         add_subdirectory(vendor/llama.cpp/examples/llava)
         set_target_properties(llava_shared PROPERTIES OUTPUT_NAME "llava")
-        # Set CUDA_ARCHITECTURES to OFF on windows
+
         if (WIN32)
             set_target_properties(llava_shared PROPERTIES CUDA_ARCHITECTURES OFF)
         endif()
@@ -121,5 +160,18 @@ if (LLAMA_BUILD)
                 DESTINATION ${SKBUILD_PLATLIB_DIR}/llama_cpp/lib
             )
         endif()
+
+        # Fix for llava build: Add include directory for llama.h
+        # Move these commands after the add_subdirectory call
+        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        target_include_directories(llava PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+
+        if (BUILD_SHARED_LIBS)
+            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+            target_include_directories(llava_shared PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/ggml/include)
+        endif()
+
+        target_include_directories(llama-llava-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
+        target_include_directories(llama-minicpmv-cli PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/vendor/llama.cpp/include)
     endif()
 endif()
diff --git a/llama_cpp/__init__.py b/llama_cpp/__init__.py
@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
 
-__version__ = "0.3.1"
+__version__ = "0.3.2"
diff --git a/llama_cpp/_internals.py b/llama_cpp/_internals.py
@@ -362,13 +362,6 @@ def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: i
             self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep
         )
 
-    def sample_tail_free(
-        self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int
-    ):
-        llama_cpp.llama_sample_tail_free(
-            self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep
-        )
-
     def sample_typical(
         self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
     ):
@@ -685,9 +678,6 @@ def sample(
                 ctx_main.sample_top_k(
                     token_data_array, self.params.top_k, min_keep=min_keep
                 )
-                ctx_main.sample_tail_free(
-                    token_data_array, self.params.tfs_z, min_keep=min_keep
-                )
                 ctx_main.sample_typical(
                     token_data_array, self.params.typical_p, min_keep=min_keep
                 )
@@ -776,10 +766,6 @@ def add_min_p(self, p: float, min_keep: int):
         sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep)
         self._add_sampler(sampler)
 
-    def add_tail_free(self, z: float, min_keep: int):
-        sampler = llama_cpp.llama_sampler_init_tail_free(z, min_keep)
-        self._add_sampler(sampler)
-
     def add_typical(self, p: float, min_keep: int):
         sampler = llama_cpp.llama_sampler_init_typical(p, min_keep)
         self._add_sampler(sampler)
diff --git a/llama_cpp/llama.py b/llama_cpp/llama.py
@@ -745,7 +745,6 @@ def apply_func(token_data_array: llama_cpp.llama_token_data_array_p):
                 n_probs = 0
                 min_keep = max(1, n_probs)
                 sampler.add_top_k(top_k)
-                sampler.add_tail_free(tfs_z, min_keep)
                 sampler.add_typical(typical_p, min_keep)
                 sampler.add_top_p(top_p, min_keep)
                 sampler.add_min_p(min_p, min_keep)
@@ -1142,7 +1141,7 @@ def _create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
     ) -> Union[
         Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse]
     ]:
@@ -1762,7 +1761,7 @@ def create_completion(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1859,7 +1858,7 @@ def __call__(
         stopping_criteria: Optional[StoppingCriteriaList] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
     ) -> Union[CreateCompletionResponse, Iterator[CreateCompletionStreamResponse]]:
         """Generate text from a prompt.
 
@@ -1952,7 +1951,7 @@ def create_chat_completion(
         model: Optional[str] = None,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
-        logit_bias: Optional[Dict[str, float]] = None,
+        logit_bias: Optional[Dict[int, float]] = None,
         logprobs: Optional[bool] = None,
         top_logprobs: Optional[int] = None,
     ) -> Union[
@@ -2074,7 +2073,7 @@ def __getstate__(self):
             use_mlock=self.model_params.use_mlock,
             kv_overrides=self.kv_overrides,
             # Context Params
-            seed=self.context_params.seed,
+            seed=self._seed,
             n_ctx=self.context_params.n_ctx,
             n_batch=self.n_batch,
             n_ubatch=self.context_params.n_ubatch,
diff --git a/llama_cpp/llama_cpp.py b/llama_cpp/llama_cpp.py
diff --git a/llama_cpp/llama_types.py b/llama_cpp/llama_types.py
diff --git a/vendor/llama.cpp b/vendor/llama.cpp