Merge branch 'antimatter15:master' into win-color

anzz1 · Mar 17, 2023 · 52b8a51 · 52b8a51
2 parents 73580cf + 1e82fa8
commit 52b8a51
Show file tree

Hide file tree

Showing 5 changed files with 85 additions and 46 deletions.
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
@@ -14,37 +14,43 @@ on:
  paths: ['CMakeLists.txt', 'Makefile', '**.h', '*.c', '**.cpp']
 
 jobs:
-# ubuntu-latest:
-# runs-on: ubuntu-latest
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v1
-#
-# - name: Dependencies
-# run: |
-# sudo apt-get update
-# sudo apt-get install build-essential
-#
-# - name: Build
-# run: |
-# make
-#
-# macOS-latest:
-# runs-on: macOS-latest
-#
-# steps:
-# - name: Clone
-# uses: actions/checkout@v1
-#
-# - name: Dependencies
-# run: |
-# brew update
-#
-# - name: Build
-# run: |
-# make
-#
+ ubuntu-latest:
+ runs-on: ubuntu-latest
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v1
+
+ - name: Dependencies
+ id: depends
+ run: |
+ sudo apt-get update
+ sudo apt-get install build-essential
+
+ - name: Build
+ id: make_build
+ run: |
+ make
+
+ macOS-latest:
+ runs-on: macOS-latest
+
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v1
+
+ - name: Dependencies
+ id: depends
+ run: |
+ brew update
+
+ - name: Build
+ id: make_build
+ run: |
+ make
+
  windows-latest:
  runs-on: windows-latest
 

diff --git a/.gitignore b/.gitignore
@@ -15,9 +15,22 @@ build-sanitize-addr/
 build-sanitize-thread/
 
 models/*
+*.bin
 
 /main
 /quantize
 
 arm_neon.h
 compile_commands.json
+
+# Windows CMake files
+*.vcxproj
+*.filters
+*.cmake
+*.sln
+x64/
+Debug/
+Release/
+CMakeFiles/
+CMakeCache.txt
+*.dir/
diff --git a/Makefile b/Makefile
@@ -176,7 +176,7 @@ $(info I CC: $(CCV))
 $(info I CXX: $(CXXV))
 $(info )
 
-default: main quantize
+default: chat quantize
 
 #
 # Build library
@@ -191,10 +191,6 @@ utils.o: utils.cpp utils.h
 clean:
  rm -f *.o main quantize
 
-main: main.cpp ggml.o utils.o
- $(CXX) $(CXXFLAGS) main.cpp ggml.o utils.o -o main $(LDFLAGS)
- ./main -h
-
 chat: chat.cpp ggml.o utils.o
  $(CXX) $(CXXFLAGS) chat.cpp ggml.o utils.o -o chat $(LDFLAGS)
 

diff --git a/README.md b/README.md
@@ -6,19 +6,19 @@ Run a fast ChatGPT-like model locally on your device. The screencast below is no
 [![asciicast](screencast.gif)](https://asciinema.org/a/dfJ8QXZ4u978Ona59LPEldtKK)
 
 
-This combines the [LLaMA foundation model](https://github.com/facebookresearch/llama) with an [open reproduction](https://github.com/tloen/alpaca-lora) of [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) a fine-tuning of the base model to obey instructions (akin to the [RLHF](https://huggingface.co/blog/rlhf) used to train ChatGPT). 
+This combines the [LLaMA foundation model](https://github.com/facebookresearch/llama) with an [open reproduction](https://github.com/tloen/alpaca-lora) of [Stanford Alpaca](https://github.com/tatsu-lab/stanford_alpaca) a fine-tuning of the base model to obey instructions (akin to the [RLHF](https://huggingface.co/blog/rlhf) used to train ChatGPT) and a set of modifications to [llama.cpp](https://github.com/ggerganov/llama.cpp) to add a chat interface. 
 
 ## Get started
 
-```
+```sh
 git clone https://github.com/antimatter15/alpaca.cpp
 cd alpaca.cpp
 
 make chat
 ./chat
 ```
 
-You can download the weights for `ggml-alpaca-7b-14.bin` with BitTorrent `magnet:?xt=urn:btih:5aaceaec63b03e51a98f04fd5c42320b2a033010&dn=ggml-alpaca-7b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce`
+You can download the weights for `ggml-alpaca-7b-q4.bin` with BitTorrent `magnet:?xt=urn:btih:5aaceaec63b03e51a98f04fd5c42320b2a033010&dn=ggml-alpaca-7b-q4.bin&tr=udp%3A%2F%2Ftracker.opentrackr.org%3A1337%2Fannounce&tr=udp%3A%2F%2Fopentracker.i2p.rocks%3A6969%2Fannounce`
 
 
 Alternatively you can download them with IPFS.
@@ -30,13 +30,34 @@ wget -O ggml-alpaca-7b-q4.bin -c https://ipfs.io/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2
 wget -O ggml-alpaca-7b-q4.bin -c https://cloudflare-ipfs.com/ipfs/QmQ1bf2BTnYxq73MFJWu1B7bQ2UD6qG7D7YDCxhTndVkPC
 ```
 
-Save the `ggml-alpaca-7b-14.bin` file in the same directory as your `./chat` executable. 
+Save the `ggml-alpaca-7b-q4.bin` file in the same directory as your `./chat` executable. 
 
 The weights are based on the published fine-tunes from `alpaca-lora`, converted back into a pytorch checkpoint with a [modified script](https://github.com/tloen/alpaca-lora/pull/19) and then quantized with llama.cpp the regular way. 
 
+## Windows Setup
+
+- Download and install CMake: <https://cmake.org/download/>
+- Download and install `git`. If you've never used git before, consider a GUI client like <https://desktop.github.com/>
+- Clone this repo using your git client of choice (for GitHub Desktop, go to File -> Clone repository -> From URL and paste `https://github.com/antimatter15/alpaca.cpp` in as the URL)
+- Open a Windows Terminal inside the folder you cloned the repository to
+- Run the following commands one by one:
+
+```ps1
+cmake .
+cmake --build . --config Release
+```
+
+- Download the weights via any of the links in "Get started" above, and save the file as `ggml-alpaca-7b-q4.bin` in the main Alpaca directory.
+- In the terminal window, run this command:
+```ps1
+.\Release\chat.exe
+```
+- (You can add other launch options like `--n 8` as preferred onto the same line)
+- You can now type to the AI in the terminal and it will reply. Enjoy!
+
 ## Credit
 
-This combines [Facebook's LLaMA](https://github.com/facebookresearch/llama), [Stanford Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html), [alpaca-lora](https://github.com/tatsu-lab/stanford_alpaca) (which uses [Jason Phang's implementation of LLaMA](https://github.com/huggingface/transformers/pull/21955) on top of Hugging Face Transformers), and a modified version of [llama.cpp](https://github.com/ggerganov/llama.cpp) by Georgi Gerganov. The chat implementation is based on Matvey Soloviev's [Interactive Mode](https://github.com/ggerganov/llama.cpp/pull/61) for llama.cpp. Inspired by [Simon Willison's](https://til.simonwillison.net/llms/llama-7b-m2) getting started guide for LLaMA.
+This combines [Facebook's LLaMA](https://github.com/facebookresearch/llama), [Stanford Alpaca](https://crfm.stanford.edu/2023/03/13/alpaca.html), [alpaca-lora](https://github.com/tloen/alpaca-lora) and [corresponding weights](https://huggingface.co/tloen/alpaca-lora-7b/tree/main) by Eric Wang (which uses [Jason Phang's implementation of LLaMA](https://github.com/huggingface/transformers/pull/21955) on top of Hugging Face Transformers), and [llama.cpp](https://github.com/ggerganov/llama.cpp) by Georgi Gerganov. The chat implementation is based on Matvey Soloviev's [Interactive Mode](https://github.com/ggerganov/llama.cpp/pull/61) for llama.cpp. Inspired by [Simon Willison's](https://til.simonwillison.net/llms/llama-7b-m2) getting started guide for LLaMA.
 
 
 ## Disclaimer

diff --git a/chat.cpp b/chat.cpp
@@ -917,11 +917,12 @@ int main(int argc, char ** argv) {
 #if defined (__unix__) || (defined (__APPLE__) && defined (__MACH__)) || defined (_WIN32)
  " - Press Ctrl+C to interject at any time.\n"
 #endif
- " - Press Return to return control to LLaMa.\n"
+ " - Press Return to return control to LLaMA.\n"
  " - If you want to submit another line, end your input in '\\'.\n");
  }
 
- int remaining_tokens = params.n_predict;
+ // we may want to slide the input window along with the context, but for now we restrict to the context length
+ int remaining_tokens = model.hparams.n_ctx - embd_inp.size();
  int input_consumed = 0;
  bool input_noecho = true;
 
@@ -937,7 +938,7 @@ int main(int argc, char ** argv) {
 
 
 
- while (true) {
+ while (remaining_tokens > 0) {
  // predict
  if (embd.size() > 0) {
  const int64_t t_start_us = ggml_time_us();
@@ -982,7 +983,7 @@ int main(int argc, char ** argv) {
  input_noecho = false;
 
  // decrement remaining sampling budget
- // --remaining_tokens;
+ --remaining_tokens;
  } else {
  // some user input remains from prompt or interaction, forward it to processing
  while (embd_inp.size() > input_consumed) {
@@ -1037,7 +1038,7 @@ int main(int argc, char ** argv) {
  if(params.use_color) printf(ANSI_BOLD ANSI_COLOR_GREEN);
  if (scanf("%255[^\n]%n%*c", buf, &n_read) <= 0) {
  // presumable empty line, consume the newline
- scanf("%*c");
+ if (scanf("%*c") <= 0) { /*ignore*/ }
  n_read=0;
  }
  if(params.use_color) printf(ANSI_COLOR_RESET);
@@ -1056,6 +1057,8 @@ int main(int argc, char ** argv) {
  embd_inp.insert(embd_inp.end(), line_inp.begin(), line_inp.end());
  embd_inp.insert(embd_inp.end(), response_inp.begin(), response_inp.end());
 
+ remaining_tokens -= prompt_inp.size() + line_inp.size() + response_inp.size();
+
  input_noecho = true; // do not echo this again
  }