Make tokenizer.cpp CLI tool have nicer command line arguments.

Tokenizer used to take arguments like this: tokenize MODEL_FILENAME PROMPT [--ids] And it would load the model, read the prompt, and then print a list of tokens it interpreted, or if --ids was given, just the integer values. This changeset makes the command line arguments more sophisticated: tokenize [--model MODEL_FILENAME] [--ids] [--stdin] [--prompt] [--file] [--no-bos] It will still recognize the old form just to not surprise people.
ggerganov · Mar 20, 2024 · 0e5b526 · 0e5b526
1 parent f9c7ba3
commit 0e5b526
Showing 1 changed file with 237 additions and 7 deletions.
diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp
@@ -3,33 +3,259 @@
 
 #include <cmath>
 #include <cstdio>
+#include <iostream>
+#include <fstream>
 #include <string>
 #include <vector>
 
+static void print_usage_information(const char* argv0, FILE* stream) {
+ fprintf(stream, "usage: %s [options]\n\n", argv0);
+ fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n");
+ fprintf(stream, "and prints the resulting tokens to standard output.\n\n");
+ fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n");
+ fprintf(stream, "to control the behavior of the tokenizer.\n\n");
+ fprintf(stream, "Invoke '%s' like this:\n", argv0);
+ fprintf(stream, "\n");
+ fprintf(stream, " %s MODEL_FNAME PROMPT [--ids]\n" , argv0);
+ fprintf(stream, "\n");
+ fprintf(stream, " or this:\n");
+ fprintf(stream, "\n");
+ fprintf(stream, " %s [options], where options are:\n", argv0);
+ fprintf(stream, "\n");
+ fprintf(stream, " -h, --help print this help and exit\n");
+ fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n");
+ fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n");
+ fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n");
+ fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n");
+ fprintf(stream, " --stdin read prompt from standard input.\n");
+ fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n");
+}
+
+static std::string read_prompt_from_file(const char* filepath, bool &success) {
+ success = false;
+
+ std::ifstream in(filepath, std::ios::binary);
+ if (!in) {
+ fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno));
+ return std::string();
+ }
+ // do not assume the file is seekable (e.g. /dev/stdin)
+ std::stringstream buffer;
+ buffer << in.rdbuf();
+ if (in.fail()) {
+ fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno));
+ return std::string();
+ }
+
+ success = true;
+ return buffer.str();
+}
+
 int main(int argc, char ** argv) {
- if (argc < 3 || argv[1][0] == '-') {
- printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]);
+ if (argc <= 1) {
+ print_usage_information(argv[0], stderr);
  return 1;
  }
 
- const char * model_path = argv[1];
- const char * prompt = argv[2];
+ //////
+ // Handle all the command line arguments.
+ //////
+
+ // variables where to put any arguments we see.
+ bool printing_ids = false;
+ bool no_bos = false;
+ const char* model_path = NULL;
+ const char* prompt_path = NULL;
+ const char* prompt_arg = NULL;
+
+ // track which arguments were explicitly given
+ // used for sanity checking down the line
+ bool model_path_set = false;
+ bool prompt_path_set = false;
+ bool prompt_set = false;
+ bool stdin_set = false;
+
+ // If we see an unrecognized argument, we set
+ // demand_old_style_arguments to true. It signifies we are expecting
+ // the "old style arguments", i.e. simple positional arguments for
+ // argv[1] argv[2] and possibly argv[3]:
+ //
+ // tokenize MODEL_FNAME PROMPT [--ids]
+ //
+ // As opposed to "new style arguments" which uses --model, --prompt,
+ // etc. nice flags.
+ //
+ // We use 'unknown_arg' to keep track of the first argument that we
+ // didn't recognize so we can complain to the user if we can't
+ // recognize arguments even using the old style.
+ bool demand_old_style_arguments = false;
+ const char* unknown_arg = NULL;
+
+ int iarg = 1;
+ for (; iarg < argc; ++iarg) {
+ std::string arg{argv[iarg]};
+ if (arg == "-h" || arg == "--help") {
+ print_usage_information(argv[0], stdout);
+ return 0;
+ }
+ else if (arg == "--ids") {
+ printing_ids = true;
+ }
+ else if (arg == "-m" || arg == "--model") {
+ if (model_path_set) {
+ fprintf(stderr, "Error: -m or --model specified multiple times.\n");
+ return 1;
+ }
+ model_path = argv[++iarg];
+ model_path_set = true;
+ }
+ else if (arg == "--no-bos") {
+ no_bos = true;
+ }
+ else if (arg == "-p" || arg == "--prompt") {
+ if (prompt_set) {
+ fprintf(stderr, "Error: -p or --prompt specified multiple times.\n");
+ return 1;
+ }
+ prompt_arg = argv[++iarg];
+ prompt_set = true;
+ }
+ else if (arg == "-f" || arg == "--file") {
+ if (prompt_path_set) {
+ fprintf(stderr, "Error: -f or --file specified multiple times.\n");
+ return 1;
+ }
+ prompt_path = argv[++iarg];
+ prompt_path_set = true;
+ }
+ else if (arg == "--stdin") {
+ stdin_set = true;
+ }
+ else {
+ demand_old_style_arguments = true;
+ if (unknown_arg == NULL) {
+ unknown_arg = argv[iarg];
+ }
+ }
+ }
+
+ //////
+ // Sanity check the command line arguments.
+ //////
 
- const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids";
+ // Old style arguments? (i.e. tokenizer MODEL_FNAME PROMPT [--ids])
+ if ((argc == 3 || argc == 4) &&
+ !prompt_set &&
+ !prompt_path_set &&
+ !model_path_set &&
+ !stdin_set) {
+ model_path = argv[1];
+ prompt_arg = argv[2];
+ if (argc == 4) {
+ if (!strcmp(argv[3], "--ids")) {
+ printing_ids = true;
+ } else {
+ fprintf(stderr, "Error: unknown option '%s'\n", argv[3]);
+ return 1;
+ }
+ }
+ model_path_set = true;
+ prompt_set = true;
+ } else if (demand_old_style_arguments) {
+ GGML_ASSERT(unknown_arg);
+ fprintf(stderr, "Unknown argument: '%s'\n", unknown_arg);
+ return 1;
+ }
+
+ // Check that we have the required stuff set.
+ if (model_path_set && model_path == NULL) {
+ fprintf(stderr, "Error: --model requires an argument.\n");
+ return 1;
+ }
+ if (!model_path_set) {
+ fprintf(stderr, "Error: must specify --model.\n");
+ return 1;
+ }
+ if (prompt_path_set && prompt_path == NULL) {
+ fprintf(stderr, "Error: --file requires an argument.\n");
+ return 1;
+ }
+ if (prompt_set && prompt_arg == NULL) {
+ fprintf(stderr, "Error: --prompt requires an argument.\n");
+ return 1;
+ }
+ const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set);
+ if (prompts_set > 1) {
+ fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n");
+ return 1;
+ }
+ // Must have some prompt.
+ if (prompts_set == 0) {
+ fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n");
+ return 1;
+ }
+
+ GGML_ASSERT(model_path);
+ GGML_ASSERT(prompt_path || prompt_arg || stdin_set);
+
+ //////
+ // Figure out where will the prompt come from.
+ //////
+
+ std::string prompt;
+ if (prompt_path_set) {
+ bool success = false;
+ prompt = read_prompt_from_file(prompt_path, success);
+ if (!success) {
+ return 1;
+ }
+ } else if (prompt_set) {
+ prompt = prompt_arg;
+ } else {
+ GGML_ASSERT(stdin_set);
+ // we read stdin *after* loading model (early exit if model cannot
+ // be loaded, which can be a nicer user experience)
+ }
+
+ //////
+ // Start actually doing the tokenizing stuff.
+ //////
 
  llama_backend_init();
 
  llama_model_params model_params = llama_model_default_params();
  model_params.vocab_only = true;
  llama_model * model = llama_load_model_from_file(model_path, model_params);
+ if (!model) {
+ fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path);
+ return 1;
+ }
 
  llama_context_params ctx_params = llama_context_default_params();
  llama_context * ctx = llama_new_context_with_model(model, ctx_params);
+ if (!ctx) {
+ fprintf(stderr, "Error: could not create context.\n");
+ return 1;
+ }
 
- const bool add_bos = llama_should_add_bos_token(model);
+ // read entire prompt from stdin?
+ if (stdin_set) {
+ GGML_ASSERT(!prompt_path_set && !prompt_set);
 
- std::vector<llama_token> tokens;
+ std::stringstream stdin_buffer;
+ stdin_buffer << std::cin.rdbuf();
+ if (std::cin.fail()) {
+ fprintf(stderr, "Error: could not read the entire standard input.\n");
+ return 1;
+ }
+
+ prompt = stdin_buffer.str();
+ }
 
+ const bool model_wants_add_bos = llama_should_add_bos_token(model);
+ const bool add_bos = model_wants_add_bos && !no_bos;
+
+ std::vector<llama_token> tokens;
  tokens = ::llama_tokenize(model, prompt, add_bos, true);
 
  for (int i = 0; i < (int) tokens.size(); i++) {
@@ -40,5 +266,9 @@ int main(int argc, char ** argv) {
  }
  }
 
+ // silence valgrind
+ llama_free(ctx);
+ llama_free_model(model);
+
  return 0;
 }