From cd7b5f7f78f7ba450e075e3bf5d55ae439d4897c Mon Sep 17 00:00:00 2001 From: Mikko Juola Date: Wed, 20 Mar 2024 15:04:44 -0700 Subject: [PATCH 1/4] Make tokenizer.cpp CLI tool nicer. Before this commit, tokenize was a simple CLI tool like this: tokenize MODEL_FILENAME PROMPT [--ids] This simple tool loads the model, takes the prompt, and shows the tokens llama.cpp is interpreting. This changeset makes the tokenize more sophisticated, and more useful for debugging and troubleshooting: tokenize [-m, --model MODEL_FILENAME] [--ids] [--stdin] [--prompt] [-f, --file] [--no-bos] [--log-disable] It also behaves nicer on Windows now, interpreting and rendering Unicode from command line arguments and pipes no matter what code page the user has set on their terminal. --- examples/tokenize/tokenize.cpp | 417 ++++++++++++++++++++++++++++++++- 1 file changed, 407 insertions(+), 10 deletions(-) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index d95a9247525eb..6d5da5eb4c1c3 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -3,42 +3,439 @@ #include #include +#include +#include #include #include -int main(int argc, char ** argv) { - if (argc < 3 || argv[1][0] == '-') { - printf("usage: %s MODEL_PATH PROMPT [--ids]\n" , argv[0]); +#if defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#include +#include // For CommandLineToArgvW +#endif + +static void print_usage_information(const char * argv0, FILE * stream) { + fprintf(stream, "usage: %s [options]\n\n", argv0); + fprintf(stream, "The tokenize program tokenizes a prompt using a given model,\n"); + fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); + fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); + fprintf(stream, "to control the behavior of the tokenizer.\n\n"); + fprintf(stream, "Invoke '%s' like this:\n", argv0); + fprintf(stream, "\n"); + fprintf(stream, " %s MODEL_FNAME PROMPT [--ids]\n" , argv0); + fprintf(stream, "\n"); + fprintf(stream, " or this:\n"); + fprintf(stream, "\n"); + fprintf(stream, " %s [options], where options are:\n", argv0); + fprintf(stream, "\n"); + fprintf(stream, " -h, --help print this help and exit\n"); + fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); + fprintf(stream, " --ids if given, only print numerical token IDs, and not token strings.\n"); + fprintf(stream, " The output format looks like [1, 2, 3], i.e. parseable by Python.\n"); + fprintf(stream, " -f PROMPT_FNAME, --file PROMPT_FNAME read prompt from a file.\n"); + fprintf(stream, " -p PROMPT, --prompt PROMPT read prompt from the argument.\n"); + fprintf(stream, " --stdin read prompt from standard input.\n"); + fprintf(stream, " --no-bos do not ever add a BOS token to the prompt, even if normally the model uses a BOS token.\n"); + fprintf(stream, " --log-disable disable logs. Makes stderr quiet when loading the model.\n"); +} + +static void llama_log_callback_null(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) text; + (void) user_data; +} + +static std::string read_prompt_from_file(const char * filepath, bool & success) { + success = false; + + std::ifstream in(filepath, std::ios::binary); + if (!in) { + fprintf(stderr, "%s: could not open file '%s' for reading: %s\n", __func__, filepath, strerror(errno)); + return std::string(); + } + // do not assume the file is seekable (e.g. /dev/stdin) + std::stringstream buffer; + buffer << in.rdbuf(); + if (in.fail()) { + fprintf(stderr, "%s: could not read the entire file '%s': %s\n", __func__, filepath, strerror(errno)); + return std::string(); + } + + success = true; + return buffer.str(); +} + +// +// Function: ingest_args(...) -> vector +// +// Takes argc and argv arguments, and converts them to a vector of UTF-8 encoded +// strings, as an STL vector. +// +// In particular, it handles character encoding shenanigans on Windows. +// +// Note: raw_argc and raw_argv are not actually read at all on Windows. +// On Windows we call GetCommandLineW to get the arguments in wchar_t +// format, ignoring the regular argc/argv arguments to main(). +// +// TODO: potential opportunity to roll common stuff into common/console.cpp +// in relation to Windows wchar_t shenanigans. +static std::vector ingest_args(int raw_argc, char ** raw_argv) { + std::vector argv; + + // Handle Windows, if given non-ASCII arguments. + // We convert wchar_t arguments into UTF-8 char* on this platform. + // Lets you invoke 'tokenize' on Windows cmd.exe with non-ASCII characters + // without throwing tantrums. +#if defined(_WIN32) + int argc; + const LPWSTR cmdline_wargv = GetCommandLineW(); + LPWSTR * wargv = CommandLineToArgvW(cmdline_wargv, &argc); + + // silence unused arg warnings + (void) raw_argc; + (void) raw_argv; + + for (int i = 0; i < argc; ++i) { + int length_needed = WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), 0, 0, NULL, NULL); + char * output_buf = (char *) calloc(length_needed+1, sizeof(char)); + GGML_ASSERT(output_buf); + + WideCharToMultiByte(CP_UTF8, 0, wargv[i], wcslen(wargv[i]), output_buf, length_needed, NULL, NULL); + output_buf[length_needed] = '\0'; + + argv.push_back(output_buf); + free(output_buf); + } + + LocalFree((HLOCAL) wargv); +#else + int argc = raw_argc; + for (int i = 0; i < argc; ++i) { + argv.push_back(raw_argv[i]); + } +#endif + + GGML_ASSERT((unsigned int) argc == argv.size()); + + return argv; +} + +// +// Function: write_utf8_cstr_to_stdout(const char *) -> +// +// writes a string to standard output; taking into account that on Windows +// to display correctly you have to use special handling. Works even if the +// user has not set a unicode code page on a Windows cmd.exe. +// +// In case of invalid UTF-8, invalid_utf8 is set to true on Windows, and something +// a human-readable is written instead. +// +// On non-Windows systems, simply printfs() the string. +static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) { + invalid_utf8 = false; + +#if defined(_WIN32) + // Are we in a console? + HANDLE hConsole = GetStdHandle(STD_OUTPUT_HANDLE); + DWORD dwMode = 0; + + // According to Microsoft docs: + // "WriteConsole fails if it is used with a standard handle that is redirected to a file." + // Also according to the docs, you can use GetConsoleMode to check for that. + if (hConsole == INVALID_HANDLE_VALUE || !GetConsoleMode(hConsole, &dwMode)) { + printf("%s", str); + return; + } + + // MultiByteToWideChar reports an error if str is empty, don't report + // them as invalid_utf8. + if (strlen(str) == 0) { + return; + } + int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0); + if (length_needed == 0) { + DWORD err = GetLastError(); + if (err == ERROR_NO_UNICODE_TRANSLATION) { + invalid_utf8 = true; + int len = strlen(str); + printf("<"); + for (int i = 0; i < len; ++i) { + if (i > 0) { + printf(" "); + } + printf("%02x", (uint8_t) str[i]); + } + printf(">"); + return; + } + GGML_ASSERT(false && "MultiByteToWideChar() failed in an unexpected way."); + } + + LPWSTR wstr = (LPWSTR) calloc(length_needed+1, sizeof(*wstr)); + GGML_ASSERT(wstr); + + MultiByteToWideChar(CP_UTF8, 0, str, strlen(str), wstr, length_needed); + WriteConsoleW(hConsole, wstr, length_needed, NULL, NULL); + + free(wstr); +#else + // TODO: reporting invalid_utf8 would be useful on non-Windows too. + // printf will silently just write bad unicode. + printf("%s", str); +#endif +} + +int main(int raw_argc, char ** raw_argv) { + const std::vector argv = ingest_args(raw_argc, raw_argv); + const int argc = argv.size(); + + if (argc <= 1) { + print_usage_information(argv[0].c_str(), stderr); + return 1; + } + + ////// + // Read out all the command line arguments. + ////// + + // variables where to put any arguments we see. + bool printing_ids = false; + bool no_bos = false; + bool disable_logging = false; + const char * model_path = NULL; + const char * prompt_path = NULL; + const char * prompt_arg = NULL; + + // track which arguments were explicitly given + // used for sanity checking down the line + bool model_path_set = false; + bool prompt_path_set = false; + bool prompt_set = false; + bool stdin_set = false; + + // If we see an unrecognized argument, we set + // demand_old_style_arguments to true. It signifies we are expecting + // the "old style arguments", i.e. simple positional arguments for + // argv[1] argv[2] and possibly argv[3]: + // + // tokenize MODEL_FNAME PROMPT [--ids] + // + // As opposed to "new style arguments" which uses --model, --prompt, + // etc. nice flags. + // + // We use 'unknown_arg' to keep track of the first argument that we + // didn't recognize so we can complain to the user if we can't + // recognize arguments even using the old style. + bool demand_old_style_arguments = false; + const char * unknown_arg = NULL; + + int iarg = 1; + for (; iarg < argc; ++iarg) { + std::string arg{argv[iarg]}; + if (arg == "-h" || arg == "--help") { + print_usage_information(argv[0].c_str(), stdout); + return 0; + } + else if (arg == "--ids") { + printing_ids = true; + } + else if (arg == "-m" || arg == "--model") { + if (model_path_set) { + fprintf(stderr, "Error: -m or --model specified multiple times.\n"); + return 1; + } + model_path = argv[++iarg].c_str(); + model_path_set = true; + } + else if (arg == "--no-bos") { + no_bos = true; + } + else if (arg == "-p" || arg == "--prompt") { + if (prompt_set) { + fprintf(stderr, "Error: -p or --prompt specified multiple times.\n"); + return 1; + } + prompt_arg = argv[++iarg].c_str(); + prompt_set = true; + } + else if (arg == "-f" || arg == "--file") { + if (prompt_path_set) { + fprintf(stderr, "Error: -f or --file specified multiple times.\n"); + return 1; + } + prompt_path = argv[++iarg].c_str(); + prompt_path_set = true; + } + else if (arg == "--stdin") { + stdin_set = true; + } + else if (arg == "--log-disable") { + disable_logging = true; + } + else { + demand_old_style_arguments = true; + if (unknown_arg == NULL) { + unknown_arg = argv[iarg].c_str(); + } + } + } + + ////// + // Sanity check the command line arguments. + ////// + + // Old style arguments? (i.e. tokenizer MODEL_FNAME PROMPT [--ids]) + if ((argc == 3 || argc == 4) && + !prompt_set && + !prompt_path_set && + !model_path_set && + !stdin_set) { + model_path = argv[1].c_str(); + prompt_arg = argv[2].c_str(); + if (argc == 4) { + if (argv[3] == "--ids") { + printing_ids = true; + } else { + fprintf(stderr, "Error: unknown option '%s'\n", argv[3].c_str()); + return 1; + } + } + model_path_set = true; + prompt_set = true; + } else if (demand_old_style_arguments) { + GGML_ASSERT(unknown_arg); + fprintf(stderr, "Unknown argument: '%s'\n", unknown_arg); + return 1; + } + + // Check that we have the required stuff set. + if (model_path_set && model_path == NULL) { + fprintf(stderr, "Error: --model requires an argument.\n"); + return 1; + } + if (!model_path_set) { + fprintf(stderr, "Error: must specify --model.\n"); + return 1; + } + if (prompt_path_set && prompt_path == NULL) { + fprintf(stderr, "Error: --file requires an argument.\n"); + return 1; + } + if (prompt_set && prompt_arg == NULL) { + fprintf(stderr, "Error: --prompt requires an argument.\n"); + return 1; + } + const int prompts_set = !!(prompt_path_set) + !!(prompt_set) + !!(stdin_set); + if (prompts_set > 1) { + fprintf(stderr, "Error: --stdin, --file and --prompt are mutually exclusive.\n"); + return 1; + } + // Must have some prompt. + if (prompts_set == 0) { + fprintf(stderr, "Error: must specify one of: --stdin, --file or --prompt.\n"); return 1; } - const char * model_path = argv[1]; - const char * prompt = argv[2]; + GGML_ASSERT(model_path); + GGML_ASSERT(prompt_path || prompt_arg || stdin_set); - const bool printing_ids = argc > 3 && std::string(argv[3]) == "--ids"; + ////// + // Figure out where will the prompt come from. + ////// + + std::string prompt; + if (prompt_path_set) { + bool success = false; + prompt = read_prompt_from_file(prompt_path, success); + if (!success) { + return 1; + } + } else if (prompt_set) { + prompt = prompt_arg; + } else { + GGML_ASSERT(stdin_set); + // we read stdin *after* loading model (early exit if model cannot + // be loaded, which can be a nicer user experience) + } + + ////// + // Start actually doing the tokenizing stuff. + ////// + +#ifdef LOG_DISABLE_LOGS + disable_logging = true; +#endif + + if (disable_logging) { + llama_log_set(llama_log_callback_null, NULL); + } llama_backend_init(); llama_model_params model_params = llama_model_default_params(); model_params.vocab_only = true; llama_model * model = llama_load_model_from_file(model_path, model_params); + if (!model) { + fprintf(stderr, "Error: could not load model from file '%s'.\n", model_path); + return 1; + } llama_context_params ctx_params = llama_context_default_params(); llama_context * ctx = llama_new_context_with_model(model, ctx_params); + if (!ctx) { + fprintf(stderr, "Error: could not create context.\n"); + return 1; + } - const bool add_bos = llama_should_add_bos_token(model); + // read entire prompt from stdin? + if (stdin_set) { + GGML_ASSERT(!prompt_path_set && !prompt_set); - std::vector tokens; + std::stringstream stdin_buffer; + stdin_buffer << std::cin.rdbuf(); + if (std::cin.fail()) { + fprintf(stderr, "Error: could not read the entire standard input.\n"); + return 1; + } + + prompt = stdin_buffer.str(); + } + + const bool model_wants_add_bos = llama_should_add_bos_token(model); + const bool add_bos = model_wants_add_bos && !no_bos; + std::vector tokens; tokens = ::llama_tokenize(model, prompt, add_bos, true); + if (printing_ids) { + printf("["); + } + for (int i = 0; i < (int) tokens.size(); i++) { if (printing_ids) { - printf("%d\n", tokens[i]); + if (i > 0) { + printf(", "); + } + printf("%d", tokens[i]); } else { - printf("%6d -> '%s'\n", tokens[i], llama_token_to_piece(ctx, tokens[i]).c_str()); + bool invalid_utf8 = false; + printf("%6d -> '", tokens[i]); + write_utf8_cstr_to_stdout(llama_token_to_piece(ctx, tokens[i]).c_str(), invalid_utf8); + if (invalid_utf8) { + printf("' (utf-8 decode failure)\n"); + } else { + printf("'\n"); + } } } + if (printing_ids) { + printf("]\n"); + } + + // silence valgrind + llama_free(ctx); + llama_free_model(model); + return 0; } From a837649711c5d37dea5a9517b454052ea536b958 Mon Sep 17 00:00:00 2001 From: Mikko Juola Date: Tue, 26 Mar 2024 14:54:18 -0700 Subject: [PATCH 2/4] style fix: strlen(str) == 0 --> *str == 0 --- examples/tokenize/tokenize.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 6d5da5eb4c1c3..7d9cba8ab65c8 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -149,7 +149,7 @@ static void write_utf8_cstr_to_stdout(const char * str, bool & invalid_utf8) { // MultiByteToWideChar reports an error if str is empty, don't report // them as invalid_utf8. - if (strlen(str) == 0) { + if (*str == 0) { return; } int length_needed = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, str, strlen(str), NULL, 0); From 71a08675c06430aa3f803e4308770d8e94666aa0 Mon Sep 17 00:00:00 2001 From: Mikko Juola Date: Thu, 28 Mar 2024 10:59:23 -0700 Subject: [PATCH 3/4] Simplify tokenize.cpp; by getting rid of handling positional style arguments. It must now be invoked with long --model, --prompt etc. arguments only. Shortens the code. --- examples/tokenize/tokenize.cpp | 54 ++-------------------------------- 1 file changed, 3 insertions(+), 51 deletions(-) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index 7d9cba8ab65c8..ad1c156c4820f 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -20,13 +20,7 @@ static void print_usage_information(const char * argv0, FILE * stream) { fprintf(stream, "and prints the resulting tokens to standard output.\n\n"); fprintf(stream, "It needs a model file, a prompt, and optionally other flags\n"); fprintf(stream, "to control the behavior of the tokenizer.\n\n"); - fprintf(stream, "Invoke '%s' like this:\n", argv0); - fprintf(stream, "\n"); - fprintf(stream, " %s MODEL_FNAME PROMPT [--ids]\n" , argv0); - fprintf(stream, "\n"); - fprintf(stream, " or this:\n"); - fprintf(stream, "\n"); - fprintf(stream, " %s [options], where options are:\n", argv0); + fprintf(stream, " The possible options are:\n"); fprintf(stream, "\n"); fprintf(stream, " -h, --help print this help and exit\n"); fprintf(stream, " -m MODEL_PATH, --model MODEL_PATH path to model.\n"); @@ -213,22 +207,6 @@ int main(int raw_argc, char ** raw_argv) { bool prompt_set = false; bool stdin_set = false; - // If we see an unrecognized argument, we set - // demand_old_style_arguments to true. It signifies we are expecting - // the "old style arguments", i.e. simple positional arguments for - // argv[1] argv[2] and possibly argv[3]: - // - // tokenize MODEL_FNAME PROMPT [--ids] - // - // As opposed to "new style arguments" which uses --model, --prompt, - // etc. nice flags. - // - // We use 'unknown_arg' to keep track of the first argument that we - // didn't recognize so we can complain to the user if we can't - // recognize arguments even using the old style. - bool demand_old_style_arguments = false; - const char * unknown_arg = NULL; - int iarg = 1; for (; iarg < argc; ++iarg) { std::string arg{argv[iarg]}; @@ -273,10 +251,8 @@ int main(int raw_argc, char ** raw_argv) { disable_logging = true; } else { - demand_old_style_arguments = true; - if (unknown_arg == NULL) { - unknown_arg = argv[iarg].c_str(); - } + fprintf(stderr, "Error: unknown option '%s'\n", argv[iarg].c_str()); + return 1; } } @@ -284,30 +260,6 @@ int main(int raw_argc, char ** raw_argv) { // Sanity check the command line arguments. ////// - // Old style arguments? (i.e. tokenizer MODEL_FNAME PROMPT [--ids]) - if ((argc == 3 || argc == 4) && - !prompt_set && - !prompt_path_set && - !model_path_set && - !stdin_set) { - model_path = argv[1].c_str(); - prompt_arg = argv[2].c_str(); - if (argc == 4) { - if (argv[3] == "--ids") { - printing_ids = true; - } else { - fprintf(stderr, "Error: unknown option '%s'\n", argv[3].c_str()); - return 1; - } - } - model_path_set = true; - prompt_set = true; - } else if (demand_old_style_arguments) { - GGML_ASSERT(unknown_arg); - fprintf(stderr, "Unknown argument: '%s'\n", unknown_arg); - return 1; - } - // Check that we have the required stuff set. if (model_path_set && model_path == NULL) { fprintf(stderr, "Error: --model requires an argument.\n"); From 4aaeb42598fabbef4102b7fdc3c84ea3d8b53257 Mon Sep 17 00:00:00 2001 From: brian khuu Date: Wed, 22 May 2024 17:31:48 +1000 Subject: [PATCH 4/4] tokenize.cpp: iostream header no longer required --- examples/tokenize/tokenize.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/tokenize/tokenize.cpp b/examples/tokenize/tokenize.cpp index ad1c156c4820f..54c9834afb1b9 100644 --- a/examples/tokenize/tokenize.cpp +++ b/examples/tokenize/tokenize.cpp @@ -3,7 +3,6 @@ #include #include -#include #include #include #include