From 83b11cdbd67f0d4224e2cf77997964752bff792d Mon Sep 17 00:00:00 2001
From: Abel Cheung <abelcheung@gmail.com>
Date: Wed, 13 Dec 2023 07:44:32 +0000
Subject: [PATCH 1/7] feat: basic json output format

Path with invalid char is not handled yet
---
 src/utils.c                 | 268 ++++++++++++++++++++++++++----------
 src/utils.h                 |  11 +-
 test/cmake/cli-option.cmake |  26 ++--
 3 files changed, 213 insertions(+), 92 deletions(-)

diff --git a/src/utils.c b/src/utils.c
index 6f45d58..dadd67f 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -36,7 +36,7 @@ DECL_OPT_CALLBACK(_set_output_path);
 DECL_OPT_CALLBACK(_option_deprecated);
 DECL_OPT_CALLBACK(_set_opt_delim);
 DECL_OPT_CALLBACK(_set_opt_noheading);
-DECL_OPT_CALLBACK(_set_output_xml);
+DECL_OPT_CALLBACK(_set_opt_format);
 DECL_OPT_CALLBACK(_show_ver_and_exit);
 
 /* pre-declared out of laziness */
@@ -87,7 +87,14 @@ static char *os_strings[] = {
     N_("Windows 10 or above")
 };
 
-static int          output_mode        = OUTPUT_NONE;
+static char *out_format_name[] = {
+    "unknown format",
+    "TSV format",
+    "XML format",
+    "JSON format",
+};
+
+static out_fmt      output_format      = FORMAT_UNKNOWN;
 static gboolean     no_heading         = FALSE;
 static gboolean     use_localtime      = FALSE;
 static gboolean     live_mode          = FALSE;
@@ -103,22 +110,27 @@ static FILE        *err_fh             = NULL; /*!< unused for Windows console *
        metarecord  *meta               = NULL;
 
 
-/* Options intended for tab delimited mode output only */
-static const GOptionEntry text_options[] = {
+/* Options controlling output format */
+static const GOptionEntry out_options[] = {
     {
         "delimiter", 't', 0,
         G_OPTION_ARG_CALLBACK, _set_opt_delim,
-        N_("String to use as delimiter (TAB by default)"), N_("STRING")
+        N_("Field delimiter for TSV ['\\t' (TAB) if not given]"), N_("STRING")
     },
     {
         "no-heading", 'n', G_OPTION_FLAG_NO_ARG,
         G_OPTION_ARG_CALLBACK, _set_opt_noheading,
-        N_("Don't show column header and metadata"), NULL
+        N_("Don't show TSV column header and metadata"), NULL
     },
     {
-        "always-utf8", '8', G_OPTION_FLAG_HIDDEN | G_OPTION_FLAG_NO_ARG,
+        "xml", 'x', G_OPTION_FLAG_NO_ARG,
         G_OPTION_ARG_CALLBACK, _option_deprecated,
-        N_("(This option is deprecated)"), NULL
+        N_("Deprecated, use '-f xml' in future"), NULL
+    },
+    {
+        "format", 'f', 0,
+        G_OPTION_ARG_CALLBACK, _set_opt_format,
+        N_("'text' (default), 'xml' or 'json'"), N_("FORMAT")
     },
     { 0 }
 };
@@ -130,11 +142,6 @@ static const GOptionEntry main_options[] = {
         G_OPTION_ARG_CALLBACK, _set_output_path,
         N_("Write output to FILE"), N_("FILE")
     },
-    {
-        "xml", 'x', G_OPTION_FLAG_NO_ARG,
-        G_OPTION_ARG_CALLBACK, _set_output_xml,
-        N_("Output in XML format instead of tab-delimited values"), NULL
-    },
     {
         "localtime", 'z', 0,
         G_OPTION_ARG_NONE, &use_localtime,
@@ -178,38 +185,50 @@ static const GOptionEntry live_options[] = {
 /* Following routines are command argument handling related */
 
 static gboolean
-_set_output_mode (int       mode,
-                  GError  **error)
+_set_out_format    (out_fmt     desired_format,
+                    GError    **error)
 {
-    if (output_mode == mode)
+    if (output_format == desired_format)
         return TRUE;
 
-    if (output_mode == OUTPUT_NONE) {
-        output_mode = mode;
+    if (output_format == FORMAT_UNKNOWN) {
+        output_format = desired_format;
         return TRUE;
     }
 
-    g_set_error_literal (error, G_OPTION_ERROR, G_OPTION_ERROR_FAILED,
-        _("Plain text format options can not be used in XML mode."));
+    g_set_error (error, G_OPTION_ERROR, G_OPTION_ERROR_FAILED,
+        "Output was already set in %s, but later argument "
+        "attempts to change to %s",
+        out_format_name[output_format],
+        out_format_name[desired_format]);
     return FALSE;
 }
 
 
-/**
- * @brief Option callback for setting output mode to XML
- * @return `FALSE` if option conflict exists, `TRUE` otherwise
- */
 static gboolean
-_set_output_xml (const gchar *opt_name,
-                 const gchar *value,
-                 gpointer     data,
-                 GError     **error)
+_set_opt_format   (const gchar *opt_name,
+                   const gchar *format,
+                   gpointer     data,
+                   GError     **error)
 {
     UNUSED(opt_name);
-    UNUSED(value);
     UNUSED(data);
 
-    return _set_output_mode (OUTPUT_XML, error);
+    if (g_strcmp0 (format, "text") == 0)
+        return _set_out_format (FORMAT_TEXT, error);
+    else if (g_strcmp0 (format, "tsv") == 0)  // aliases
+        return _set_out_format (FORMAT_TEXT, error);
+    else if (g_strcmp0 (format, "csv") == 0)
+        return _set_out_format (FORMAT_TEXT, error);
+    else if (g_strcmp0 (format, "xml") == 0)
+        return _set_out_format (FORMAT_XML, error);
+    else if (g_strcmp0 (format, "json") == 0)
+        return _set_out_format (FORMAT_JSON, error);
+    else {
+        g_set_error (error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE,
+            "Illegal output format '%s'", format);
+        return FALSE;
+    }
 }
 
 
@@ -229,16 +248,17 @@ _set_opt_noheading (const gchar *opt_name,
 
     no_heading = TRUE;
 
-    return _set_output_mode (OUTPUT_CSV, error);
+    return _set_out_format (FORMAT_TEXT, error);
 }
 
 
 /**
- * @brief Extra level of escape for escape sequences in delimiters
+ * @brief Convert escape sequences in delimiters
  * @param str The original delimiter string
  * @return Escaped delimiter string
- * @note Delimiter needs another escape because it is later used
- * in `printf` routines. It handles `\\r`, `\\n`, `\\t` and `\\e`.
+ * @note Similar to `g_strcompress()`, but only process a few
+ * characters, unlike glib routine which converts all 8bit chars.
+ * Currently handles `\\r`, `\\n`, `\\t` and `\\e`.
  */
 static char *
 _filter_escapes (const char *str)
@@ -314,7 +334,7 @@ _set_opt_delim (const gchar *opt_name,
 
     delim = (*value) ? _filter_escapes (value) : g_strdup ("");
 
-    return _set_output_mode (OUTPUT_CSV, error);
+    return _set_out_format (FORMAT_TEXT, error);
 }
 
 
@@ -366,14 +386,17 @@ _set_output_path (const gchar *opt_name,
  */
 static gboolean
 _option_deprecated (const gchar *opt_name,
-                    const gchar *unused,
+                    const gchar *value,
                     gpointer     data,
                     GError     **error)
 {
-    UNUSED(unused);
+    UNUSED(value);
     UNUSED(data);
-    UNUSED(error);
-    g_warning(_("Option '%s' is deprecated and ignored."), opt_name);
+    if (strcmp (opt_name, "-x") == 0 || strcmp (opt_name, "--xml") == 0)
+    {
+        g_warning(_("Option '%s' is deprecated. Use '-f xml' in future."), opt_name);
+        return _set_out_format (FORMAT_XML, error);
+    }
     return TRUE;
 }
 
@@ -552,14 +575,14 @@ _fileargs_handler (GOptionContext *context,
 
 
 /**
- * @brief post-callback after handling all text related arguments
+ * @brief post-callback after handling all output related args
  * @return Always `TRUE`, denoting success. It never fails.
  */
 static gboolean
-_text_default_options (GOptionContext *context,
-                       GOptionGroup   *group,
-                       gpointer        data,
-                       GError        **error)
+_set_def_output_opts    (GOptionContext *context,
+                         GOptionGroup   *group,
+                         gpointer        data,
+                         GError        **error)
 {
     UNUSED (context);
     UNUSED (group);
@@ -570,8 +593,8 @@ _text_default_options (GOptionContext *context,
     if (delim == NULL)
         delim = g_strdup ("\t");
 
-    if (output_mode == OUTPUT_NONE)
-        output_mode = OUTPUT_CSV;
+    if (output_format == FORMAT_UNKNOWN)
+        output_format = FORMAT_TEXT;
 
     return TRUE;
 }
@@ -898,7 +921,7 @@ _opt_ctxt_setup (GOptionContext **context,
                  rbin_type        type)
 {
     char         *desc_str;
-    GOptionGroup *group, *txt_group;
+    GOptionGroup *main_group, *output_group;
 
     /* FIXME Sneaky metadata modification! Think about cleaner way */
     meta->type = type;
@@ -912,17 +935,17 @@ _opt_ctxt_setup (GOptionContext **context,
     g_free (desc_str);
 
     /* main group */
-    group = g_option_group_new (NULL, NULL, NULL, meta, NULL);
+    main_group = g_option_group_new (NULL, NULL, NULL, meta, NULL);
 
-    g_option_group_add_entries (group, main_options);
+    g_option_group_add_entries (main_group, main_options);
     switch (type)
     {
         case RECYCLE_BIN_TYPE_FILE:
-            g_option_group_add_entries (group, rbinfile_options);
+            g_option_group_add_entries (main_group, rbinfile_options);
             break;
         case RECYCLE_BIN_TYPE_DIR:
 #if (defined G_OS_WIN32 || defined __GLIBC__)
-            g_option_group_add_entries (group, live_options);
+            g_option_group_add_entries (main_group, live_options);
 #else
             UNUSED (live_options);
 #endif
@@ -930,19 +953,19 @@ _opt_ctxt_setup (GOptionContext **context,
         default: break;
     }
 
-    g_option_group_set_parse_hooks (group, NULL,
+    g_option_group_set_parse_hooks (main_group, NULL,
         (GOptionParseFunc) _fileargs_handler);
-    g_option_context_set_main_group (*context, group);
+    g_option_context_set_main_group (*context, main_group);
 
-    /* text group */
-    txt_group = g_option_group_new ("text",
-        _("Plain text output options:"),
-        N_("Show plain text output options"), NULL, NULL);
+    /* output format arg group */
+    output_group = g_option_group_new ("format",
+        _("Output format options:"),
+        N_("Show output formatting options"), NULL, NULL);
 
-    g_option_group_add_entries (txt_group, text_options);
+    g_option_group_add_entries (output_group, out_options);
     g_option_group_set_parse_hooks (
-        txt_group, NULL, _text_default_options);
-    g_option_context_add_group (*context, txt_group);
+        output_group, NULL, _set_def_output_opts);
+    g_option_context_add_group (*context, output_group);
 
     g_option_context_set_help_enabled (*context, TRUE);
 }
@@ -1341,6 +1364,27 @@ _close_handles (void)
 }
 
 
+static char *
+_json_escape_path (const char *path)
+{
+    // TODO g_string_replace from glib 2.68 does it all
+
+    char *p = (char *) path;
+    gunichar c = 0;
+    GString *s = g_string_new ("");
+
+    while (*p) {
+        c = g_utf8_get_char (p);
+        if (c == 0x5C)
+            s = g_string_append (s, "\\\\");
+        else
+            s = g_string_append_unichar (s, c);
+        p = g_utf8_next_char (p);
+    }
+    return g_string_free (s, FALSE);
+}
+
+
 /**
  * @brief Print preamble and column header for TSV output
  * @param meta Pointer to metadata structure
@@ -1471,6 +1515,39 @@ _print_xml_header (metarecord *meta)
 }
 
 
+/**
+ * @brief Print preamble for JSON output
+ * @param meta Pointer to metadata structure
+ */
+static void
+_print_json_header (metarecord *meta)
+{
+    g_print ("{\n");
+    g_printf ("  \"format\": \"%s\",\n",
+        (meta->type == RECYCLE_BIN_TYPE_FILE) ? "file" : "dir");
+
+
+    if (meta->version >= 0)  /* can be found and not error */
+        g_printf ("  \"version\": %" PRId64 ",\n", meta->version);
+    else
+        g_print ("  \"version\": null,\n");
+
+    if (meta->type == RECYCLE_BIN_TYPE_FILE && meta->total_entry > 0)
+        g_printf ("  \"ever_existed\": %" PRIu32 ",\n", meta->total_entry);
+
+    // TODO need to escape path separator for json
+    {
+        char *s = g_filename_display_name (meta->filename);
+        char *rbin_path = _json_escape_path (s);
+        g_printf ("  \"path\": \"%s\",\n", rbin_path);
+        g_free (s);
+        g_free (rbin_path);
+    }
+
+    g_print ("  \"records\": [\n");
+}
+
+
 /**
  * @brief Stub routine for printing header
  * @note Calls other printing routine depending on output mode
@@ -1480,18 +1557,13 @@ _print_header (void)
 {
     if (no_heading) return;
 
-    switch (output_mode)
+    switch (output_format)
     {
-        case OUTPUT_CSV:
-            _print_csv_header (meta);
-            break;
-
-        case OUTPUT_XML:
-            _print_xml_header (meta);
-            break;
+        case FORMAT_TEXT: _print_csv_header  (meta); break;
+        case FORMAT_XML:  _print_xml_header  (meta); break;
+        case FORMAT_JSON: _print_json_header (meta); break;
 
-        default:
-            g_assert_not_reached();
+        default: g_assert_not_reached();
     }
 }
 
@@ -1523,9 +1595,9 @@ _print_record_cb (rbin_struct *record,
     out_fname = out_fname ?
         g_strdup (out_fname) : g_strdup ("???");
 
-    switch (output_mode)
+    switch (output_format)
     {
-        case OUTPUT_CSV:
+        case FORMAT_TEXT:
 
             deltime = g_date_time_format (dt, "%F %T");
 
@@ -1544,7 +1616,7 @@ _print_record_cb (rbin_struct *record,
 
             break;
 
-        case OUTPUT_XML:
+        case FORMAT_XML:
         {
             GString *s = g_string_new (NULL);
 
@@ -1573,6 +1645,46 @@ _print_record_cb (rbin_struct *record,
         }
             break;
 
+        case FORMAT_JSON:
+        {
+            GString *s = g_string_new ("    {\"index\": ");
+
+            if (meta->type == RECYCLE_BIN_TYPE_FILE) {
+                g_string_append_printf (s, "%" PRIu32, record->index_n);
+            } else {
+                g_string_append_printf (s, "\"%s\"", record->index_s);
+            }
+
+            deltime = use_localtime ? g_date_time_format (dt, "%FT%T%z"):
+                                      g_date_time_format (dt, "%FT%TZ");
+
+            g_string_append_printf (s, ", \"time\": \"%s\"", deltime);
+
+            g_string_append_printf (s, ", \"gone\": %s",
+                (record->gone == FILESTATUS_GONE  ) ? "true" :
+                (record->gone == FILESTATUS_EXISTS) ? "false":
+                                                      "null");
+
+            if ( record->filesize == G_MAXUINT64 ) /* faulty */
+                g_string_append_printf (s, ", \"size\": null");
+            else
+                g_string_append_printf (s,
+                    ", \"size\": %" PRIu64, record->filesize);
+
+            {
+                char *s = _json_escape_path (out_fname);
+                g_free (out_fname);
+                out_fname = s;
+            }
+
+            g_string_append_printf (s,
+                ", \"path\": \"%s\"},\n", out_fname);
+
+            outstr = g_string_free (s, FALSE);
+            g_print ("%s", outstr);
+        }
+            break;
+
         default:
             g_assert_not_reached();
     }
@@ -1591,16 +1703,20 @@ _print_record_cb (rbin_struct *record,
 static void
 _print_footer (void)
 {
-    switch (output_mode)
+    switch (output_format)
     {
-        case OUTPUT_CSV:
+        case FORMAT_TEXT:
             /* do nothing */
             break;
 
-        case OUTPUT_XML:
+        case FORMAT_XML:
             g_print ("%s", "</recyclebin>\n");
             break;
 
+        case FORMAT_JSON:
+            g_print ("  ]\n}\n");
+            break;
+
         default:
             g_assert_not_reached();
     }
diff --git a/src/utils.h b/src/utils.h
index e34cf96..2c70ad8 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -63,12 +63,13 @@ typedef enum
     VERSION_ME_03,
 } detected_os_ver;
 
-enum
+typedef enum _out_fmt
 {
-    OUTPUT_NONE = 0,
-    OUTPUT_CSV,
-    OUTPUT_XML
-};
+    FORMAT_UNKNOWN,
+    FORMAT_TEXT,
+    FORMAT_XML,
+    FORMAT_JSON,
+} out_fmt;
 
 /**
  * @brief Whether original trashed file still exists
diff --git a/test/cmake/cli-option.cmake b/test/cmake/cli-option.cmake
index 1428706..f317acf 100644
--- a/test/cmake/cli-option.cmake
+++ b/test/cmake/cli-option.cmake
@@ -35,11 +35,11 @@ endfunction()
 addWithFileOptTest(LongHead   --no-heading  )
 addWithFileOptTest(LongSep    --delimiter=: )
 addWithFileOptTest(LongTime   --localtime   )
-addWithFileOptTest(LongXml    --xml         )
+addWithFileOptTest(LongXml    --format xml  )
 addWithFileOptTest(ShortHead  -n            )
 addWithFileOptTest(ShortSep   -t :          )
 addWithFileOptTest(ShortTime  -z            )
-addWithFileOptTest(ShortXml   -x            )
+addWithFileOptTest(ShortXml   -f xml        )
 
 
 function(addBadBareOptTest name)
@@ -104,20 +104,24 @@ set_tests_properties(d_NullArgOptTestOut f_NullArgOptTestOut f_NullArgOptTestEnc
 add_bintype_label(d_NullArgOptTestOut f_NullArgOptTestOut f_NullArgOptTestEnc)
 
 
-function(addBadComboOptTest name)
-    add_test(NAME d_BadComboOptTest${name} COMMAND
+function(addBadComboOptTest id)
+    add_test(NAME d_BadComboOptTest${id} COMMAND
         rifiuti-vista ${ARGN} ${sample_dir}/dir-sample1)
-    add_test(NAME f_BadComboOptTest${name} COMMAND
+    add_test(NAME f_BadComboOptTest${id} COMMAND
         rifiuti       ${ARGN} ${sample_dir}/INFO2-sample1)
-    set_tests_properties(d_BadComboOptTest${name} f_BadComboOptTest${name}
+    set_tests_properties(d_BadComboOptTest${id} f_BadComboOptTest${id}
         PROPERTIES
             LABELS "arg;xfail"
-            PASS_REGULAR_EXPRESSION "can not be used in XML mode")
-    add_bintype_label(d_BadComboOptTest${name} f_BadComboOptTest${name})
+            PASS_REGULAR_EXPRESSION "Output was already set in .+ format, but later argument attempts to change to .+ format")
+    add_bintype_label(d_BadComboOptTest${id} f_BadComboOptTest${id})
 endfunction()
 
-addBadComboOptTest(1 -x -t:)
-addBadComboOptTest(2 -n -x)
+# implicit text options
+addBadComboOptTest(1 -f xml -t:)
+addBadComboOptTest(2 -n -f xml)
+# explicit option conflict
+addBadComboOptTest(3 -f tsv -f json)
+addBadComboOptTest(4 -f xml -f text)
 
 
 function(addMultiInputTest name)
@@ -144,7 +148,7 @@ function(addMissingInputTest name)
     add_bintype_label(d_MissingInputTest${name} f_MissingInputTest${name})
 endfunction()
 
-addMissingInputTest(1 -x)
+addMissingInputTest(1 -f xml)
 addMissingInputTest(2 -t :)
 addMissingInputTest(3 -z -o file1 -n)
 

From 60e552ff9a4c23abe80e4f2864e4418767f6788a Mon Sep 17 00:00:00 2001
From: Abel Cheung <abelcheung@gmail.com>
Date: Wed, 13 Dec 2023 23:36:16 +0000
Subject: [PATCH 2/7] refactor: Split conversion and encoding utils to its own
 file

Some func args are amended to break the dependency on main utils
---
 CMakeLists.txt      |   2 +
 src/rifiuti-vista.c |  67 +++++----
 src/rifiuti.c       |  56 +++++--
 src/utils-conv.c    | 353 +++++++++++++++++++++++++++++++++++++++++++
 src/utils-conv.h    |  25 ++++
 src/utils.c         | 358 +-------------------------------------------
 src/utils.h         |  12 --
 7 files changed, 468 insertions(+), 405 deletions(-)
 create mode 100644 src/utils-conv.c
 create mode 100644 src/utils-conv.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 260132a..1798e51 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -62,6 +62,8 @@ foreach(bin rifiuti rifiuti-vista)
         PRIVATE
             src/utils.c
             src/utils.h
+            src/utils-conv.c
+            src/utils-conv.h
     )
     if(WIN32)
         target_sources(${bin}
diff --git a/src/rifiuti-vista.c b/src/rifiuti-vista.c
index c1eaaa0..182a9d6 100644
--- a/src/rifiuti-vista.c
+++ b/src/rifiuti-vista.c
@@ -4,10 +4,12 @@
  * Please see LICENSE file for more info.
  */
 
+#include <stdbool.h>
 #include <glib/gi18n.h>
 #include <glib/gstdio.h>
 
 #include "rifiuti-vista.h"
+#include "utils-conv.h"
 #include "utils.h"
 #ifdef G_OS_WIN32
 #  include "utils-win.h"
@@ -124,22 +126,26 @@ _validate_index_file   (const char   *filename,
 
 static rbin_struct *
 _populate_record_data  (void      *buf,
-                        uint64_t   version,
-                        gboolean   erraneous)
+                        gsize      bufsize,
+                        uint64_t   version)
 {
     rbin_struct  *record;
     size_t        read;
+    bool          erraneous = false;
 
     record = g_malloc0 (sizeof (rbin_struct));
     record->version = version;
 
-    /*
-     * In rare cases, the size of index file is 543 bytes versus (normal) 544 bytes.
-     * In such occasion file size only occupies 56 bit, not 64 bit as it ought to be.
-     * Actually this 56-bit file size is very likely wrong after all. Probably some
-     * bug inside Windows. This is observed during deletion of dd.exe from Forensic
-     * Acquisition Utilities (by George M. Garner Jr) in certain localized Vista.
-     */
+    // In rare cases, the size of index file is one byte short of
+    // (fixed) 544 bytes in Vista. Under such occasion, file size
+    // only occupies 56 bit, not 64 bit as it ought to be.
+    // Actually this 56-bit file size is very likely wrong after all.
+    // This is observed during deletion of dd.exe from Forensic
+    // Acquisition Utilities (by George M. Garner Jr)
+    // in certain localized Vista.
+    if (version == VERSION_VISTA && bufsize == VERSION1_FILE_SIZE - 1)
+        erraneous = true;
+
     memcpy (&record->filesize, buf + FILESIZE_OFFSET,
             FILETIME_OFFSET - FILESIZE_OFFSET - (int) erraneous);
     if (erraneous)
@@ -156,7 +162,7 @@ _populate_record_data  (void      *buf,
     }
 
     /* File deletion time */
-    memcpy (&record->winfiletime, buf + FILETIME_OFFSET - (int) erraneous,
+    memcpy (&record->winfiletime, buf - (int) erraneous + FILETIME_OFFSET,
             VERSION1_FILENAME_OFFSET - FILETIME_OFFSET);
     record->winfiletime = GINT64_FROM_LE (record->winfiletime);
     record->deltime = win_filetime_to_gdatetime (record->winfiletime);
@@ -165,24 +171,37 @@ _populate_record_data  (void      *buf,
     {
         case VERSION_VISTA:
             record->uni_path = conv_path_to_utf8_with_tmpl (
-                (const char *) (buf - erraneous + VERSION1_FILENAME_OFFSET),
-                NULL, "<\\u%04X>", &read, &record->error);
+                (const char *) (buf - (int) erraneous + VERSION1_FILENAME_OFFSET),
+                WIN_PATH_MAX, NULL, "<\\u%04X>", &read, &record->error);
             break;
 
         case VERSION_WIN10:
+        {
             record->uni_path = conv_path_to_utf8_with_tmpl (
                 (const char *) (buf + VERSION2_FILENAME_OFFSET),
+                bufsize - VERSION2_FILENAME_OFFSET,
                 NULL, "<\\u%04X>", &read, &record->error);
+        }
             break;
 
         default:
             g_assert_not_reached ();
     }
 
-    if (! record->uni_path)
-        g_set_error_literal (&record->error, R2_REC_ERROR,
-                R2_REC_ERROR_CONV_PATH,
-                _("Trash file path conversion failed completely"));
+    if (record->uni_path) {
+        if (g_error_matches (record->error, G_CONVERT_ERROR,
+            G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
+        {
+            g_debug ("%s", record->error->message);
+            g_clear_error (&record->error);
+            g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
+                _("Path contains broken unicode character(s)"));
+        }
+    } else {
+        g_clear_error (&record->error);
+        g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
+            _("Trash file path conversion failed completely"));
+    }
 
     return record;
 }
@@ -212,21 +231,7 @@ _parse_record_cb   (char *index_file,
 
     g_debug ("Start populating record for '%s'...", basename);
 
-    switch (version)
-    {
-        case VERSION_VISTA:
-            record = _populate_record_data (buf, version,
-                (bufsize == VERSION1_FILE_SIZE - 1));
-            break;
-
-        case VERSION_WIN10:
-            record = _populate_record_data (buf, version, FALSE);
-            break;
-
-        default:
-            g_assert_not_reached();
-    }
-
+    record = _populate_record_data (buf, bufsize, version);
     g_free (buf);
 
     /* Check corresponding $R.... file existance and set record->gone */
diff --git a/src/rifiuti.c b/src/rifiuti.c
index 4244e1e..f682e05 100644
--- a/src/rifiuti.c
+++ b/src/rifiuti.c
@@ -9,6 +9,7 @@
 #include <glib/gstdio.h>
 
 #include "rifiuti.h"
+#include "utils-conv.h"
 #include "utils.h"
 
 
@@ -144,10 +145,12 @@ _populate_record_data   (void     *buf,
     uint32_t        drivenum;
     size_t          read;
     char           *legacy_fname;
+    gsize           legacy_bufsize, uni_bufsize;
 
     record = g_malloc0 (sizeof (rbin_struct));
 
-    legacy_fname = g_malloc0 (RECORD_INDEX_OFFSET - LEGACY_FILENAME_OFFSET);
+    legacy_bufsize = RECORD_INDEX_OFFSET - LEGACY_FILENAME_OFFSET;
+    legacy_fname = g_malloc0 (legacy_bufsize);
     copy_field (legacy_fname, LEGACY_FILENAME_OFFSET, RECORD_INDEX_OFFSET);
 
     /* Index number associated with the record */
@@ -187,18 +190,28 @@ _populate_record_data   (void     *buf,
     record->filesize = GUINT64_FROM_LE (record->filesize);
     g_debug ("filesize=%" PRIu64, record->filesize);
 
-    /*
-     * 1. Only bother populating legacy path if users need it,
-     *    because otherwise we don't know which encoding to use
-     * 2. Enclose with angle brackets because they are not allowed
-     *    in Windows file name, therefore stands out better that
-     *    the escaped hex sequences are not part of real file name
-     */
+    // Only bother populating legacy path if users need it,
+    // because otherwise we don't know which encoding to use
     if (legacy_encoding)
     {
         record->legacy_path = conv_path_to_utf8_with_tmpl (
-            legacy_fname, legacy_encoding,
+            legacy_fname, legacy_bufsize, legacy_encoding,
             "<\\%02X>", &read, &record->error);
+        if (record->legacy_path) {
+            if (g_error_matches (record->error, G_CONVERT_ERROR,
+                G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
+            {
+                g_debug ("%s", record->error->message);
+                g_clear_error (&record->error);
+                g_set_error (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
+                _("Path contains character(s) that could not be "
+                "interpreted in %s encoding"), legacy_encoding);
+            }
+        } else {
+            g_clear_error (&record->error);
+            g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
+                _("Legacy path conversion failed completely"));
+        }
     }
 
     g_free (legacy_fname);
@@ -208,10 +221,28 @@ _populate_record_data   (void     *buf,
 
     /* Part below deals with unicode path only */
 
+    uni_bufsize = UNICODE_RECORD_SIZE - UNICODE_FILENAME_OFFSET;
+
     record->uni_path = conv_path_to_utf8_with_tmpl (
-        (char *) (buf + UNICODE_FILENAME_OFFSET), NULL,
+        (char *) (buf + UNICODE_FILENAME_OFFSET),
+        uni_bufsize / sizeof(gunichar2), NULL,
         "<\\u%04X>", &read, &record->error);
 
+    if (record->uni_path) {
+        if (g_error_matches (record->error, G_CONVERT_ERROR,
+            G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
+        {
+            g_debug ("%s", record->error->message);
+            g_clear_error (&record->error);
+            g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
+                _("Path contains broken unicode character(s)"));
+        }
+    } else {
+        g_clear_error (&record->error);
+        g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
+            _("Unicode path conversion failed completely"));
+    }
+
     /*
      * We check for junk memory filling the padding area after
      * unicode path, using it as the indicator of OS generating this
@@ -226,6 +257,11 @@ _populate_record_data   (void     *buf,
      * Looks like an ANSI codepage full path is filled in
      * legacy path field, then overwritten in place by a 8.3
      * version of path whenever applicable (which was always shorter).
+     *
+     * The 8.3 path generated from non-ascii seems to follow certain
+     * ruleset, but the exact detail is unknown:
+     * - accented latin chars transliterated to pure ASCII
+     * - first DBCS char converted to UCS2 codepoint
      */
     if (junk_detected && ! *junk_detected)
     {
diff --git a/src/utils-conv.c b/src/utils-conv.c
new file mode 100644
index 0000000..02640f9
--- /dev/null
+++ b/src/utils-conv.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (C) 2023, Abel Cheung.
+ * rifiuti2 is released under Revised BSD License.
+ * Please see LICENSE file for more info.
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
+#include <glib.h>
+#include <glib/gi18n.h>
+
+#include "utils-conv.h"
+
+
+/**
+ * @brief Try out if encoding is compatible to ASCII
+ * @param enc The encoding to test
+ * @param error Location to store error during trial
+ * @return `true` if compatible, `false` otherwise
+ * (including the case where encoding doesn't exist)
+ */
+bool
+enc_is_ascii_compatible    (const char   *enc,
+                            GError      **error)
+{
+    bool equal;
+    char *s;
+
+    g_return_val_if_fail (enc && *enc, false);
+
+    s = g_convert ("C:\\", -1, "UTF-8", enc, NULL, NULL, error);
+    equal = (0 == g_strcmp0 ("C:\\", (const char *)s));
+    g_free (s);
+
+    if (equal)
+        return true;
+
+    if (*error == NULL)
+        // Encoding is ASCII incompatible (e.g. EBCDIC). Even if trial
+        // convert doesn't fail, it would cause application error
+        // later on. Treat that as conversion error for convenience.
+        g_set_error_literal (error, G_CONVERT_ERROR,
+            G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "");
+    return false;
+}
+
+
+/**
+ * @brief Compute UCS2 string length like `wcslen()`
+ * @param str The string to check (in `char*` !)
+ * @param max_sz Maximum length to check, or use -1 to
+ * denote the string is nul-terminated
+ * @return Either number of UCS2 char for whole string,
+ * or return `max_sz` when `max_sz` param is exceeded
+ */
+static size_t
+_ucs2_strnlen   (const char   *str,
+                 ssize_t       max_sz)
+{
+    // wcsnlen_s should be equivalent except for boundary
+    // cases we don't care about
+
+    size_t i = 0;
+    char *p = (char *) str;
+
+    if (str == NULL)
+        return 0;
+
+    while (*p || *(p+1))
+    {
+        if (max_sz >= 0 && i >= (size_t) max_sz)
+            break;
+        i++;
+        p += 2;
+    }
+    return i;
+}
+
+
+/**
+ * @brief Move character pointer for specified bytes
+ * @param sz Must be either 1 or 2, denoting broken byte or broken UCS2 character
+ * @param in_str Reference to input string to be converted
+ * @param read_bytes Reference to already read bytes count to keep track of
+ * @param out_str Reference to output string to be appended
+ * @param write_bytes Reference to writable bytes count to decrement
+ * @param tmpl `printf` template to represent the broken character
+ * @note This is the core of `conv_path_to_utf8_with_tmpl()` doing
+ * error fallback, converting a single broken char to `printf` output.
+ */
+static void
+_advance_octet    (size_t       sz,
+                   char       **in_str,
+                   gsize       *read_bytes,
+                   char       **out_str,
+                   gsize       *write_bytes,
+                   const char  *tmpl)
+{
+    char *repl;
+
+    switch (sz) {
+        case 1:
+        {
+            unsigned char c = *(unsigned char *) (*in_str);
+            repl = g_strdup_printf (tmpl, c);
+        }
+            break;
+
+        case 2:
+        {
+            uint16_t c = GUINT16_FROM_LE (*(uint16_t *) (*in_str));
+            repl = g_strdup_printf (tmpl, c);
+        }
+            break;
+
+        default:
+            g_assert_not_reached();
+    }
+
+    (*in_str) += sz;
+    if (read_bytes != NULL)
+        (*read_bytes) -= sz;
+
+    *out_str = g_stpcpy (*out_str, (const char *) repl);
+    if (write_bytes != NULL)
+        *write_bytes -= strlen (repl);
+
+    g_free (repl);
+    return;
+}
+
+
+/**
+ * @brief Convert non-printable characters to escape sequences
+ * @param str The original string to be converted
+ * @param tmpl `printf` template to represent non-printable chars
+ * @return Converted string, maybe containing escape sequences
+ * @attention Caller is responsible for using correct template, no
+ * error checking is performed. This template should handle a single
+ * Windows unicode path character, which is in UTF-16LE encoding.
+ */
+static char *
+_filter_printable_char (const char *str,
+                        const char *tmpl)
+{
+    char     *p, *np;
+    gunichar  c;
+    GString  *s;
+
+    s = g_string_sized_new (strlen (str) * 2);
+    p = (char *) str;
+    while (*p)
+    {
+        c  = g_utf8_get_char  (p);
+        np = g_utf8_next_char (p);
+
+        /*
+         * ASCII space is the norm (e.g. Program Files), but
+         * all other kinds of spaces are rare, so escape them too
+         */
+        if (g_unichar_isgraph (c) || (c == 0x20))
+            s = g_string_append_len (s, p, (gssize) (np - p));
+        else
+            g_string_append_printf (s, tmpl, c);
+
+        p = np;
+    }
+
+    return g_string_free (s, FALSE);
+}
+
+
+/**
+ * @brief Convert path to UTF-8 encoding with customizable fallback
+ * @param path The path string to be converted
+ * @param from_enc Either a legacy Windows ANSI encoding, or use
+ * `NULL` to represent Windows wide char encoding (UTF-16LE)
+ * @param tmpl `printf`-style string template to represent broken
+ * character. This template should handle either single- or
+ * double-octet, namely `%u`, `%o`, `%d`, `%i`, `%x` and `%X`.
+ * @param read Reference to number of successfully read bytes
+ * @param error Location to store error upon problem
+ * @return UTF-8 encoded path, or `NULL` if conversion error happens
+ * @note This is very similar to `g_convert_with_fallback()`, but the
+ * fallback is a `printf`-style string instead of a fixed string,
+ * so that different fallback sequence can be used with various output
+ * format.
+ * @attention 1. This routine is not for generic charset conversion.
+ * Extra transformation is intended for path display only.
+ * @attention 1. Caller is responsible for using correct template,
+ * almost no error checking is performed.
+ */
+char *
+conv_path_to_utf8_with_tmpl (const char *path,
+                             ssize_t     pathlen,
+                             const char *from_enc,
+                             const char *tmpl,
+                             size_t     *read,
+                             GError    **error)
+{
+    char *u8_path, *i_ptr, *o_ptr, *result = NULL;
+    gsize len, r_total, rbyte, wbyte, status, in_ch_width, out_ch_width;
+    GIConv conv;
+
+    g_return_val_if_fail (path && *path, NULL);
+    g_return_val_if_fail (tmpl && *tmpl, NULL);
+    g_return_val_if_fail (! from_enc || *from_enc, NULL);
+    g_return_val_if_fail (! error    || ! *error , NULL);
+
+    /* try the template */
+    {
+        char *s = g_strdup_printf (tmpl, from_enc ? 0xFF : 0xFFFF);
+        /* UTF-8 character occupies at most 6 bytes */
+        out_ch_width = MAX (strlen(s), 6);
+        g_free (s);
+    }
+
+    if (from_enc != NULL) {
+        in_ch_width = sizeof (char);
+        len = strnlen (path, (size_t) pathlen);
+    } else {
+        in_ch_width = sizeof (gunichar2);
+        len = _ucs2_strnlen (path, (size_t) pathlen);
+    }
+
+    rbyte   = len *  in_ch_width;
+    wbyte   = len * out_ch_width;
+    u8_path = g_malloc0 (wbyte);
+
+    r_total = rbyte;
+    i_ptr   = (char *) path;
+    o_ptr   = u8_path;
+
+    /* Shouldn't fail, from_enc already tested upon start of prog */
+    conv = g_iconv_open ("UTF-8", from_enc ? from_enc : "UTF-16LE");
+
+    g_debug ("Initial: read=%" G_GSIZE_FORMAT ", write=%" G_GSIZE_FORMAT,
+            rbyte, wbyte);
+
+    /* Pass 1: Convert to UTF-8, all illegal seq become escaped hex */
+    while (TRUE)
+    {
+        int e;
+
+        if (*i_ptr == '\0') {
+            if (from_enc   != NULL) break;
+            if (*(i_ptr+1) == '\0') break; /* utf-16: check "\0\0" */
+        }
+
+        // GNU iconv may return number of nonreversible conversions
+        // upon success, but we don't need to worry about it, as
+        // conversion from code page to UTF-8 would not be nonreversible
+        if ((gsize) -1 != (status = g_iconv (
+            conv, &i_ptr, &rbyte, &o_ptr, &wbyte)))
+            break;
+
+        e = errno;
+
+        g_debug ("r=%02" G_GSIZE_FORMAT ", w=%02" G_GSIZE_FORMAT
+            ", stt=%" G_GSIZE_FORMAT " (%s) str=%s",
+            rbyte, wbyte, status, g_strerror(e), u8_path);
+
+        switch (e) {
+            case EILSEQ:
+            case EINVAL:  // TODO Handle partial input for EINVAL
+                if (*error == NULL) {
+                    g_set_error (error, G_CONVERT_ERROR,
+                        G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
+                        _("Illegal sequence or partial input at offset %" G_GSIZE_FORMAT), rbyte);
+                }
+                _advance_octet (in_ch_width, &i_ptr, &rbyte, &o_ptr, &wbyte, tmpl);
+                g_iconv (conv, NULL, NULL, &o_ptr, &wbyte);  // reset state
+                break;
+            case E2BIG:  // TODO realloc instead of Kaboom!
+                g_assert_not_reached();
+        }
+    }
+
+    g_debug ("r=%02" G_GSIZE_FORMAT ", w=%02" G_GSIZE_FORMAT
+        ", stt=%" G_GSIZE_FORMAT ", str=%s", rbyte, wbyte, status, u8_path);
+
+    g_iconv_close (conv);
+
+    if (read != NULL)
+        *read = r_total - rbyte;
+
+    /* Pass 2: Convert all non-printable chars to hex */
+    g_return_val_if_fail (g_utf8_validate (u8_path, -1, NULL), NULL);
+
+    result = _filter_printable_char (u8_path, tmpl);
+    g_free (u8_path);
+
+    return result;
+}
+
+
+/**
+ * @brief Convert escape sequences in delimiters
+ * @param str The original delimiter string
+ * @return Escaped delimiter string
+ * @note Similar to `g_strcompress()`, but only process a few
+ * characters, unlike glib routine which converts all 8bit chars.
+ * Currently handles `\\r`, `\\n`, `\\t` and `\\e`.
+ */
+char *
+filter_escapes (const char *str)
+{
+    GString *result, *debug_str;
+    char *i = (char *) str;
+
+    g_return_val_if_fail ( (str != NULL) && (*str != '\0'), NULL);
+
+    result = g_string_new (NULL);
+    do
+    {
+        if ( *i != '\\' )
+        {
+            result = g_string_append_c (result, *i);
+            continue;
+        }
+
+        switch ( *(++i) )
+        {
+          case 'r':
+            result = g_string_append_c (result, '\r'); break;
+          case 'n':
+            result = g_string_append_c (result, '\n'); break;
+          case 't':
+            result = g_string_append_c (result, '\t'); break;
+          case 'e':
+            result = g_string_append_c (result, '\x1B'); break;
+          default:
+            result = g_string_append_c (result, '\\'); i--;
+        }
+    }
+    while ( *(++i) );
+
+    debug_str = g_string_new ("filtered delimiter = ");
+    i = result->str;
+    do
+    {
+        if ( *i >= 0x20 && *i <= 0x7E )  /* problem during linking with g_ascii_isprint */
+            debug_str = g_string_append_c (debug_str, *i);
+        else
+            g_string_append_printf (debug_str, "\\x%02X", *(unsigned char *) i);
+    }
+    while ( *(++i) );
+    g_debug ("%s", debug_str->str);
+    g_string_free (debug_str, TRUE);
+    return g_string_free (result, FALSE);
+}
+
diff --git a/src/utils-conv.h b/src/utils-conv.h
new file mode 100644
index 0000000..ad7bb1e
--- /dev/null
+++ b/src/utils-conv.h
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2023, Abel Cheung.
+ * rifiuti2 is released under Revised BSD License.
+ * Please see LICENSE file for more info.
+ */
+
+#ifndef _RIFIUTI_UTILS_CONV_H
+#define _RIFIUTI_UTILS_CONV_H
+
+#include <stdbool.h>
+#include <glib.h>
+
+bool          enc_is_ascii_compatible     (const char       *enc,
+                                           GError          **error);
+
+char *        conv_path_to_utf8_with_tmpl (const char       *path,
+                                           ssize_t           pathlen,
+                                           const char       *from_enc,
+                                           const char       *tmpl,
+                                           size_t           *read,
+                                           GError          **error);
+
+char *        filter_escapes              (const char       *str);
+
+#endif
diff --git a/src/utils.c b/src/utils.c
index dadd67f..177c1fb 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -10,6 +10,7 @@
 #include <glib/gi18n.h>
 #include <glib/gstdio.h>
 
+#include "utils-conv.h"
 #include "utils.h"
 #ifdef G_OS_WIN32
 #  include "utils-win.h"
@@ -20,7 +21,7 @@
 
 /* Our own error domain */
 
-G_DEFINE_QUARK (rifiuti-misc-error-quark, rifiuti_fatal_error)
+G_DEFINE_QUARK (rifiuti-fatal-error-quark, rifiuti_fatal_error)
 G_DEFINE_QUARK (rifiuti-record-error-quark, rifiuti_record_error)
 
 /* Common function signature for option callbacks */
@@ -252,63 +253,6 @@ _set_opt_noheading (const gchar *opt_name,
 }
 
 
-/**
- * @brief Convert escape sequences in delimiters
- * @param str The original delimiter string
- * @return Escaped delimiter string
- * @note Similar to `g_strcompress()`, but only process a few
- * characters, unlike glib routine which converts all 8bit chars.
- * Currently handles `\\r`, `\\n`, `\\t` and `\\e`.
- */
-static char *
-_filter_escapes (const char *str)
-{
-    GString *result, *debug_str;
-    char *i = (char *) str;
-
-    g_return_val_if_fail ( (str != NULL) && (*str != '\0'), NULL);
-
-    result = g_string_new (NULL);
-    do
-    {
-        if ( *i != '\\' )
-        {
-            result = g_string_append_c (result, *i);
-            continue;
-        }
-
-        switch ( *(++i) )
-        {
-          case 'r':
-            result = g_string_append_c (result, '\r'); break;
-          case 'n':
-            result = g_string_append_c (result, '\n'); break;
-          case 't':
-            result = g_string_append_c (result, '\t'); break;
-          case 'e':
-            result = g_string_append_c (result, '\x1B'); break;
-          default:
-            result = g_string_append_c (result, '\\'); i--;
-        }
-    }
-    while ( *(++i) );
-
-    debug_str = g_string_new ("filtered delimiter = ");
-    i = result->str;
-    do
-    {
-        if ( *i >= 0x20 && *i <= 0x7E )  /* problem during linking with g_ascii_isprint */
-            debug_str = g_string_append_c (debug_str, *i);
-        else
-            g_string_append_printf (debug_str, "\\x%02X", *(unsigned char *) i);
-    }
-    while ( *(++i) );
-    g_debug ("%s", debug_str->str);
-    g_string_free (debug_str, TRUE);
-    return g_string_free (result, FALSE);
-}
-
-
 /**
  * @brief Option callback for setting field delimiter in TSV output
  * @return `FALSE` if duplicate options are found, `TRUE` otherwise
@@ -332,7 +276,7 @@ _set_opt_delim (const gchar *opt_name,
     }
     seen = TRUE;
 
-    delim = (*value) ? _filter_escapes (value) : g_strdup ("");
+    delim = (*value) ? filter_escapes (value) : g_strdup ("");
 
     return _set_out_format (FORMAT_TEXT, error);
 }
@@ -433,27 +377,14 @@ _check_legacy_encoding (const gchar *opt_name,
         return FALSE;
     }
 
+    if (enc_is_ascii_compatible (enc, &conv_err))
     {
-        char *s = g_convert ("C:\\", -1, "UTF-8", enc, NULL, NULL, &conv_err);
-        gboolean equal = ! g_strcmp0 ("C:\\", s);
-        g_free (s);
-
-        if (equal) {
-            legacy_encoding = g_strdup (enc);
-            return TRUE;
-        }
+        legacy_encoding = g_strdup (enc);
+        return TRUE;
     }
 
     /* everything below is error handling */
 
-    if (conv_err == NULL) {
-        // Encoding is ASCII incompatible (e.g. EBCDIC). Even if trial
-        // convert doesn't fail, it would cause application error
-        // later on. Treat that as conversion error for convenience.
-        g_set_error_literal (&conv_err, G_CONVERT_ERROR,
-            G_CONVERT_ERROR_ILLEGAL_SEQUENCE, "");
-    }
-
     if (g_error_matches (conv_err, G_CONVERT_ERROR, G_CONVERT_ERROR_NO_CONVERSION)) {
         g_set_error (error, G_OPTION_ERROR, G_OPTION_ERROR_BAD_VALUE,
             _("'%s' encoding is not supported by glib library "
@@ -599,256 +530,6 @@ _set_def_output_opts    (GOptionContext *context,
     return TRUE;
 }
 
-/*
- * Charset conversion routines
- */
-
-size_t
-ucs2_strnlen (const char *str, size_t max_sz)
-{
-#ifdef G_OS_WIN32
-
-    return wcsnlen_s ((const wchar_t *) str, max_sz);
-
-#else
-
-    if (str == NULL)
-        return 0;
-
-    for (size_t i = 0; i < max_sz; i++) {
-        if (*(str + i*2) == '\0' && *(str + i*2 + 1) == '\0')
-            return i;
-    }
-    return max_sz;
-
-#endif
-}
-
-
-/**
- * @brief Move character pointer for specified bytes
- * @param sz Must be either 1 or 2, denoting broken byte or broken UCS2 character
- * @param in_str Reference to input string to be converted
- * @param read_bytes Reference to already read bytes count to keep track of
- * @param out_str Reference to output string to be appended
- * @param write_bytes Reference to writable bytes count to decrement
- * @param tmpl `printf` template to represent the broken character
- * @note This is the core of `conv_path_to_utf8_with_tmpl()` doing
- * error fallback, converting a single broken char to `printf` output.
- */
-static void
-_advance_octet (size_t       sz,
-               gchar      **in_str,
-               gsize       *read_bytes,
-               gchar      **out_str,
-               gsize       *write_bytes,
-               const char  *tmpl)
-{
-    gchar *repl;
-
-    switch (sz) {
-        case 1:
-        {
-            unsigned char c = *(unsigned char *) (*in_str);
-            repl = g_strdup_printf (tmpl, c);
-        }
-            break;
-
-        case 2:
-        {
-            uint16_t c = GUINT16_FROM_LE (*(uint16_t *) (*in_str));
-            repl = g_strdup_printf (tmpl, c);
-        }
-            break;
-
-        default:
-            g_assert_not_reached();
-    }
-
-    (*in_str) += sz;
-    if (read_bytes != NULL)
-        (*read_bytes) -= sz;
-
-    *out_str = g_stpcpy (*out_str, (const char *) repl);
-    if (write_bytes != NULL)
-        *write_bytes -= strlen (repl);
-
-    g_free (repl);
-    return;
-}
-
-
-/**
- * @brief Convert non-printable characters to escape sequences
- * @param str The original string to be converted
- * @param tmpl `printf` template to represent non-printable chars
- * @return Converted string, maybe containing escape sequences
- * @attention Caller is responsible for using correct template, no
- * error checking is performed. This template should handle a single
- * Windows unicode path character, which is in UTF-16LE encoding.
- */
-static char *
-_filter_printable_char (const char *str,
-                        const char *tmpl)
-{
-    char     *p, *np;
-    gunichar  c;
-    GString  *s;
-
-    s = g_string_sized_new (strlen (str) * 2);
-    p = (char *) str;
-    while (*p)
-    {
-        c  = g_utf8_get_char  (p);
-        np = g_utf8_next_char (p);
-
-        /*
-         * ASCII space is the norm (e.g. Program Files), but
-         * all other kinds of spaces are rare, so escape them too
-         */
-        if (g_unichar_isgraph (c) || (c == 0x20))
-            s = g_string_append_len (s, p, (gssize) (np - p));
-        else
-            g_string_append_printf (s, tmpl, c);
-
-        p = np;
-    }
-
-    return g_string_free (s, FALSE);
-}
-
-
-/**
- * @brief Convert path to UTF-8 encoding with customizable fallback
- * @param path The path string to be converted
- * @param from_enc Either a legacy Windows ANSI encoding, or use
- * `NULL` to represent Windows wide char encoding (UTF-16LE)
- * @param tmpl `printf`-style string template to represent broken character
- * @param read Reference to number of successfully read bytes
- * @param error Location to store error upon problem
- * @return UTF-8 encoded path, or `NULL` if conversion error happens
- * @note This is very similar to `g_convert_with_fallback()`, but the
- * fallback is a `printf`-style string instead of a fixed string.
- * @attention 1. This routine is not for generic charset conversion.
- * Only supply encoding used in Windows ANSI code page, or use `NULL`
- * for unicode path.
- * @attention 1. Caller is responsible for using correct template, no
- * error checking is performed.
- * This template should handle either single- or double-octet, namely
- * `%u`, `%o`, `%d`, `%i`, `%x` and `%X`. `%c` is no good since byte
- * sequence concerned can't be converted to proper UTF-8 character.
- */
-char *
-conv_path_to_utf8_with_tmpl (const char *path,
-                             const char *from_enc,
-                             const char *tmpl,
-                             size_t     *read,
-                             GError    **error)
-{
-    char *u8_path, *i_ptr, *o_ptr, *result = NULL;
-    gsize len, r_total, rbyte, wbyte, status, in_ch_width, out_ch_width;
-    GIConv conv;
-    gboolean will_set_error = FALSE;  // avoid overwriting error
-
-    g_return_val_if_fail (! from_enc || *from_enc, NULL);
-    g_return_val_if_fail (tmpl && *tmpl, NULL);
-    g_return_val_if_fail (! error || ! *error, NULL);
-
-    /* try the template */
-    {
-        char *s = g_strdup_printf (tmpl, from_enc ? 0xFF : 0xFFFF);
-        /* UTF-8 character occupies at most 6 bytes */
-        out_ch_width = MAX (strlen(s), 6);
-        g_free (s);
-    }
-
-    if (from_enc != NULL) {
-        in_ch_width = sizeof (char);
-        len = strnlen (path, WIN_PATH_MAX);
-    } else {
-        in_ch_width = sizeof (gunichar2);
-        len = ucs2_strnlen (path, WIN_PATH_MAX);
-    }
-
-    if (! len)
-        return NULL;
-
-    rbyte   = len *  in_ch_width;
-    wbyte   = len * out_ch_width;
-    u8_path = g_malloc0 (wbyte);
-
-    r_total = rbyte;
-    i_ptr   = (char *) path;
-    o_ptr   = u8_path;
-
-    /* Shouldn't fail, from_enc already tested upon start of prog */
-    conv = g_iconv_open ("UTF-8", from_enc ? from_enc : "UTF-16LE");
-
-    g_debug ("Initial: read=%" G_GSIZE_FORMAT ", write=%" G_GSIZE_FORMAT,
-            rbyte, wbyte);
-
-    /* Pass 1: Convert to UTF-8, all illegal seq become escaped hex */
-    while (TRUE)
-    {
-        int e;
-
-        if (*i_ptr == '\0') {
-            if (from_enc   != NULL) break;
-            if (*(i_ptr+1) == '\0') break; /* utf-16: check "\0\0" */
-        }
-
-        status = g_iconv (conv, &i_ptr, &rbyte, &o_ptr, &wbyte);
-        e = errno;
-
-        if ( status != (gsize) -1 ) break;
-
-        g_debug ("r=%02" G_GSIZE_FORMAT ", w=%02" G_GSIZE_FORMAT
-            ", stt=%" G_GSIZE_FORMAT " (%s) str=%s",
-            rbyte, wbyte, status, g_strerror(e), u8_path);
-
-        /* XXX Should I consider the possibility of odd bytes for EINVAL? */
-        switch (e) {
-            case EILSEQ:
-            case EINVAL:
-                _advance_octet (in_ch_width, &i_ptr, &rbyte, &o_ptr, &wbyte, tmpl);
-                /* reset state, hopefully Windows don't use stateful encoding at all */
-                g_iconv (conv, NULL, NULL, &o_ptr, &wbyte);
-                will_set_error = TRUE;
-                break;
-            case E2BIG:
-                /* Should have already allocated enough buffer. Let it KABOOM! otherwise. */
-                g_assert_not_reached();
-        }
-    }
-
-    if (will_set_error)
-    {
-        if (from_enc)
-            g_set_error (error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
-                _("Path contains character(s) that could not be "
-                "interpreted in %s encoding"), from_enc);
-        else
-            g_set_error_literal (error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
-                _("Path contains broken unicode character(s)"));
-    }
-
-    g_debug ("r=%02" G_GSIZE_FORMAT ", w=%02" G_GSIZE_FORMAT
-        ", stt=%" G_GSIZE_FORMAT ", str=%s", rbyte, wbyte, status, u8_path);
-
-    g_iconv_close (conv);
-
-    if (read != NULL)
-        *read = r_total - rbyte;
-
-    /* Pass 2: Convert all non-printable chars to hex */
-    g_return_val_if_fail (g_utf8_validate (u8_path, -1, NULL), NULL);
-
-    result = _filter_printable_char (u8_path, tmpl);
-    g_free (u8_path);
-
-    return result;
-}
-
 
 /**
  * @brief Converts Windows FILETIME number to glib counterpart
@@ -1079,33 +760,6 @@ rifiuti_init (rbin_type  type,
 }
 
 
-/*!
- * Wrapper of g_utf16_to_utf8 for big endian system.
- * Always assume string is nul-terminated. (Unused now?)
- */
-char *
-utf16le_to_utf8 (const gunichar2   *str,
-                 glong             *items_read,
-                 glong             *items_written,
-                 GError           **error)
-{
-#if ((G_BYTE_ORDER) == (G_LITTLE_ENDIAN))
-    return g_utf16_to_utf8 (str, -1, items_read, items_written, error);
-#else
-
-    gunichar2 *buf;
-    char *ret;
-
-    /* should be guaranteed to succeed */
-    buf = (gunichar2 *) g_convert ((const char *) str, -1, "UTF-16BE",
-                                   "UTF-16LE", NULL, NULL, NULL);
-    ret = g_utf16_to_utf8 (buf, -1, items_read, items_written, error);
-    g_free (buf);
-    return ret;
-#endif
-}
-
-
 /**
  * @brief Wrapper of `g_mkstemp()` that returns file handle
  * @param fh Reference to `FILE` pointer to store file handle
diff --git a/src/utils.h b/src/utils.h
index 2c70ad8..673ac81 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -266,20 +266,8 @@ gboolean      rifiuti_init                (rbin_type         type,
 
 GDateTime *   win_filetime_to_gdatetime   (int64_t           win_filetime);
 
-char *        utf16le_to_utf8             (const gunichar2  *str,
-                                           glong            *items_read,
-                                           glong            *items_written,
-                                           GError          **error)
-                                           G_GNUC_UNUSED;
-
 gboolean      dump_content                (GError          **error);
 
-char *        conv_path_to_utf8_with_tmpl (const char       *str,
-                                           const char       *from_enc,
-                                           const char       *tmpl,
-                                           size_t           *read,
-                                           GError          **error);
-
 exitcode      rifiuti_handle_global_error (GError           *error);
 gboolean      rifiuti_handle_record_error (void);
 

From 0d97d8274977c02f9582f7b3d72ff6f6be88842b Mon Sep 17 00:00:00 2001
From: Abel Cheung <abelcheung@gmail.com>
Date: Fri, 15 Dec 2023 18:31:06 +0800
Subject: [PATCH 3/7] refactor: Allow difference fallback seq in output formats

Records now store raw path data, so that invalid chars can be converted
to different formats later. However things are not working properly for
JSON yet, as its escape sequence collides with path separator and cause
double conversion.
---
 src/rifiuti-vista.c |  78 +++++------
 src/rifiuti.c       |  79 +++++------
 src/utils-conv.c    |  10 +-
 src/utils-conv.h    |   3 +
 src/utils.c         | 329 +++++++++++++++++++++++++-------------------
 src/utils.h         |   8 +-
 6 files changed, 271 insertions(+), 236 deletions(-)

diff --git a/src/rifiuti-vista.c b/src/rifiuti-vista.c
index 182a9d6..7d5cd19 100644
--- a/src/rifiuti-vista.c
+++ b/src/rifiuti-vista.c
@@ -94,7 +94,7 @@ _validate_index_file   (const char   *filename,
             pathlen = GUINT32_FROM_LE (pathlen);
 
             /* Header length + strlen in UTF-16 encoding */
-            expect_sz = VERSION2_FILENAME_OFFSET + pathlen * 2;
+            expect_sz = VERSION2_FILENAME_OFFSET + pathlen * sizeof(gunichar2);
             if (*bufsize != expect_sz)
             {
                 g_debug ("File size = %" G_GSIZE_FORMAT
@@ -130,22 +130,39 @@ _populate_record_data  (void      *buf,
                         uint64_t   version)
 {
     rbin_struct  *record;
-    size_t        read;
+    size_t        pathbuf_sz = 0;
+    void         *pathbuf_start = NULL;
     bool          erraneous = false;
 
+    switch (version)
+    {
+        case VERSION_VISTA:
+            // In rare cases, the size of index file is one byte short of
+            // (fixed) 544 bytes in Vista. Under such occasion, file size
+            // only occupies 56 bit, not 64 bit as it ought to be.
+            // Actually this 56-bit file size is very likely wrong after all.
+            // This is observed during deletion of dd.exe from Forensic
+            // Acquisition Utilities (by George M. Garner Jr)
+            // in certain localized Vista.
+            if (bufsize == VERSION1_FILE_SIZE - 1)
+                erraneous = true;
+
+            pathbuf_sz = WIN_PATH_MAX * sizeof(gunichar2);
+            pathbuf_start = buf - (int)erraneous + VERSION1_FILENAME_OFFSET;
+            break;
+
+        case VERSION_WIN10:
+            pathbuf_sz = bufsize - VERSION2_FILENAME_OFFSET;
+            pathbuf_start = buf + VERSION2_FILENAME_OFFSET;
+            break;
+
+        default:
+            g_assert_not_reached ();
+    }
+
     record = g_malloc0 (sizeof (rbin_struct));
     record->version = version;
 
-    // In rare cases, the size of index file is one byte short of
-    // (fixed) 544 bytes in Vista. Under such occasion, file size
-    // only occupies 56 bit, not 64 bit as it ought to be.
-    // Actually this 56-bit file size is very likely wrong after all.
-    // This is observed during deletion of dd.exe from Forensic
-    // Acquisition Utilities (by George M. Garner Jr)
-    // in certain localized Vista.
-    if (version == VERSION_VISTA && bufsize == VERSION1_FILE_SIZE - 1)
-        erraneous = true;
-
     memcpy (&record->filesize, buf + FILESIZE_OFFSET,
             FILETIME_OFFSET - FILESIZE_OFFSET - (int) erraneous);
     if (erraneous)
@@ -167,40 +184,23 @@ _populate_record_data  (void      *buf,
     record->winfiletime = GINT64_FROM_LE (record->winfiletime);
     record->deltime = win_filetime_to_gdatetime (record->winfiletime);
 
-    switch (version)
-    {
-        case VERSION_VISTA:
-            record->uni_path = conv_path_to_utf8_with_tmpl (
-                (const char *) (buf - (int) erraneous + VERSION1_FILENAME_OFFSET),
-                WIN_PATH_MAX, NULL, "<\\u%04X>", &read, &record->error);
-            break;
+    record->raw_uni_path = g_malloc0 (pathbuf_sz + sizeof(gunichar2));
+    memcpy (record->raw_uni_path, pathbuf_start, pathbuf_sz);
 
-        case VERSION_WIN10:
+    {
+        // Never set len = -1 for UCS2 source string
+        char *s = g_convert (record->raw_uni_path,
+            ucs2_strnlen (record->raw_uni_path, pathbuf_sz) * sizeof (gunichar2),
+            "UTF-8", "UTF-16LE", NULL, NULL, NULL);
+        if (s)
         {
-            record->uni_path = conv_path_to_utf8_with_tmpl (
-                (const char *) (buf + VERSION2_FILENAME_OFFSET),
-                bufsize - VERSION2_FILENAME_OFFSET,
-                NULL, "<\\u%04X>", &read, &record->error);
+            g_free (s);
         }
-            break;
-
-        default:
-            g_assert_not_reached ();
-    }
-
-    if (record->uni_path) {
-        if (g_error_matches (record->error, G_CONVERT_ERROR,
-            G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
+        else
         {
-            g_debug ("%s", record->error->message);
-            g_clear_error (&record->error);
             g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
                 _("Path contains broken unicode character(s)"));
         }
-    } else {
-        g_clear_error (&record->error);
-        g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
-            _("Trash file path conversion failed completely"));
     }
 
     return record;
diff --git a/src/rifiuti.c b/src/rifiuti.c
index f682e05..98a7807 100644
--- a/src/rifiuti.c
+++ b/src/rifiuti.c
@@ -126,7 +126,7 @@ _validate_index_file   (const char   *filename,
 
     rewind (fp);
     *infile = fp;
-    meta->version = (uint64_t) ver;
+    meta->version = ver;
     return TRUE;
 
     validation_broken:
@@ -143,15 +143,12 @@ _populate_record_data   (void     *buf,
 {
     rbin_struct    *record;
     uint32_t        drivenum;
-    size_t          read;
-    char           *legacy_fname;
-    gsize           legacy_bufsize, uni_bufsize;
 
     record = g_malloc0 (sizeof (rbin_struct));
 
-    legacy_bufsize = RECORD_INDEX_OFFSET - LEGACY_FILENAME_OFFSET;
-    legacy_fname = g_malloc0 (legacy_bufsize);
-    copy_field (legacy_fname, LEGACY_FILENAME_OFFSET, RECORD_INDEX_OFFSET);
+    // Verbatim path in ANSI code page
+    record->raw_legacy_path = g_malloc0 (RECORD_INDEX_OFFSET - LEGACY_FILENAME_OFFSET);
+    copy_field (record->raw_legacy_path, LEGACY_FILENAME_OFFSET, RECORD_INDEX_OFFSET);
 
     /* Index number associated with the record */
     copy_field (&record->index_n, RECORD_INDEX_OFFSET, DRIVE_LETTER_OFFSET);
@@ -173,10 +170,10 @@ _populate_record_data   (void     *buf,
     record->gone = FILESTATUS_EXISTS;
     // If file is not in recycle bin (restored or permanently deleted),
     // first byte will be removed from filename
-    if (!*legacy_fname)
+    if (! *record->raw_legacy_path)
     {
         record->gone = FILESTATUS_GONE;
-        *legacy_fname = record->drive;
+        *record->raw_legacy_path = record->drive;
     }
 
     /* File deletion time */
@@ -190,57 +187,44 @@ _populate_record_data   (void     *buf,
     record->filesize = GUINT64_FROM_LE (record->filesize);
     g_debug ("filesize=%" PRIu64, record->filesize);
 
-    // Only bother populating legacy path if users need it,
+    // Only bother checking legacy path when requested,
     // because otherwise we don't know which encoding to use
     if (legacy_encoding)
     {
-        record->legacy_path = conv_path_to_utf8_with_tmpl (
-            legacy_fname, legacy_bufsize, legacy_encoding,
-            "<\\%02X>", &read, &record->error);
-        if (record->legacy_path) {
-            if (g_error_matches (record->error, G_CONVERT_ERROR,
-                G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
-            {
-                g_debug ("%s", record->error->message);
-                g_clear_error (&record->error);
-                g_set_error (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
+        char *s = g_convert (record->raw_legacy_path, -1,
+            "UTF-8", legacy_encoding, NULL, NULL, NULL);
+        if (s)
+            g_free (s);
+        else
+            g_set_error (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
                 _("Path contains character(s) that could not be "
                 "interpreted in %s encoding"), legacy_encoding);
-            }
-        } else {
-            g_clear_error (&record->error);
-            g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
-                _("Legacy path conversion failed completely"));
-        }
     }
 
-    g_free (legacy_fname);
-
     if (bufsize == LEGACY_RECORD_SIZE)
         return record;
 
     /* Part below deals with unicode path only */
 
-    uni_bufsize = UNICODE_RECORD_SIZE - UNICODE_FILENAME_OFFSET;
+    gsize uni_sz = UNICODE_RECORD_SIZE - UNICODE_FILENAME_OFFSET;
 
-    record->uni_path = conv_path_to_utf8_with_tmpl (
-        (char *) (buf + UNICODE_FILENAME_OFFSET),
-        uni_bufsize / sizeof(gunichar2), NULL,
-        "<\\u%04X>", &read, &record->error);
+    record->raw_uni_path = g_malloc (uni_sz);
+    copy_field (record->raw_uni_path, UNICODE_FILENAME_OFFSET, UNICODE_RECORD_SIZE);
 
-    if (record->uni_path) {
-        if (g_error_matches (record->error, G_CONVERT_ERROR,
-            G_CONVERT_ERROR_ILLEGAL_SEQUENCE))
+    {
+        // Never set len = -1 for UCS2 source string
+        char *s = g_convert (record->raw_uni_path,
+            ucs2_strnlen (record->raw_uni_path, WIN_PATH_MAX) * sizeof (gunichar2),
+            "UTF-8", "UTF-16LE", NULL, NULL, NULL);
+        if (s)
+        {
+            g_free (s);
+        }
+        else
         {
-            g_debug ("%s", record->error->message);
-            g_clear_error (&record->error);
             g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
                 _("Path contains broken unicode character(s)"));
         }
-    } else {
-        g_clear_error (&record->error);
-        g_set_error_literal (&record->error, R2_REC_ERROR, R2_REC_ERROR_CONV_PATH,
-            _("Unicode path conversion failed completely"));
     }
 
     /*
@@ -265,18 +249,19 @@ _populate_record_data   (void     *buf,
      */
     if (junk_detected && ! *junk_detected)
     {
-        void *ptr;
+        char *p = record->raw_uni_path + ucs2_strnlen (
+            record->raw_uni_path, uni_sz) * sizeof(gunichar2);
 
-        for (ptr = buf + UNICODE_FILENAME_OFFSET + read;
-            ptr < buf + UNICODE_RECORD_SIZE; ptr++)
+        while (p < record->raw_uni_path + uni_sz * sizeof(gunichar2))
         {
-            if ( *(char *) ptr != '\0' )
+            if (*p != '\0')
             {
                 g_debug ("Junk detected at offset 0x%tx of unicode path",
-                    ptr - buf - UNICODE_FILENAME_OFFSET);
+                    p - record->raw_uni_path);
                 *junk_detected = TRUE;
                 break;
             }
+            p++;
         }
     }
 
diff --git a/src/utils-conv.c b/src/utils-conv.c
index 02640f9..aec3479 100644
--- a/src/utils-conv.c
+++ b/src/utils-conv.c
@@ -54,9 +54,9 @@ enc_is_ascii_compatible    (const char   *enc,
  * @return Either number of UCS2 char for whole string,
  * or return `max_sz` when `max_sz` param is exceeded
  */
-static size_t
-_ucs2_strnlen   (const char   *str,
-                 ssize_t       max_sz)
+size_t
+ucs2_strnlen   (const char   *str,
+                ssize_t       max_sz)
 {
     // wcsnlen_s should be equivalent except for boundary
     // cases we don't care about
@@ -221,7 +221,7 @@ conv_path_to_utf8_with_tmpl (const char *path,
         len = strnlen (path, (size_t) pathlen);
     } else {
         in_ch_width = sizeof (gunichar2);
-        len = _ucs2_strnlen (path, (size_t) pathlen);
+        len = ucs2_strnlen (path, (size_t) pathlen);
     }
 
     rbyte   = len *  in_ch_width;
@@ -264,7 +264,7 @@ conv_path_to_utf8_with_tmpl (const char *path,
         switch (e) {
             case EILSEQ:
             case EINVAL:  // TODO Handle partial input for EINVAL
-                if (*error == NULL) {
+                if (error && ! *error) {
                     g_set_error (error, G_CONVERT_ERROR,
                         G_CONVERT_ERROR_ILLEGAL_SEQUENCE,
                         _("Illegal sequence or partial input at offset %" G_GSIZE_FORMAT), rbyte);
diff --git a/src/utils-conv.h b/src/utils-conv.h
index ad7bb1e..5187079 100644
--- a/src/utils-conv.h
+++ b/src/utils-conv.h
@@ -13,6 +13,9 @@
 bool          enc_is_ascii_compatible     (const char       *enc,
                                            GError          **error);
 
+size_t        ucs2_strnlen                (const char       *str,
+                                           ssize_t           max_sz);
+
 char *        conv_path_to_utf8_with_tmpl (const char       *path,
                                            ssize_t           pathlen,
                                            const char       *from_enc,
diff --git a/src/utils.c b/src/utils.c
index 177c1fb..6f76e48 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -707,8 +707,8 @@ _free_record_cb (rbin_struct *record)
 {
     g_free (record->index_s);
     g_date_time_unref (record->deltime);
-    g_free (record->uni_path);
-    g_free (record->legacy_path);
+    g_free (record->raw_uni_path);
+    g_free (record->raw_legacy_path);
     g_clear_error (&record->error);
     g_free (record);
 }
@@ -1044,7 +1044,7 @@ _json_escape_path (const char *path)
  * @param meta Pointer to metadata structure
  */
 static void
-_print_csv_header (metarecord *meta)
+_print_text_header (const metarecord *meta)
 {
     {
         char *rbin_path = g_filename_display_name (meta->filename);
@@ -1134,7 +1134,7 @@ _print_csv_header (metarecord *meta)
  * @param meta Pointer to metadata structure
  */
 static void
-_print_xml_header (metarecord *meta)
+_print_xml_header (const metarecord *meta)
 {
     GString *result;
 
@@ -1174,7 +1174,7 @@ _print_xml_header (metarecord *meta)
  * @param meta Pointer to metadata structure
  */
 static void
-_print_json_header (metarecord *meta)
+_print_json_header (const metarecord *meta)
 {
     g_print ("{\n");
     g_printf ("  \"format\": \"%s\",\n",
@@ -1202,178 +1202,198 @@ _print_json_header (metarecord *meta)
 }
 
 
-/**
- * @brief Stub routine for printing header
- * @note Calls other printing routine depending on output mode
- */
 static void
-_print_header (void)
+_print_text_record   (rbin_struct        *record,
+                      const metarecord   *meta)
 {
-    if (no_heading) return;
+    char         *outstr;
+    char         **header;
+    GDateTime    *dt;
 
-    switch (output_format)
-    {
-        case FORMAT_TEXT: _print_csv_header  (meta); break;
-        case FORMAT_XML:  _print_xml_header  (meta); break;
-        case FORMAT_JSON: _print_json_header (meta); break;
+    g_return_if_fail (record != NULL);
 
-        default: g_assert_not_reached();
-    }
+    header = (char **) g_malloc0_n (6, sizeof(gpointer));
+
+    header[0] = (meta->type == RECYCLE_BIN_TYPE_FILE) ?
+        g_strdup_printf ("%" PRIu32, record->index_n) :
+        g_strdup (record->index_s);
+
+    dt = use_localtime ? g_date_time_to_local (record->deltime):
+                         g_date_time_ref      (record->deltime);
+    header[1] = g_date_time_format (dt, "%F %T");
+
+    header[2] =
+        (record->gone == FILESTATUS_EXISTS) ? g_strdup("FALSE") :
+        (record->gone == FILESTATUS_GONE  ) ? g_strdup("TRUE")  :
+                                              g_strdup("???")   ;
+
+    header[3] = (record->filesize == G_MAXUINT64) ?  // faulty
+        g_strdup ("???") :
+        g_strdup_printf ("%" PRIu64, record->filesize);
+
+    if (legacy_encoding)
+        header[4] = conv_path_to_utf8_with_tmpl (record->raw_legacy_path,
+            -1, legacy_encoding, "<\\%02X>", NULL, NULL);
+    else
+        header[4] = conv_path_to_utf8_with_tmpl (record->raw_uni_path,
+            -1, NULL, "<\\u%04X>", NULL, NULL);
+    if (! header[4])
+        header[4] = g_strdup ("???");
+
+    outstr = g_strjoinv (delim, header);
+    g_print ("%s\n", outstr);
+
+    g_free (outstr);
+    g_date_time_unref (dt);
+    g_strfreev (header);
 }
 
 
-/**
- * @brief Print content of each recycle bin record
- * @param record Pointer to each recycle bin record
- * @param meta Pointer to metadata structure
- */
 static void
-_print_record_cb (rbin_struct *record,
-                  const metarecord *meta)
+_print_xml_record   (rbin_struct        *record,
+                     const metarecord   *meta)
 {
-    char       *out_fname, *index, *size = NULL;
-    char       *outstr = NULL, *deltime = NULL;
-    GDateTime  *dt;
+    char         *path, *dt_str;
+    GDateTime    *dt;
+    GString      *s;
 
     g_return_if_fail (record != NULL);
 
-    index = (meta->type == RECYCLE_BIN_TYPE_FILE) ?
-        g_strdup_printf ("%u", record->index_n) :
-        g_strdup (record->index_s);
-
-    dt = use_localtime ? g_date_time_to_local (record->deltime):
-                         g_date_time_ref      (record->deltime);
+    s = g_string_new ("  <record");
 
-    out_fname = legacy_encoding ?
-        record->legacy_path : record->uni_path;
-    out_fname = out_fname ?
-        g_strdup (out_fname) : g_strdup ("???");
+    if (meta->type == RECYCLE_BIN_TYPE_FILE)
+        g_string_append_printf (s, " index=\"%" PRIu32 "\"", record->index_n);
+    else
+        g_string_append_printf (s, " index=\"%s\"", record->index_s);
 
-    switch (output_format)
+    if (use_localtime)
     {
-        case FORMAT_TEXT:
+        dt = g_date_time_to_local (record->deltime);
+        dt_str = g_date_time_format (dt, "%FT%T%z");
+    }
+    else
+    {
+        dt = g_date_time_ref (record->deltime);
+        dt_str = g_date_time_format (dt, "%FT%TZ");
+    }
+    g_string_append_printf (s, " time=\"%s\"", dt_str);
 
-            deltime = g_date_time_format (dt, "%F %T");
+    g_string_append_printf (s, " gone=\"%s\"",
+        (record->gone == FILESTATUS_GONE  ) ? "true"  :
+        (record->gone == FILESTATUS_EXISTS) ? "false" :
+                                              "unknown");
 
-            if ( record->filesize == G_MAXUINT64 ) /* faulty */
-                size = g_strdup ("???");
-            else
-                size = g_strdup_printf ("%" PRIu64, record->filesize);
+    if (record->filesize == G_MAXUINT64)  // faulty
+        g_string_append_printf (s, " size=\"-1\"");
+    else
+        g_string_append_printf (s,
+            " size=\"%" PRIu64 "\"", record->filesize);
+
+    // Still need to be converted despite using CDATA, otherwise
+    // could be writing garbage on screen or into file
+    if (legacy_encoding)
+        path = conv_path_to_utf8_with_tmpl (record->raw_legacy_path,
+            -1, legacy_encoding, "&#x%02X;", NULL, NULL);
+    else
+        path = conv_path_to_utf8_with_tmpl (record->raw_uni_path,
+            -1, NULL, "&#x%04X;", NULL, NULL);
 
-            const char *gone =
-                record->gone == FILESTATUS_EXISTS ? "FALSE" :
-                record->gone == FILESTATUS_GONE   ? "TRUE"  :
-                                                    "???"   ;
-            outstr = g_strjoin (delim, index, deltime, gone, size, out_fname, NULL);
+    if (path)
+        g_string_append_printf (s, ">\n"
+            "    <path><![CDATA[%s]]></path>\n"
+            "  </record>\n", path);
+    else
+        s = g_string_append (s, ">\n    <path/>\n  </record>\n");
 
-            g_print ("%s\n", outstr);
+    g_print ("%s", s->str);
+    g_string_free (s, TRUE);
 
-            break;
+    g_date_time_unref (dt);
+    g_free (path);
+    g_free (dt_str);
+}
 
-        case FORMAT_XML:
-        {
-            GString *s = g_string_new (NULL);
-
-            deltime = use_localtime ? g_date_time_format (dt, "%FT%T%z" ):
-                                      g_date_time_format (dt, "%FT%TZ");
-
-            g_string_printf (s,
-                "  <record index=\"%s\" time=\"%s\" gone=\"%s\"",
-                index, deltime,
-                (record->gone == FILESTATUS_GONE  ) ? "true" :
-                (record->gone == FILESTATUS_EXISTS) ? "false":
-                                                      "unknown");
-
-            if ( record->filesize == G_MAXUINT64 ) /* faulty */
-                g_string_append_printf (s, " size=\"-1\"");
-            else
-                g_string_append_printf (s,
-                    " size=\"%" PRIu64 "\"", record->filesize);
-
-            g_string_append_printf (s, ">\n"
-                "    <path><![CDATA[%s]]></path>\n"
-                "  </record>\n", out_fname);
-
-            outstr = g_string_free (s, FALSE);
-            g_print ("%s", outstr);
-        }
-            break;
 
-        case FORMAT_JSON:
-        {
-            GString *s = g_string_new ("    {\"index\": ");
+static void
+_print_json_record   (rbin_struct        *record,
+                      const metarecord   *meta)
+{
+    char         *path, *dt_str;
+    GDateTime    *dt;
+    GString      *s;
 
-            if (meta->type == RECYCLE_BIN_TYPE_FILE) {
-                g_string_append_printf (s, "%" PRIu32, record->index_n);
-            } else {
-                g_string_append_printf (s, "\"%s\"", record->index_s);
-            }
+    g_return_if_fail (record != NULL);
 
-            deltime = use_localtime ? g_date_time_format (dt, "%FT%T%z"):
-                                      g_date_time_format (dt, "%FT%TZ");
+    s = g_string_new ("    {");
 
-            g_string_append_printf (s, ", \"time\": \"%s\"", deltime);
+    if (meta->type == RECYCLE_BIN_TYPE_FILE)
+        g_string_append_printf (s, "\"index\": %" PRIu32, record->index_n);
+    else
+        g_string_append_printf (s, "\"index\": \"%s\"", record->index_s);
 
-            g_string_append_printf (s, ", \"gone\": %s",
-                (record->gone == FILESTATUS_GONE  ) ? "true" :
-                (record->gone == FILESTATUS_EXISTS) ? "false":
-                                                      "null");
+    if (use_localtime)
+    {
+        dt = g_date_time_to_local (record->deltime);
+        dt_str = g_date_time_format (dt, "%FT%T%z");
+    }
+    else
+    {
+        dt = g_date_time_ref (record->deltime);
+        dt_str = g_date_time_format (dt, "%FT%TZ");
+    }
+    g_string_append_printf (s, ", \"time\": \"%s\"", dt_str);
 
-            if ( record->filesize == G_MAXUINT64 ) /* faulty */
-                g_string_append_printf (s, ", \"size\": null");
-            else
-                g_string_append_printf (s,
-                    ", \"size\": %" PRIu64, record->filesize);
+    g_string_append_printf (s, ", \"gone\": %s",
+        (record->gone == FILESTATUS_GONE  ) ? "true" :
+        (record->gone == FILESTATUS_EXISTS) ? "false":
+                                              "null");
 
-            {
-                char *s = _json_escape_path (out_fname);
-                g_free (out_fname);
-                out_fname = s;
-            }
+    if (record->filesize == G_MAXUINT64)  // faulty
+        g_string_append_printf (s, ", \"size\": null");
+    else
+        g_string_append_printf (s,
+            ", \"size\": %" PRIu64, record->filesize);
+
+    // JSON spec doesn't even allow encoding raw byte data,
+    // so transform it like text output format
+    if (legacy_encoding)
+        path = conv_path_to_utf8_with_tmpl (record->raw_legacy_path,
+            -1, legacy_encoding, "<\\%02X>", NULL, NULL);
+    else
+        path = conv_path_to_utf8_with_tmpl (record->raw_uni_path,
+            -1, NULL, "\\u%04X", NULL, NULL);
+    {
+        // FIXME Doesn't work, it does extra level of escape for
+        // unicode escape
+        char *s = _json_escape_path (path);
+        g_free (path);
+        path = s;
+    }
 
-            g_string_append_printf (s,
-                ", \"path\": \"%s\"},\n", out_fname);
+    if (path)
+        g_string_append_printf (s, ", \"path\": \"%s\"},\n", path);
+    else
+        s = g_string_append (s, ", \"path\": null},\n");
 
-            outstr = g_string_free (s, FALSE);
-            g_print ("%s", outstr);
-        }
-            break;
+    g_print ("%s", s->str);
 
-        default:
-            g_assert_not_reached();
-    }
     g_date_time_unref (dt);
-    g_free (outstr);
-    g_free (out_fname);
-    g_free (deltime);
-    g_free (size);
-    g_free (index);
+    g_free (path);
+    g_free (dt_str);
 }
 
 
-/**
- * @brief Print footer of recycle bin data
- */
 static void
-_print_footer (void)
+_print_xml_footer (void)
 {
-    switch (output_format)
-    {
-        case FORMAT_TEXT:
-            /* do nothing */
-            break;
+    g_print ("%s", "</recyclebin>\n");
+}
 
-        case FORMAT_XML:
-            g_print ("%s", "</recyclebin>\n");
-            break;
 
-        case FORMAT_JSON:
-            g_print ("  ]\n}\n");
-            break;
-
-        default:
-            g_assert_not_reached();
-    }
+static void
+_print_json_footer (void)
+{
+    g_print ("  ]\n}\n");
 }
 
 
@@ -1387,6 +1407,9 @@ dump_content (GError **error)
 {
     FILE *tmp_fh = NULL, *prev_fh = NULL;
     char *tmp_path = NULL;
+    void (*print_header_func)(const metarecord *);
+    void (*print_record_func)(rbin_struct *, const metarecord *);
+    void (*print_footer_func)();
 
     if (output_loc)
     {
@@ -1400,9 +1423,33 @@ dump_content (GError **error)
             return FALSE;
     }
 
-    _print_header ();
-    g_ptr_array_foreach (meta->records, (GFunc) _print_record_cb, meta);
-    _print_footer ();
+    switch (output_format)
+    {
+        case FORMAT_TEXT:
+            print_header_func = no_heading ?
+                NULL : &_print_text_header;
+            print_record_func = &_print_text_record;
+            print_footer_func = NULL;
+            break;
+        case FORMAT_XML:
+            print_header_func = &_print_xml_header;
+            print_record_func = &_print_xml_record;
+            print_footer_func = &_print_xml_footer;
+            break;
+        case FORMAT_JSON:
+            print_header_func = &_print_json_header;
+            print_record_func = &_print_json_record;
+            print_footer_func = &_print_json_footer;
+            break;
+
+        default: g_assert_not_reached();
+    }
+
+    if (print_header_func != NULL)
+        (*print_header_func) (meta);
+    g_ptr_array_foreach (meta->records, (GFunc) print_record_func, meta);
+    if (print_footer_func != NULL)
+        (*print_footer_func) ();
 
     if (!tmp_path)
         return TRUE;
diff --git a/src/utils.h b/src/utils.h
index 673ac81..31ed5f3 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -166,7 +166,6 @@ typedef struct _rbin_meta
  */
 typedef struct _rbin_struct
 {
-
     /**
      * @brief version of each index file
      * @note `meta.version` keeps the global status of whole dir,
@@ -174,7 +173,6 @@ typedef struct _rbin_struct
      * @attention For `$Recycle.bin` only
      */
     uint64_t version;
-
     /**
      * @brief Chronological index number for INFO2
      * @attention For `INFO2` only
@@ -210,7 +208,8 @@ typedef struct _rbin_struct
      * @note Original path was stored in index file in UTF-16 encoding
      * since Windows 2000. The path is converted to UTF-8 encoding and stored here .
      */
-    char *uni_path;
+    char *raw_uni_path;
+
     /**
      * @brief ANSI encoded trash file original path
      * @note Until Windows 2003, index file preserves trashed file path in
@@ -218,7 +217,8 @@ typedef struct _rbin_struct
      * @attention For `INFO2` only. Can be either full path or using 8.3 format,
      * depending on Windows version and code page used.
      */
-    char *legacy_path;
+    char *raw_legacy_path;
+
     /**
      * @brief Whether original trashed file is gone
      * @note Trash file can be detected if it still exists, but via very

From cfd0930bfe903d67dfd3ca256d8205ebe118cd5f Mon Sep 17 00:00:00 2001
From: Abel Cheung <abelcheung@gmail.com>
Date: Fri, 15 Dec 2023 19:41:18 +0000
Subject: [PATCH 4/7] fix: Fix junk data detection due to incorrect buffer size

And write a hex dump func to aid debugging
---
 src/rifiuti.c | 23 ++++++++++++++---------
 src/utils.c   | 23 +++++++++++++++++++++++
 src/utils.h   |  3 +++
 3 files changed, 40 insertions(+), 9 deletions(-)

diff --git a/src/rifiuti.c b/src/rifiuti.c
index 98a7807..530c091 100644
--- a/src/rifiuti.c
+++ b/src/rifiuti.c
@@ -143,6 +143,7 @@ _populate_record_data   (void     *buf,
 {
     rbin_struct    *record;
     uint32_t        drivenum;
+    size_t          uni_buf_sz, null_terminator_offset;
 
     record = g_malloc0 (sizeof (rbin_struct));
 
@@ -206,15 +207,15 @@ _populate_record_data   (void     *buf,
 
     /* Part below deals with unicode path only */
 
-    gsize uni_sz = UNICODE_RECORD_SIZE - UNICODE_FILENAME_OFFSET;
-
-    record->raw_uni_path = g_malloc (uni_sz);
+    uni_buf_sz = UNICODE_RECORD_SIZE - UNICODE_FILENAME_OFFSET;
+    record->raw_uni_path = g_malloc (uni_buf_sz);
     copy_field (record->raw_uni_path, UNICODE_FILENAME_OFFSET, UNICODE_RECORD_SIZE);
+    null_terminator_offset = ucs2_strnlen (
+        record->raw_uni_path, WIN_PATH_MAX) * sizeof (gunichar2);
 
     {
-        // Never set len = -1 for UCS2 source string
-        char *s = g_convert (record->raw_uni_path,
-            ucs2_strnlen (record->raw_uni_path, WIN_PATH_MAX) * sizeof (gunichar2),
+        // Never set len = -1 for wchar source string
+        char *s = g_convert (record->raw_uni_path, null_terminator_offset,
             "UTF-8", "UTF-16LE", NULL, NULL, NULL);
         if (s)
         {
@@ -249,10 +250,11 @@ _populate_record_data   (void     *buf,
      */
     if (junk_detected && ! *junk_detected)
     {
-        char *p = record->raw_uni_path + ucs2_strnlen (
-            record->raw_uni_path, uni_sz) * sizeof(gunichar2);
+        // Beware: start pos shouldn't be previously read bytes,
+        // as it may contain invalid seq and quit prematurely.
+        char *p = record->raw_uni_path + null_terminator_offset;
 
-        while (p < record->raw_uni_path + uni_sz * sizeof(gunichar2))
+        while (p < record->raw_uni_path + uni_buf_sz)
         {
             if (*p != '\0')
             {
@@ -263,6 +265,9 @@ _populate_record_data   (void     *buf,
             }
             p++;
         }
+
+        if (*junk_detected)
+            hexdump (record->raw_uni_path, uni_buf_sz);
     }
 
     return record;
diff --git a/src/utils.c b/src/utils.c
index 6f76e48..4d8155c 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1597,3 +1597,26 @@ rifiuti_cleanup (void)
     cleanup_windows_res ();
 #endif
 }
+
+
+void
+hexdump    (void     *start,
+            size_t    size)
+{
+    GString *s = g_string_new ("");
+    size_t i = 0;
+    do
+    {
+        if (i % 16 == 0)
+        {
+            if (s->len > 0)
+            {
+                g_debug ("%s", s->str);
+                s = g_string_assign (s, "");
+            }
+            g_string_append_printf (s, "%04zX    ", i);
+        }
+        g_string_append_printf (s, "%02" PRIX8 " ", *(uint8_t *) (start+i));
+    }
+    while (i++ < size);
+}
diff --git a/src/utils.h b/src/utils.h
index 31ed5f3..4d9d8dd 100644
--- a/src/utils.h
+++ b/src/utils.h
@@ -273,4 +273,7 @@ gboolean      rifiuti_handle_record_error (void);
 
 void          rifiuti_cleanup             (void);
 
+void          hexdump                     (void             *start,
+                                           size_t            size);
+
 #endif

From de809ef33398ded32edbf2e4b041be5b15718bab Mon Sep 17 00:00:00 2001
From: Abel Cheung <abelcheung@gmail.com>
Date: Sat, 16 Dec 2023 11:33:35 +0000
Subject: [PATCH 5/7] fix: Workaround for json escape problem with unicde and
 pathsep

---
 src/utils-conv.c | 23 +++++++++++++++++++++++
 src/utils-conv.h |  2 ++
 src/utils.c      | 48 ++++++++++++++----------------------------------
 3 files changed, 39 insertions(+), 34 deletions(-)

diff --git a/src/utils-conv.c b/src/utils-conv.c
index aec3479..7c9159f 100644
--- a/src/utils-conv.c
+++ b/src/utils-conv.c
@@ -351,3 +351,26 @@ filter_escapes (const char *str)
     return g_string_free (result, FALSE);
 }
 
+
+char *
+json_escape_path (const char *path)
+{
+    // TODO g_string_replace from glib 2.68 does it all
+
+    char *p = (char *) path;
+    gunichar c = 0;
+    GString *s = g_string_new ("");
+
+    while (*p) {
+        c = g_utf8_get_char (p);
+        if (c == '\\')
+            s = g_string_append (s, "\\\\");
+        else if (c == '*')
+            s = g_string_append_c (s, '\\');
+        else
+            s = g_string_append_unichar (s, c);
+        p = g_utf8_next_char (p);
+    }
+    return g_string_free (s, FALSE);
+}
+
diff --git a/src/utils-conv.h b/src/utils-conv.h
index 5187079..1bb1be8 100644
--- a/src/utils-conv.h
+++ b/src/utils-conv.h
@@ -25,4 +25,6 @@ char *        conv_path_to_utf8_with_tmpl (const char       *path,
 
 char *        filter_escapes              (const char       *str);
 
+char *        json_escape_path            (const char       *path);
+
 #endif
diff --git a/src/utils.c b/src/utils.c
index 4d8155c..c42283a 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1018,27 +1018,6 @@ _close_handles (void)
 }
 
 
-static char *
-_json_escape_path (const char *path)
-{
-    // TODO g_string_replace from glib 2.68 does it all
-
-    char *p = (char *) path;
-    gunichar c = 0;
-    GString *s = g_string_new ("");
-
-    while (*p) {
-        c = g_utf8_get_char (p);
-        if (c == 0x5C)
-            s = g_string_append (s, "\\\\");
-        else
-            s = g_string_append_unichar (s, c);
-        p = g_utf8_next_char (p);
-    }
-    return g_string_free (s, FALSE);
-}
-
-
 /**
  * @brief Print preamble and column header for TSV output
  * @param meta Pointer to metadata structure
@@ -1189,10 +1168,9 @@ _print_json_header (const metarecord *meta)
     if (meta->type == RECYCLE_BIN_TYPE_FILE && meta->total_entry > 0)
         g_printf ("  \"ever_existed\": %" PRIu32 ",\n", meta->total_entry);
 
-    // TODO need to escape path separator for json
     {
         char *s = g_filename_display_name (meta->filename);
-        char *rbin_path = _json_escape_path (s);
+        char *rbin_path = json_escape_path (s);
         g_printf ("  \"path\": \"%s\",\n", rbin_path);
         g_free (s);
         g_free (rbin_path);
@@ -1318,7 +1296,7 @@ static void
 _print_json_record   (rbin_struct        *record,
                       const metarecord   *meta)
 {
-    char         *path, *dt_str;
+    char         *tmp, *path, *dt_str;
     GDateTime    *dt;
     GString      *s;
 
@@ -1354,21 +1332,22 @@ _print_json_record   (rbin_struct        *record,
         g_string_append_printf (s,
             ", \"size\": %" PRIu64, record->filesize);
 
-    // JSON spec doesn't even allow encoding raw byte data,
-    // so transform it like text output format
     if (legacy_encoding)
-        path = conv_path_to_utf8_with_tmpl (record->raw_legacy_path,
+    {
+        // JSON spec doesn't even allow encoding raw byte data,
+        // so transform it like text output format
+        tmp = conv_path_to_utf8_with_tmpl (record->raw_legacy_path,
             -1, legacy_encoding, "<\\%02X>", NULL, NULL);
+    }
     else
-        path = conv_path_to_utf8_with_tmpl (record->raw_uni_path,
-            -1, NULL, "\\u%04X", NULL, NULL);
     {
-        // FIXME Doesn't work, it does extra level of escape for
-        // unicode escape
-        char *s = _json_escape_path (path);
-        g_free (path);
-        path = s;
+        // HACK \u sequence collides with path separator, which
+        // will be processed in json escaping routine. Use a temp
+        // char to avoid collision and convert it back later
+        tmp = conv_path_to_utf8_with_tmpl (record->raw_uni_path,
+            -1, NULL, "*u%04X", NULL, NULL);
     }
+    path = json_escape_path (tmp);
 
     if (path)
         g_string_append_printf (s, ", \"path\": \"%s\"},\n", path);
@@ -1378,6 +1357,7 @@ _print_json_record   (rbin_struct        *record,
     g_print ("%s", s->str);
 
     g_date_time_unref (dt);
+    g_free (tmp);
     g_free (path);
     g_free (dt_str);
 }

From 07c9e8327ece5a0bba34bc919d331e9a3b1d850b Mon Sep 17 00:00:00 2001
From: Abel Cheung <abelcheung@gmail.com>
Date: Sat, 16 Dec 2023 11:34:18 +0000
Subject: [PATCH 6/7] test: Create schema for json output format

---
 .editorconfig            |  8 +---
 test/rifiuti-schema.json | 83 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 84 insertions(+), 7 deletions(-)
 create mode 100644 test/rifiuti-schema.json

diff --git a/.editorconfig b/.editorconfig
index 7cf2d65..74b98f9 100644
--- a/.editorconfig
+++ b/.editorconfig
@@ -9,11 +9,5 @@ indent_style = space
 [*.{c,h,txt,cmake}]
 indent_size = 4
 
-[configure.ac]
-indent_size = 4
-
-[*.{md,yml}]
+[*.{md,yml,json}]
 indent_size = 2
-
-[{Makefile.am,*.mk}]
-indent_style = tab
diff --git a/test/rifiuti-schema.json b/test/rifiuti-schema.json
new file mode 100644
index 0000000..b309517
--- /dev/null
+++ b/test/rifiuti-schema.json
@@ -0,0 +1,83 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://raw.githubusercontent.com/abelcheung/rifiuti2/0.8.1/test/rifiuti-schema.json",
+  "title": "rifiuti",
+  "description": "JSON schema for rifiuti json formatted output",
+  "type": "object",
+  "definitions": {
+    "nonNegativeInteger": {
+      "type": "integer",
+      "minimum": 0
+    }
+  },
+  "properties": {
+    "format": {
+      "description": "Recycle bin format",
+      "type": "string"
+    },
+    "version": {
+      "allOf": [
+        { "$ref": "#/definitions/nonNegativeInteger" },
+        { "description": "Version embedded in index file header" }
+      ]
+    },
+    "ever_existed": {
+      "allOf": [
+        { "$ref": "#/definitions/nonNegativeInteger" },
+        { "description": "Total items ever existed in recycle bin" }
+      ]
+    },
+    "path": {
+      "description": "Location of recycle bin",
+      "type": "string"
+    },
+    "records": {
+      "description": "All recycle bin records",
+      "type": "array",
+      "uniqueItems": true,
+      "minItems": 0,
+      "items": {
+        "type": "object",
+        "properties": {
+          "index": {
+            "anyOf": [
+              { "$ref": "#/definitions/nonNegativeInteger" },
+              { "type": "string" }
+            ]
+          },
+          "time": {
+            "type": "string"
+          },
+          "gone": {
+            "anyOf": [
+              { "type": "boolean" },
+              { "type": "null" }
+            ]
+          },
+          "size": {
+            "anyOf": [
+              { "$ref": "#/definitions/nonNegativeInteger" },
+              { "type": "null" }
+            ]
+          },
+          "path": {
+            "type": "string"
+          }
+        },
+        "required": [
+          "index",
+          "time",
+          "gone",
+          "size",
+          "path"
+        ]
+      }
+    }
+  },
+  "required": [
+    "format",
+    "version",
+    "path",
+    "records"
+  ]
+}

From 8059c7064515c1ed42dbe686158cede519d21d8b Mon Sep 17 00:00:00 2001
From: Abel Cheung <abelcheung@gmail.com>
Date: Sat, 16 Dec 2023 17:15:57 +0000
Subject: [PATCH 7/7] test: Add tests for json output format

---
 src/utils.c                              | 10 +++---
 test/CMakeLists.txt                      |  1 +
 test/cmake/_try_encoding.cmake           |  4 +++
 test/cmake/encoding.cmake                | 31 ++++++++++++++++++
 test/cmake/json.cmake                    | 41 ++++++++++++++++++++++++
 test/samples/INFO-95-ja-1-in-cp1255.json | 19 +++++++++++
 test/samples/INFO-95-ja-1.json           | 19 +++++++++++
 test/samples/INFO2-empty.json            |  7 ++++
 test/samples/INFO2-sample1.json          | 23 +++++++++++++
 test/samples/INFO2-sample2.json          | 14 ++++++++
 test/samples/dir-2019-uncpath.json       | 10 ++++++
 test/samples/dir-sample1.json            | 22 +++++++++++++
 test/samples/dir-win10-01.json           | 14 ++++++++
 13 files changed, 209 insertions(+), 6 deletions(-)
 create mode 100644 test/cmake/json.cmake
 create mode 100644 test/samples/INFO-95-ja-1-in-cp1255.json
 create mode 100644 test/samples/INFO-95-ja-1.json
 create mode 100644 test/samples/INFO2-empty.json
 create mode 100644 test/samples/INFO2-sample1.json
 create mode 100644 test/samples/INFO2-sample2.json
 create mode 100644 test/samples/dir-2019-uncpath.json
 create mode 100644 test/samples/dir-sample1.json
 create mode 100644 test/samples/dir-win10-01.json

diff --git a/src/utils.c b/src/utils.c
index c42283a..d7d4bc3 100644
--- a/src/utils.c
+++ b/src/utils.c
@@ -1155,23 +1155,21 @@ _print_xml_header (const metarecord *meta)
 static void
 _print_json_header (const metarecord *meta)
 {
-    g_print ("{\n");
-    g_printf ("  \"format\": \"%s\",\n",
+    g_print ("{\n  \"format\": \"%s\",\n",
         (meta->type == RECYCLE_BIN_TYPE_FILE) ? "file" : "dir");
 
-
     if (meta->version >= 0)  /* can be found and not error */
-        g_printf ("  \"version\": %" PRId64 ",\n", meta->version);
+        g_print ("  \"version\": %" PRId64 ",\n", meta->version);
     else
         g_print ("  \"version\": null,\n");
 
     if (meta->type == RECYCLE_BIN_TYPE_FILE && meta->total_entry > 0)
-        g_printf ("  \"ever_existed\": %" PRIu32 ",\n", meta->total_entry);
+        g_print ("  \"ever_existed\": %" PRIu32 ",\n", meta->total_entry);
 
     {
         char *s = g_filename_display_name (meta->filename);
         char *rbin_path = json_escape_path (s);
-        g_printf ("  \"path\": \"%s\",\n", rbin_path);
+        g_print ("  \"path\": \"%s\",\n", rbin_path);
         g_free (s);
         g_free (rbin_path);
     }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bb718a1..989df1e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -221,6 +221,7 @@ target_link_directories   (test_glib_iconv PRIVATE ${GLIB_LIBRARY_DIRS})
 include(cli-option)
 include(crafted)
 include(encoding)
+include(json)
 include(parse-info2)
 include(parse-rdir)
 include(read-write)
diff --git a/test/cmake/_try_encoding.cmake b/test/cmake/_try_encoding.cmake
index f4604d9..9c569d8 100644
--- a/test/cmake/_try_encoding.cmake
+++ b/test/cmake/_try_encoding.cmake
@@ -34,6 +34,10 @@ set(args -l ${encoding} ${INFO2})
 if(DEFINED OUTFILE)
     list(APPEND args -o ${OUTFILE})
 endif()
+if(DEFINED EXTRA_ARGS)
+    string(REPLACE "|" ";" EXTRA_ARGS "${EXTRA_ARGS}")
+    list(APPEND args ${EXTRA_ARGS})
+endif()
 execute_process(
     COMMAND ${RIFIUTI} ${args}
     # COMMAND_ECHO STDOUT
diff --git a/test/cmake/encoding.cmake b/test/cmake/encoding.cmake
index 2f2a2fb..317816b 100644
--- a/test/cmake/encoding.cmake
+++ b/test/cmake/encoding.cmake
@@ -132,3 +132,34 @@ add_encoding_test_with_cwd(f_LegacyUNC_Prep
 
 generate_simple_comparison_test("LegacyUNC" 1
     "" "INFO2-2k-tw-uncpath.txt" "encoding")
+
+#
+# JSON output
+#
+
+add_encoding_test_with_cwd(f_JsonInfo2Win95_Prep
+    ${sample_dir}
+    -DINFO2=INFO-95-ja-1
+    -DCHOICES=CP932|Windows-932|IBM-943|SJIS|JIS_X0208|SHIFT_JIS|SHIFT-JIS
+    -DOUTFILE=${bindir}/f_JsonInfo2Win95.output
+    -DEXTRA_ARGS=-f|json
+)
+
+generate_simple_comparison_test("JsonInfo2Win95" 1
+    "" "INFO-95-ja-1.json" "encoding|json")
+
+
+add_encoding_test_with_cwd(f_JsonWin95WrongEnc_Prep
+    ${sample_dir}
+    -DINFO2=INFO-95-ja-1
+    -DCHOICES=CP1255|MS-HEBR|WINDOWS-1255|HEBREW|ISO-8859-8|ISO-IR-138|ISO8859-8|ISO_8859-8|ISO_8859-8:1988|CSISOLATINHEBREW
+    -DOUTFILE=${bindir}/f_JsonWin95WrongEnc.output
+    -DEXTRA_ARGS=-f|json
+)
+
+set_tests_properties(f_JsonWin95WrongEnc_Prep
+    PROPERTIES
+    PASS_REGULAR_EXPRESSION "could not be interpreted in .+ encoding")
+
+generate_simple_comparison_test("JsonWin95WrongEnc" 1
+    "" "INFO-95-ja-1-in-cp1255.json" "encoding|xfail|json")
diff --git a/test/cmake/json.cmake b/test/cmake/json.cmake
new file mode 100644
index 0000000..1183a3f
--- /dev/null
+++ b/test/cmake/json.cmake
@@ -0,0 +1,41 @@
+# Copyright (C) 2023, Abel Cheung
+# rifiuti2 is released under Revised BSD License.
+# Please see LICENSE file for more info.
+
+#
+# Verify JSON output works as intended
+#
+
+function(createJsonOutputTests)
+
+set(ids
+    "JsonInfo2Empty" "JsonInfo2WinXP" "JsonInfo2Win98"
+    "JsonRdirVista" "JsonRdirWin10" "JsonRdirUNC19"
+)
+
+set(files
+    "INFO2-empty" "INFO2-sample1" "INFO2-sample2"
+    "dir-sample1" "dir-win10-01" "dir-2019-uncpath"
+)
+
+set(encs
+    "" "" "CP1252" "" ""
+)
+
+foreach(id file enc IN ZIP_LISTS ids files encs)
+    if (IS_DIRECTORY ${sample_dir}/${file})
+        set(is_info2 0)
+    else()
+        set(is_info2 1)
+    endif()
+    set(args -f json)
+    if(enc)
+        list(APPEND args -l ${enc})
+    endif()
+    generate_simple_comparison_test(${id} ${is_info2}
+        ${file} ${file}.json "parse|json" ${args})
+endforeach()
+
+endfunction()
+
+createJsonOutputTests()
diff --git a/test/samples/INFO-95-ja-1-in-cp1255.json b/test/samples/INFO-95-ja-1-in-cp1255.json
new file mode 100644
index 0000000..ea842d3
--- /dev/null
+++ b/test/samples/INFO-95-ja-1-in-cp1255.json
@@ -0,0 +1,19 @@
+{
+  "format": "file",
+  "version": 0,
+  "ever_existed": 16,
+  "path": "INFO-95-ja-1",
+  "records": [
+    {"index": 1, "time": "2015-05-11T05:59:49Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ֳ<\\DE>½¸ִ¯ּ<\\DF>\\The Microsoft Network ‚ּ¾¯ִ±¯ּ<\\DF>.lnk"},
+    {"index": 2, "time": "2015-05-11T06:00:25Z", "gone": false, "size": 950272, "path": "D:\\WINDOWS\\ֳ<\\DE>½¸ִ¯ּ<\\DF>\\<\\90>V‹Kֻ<\\DE>¯ִֿ¯ּ<\\DF> ²ׂ°¼<\\DE>.bmp"},
+    {"index": 3, "time": "2015-05-11T07:19:25Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ֳ<\\DE>½¸ִ¯ּ<\\DF>\\<\\90>V‹Kֳ·½ִ•¶<\\8F>‘.txt"},
+    {"index": 4, "time": "2015-05-11T09:48:21Z", "gone": false, "size": 589824, "path": "D:\\My Documents\\DirectX-V8.0a\\bda.cab"},
+    {"index": 5, "time": "2015-05-11T09:48:21Z", "gone": false, "size": 589824, "path": "D:\\My Documents\\DirectX-V8.0a\\bdant.cab"},
+    {"index": 6, "time": "2015-05-11T09:48:21Z", "gone": false, "size": 65536, "path": "D:\\My Documents\\DirectX-V8.0a\\cfgmgr32.dll"},
+    {"index": 11, "time": "2015-05-11T09:48:23Z", "gone": false, "size": 163840, "path": "D:\\My Documents\\DirectX-V8.0a\\dxsetup.exe"},
+    {"index": 12, "time": "2015-05-11T09:48:23Z", "gone": false, "size": 360448, "path": "D:\\My Documents\\DirectX-V8.0a\\setupapi.dll"},
+    {"index": 13, "time": "2015-05-11T09:59:19Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ֳ<\\DE>½¸ִ¯ּ<\\DF>\\Connect to the Internet.LNK"},
+    {"index": 14, "time": "2015-05-11T09:59:22Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ֳ<\\DE>½¸ִ¯ּ<\\DF>\\Outlook Express.lnk"},
+    {"index": 15, "time": "2015-05-18T00:45:09Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ֳ<\\DE>½¸ִ¯ּ<\\DF>\\<\\90>V‹Kֳ·½ִ•¶<\\8F>‘.txt"},
+  ]
+}
diff --git a/test/samples/INFO-95-ja-1.json b/test/samples/INFO-95-ja-1.json
new file mode 100644
index 0000000..95066d4
--- /dev/null
+++ b/test/samples/INFO-95-ja-1.json
@@ -0,0 +1,19 @@
+{
+  "format": "file",
+  "version": 0,
+  "ever_existed": 16,
+  "path": "INFO-95-ja-1",
+  "records": [
+    {"index": 1, "time": "2015-05-11T05:59:49Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ﾃﾞｽｸﾄｯﾌﾟ\\The Microsoft Network のｾｯﾄｱｯﾌﾟ.lnk"},
+    {"index": 2, "time": "2015-05-11T06:00:25Z", "gone": false, "size": 950272, "path": "D:\\WINDOWS\\ﾃﾞｽｸﾄｯﾌﾟ\\新規ﾋﾞｯﾄﾏｯﾌﾟ ｲﾒｰｼﾞ.bmp"},
+    {"index": 3, "time": "2015-05-11T07:19:25Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ﾃﾞｽｸﾄｯﾌﾟ\\新規ﾃｷｽﾄ文書.txt"},
+    {"index": 4, "time": "2015-05-11T09:48:21Z", "gone": false, "size": 589824, "path": "D:\\My Documents\\DirectX-V8.0a\\bda.cab"},
+    {"index": 5, "time": "2015-05-11T09:48:21Z", "gone": false, "size": 589824, "path": "D:\\My Documents\\DirectX-V8.0a\\bdant.cab"},
+    {"index": 6, "time": "2015-05-11T09:48:21Z", "gone": false, "size": 65536, "path": "D:\\My Documents\\DirectX-V8.0a\\cfgmgr32.dll"},
+    {"index": 11, "time": "2015-05-11T09:48:23Z", "gone": false, "size": 163840, "path": "D:\\My Documents\\DirectX-V8.0a\\dxsetup.exe"},
+    {"index": 12, "time": "2015-05-11T09:48:23Z", "gone": false, "size": 360448, "path": "D:\\My Documents\\DirectX-V8.0a\\setupapi.dll"},
+    {"index": 13, "time": "2015-05-11T09:59:19Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ﾃﾞｽｸﾄｯﾌﾟ\\Connect to the Internet.LNK"},
+    {"index": 14, "time": "2015-05-11T09:59:22Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ﾃﾞｽｸﾄｯﾌﾟ\\Outlook Express.lnk"},
+    {"index": 15, "time": "2015-05-18T00:45:09Z", "gone": false, "size": 32768, "path": "D:\\WINDOWS\\ﾃﾞｽｸﾄｯﾌﾟ\\新規ﾃｷｽﾄ文書.txt"},
+  ]
+}
diff --git a/test/samples/INFO2-empty.json b/test/samples/INFO2-empty.json
new file mode 100644
index 0000000..a3b201a
--- /dev/null
+++ b/test/samples/INFO2-empty.json
@@ -0,0 +1,7 @@
+{
+  "format": "file",
+  "version": 5,
+  "path": "INFO2-empty",
+  "records": [
+  ]
+}
diff --git a/test/samples/INFO2-sample1.json b/test/samples/INFO2-sample1.json
new file mode 100644
index 0000000..0e13815
--- /dev/null
+++ b/test/samples/INFO2-sample1.json
@@ -0,0 +1,23 @@
+{
+  "format": "file",
+  "version": 5,
+  "path": "INFO2-sample1",
+  "records": [
+    {"index": 44, "time": "2008-10-28T15:53:42Z", "gone": false, "size": 4096, "path": "C:\\Documents and Settings\\All Users\\Desktop\\有道桌面词典.lnk"},
+    {"index": 45, "time": "2008-11-03T15:01:59Z", "gone": false, "size": 4096, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\wongsir_url.txt"},
+    {"index": 46, "time": "2008-11-06T09:20:58Z", "gone": false, "size": 2912256, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\dd-wrt.v24_mini_wrt54g.bin"},
+    {"index": 47, "time": "2008-11-13T12:08:39Z", "gone": false, "size": 765952, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\theme\\.svn"},
+    {"index": 48, "time": "2008-11-13T12:11:33Z", "gone": false, "size": 5812224, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\Config Client"},
+    {"index": 49, "time": "2008-11-13T12:11:36Z", "gone": false, "size": 1847296, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\Config Client.7z"},
+    {"index": 50, "time": "2008-11-19T04:42:04Z", "gone": false, "size": 4096, "path": "C:\\Documents and Settings\\All Users\\Desktop\\Wireshark.lnk"},
+    {"index": 57, "time": "2008-11-19T05:07:15Z", "gone": false, "size": 2727936, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\GetDataBackforFAT-v3.63_PConline.rar"},
+    {"index": 64, "time": "2008-11-19T05:07:35Z", "gone": true, "size": 2727936, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\GetDataBackforFAT-v3.63_PConline"},
+    {"index": 65, "time": "2008-11-19T05:17:12Z", "gone": false, "size": 4096, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\360保险箱.lnk"},
+    {"index": 66, "time": "2008-11-19T05:21:37Z", "gone": false, "size": 2732032, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\gdb"},
+    {"index": 67, "time": "2008-11-19T05:21:37Z", "gone": false, "size": 2723840, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\gdb.zip"},
+    {"index": 68, "time": "2008-11-19T11:34:23Z", "gone": false, "size": 0, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\recovered files"},
+    {"index": 69, "time": "2008-11-19T18:51:45Z", "gone": false, "size": 2727936, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\GetDataBackforFAT-v3.63_PConline"},
+    {"index": 70, "time": "2008-11-19T18:51:45Z", "gone": false, "size": 5169152, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\Uneraser_Setup(2).exe"},
+    {"index": 71, "time": "2008-11-19T18:51:45Z", "gone": false, "size": 5169152, "path": "C:\\Documents and Settings\\Administrator\\Desktop\\Uneraser_Setup.exe"},
+  ]
+}
diff --git a/test/samples/INFO2-sample2.json b/test/samples/INFO2-sample2.json
new file mode 100644
index 0000000..44619e7
--- /dev/null
+++ b/test/samples/INFO2-sample2.json
@@ -0,0 +1,14 @@
+{
+  "format": "file",
+  "version": 4,
+  "path": "INFO2-sample2",
+  "records": [
+    {"index": 0, "time": "2015-04-20T00:07:36Z", "gone": false, "size": 32768, "path": "C:\\WINDOWS\\All Users\\Desktop\\Connect to the Internet.LNK"},
+    {"index": 1, "time": "2015-04-20T00:07:42Z", "gone": false, "size": 32768, "path": "C:\\WINDOWS\\Desktop\\Online Services"},
+    {"index": 2, "time": "2015-04-20T00:09:43Z", "gone": true, "size": 524288, "path": "C:\\WINDOWS\\Desktop\\IE9-WindowsVista-x64-enu.exe"},
+    {"index": 3, "time": "2015-04-20T01:04:33Z", "gone": false, "size": 32768, "path": "C:\\My Documents\\Résumé.txt.txt"},
+    {"index": 4, "time": "2015-04-20T01:05:01Z", "gone": false, "size": 6258688, "path": "C:\\WINDOWS\\Desktop\\winzip100.exe"},
+    {"index": 5, "time": "2015-04-20T01:05:41Z", "gone": true, "size": 32768, "path": "C:\\WINDOWS\\Desktop\\111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111"},
+    {"index": 6, "time": "2015-04-20T01:06:12Z", "gone": false, "size": 32768, "path": "C:\\WINDOWS\\Desktop\\1234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345"},
+  ]
+}
diff --git a/test/samples/dir-2019-uncpath.json b/test/samples/dir-2019-uncpath.json
new file mode 100644
index 0000000..a4268b0
--- /dev/null
+++ b/test/samples/dir-2019-uncpath.json
@@ -0,0 +1,10 @@
+{
+  "format": "dir",
+  "version": 2,
+  "path": "dir-2019-uncpath",
+  "records": [
+    {"index": "$IW0RYW0.rtf", "time": "2019-05-07T20:56:04Z", "gone": true, "size": 7, "path": "\\\\WIN-163RLA0PH3N\\somewhere\\hahaha.rtf"},
+    {"index": "$I4OZLXW.bmp", "time": "2019-05-07T21:01:01Z", "gone": true, "size": 1714662, "path": "\\\\WIN-163RLA0PH3N\\somewhere\\পরীক্ষা.bmp"},
+    {"index": "$IYDW1CC.rtf", "time": "2019-05-07T21:08:55Z", "gone": true, "size": 7, "path": "\\\\WIN-163RLA0PH3N\\somewhere\\hahaha.rtf"},
+  ]
+}
diff --git a/test/samples/dir-sample1.json b/test/samples/dir-sample1.json
new file mode 100644
index 0000000..61c7e6a
--- /dev/null
+++ b/test/samples/dir-sample1.json
@@ -0,0 +1,22 @@
+{
+  "format": "dir",
+  "version": 1,
+  "path": "dir-sample1",
+  "records": [
+    {"index": "$IUVFB0M.rtf", "time": "2007-09-21T06:32:46Z", "gone": false, "size": 155, "path": "C:\\Users\\student\\Desktop\\New Rich Text Document.rtf"},
+    {"index": "$I0JGHX7", "time": "2007-09-21T06:47:49Z", "gone": true, "size": 0, "path": "C:\\Users\\student\\Desktop\\New Folder 1"},
+    {"index": "$I1IS2OK.txt", "time": "2007-09-21T06:48:13Z", "gone": false, "size": 0, "path": "C:\\Users\\student\\Desktop\\New Text Document blah.txt"},
+    {"index": "$IYAR1YY.exe", "time": "2007-09-21T07:54:23Z", "gone": true, "size": null, "path": "C:\\dd.exe"},
+    {"index": "$I95CUKU", "time": "2007-09-21T08:02:59Z", "gone": true, "size": 4096, "path": "C:\\Users\\student\\Downloads\\fau-1.3.0.2355(rc3)\\fau\\FAU.x86\\sparsefile"},
+    {"index": "$IHMU3NR.zip", "time": "2007-09-21T08:17:19Z", "gone": true, "size": 5025829, "path": "C:\\Users\\student\\Downloads\\fau-1.3.0.2355(rc3).zip"},
+    {"index": "$I7FV8IY.exe", "time": "2007-09-21T08:23:18Z", "gone": true, "size": 153478296, "path": "C:\\Users\\student\\Downloads\\VMware-server-installer-1.0.4-56528.exe"},
+    {"index": "$IMG2SSB", "time": "2007-09-21T08:28:57Z", "gone": true, "size": 0, "path": "C:\\Users\\student\\Desktop\\123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012"},
+    {"index": "$IZK01YL.txt", "time": "2007-09-21T08:31:35Z", "gone": true, "size": 11, "path": "C:\\Users\\student\\Desktop\\123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012\\1234567.txt"},
+    {"index": "$I1TDH1G.exe", "time": "2007-09-21T08:38:30Z", "gone": true, "size": 704512, "path": "C:\\Users\\student\\Downloads\\fau-1.3.0.2355(rc3)\\fau\\FAU.x86\\nc.exe"},
+    {"index": "$IEQWWMF.exe", "time": "2007-09-21T08:38:30Z", "gone": true, "size": 679936, "path": "C:\\Users\\student\\Downloads\\fau-1.3.0.2355(rc3)\\fau\\FAU.x86\\fmdata.exe"},
+    {"index": "$IFRN1CZ.exe", "time": "2007-09-21T08:38:30Z", "gone": true, "size": 110592, "path": "C:\\Users\\student\\Downloads\\fau-1.3.0.2355(rc3)\\fau\\FAU.x86\\wipe.exe"},
+    {"index": "$IW527XU.exe", "time": "2007-09-21T08:38:30Z", "gone": true, "size": 331776, "path": "C:\\Users\\student\\Downloads\\fau-1.3.0.2355(rc3)\\fau\\FAU.x86\\volume_dump.exe"},
+    {"index": "$IC6GEAW.exe", "time": "2007-09-21T08:50:16Z", "gone": true, "size": null, "path": "C:\\Users\\student\\Downloads\\fau-1.3.0.2355(rc3)\\fau\\FAU.x86\\dd.exe"},
+    {"index": "$IZUFRX4.vmdk", "time": "2007-09-21T09:22:25Z", "gone": true, "size": 10737418240, "path": "C:\\Virtual Machines\\Windows XP Professional\\Windows XP Professional-flat.vmdk"},
+  ]
+}
diff --git a/test/samples/dir-win10-01.json b/test/samples/dir-win10-01.json
new file mode 100644
index 0000000..6549def
--- /dev/null
+++ b/test/samples/dir-win10-01.json
@@ -0,0 +1,14 @@
+{
+  "format": "dir",
+  "version": 2,
+  "path": "dir-win10-01",
+  "records": [
+    {"index": "$IKEGS1G", "time": "2015-04-04T17:19:52Z", "gone": false, "size": 0, "path": "C:\\Users\\tester\\12345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890123456789012345678901234567890"},
+    {"index": "$IQ7LAXT.png", "time": "2015-04-04T17:20:01Z", "gone": false, "size": 6455, "path": "C:\\Users\\tester\\Pictures\\web-canvas.png"},
+    {"index": "$I7R52EG.txt", "time": "2015-04-04T17:24:09Z", "gone": false, "size": 14, "path": "C:\\Temp\\foobat.txt.txt"},
+    {"index": "$IBBFODN", "time": "2015-04-07T23:19:35Z", "gone": true, "size": 7, "path": "C:\\Temp\\𨳊𨶙閪邨鰂"},
+    {"index": "$IHO61YT", "time": "2015-04-07T23:32:07Z", "gone": true, "size": 12884901888, "path": "C:\\Temp\\largesparsefile"},
+    {"index": "$IROMPZ0.exe", "time": "2015-04-19T10:49:59Z", "gone": true, "size": 1761792, "path": "C:\\Temp\\FAU\\FAU.x64\\dd.exe"},
+    {"index": "$IDNLPD4.exe", "time": "2015-04-19T10:50:51Z", "gone": true, "size": 872448, "path": "C:\\Temp\\FAU\\FAU.x86\\dd.exe"},
+  ]
+}