Skip to content

Commit 2cd67ab

Browse files
committed
ondemand: able to iterate large json files
- fixed typo bug in ValueIterator.get_type() which was causing mysterious problems beyond just number parsing - created recursive_iterate_json() in debug_main.zig (not in repo) which is getting through twitter.json and a large package-lock.json file - add build flag -Dondemand which chooses the ondemand parser for validation - ondemand validation matches dom validation except for 3 big integer values
1 parent 06d0b58 commit 2cd67ab

File tree

8 files changed

+164
-61
lines changed

8 files changed

+164
-61
lines changed

.vscode/launch.json

Lines changed: 5 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -4,37 +4,18 @@
44
// For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
55
"version": "0.2.0",
66
"configurations": [
7-
// {
8-
// "name": "(gdb) Launch",
9-
// "type": "cppdbg",
10-
// "request": "launch",
11-
// "program": "${workspaceFolder}/zig-out/bin/zimdjson",
12-
// "args": ["test/test.json"],
13-
// "stopAtEntry": true,
14-
// "cwd": "${workspaceFolder}",
15-
// "environment": [],
16-
// "externalConsole": false,
17-
// "MIMode": "gdb",
18-
// "setupCommands": [
19-
// {
20-
// "description": "Enable pretty-printing for gdb",
21-
// "text": "-enable-pretty-printing",
22-
// "ignoreFailures": true
23-
// }
24-
// ]
25-
// },
267
{
278
"type": "lldb",
289
"request": "launch",
2910
"name": "Debug",
30-
"program": "${workspaceFolder}/zig-out/bin/zimdjson",
31-
// "args": ["test/test2.json"],
11+
"program": "${workspaceFolder}/zig-out/bin/dbg",
3212
"args": [
33-
// "../JSONTestSuite/test_parsing/y_string_1_2_3_bytes_UTF-8_sequences.json",
34-
"../JSONTestSuite/test_parsing//y_string_utf8.json",
13+
// "../JSONTestSuite/test_parsing/i_structure_500_nested_arrays.json",
14+
"../JSONTestSuite/test_parsing/y_array_arraysWithSpaces.json",
15+
// "../../c/simdjson/jsonexamples/twitter.json",
3516
],
3617
"cwd": "${workspaceFolder}",
37-
"preLaunchTask": "zig build",
18+
"preLaunchTask": "zig build debug",
3819
}
3920
]
4021
}

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,9 +5,9 @@ This is a port of [simdjson](https://github.com/simdjson/simdjson), a high perfo
55

66

77
# requirements
8-
A CPU with both AVX2 and CLMUL is required (Haswell from 2013 onwards should do for Intel, for AMD a Ryzen/EPYC CPU (Q1 2017) should be sufficient). Macos is (not yet) supported.
8+
A CPU with both AVX2 and CLMUL is required (Haswell from 2013 onwards should do for Intel, for AMD a Ryzen/EPYC CPU (Q1 2017) should be sufficient).
99

10-
No fallback for unsupported CPUs is provided.
10+
No fallback for unsupported CPUs is provided (yet).
1111

1212
# usage
1313
```console

build.zig

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,12 @@
11
const std = @import("std");
22

3-
fn setup(step: *std.build.LibExeObjStep, mode: std.builtin.Mode, target: anytype, step_size: u8) void {
3+
fn setup(step: *std.build.LibExeObjStep, mode: std.builtin.Mode, target: anytype, step_size: u8, ondemand: bool) void {
44
step.addCSourceFile("src/utils.c", &[_][]const u8{ "-Wall", "-Wextra", "-Werror", "-O3" });
55
step.setTarget(target);
66
step.linkLibC();
77
step.setBuildMode(mode);
88
step.addBuildOption(u8, "step_size", step_size);
9+
step.addBuildOption(bool, "ondemand", ondemand);
910
}
1011

1112
pub fn build(b: *std.build.Builder) void {
@@ -20,19 +21,25 @@ pub fn build(b: *std.build.Builder) void {
2021
"how many bytes of input to read at per StructuralIndexer step. must be either 64 or 128",
2122
) orelse 64;
2223

24+
const ondemand = b.option(
25+
bool,
26+
"ondemand",
27+
"use the ondemand parser for validation",
28+
) orelse false;
29+
2330
const lib = b.addStaticLibrary("simdjzon", "src/simdjzon.zig");
24-
setup(lib, mode, target, step_size);
31+
setup(lib, mode, target, step_size, ondemand);
2532
lib.install();
2633

2734
var main_tests = b.addTest("src/tests.zig");
28-
setup(main_tests, mode, target, step_size);
29-
// main_tests.setFilter("ondemand");
35+
setup(main_tests, mode, target, step_size, ondemand);
36+
// main_tests.setFilter("ondemand array iteration");
3037

3138
const test_step = b.step("test", "Run tests");
3239
test_step.dependOn(&main_tests.step);
3340

3441
const exe = b.addExecutable("simdjzon", "src/main.zig");
35-
setup(exe, mode, target, step_size);
42+
setup(exe, mode, target, step_size, ondemand);
3643
exe.install();
3744

3845
const run_cmd = exe.run();

src/Logger.zig

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,16 @@
11
const std = @import("std");
22
const mem = std.mem;
3-
const main = @import("main.zig");
3+
const common = @import("common.zig");
44
const Ondemand = @import("ondemand.zig");
5-
const println = main.println;
6-
const print = main.print;
5+
const println = common.println;
6+
const print = common.print;
77
usingnamespace @import("llvm_intrinsics.zig");
88

99
depth: u8 = 0,
1010

1111
const Logger = @This();
1212

13+
pub const MAX_DEPTH = 30;
1314
const LOG_EVENT_LEN = 20;
1415
const LOG_BUFFER_LEN = 30;
1516
const LOG_SMALL_BUFFER_LEN = 10;
@@ -22,15 +23,15 @@ fn pad_with(comptime s: []const u8, comptime pad_byte: u8, comptime len: u8) [le
2223
}
2324

2425
fn pad_with_alloc(s: []const u8, pad_byte: u8, len: u8, allocator: *mem.Allocator) []const u8 {
25-
var buf = allocator.alloc(u8, len) catch unreachable;
26+
var buf = allocator.alloc(u8, len) catch return s;
2627
std.mem.set(u8, buf, pad_byte);
2728
std.mem.copy(u8, buf, s[0..std.math.min(s.len, buf.len)]);
2829
return buf;
2930
}
3031

3132
pub fn start(log: *Logger, iter: anytype) void {
3233
_ = iter;
33-
if (main.debug) {
34+
if (common.debug) {
3435
log.depth = 0;
3536
const event_txt = pad_with("Event", ' ', LOG_EVENT_LEN);
3637
const buffer_txt = pad_with("Buffer", ' ', LOG_BUFFER_LEN);
@@ -51,18 +52,22 @@ fn printable_char(c: u8) u8 {
5152

5253
pub fn line_fmt(log: *Logger, iter: anytype, title_prefix: []const u8, title: []const u8, comptime detail_fmt: []const u8, detail_args: anytype) void {
5354
var buf: [0x100]u8 = undefined;
54-
log.line(iter, title_prefix, title, std.fmt.bufPrint(&buf, detail_fmt, detail_args) catch &buf);
55+
log.line(iter, title_prefix, title, std.fmt.bufPrint(&buf, detail_fmt, detail_args) catch return);
5556
}
5657

57-
// TODO: remove catch unreachables
5858
pub fn line(log: *Logger, iter: anytype, title_prefix: []const u8, title: []const u8, detail: []const u8) void {
59+
if (iter.depth >= Logger.MAX_DEPTH) return;
5960
var log_buf: [0x100]u8 = undefined;
6061
var log_buf2: [LOG_BUFFER_LEN]u8 = undefined;
61-
if (!main.debug) return;
62+
if (!common.debug) return;
6263

6364
var log_fba = std.heap.FixedBufferAllocator.init(&log_buf);
6465
const depth_padding = pad_with_alloc("", ' ', @intCast(u8, if (log.depth < 0x0f) log.depth * 2 else 0xff), &log_fba.allocator);
65-
const titles = std.fmt.allocPrint(&log_fba.allocator, "{s}{s}{s}", .{ depth_padding, title_prefix, title }) catch unreachable;
66+
const titles = std.fmt.allocPrint(
67+
&log_fba.allocator,
68+
"{s}{s}{s}",
69+
.{ depth_padding, title_prefix, title },
70+
) catch return;
6671
const p1 = pad_with_alloc(titles, ' ', LOG_EVENT_LEN, &log_fba.allocator);
6772
print("| {s} ", .{p1});
6873
const current_index = if (iter.at_beginning()) null else iter.next_structural() - 1;
@@ -112,7 +117,7 @@ pub fn line(log: *Logger, iter: anytype, title_prefix: []const u8, title: []cons
112117
if (current_index) |ci| {
113118
print("| {s} ", .{
114119
pad_with_alloc(
115-
std.fmt.bufPrint(&log_buf2, "{}", .{ci[0]}) catch unreachable,
120+
std.fmt.bufPrint(&log_buf2, "{}", .{ci[0]}) catch return,
116121
' ',
117122
LOG_INDEX_LEN,
118123
&log_fba.allocator,
@@ -135,22 +140,22 @@ pub fn value2(log: *Logger, iter: anytype, typ: []const u8, detail: []const u8,
135140

136141
pub fn start_value(log: *Logger, iter: anytype, typ: []const u8) void {
137142
log.line(iter, "+", typ, "");
138-
if (main.debug) log.depth = sat_add_u8(log.depth, 1);
143+
if (common.debug) log.depth = sat_add_u8(log.depth, 1);
139144
}
140145
pub fn end_value(log: *Logger, iter: anytype, typ: []const u8) void {
141-
if (main.debug) log.depth = sat_sub_u8(log.depth, 1);
146+
if (common.debug) log.depth = sat_sub_u8(log.depth, 1);
142147
log.line(iter, "-", typ, "");
143148
}
144149

145150
pub fn err(log: *Logger, iter: anytype, err_msg: []const u8) void {
146151
_ = iter;
147152
_ = log;
148-
if (main.debug) std.log.err("{s}", .{err_msg});
153+
if (common.debug) std.log.err("{s}", .{err_msg});
149154
}
150155
pub fn err_fmt(log: *Logger, iter: anytype, comptime fmt: []const u8, args: anytype) void {
151156
_ = iter;
152157
_ = log;
153-
if (main.debug) std.log.err(fmt, args);
158+
if (common.debug) std.log.err(fmt, args);
154159
}
155160

156161
pub fn event(log: *Logger, iter: anytype, typ: []const u8, detail: []const u8, delta: i32, depth_delta: i32) void {

src/main.zig

Lines changed: 102 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,17 +2,17 @@ const build_options = @import("build_options");
22
const std = @import("std");
33

44
const os = std.os;
5-
usingnamespace @import("common.zig");
5+
const common = @import("common.zig");
66
const simdjzon = @import("simdjzon.zig");
77
const dom = simdjzon.dom;
8+
const ondemand = simdjzon.ondemand;
89
pub const step_size = build_options.step_size;
910

10-
pub fn main() !u8 {
11+
pub fn domMain() !u8 {
1112
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
1213
defer arena.deinit();
1314
const allocator = &arena.allocator;
1415
var parser: dom.Parser = undefined;
15-
// debug = true;
1616

1717
if (os.argv.len == 1) {
1818
var stdin = std.io.getStdIn().reader();
@@ -37,3 +37,102 @@ pub fn main() !u8 {
3737
std.log.debug("parse valid", .{});
3838
return 0;
3939
}
40+
41+
pub fn ondemandMain() !u8 {
42+
var arena = std.heap.ArenaAllocator.init(std.heap.page_allocator);
43+
defer arena.deinit();
44+
const allocator = &arena.allocator;
45+
var parser: ondemand.Parser = undefined;
46+
defer if (parser.src.* == .file) parser.src.file.close();
47+
48+
if (os.argv.len == 1) {
49+
var stdin = std.io.getStdIn().reader();
50+
const input = try stdin.readAllAlloc(allocator, std.math.maxInt(u32));
51+
var src = std.io.StreamSource{ .buffer = std.io.fixedBufferStream(input) };
52+
parser = try ondemand.Parser.init(&src, allocator, "<stdin>", .{});
53+
} else if (os.argv.len == 2) {
54+
const filepath = std.mem.span(os.argv[1]);
55+
const file = try std.fs.cwd().openFile(filepath, .{ .read = true });
56+
57+
var src = std.io.StreamSource{ .file = file };
58+
parser = try ondemand.Parser.init(&src, allocator, filepath, .{});
59+
} else {
60+
std.log.err("Too many arguments. Please provide input via filename or stdin", .{});
61+
return 1;
62+
}
63+
64+
defer parser.deinit();
65+
66+
var doc = try parser.iterate();
67+
// common.debug = true;
68+
var string_buf: [0x1000]u8 = undefined;
69+
const end_index = recursive_iterate_json(&doc, 1, parser.parser.max_depth, &string_buf) catch |err| switch (err) {
70+
// error.EndOfStream => {},
71+
else => {
72+
std.log.err("parse failed. {s}", .{@errorName(err)});
73+
return 1;
74+
},
75+
};
76+
77+
if (end_index != doc.iter.last_document_position()[0]) {
78+
std.log.err("More than one JSON value at the root of the document, or extra characters at the end of the JSON!", .{});
79+
return 1;
80+
}
81+
std.log.debug("parse valid", .{});
82+
return 0;
83+
}
84+
85+
inline fn recursive_iterate_json(element: anytype, depth: u16, max_depth: u16, string_buf: []u8) common.Error!u32 {
86+
if (depth >= max_depth) return error.DEPTH_ERROR;
87+
var iter = switch (@TypeOf(element)) {
88+
*ondemand.Document => element.iter,
89+
*ondemand.Value => element.iter.iter,
90+
else => unreachable,
91+
};
92+
switch (try element.get_type()) {
93+
.array => {
94+
var arr = try element.get_array();
95+
var it = arr.iterator();
96+
while (try it.next()) |*child| {
97+
_ = try recursive_iterate_json(child, depth + 1, max_depth, string_buf);
98+
}
99+
return (it.iter.iter.token.index - 1)[0];
100+
},
101+
.object => {
102+
var obj = try element.get_object();
103+
var it = obj.iterator();
104+
var key: [0x1000]u8 = undefined;
105+
while (try it.next(&key)) |*field| {
106+
_ = try recursive_iterate_json(&field.value, depth + 1, max_depth, string_buf);
107+
}
108+
return (it.iter.iter.token.index - 1)[0];
109+
},
110+
.number => {
111+
// FIXME: clean this up to match dom behavior of big int values
112+
// failing to match JSONTestSuite on:
113+
// i_number_too_big_neg_int.json
114+
// i_number_too_big_pos_int.json
115+
// i_number_very_big_negative_int.json
116+
117+
switch ((try iter.peek_delta(0, 1))[0]) {
118+
'-' => _ = element.get_int(i64) catch {
119+
_ = try element.get_double();
120+
},
121+
else => _ = element.get_int(u64) catch {
122+
_ = try element.get_double();
123+
},
124+
}
125+
},
126+
.string => _ = try element.get_string(string_buf),
127+
.bool => _ = try element.get_bool(),
128+
.nul => if (!(try element.is_null())) return error.INCORRECT_TYPE,
129+
}
130+
return (iter.token.index - 1)[0];
131+
}
132+
133+
pub fn main() !u8 {
134+
return if (build_options.ondemand)
135+
ondemandMain()
136+
else
137+
domMain();
138+
}

src/number_parsing.zig

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,14 @@ const mem = std.mem;
33
usingnamespace @import("vector_types.zig");
44
usingnamespace @import("llvm_intrinsics.zig");
55
const main = @import("main.zig");
6-
const Error = main.Error;
7-
const println = main.println;
86
const CharUtils = @import("string_parsing.zig").CharUtils;
97
const dom = @import("dom.zig");
108
const Iterator = dom.Iterator;
119
const TapeBuilder = dom.TapeBuilder;
1210
const TapeType = dom.TapeType;
1311
const common = @import("common.zig");
12+
const Error = common.Error;
13+
const println = common.println;
1414

1515
fn INVALID_NUMBER(src: [*]const u8) Error {
1616
_ = src;
@@ -1829,7 +1829,7 @@ pub fn parse_integer(src: [*]const u8) !u64 {
18291829
} else if (src[0] != '1' or i <= std.math.maxInt(i64)) return error.INCORRECT_TYPE;
18301830
}
18311831

1832-
return if (negative) (~i + 1) else i;
1832+
return if (negative) (~i +% 1) else i;
18331833
}
18341834

18351835
pub fn parse_double(src_: [*]const u8) !f64 {

0 commit comments

Comments
 (0)