From 143a1c663374b8db9c4718d48c9714aaabeb0c28 Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Tue, 5 Nov 2024 15:51:08 -0800 Subject: [PATCH 1/4] Add regression test for merging large doc with small doc. --- go/store/prolly/tree/json_diff_test.go | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/go/store/prolly/tree/json_diff_test.go b/go/store/prolly/tree/json_diff_test.go index 4d95573175f..45d0e05e9b2 100644 --- a/go/store/prolly/tree/json_diff_test.go +++ b/go/store/prolly/tree/json_diff_test.go @@ -287,10 +287,13 @@ func largeJsonDiffTests(t *testing.T) []jsonDiffTest { ctx := sql.NewEmptyContext() ns := NewTestNodeStore() + emptyDocument := types.JSONDocument{Val: types.JsonObject{}} + insert := func(document types.MutableJSON, path string, val interface{}) types.MutableJSON { jsonVal, inRange, err := types.JSON.Convert(val) require.NoError(t, err) require.True(t, (bool)(inRange)) + document = document.Clone(ctx).(types.MutableJSON) newDoc, changed, err := document.Insert(ctx, path, jsonVal.(sql.JSONWrapper)) require.NoError(t, err) require.True(t, changed) @@ -400,6 +403,18 @@ func largeJsonDiffTests(t *testing.T) []jsonDiffTest { }, }, }, + { + // This is a regression test. + // If: + // - One document fits in a single chunk and the other doesn't + // - The location of the chunk boundary in the larger document is also present in the smaller document + // - The chunk boundary doesn't fall at the beginning of value. + // Then the differ would fail to advance the prolly tree cursor and would incorrectly see the larger document as corrupt. + // The values in this test case are specifically chosen to meet these conditions. + name: "no error when diffing large doc with small doc", + from: largeObject, + to: insert(emptyDocument, "$.level6", insert(emptyDocument, "$.level4", lookup(largeObject, "$.level6.level4"))), + }, } } @@ -473,9 +488,11 @@ func runTest(t *testing.T, test jsonDiffTest) { return cmp == 0 } - require.Equal(t, len(test.expectedDiffs), len(actualDiffs)) - for i, expected := range test.expectedDiffs { - actual := actualDiffs[i] - require.True(t, diffsEqual(expected, actual), fmt.Sprintf("Expected: %v\nActual: %v", expected, actual)) + if test.expectedDiffs != nil { + require.Equal(t, len(test.expectedDiffs), len(actualDiffs)) + for i, expected := range test.expectedDiffs { + actual := actualDiffs[i] + require.True(t, diffsEqual(expected, actual), fmt.Sprintf("Expected: %v\nActual: %v", expected, actual)) + } } } From cf28c794d0eaf76ef767ab10d72f7d94f8b9f8ec Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Tue, 5 Nov 2024 15:51:15 -0800 Subject: [PATCH 2/4] Fix merging large doc with small doc. --- go/store/prolly/tree/indexed_json_diff.go | 83 +++++++++++++---------- 1 file changed, 47 insertions(+), 36 deletions(-) diff --git a/go/store/prolly/tree/indexed_json_diff.go b/go/store/prolly/tree/indexed_json_diff.go index bf4bc2e427a..87ec31a12f7 100644 --- a/go/store/prolly/tree/indexed_json_diff.go +++ b/go/store/prolly/tree/indexed_json_diff.go @@ -44,61 +44,72 @@ func NewIndexedJsonDiffer(ctx context.Context, from, to IndexedJsonDocument) (*I differ.fromStop = differ.fromStop.parent differ.toStop = differ.toStop.parent - if differ.from == nil || differ.to == nil { + var currentFromCursor, currentToCursor *JsonCursor + if differ.from == nil { // This can happen when either document fits in a single chunk. // We don't use the chunk differ in this case, and instead we create the cursors without it. diffKey := []byte{byte(startOfValue)} - currentFromCursor, err := newJsonCursorAtStartOfChunk(ctx, from.m.NodeStore, from.m.Root, diffKey) + currentFromCursor, err = newJsonCursorAtStartOfChunk(ctx, from.m.NodeStore, from.m.Root, diffKey) if err != nil { return nil, err } - currentToCursor, err := newJsonCursorAtStartOfChunk(ctx, to.m.NodeStore, to.m.Root, diffKey) + err = advanceCursor(ctx, ¤tFromCursor) + if err != nil { + return nil, err + } + } + + if differ.to == nil { + // This can happen when either document fits in a single chunk. + // We don't use the chunk differ in this case, and instead we create the cursors without it. + diffKey := []byte{byte(startOfValue)} + currentToCursor, err = newJsonCursorAtStartOfChunk(ctx, to.m.NodeStore, to.m.Root, diffKey) + if err != nil { + return nil, err + } + err = advanceCursor(ctx, ¤tToCursor) if err != nil { return nil, err } - return &IndexedJsonDiffer{ - differ: differ, - from: from, - to: to, - currentFromCursor: currentFromCursor, - currentToCursor: currentToCursor, - }, nil } return &IndexedJsonDiffer{ - differ: differ, - from: from, - to: to, + differ: differ, + from: from, + to: to, + currentFromCursor: currentFromCursor, + currentToCursor: currentToCursor, }, nil } +func advanceCursor(ctx context.Context, jCur **JsonCursor) (err error) { + if (*jCur).jsonScanner.atEndOfChunk() { + err = (*jCur).cur.advance(ctx) + if err != nil { + return err + } + *jCur = nil + } else { + err = (*jCur).jsonScanner.AdvanceToNextLocation() + if err != nil { + return err + } + } + return nil +} + // Next computes the next diff between the two JSON documents. // To accomplish this, it uses the underlying Differ to find chunks that have changed, and uses a pair of JsonCursors // to walk corresponding chunks. func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error) { // helper function to advance a JsonCursor and set it to nil if it reaches the end of a chunk - advanceCursor := func(jCur **JsonCursor) (err error) { - if (*jCur).jsonScanner.atEndOfChunk() { - err = (*jCur).cur.advance(ctx) - if err != nil { - return err - } - *jCur = nil - } else { - err = (*jCur).jsonScanner.AdvanceToNextLocation() - if err != nil { - return err - } - } - return nil - } newAddedDiff := func(key []byte) (JsonDiff, error) { addedValue, err := jd.currentToCursor.NextValue(ctx) if err != nil { return JsonDiff{}, err } - err = advanceCursor(&jd.currentToCursor) + err = advanceCursor(ctx, &jd.currentToCursor) if err != nil { return JsonDiff{}, err } @@ -114,7 +125,7 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error if err != nil { return JsonDiff{}, err } - err = advanceCursor(&jd.currentFromCursor) + err = advanceCursor(ctx, &jd.currentFromCursor) if err != nil { return JsonDiff{}, err } @@ -158,7 +169,7 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error return JsonDiff{}, err } - err = advanceCursor(&jd.currentFromCursor) + err = advanceCursor(ctx, &jd.currentFromCursor) if err != nil { return JsonDiff{}, err } @@ -172,7 +183,7 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error return JsonDiff{}, err } - err = advanceCursor(&jd.currentToCursor) + err = advanceCursor(ctx, &jd.currentToCursor) if err != nil { return JsonDiff{}, err } @@ -209,11 +220,11 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error if compareJsonLocations(fromCurrentLocation, toCurrentLocation) != 0 { return JsonDiff{}, jsonParseError } - err = advanceCursor(&jd.currentFromCursor) + err = advanceCursor(ctx, &jd.currentFromCursor) if err != nil { return JsonDiff{}, err } - err = advanceCursor(&jd.currentToCursor) + err = advanceCursor(ctx, &jd.currentToCursor) if err != nil { return JsonDiff{}, err } @@ -230,11 +241,11 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error // Otherwise, compare them and possibly return a modification. if (fromScanner.current() == '{' && toScanner.current() == '{') || (fromScanner.current() == '[' && toScanner.current() == '[') { - err = advanceCursor(&jd.currentFromCursor) + err = advanceCursor(ctx, &jd.currentFromCursor) if err != nil { return JsonDiff{}, err } - err = advanceCursor(&jd.currentToCursor) + err = advanceCursor(ctx, &jd.currentToCursor) if err != nil { return JsonDiff{}, err } From bc9836320f4b014d041a011b368ba479480cf0dc Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Tue, 5 Nov 2024 16:36:57 -0800 Subject: [PATCH 3/4] Make json diff error message less scary. --- go/store/prolly/tree/json_scanner.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/go/store/prolly/tree/json_scanner.go b/go/store/prolly/tree/json_scanner.go index 8a7ffcf28f7..9478d93a378 100644 --- a/go/store/prolly/tree/json_scanner.go +++ b/go/store/prolly/tree/json_scanner.go @@ -38,7 +38,7 @@ type JsonScanner struct { valueOffset int } -var jsonParseError = fmt.Errorf("an error occurred while reading or writing JSON to/from the database. This is most likely a bug, but could indicate database corruption") +var jsonParseError = fmt.Errorf("encountered invalid JSON while reading JSON from the database, or while preparing to write JSON to the database. This is most likely a bug in JSON diffing") func (j JsonScanner) Clone() JsonScanner { return JsonScanner{ From ad631c6a79aa11dced20d36c71b5a6c8bb5927dd Mon Sep 17 00:00:00 2001 From: Nick Tobey Date: Tue, 5 Nov 2024 16:49:12 -0800 Subject: [PATCH 4/4] Add comments to json differ. --- go/store/prolly/tree/indexed_json_diff.go | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/go/store/prolly/tree/indexed_json_diff.go b/go/store/prolly/tree/indexed_json_diff.go index 87ec31a12f7..1976b7a3d63 100644 --- a/go/store/prolly/tree/indexed_json_diff.go +++ b/go/store/prolly/tree/indexed_json_diff.go @@ -46,13 +46,14 @@ func NewIndexedJsonDiffer(ctx context.Context, from, to IndexedJsonDocument) (*I var currentFromCursor, currentToCursor *JsonCursor if differ.from == nil { - // This can happen when either document fits in a single chunk. - // We don't use the chunk differ in this case, and instead we create the cursors without it. + // The "from" document fits inside a single chunk. + // We can't create the "from" cursor from the differ, so we create it here instead. diffKey := []byte{byte(startOfValue)} currentFromCursor, err = newJsonCursorAtStartOfChunk(ctx, from.m.NodeStore, from.m.Root, diffKey) if err != nil { return nil, err } + // Advance the cursor past the "beginning of document" location, so that it aligns with the "to" cursor no matter what. err = advanceCursor(ctx, ¤tFromCursor) if err != nil { return nil, err @@ -60,13 +61,14 @@ func NewIndexedJsonDiffer(ctx context.Context, from, to IndexedJsonDocument) (*I } if differ.to == nil { - // This can happen when either document fits in a single chunk. - // We don't use the chunk differ in this case, and instead we create the cursors without it. + // The "to" document fits inside a single chunk. + // We can't create the "from" cursor from the differ, so we create it here instead. diffKey := []byte{byte(startOfValue)} currentToCursor, err = newJsonCursorAtStartOfChunk(ctx, to.m.NodeStore, to.m.Root, diffKey) if err != nil { return nil, err } + // Advance the cursor past the "beginning of document" location, so that it aligns with the "from" cursor no matter what. err = advanceCursor(ctx, ¤tToCursor) if err != nil { return nil, err @@ -161,9 +163,12 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error return JsonDiff{}, err } } else if jd.currentFromCursor == nil { - // We exhausted the current `from` chunk but not the `to` chunk. Since the chunk boundaries don't align on + // We exhausted the current `from` chunk but not the current `to` chunk. Since the chunk boundaries don't align on // the same key, we need to continue into the next chunk. + // Alternatively, the "to" cursor was created during construction because the "to" document fit in a single chunk, + // and the "from" cursor hasn't been created yet. + jd.currentFromCursor, err = newJsonCursorFromCursor(ctx, jd.differ.from) if err != nil { return JsonDiff{}, err @@ -175,9 +180,12 @@ func (jd *IndexedJsonDiffer) Next(ctx context.Context) (diff JsonDiff, err error } continue } else if jd.currentToCursor == nil { - // We exhausted the current `to` chunk but not the `from` chunk. Since the chunk boundaries don't align on + // We exhausted the current `to` chunk but not the current `from` chunk. Since the chunk boundaries don't align on // the same key, we need to continue into the next chunk. + // Alternatively, the "from" cursor was created during construction because the "from" document fit in a single chunk, + // and the "to" cursor hasn't been created yet. + jd.currentToCursor, err = newJsonCursorFromCursor(ctx, jd.differ.to) if err != nil { return JsonDiff{}, err