Skip to content

Commit 2f2db16

Browse files
authored
store DateTime as nanoseconds in doc store (quickwit-oss#2486)
* store DateTime as nanoseconds in doc store The doc store DateTime was truncated to microseconds previously. This removes this truncation, while still keeping backwards compatibility. This is done by adding the trait `ConfigurableBinarySerializable`, which works like `BinarySerializable`, but with a config that allows de/serialize as different date time precision currently. bump version format to 7. add compat test to check the date time truncation. * remove configurable binary serialize, add enum for doc store version * test doc store version ord
1 parent d152e29 commit 2f2db16

22 files changed

+246
-89
lines changed

common/src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ pub use datetime::{DateTime, DateTimePrecision};
2020
pub use group_by::GroupByIteratorExtended;
2121
pub use json_path_writer::JsonPathWriter;
2222
pub use ownedbytes::{OwnedBytes, StableDeref};
23-
pub use serialize::{BinarySerializable, DeserializeFrom, FixedSize};
23+
pub use serialize::*;
2424
pub use vint::{
2525
read_u32_vint, read_u32_vint_no_advance, serialize_vint_u32, write_u32_vint, VInt, VIntU128,
2626
};

common/src/serialize.rs

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -74,14 +74,14 @@ impl FixedSize for () {
7474

7575
impl<T: BinarySerializable> BinarySerializable for Vec<T> {
7676
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
77-
VInt(self.len() as u64).serialize(writer)?;
77+
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
7878
for it in self {
7979
it.serialize(writer)?;
8080
}
8181
Ok(())
8282
}
8383
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Vec<T>> {
84-
let num_items = VInt::deserialize(reader)?.val();
84+
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
8585
let mut items: Vec<T> = Vec::with_capacity(num_items as usize);
8686
for _ in 0..num_items {
8787
let item = T::deserialize(reader)?;
@@ -236,12 +236,12 @@ impl FixedSize for bool {
236236
impl BinarySerializable for String {
237237
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
238238
let data: &[u8] = self.as_bytes();
239-
VInt(data.len() as u64).serialize(writer)?;
239+
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
240240
writer.write_all(data)
241241
}
242242

243243
fn deserialize<R: Read>(reader: &mut R) -> io::Result<String> {
244-
let string_length = VInt::deserialize(reader)?.val() as usize;
244+
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
245245
let mut result = String::with_capacity(string_length);
246246
reader
247247
.take(string_length as u64)
@@ -253,12 +253,12 @@ impl BinarySerializable for String {
253253
impl<'a> BinarySerializable for Cow<'a, str> {
254254
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
255255
let data: &[u8] = self.as_bytes();
256-
VInt(data.len() as u64).serialize(writer)?;
256+
BinarySerializable::serialize(&VInt(data.len() as u64), writer)?;
257257
writer.write_all(data)
258258
}
259259

260260
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, str>> {
261-
let string_length = VInt::deserialize(reader)?.val() as usize;
261+
let string_length = <VInt as BinarySerializable>::deserialize(reader)?.val() as usize;
262262
let mut result = String::with_capacity(string_length);
263263
reader
264264
.take(string_length as u64)
@@ -269,18 +269,18 @@ impl<'a> BinarySerializable for Cow<'a, str> {
269269

270270
impl<'a> BinarySerializable for Cow<'a, [u8]> {
271271
fn serialize<W: Write + ?Sized>(&self, writer: &mut W) -> io::Result<()> {
272-
VInt(self.len() as u64).serialize(writer)?;
272+
BinarySerializable::serialize(&VInt(self.len() as u64), writer)?;
273273
for it in self.iter() {
274-
it.serialize(writer)?;
274+
BinarySerializable::serialize(it, writer)?;
275275
}
276276
Ok(())
277277
}
278278

279279
fn deserialize<R: Read>(reader: &mut R) -> io::Result<Cow<'a, [u8]>> {
280-
let num_items = VInt::deserialize(reader)?.val();
280+
let num_items = <VInt as BinarySerializable>::deserialize(reader)?.val();
281281
let mut items: Vec<u8> = Vec::with_capacity(num_items as usize);
282282
for _ in 0..num_items {
283-
let item = u8::deserialize(reader)?;
283+
let item = <u8 as BinarySerializable>::deserialize(reader)?;
284284
items.push(item);
285285
}
286286
Ok(Cow::Owned(items))

src/compat_tests.rs

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -44,8 +44,19 @@ fn test_format_6() {
4444
assert_date_time_precision(&index, DateTimePrecision::Microseconds);
4545
}
4646

47+
/// feature flag quickwit uses a different dictionary type
48+
#[test]
49+
#[cfg(not(feature = "quickwit"))]
50+
fn test_format_7() {
51+
let path = path_for_version("7");
52+
53+
let index = Index::open_in_dir(path).expect("Failed to open index");
54+
// dates are not truncated in v7 in the docstore
55+
assert_date_time_precision(&index, DateTimePrecision::Nanoseconds);
56+
}
57+
4758
#[cfg(not(feature = "quickwit"))]
48-
fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
59+
fn assert_date_time_precision(index: &Index, doc_store_precision: DateTimePrecision) {
4960
use collector::TopDocs;
5061
let reader = index.reader().expect("Failed to create reader");
5162
let searcher = reader.searcher();
@@ -75,6 +86,6 @@ fn assert_date_time_precision(index: &Index, precision: DateTimePrecision) {
7586
.as_datetime()
7687
.unwrap();
7788

78-
let expected = DateTime::from_timestamp_nanos(123456).truncate(precision);
89+
let expected = DateTime::from_timestamp_nanos(123456).truncate(doc_store_precision);
7990
assert_eq!(date_value, expected,);
8091
}

src/lib.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -232,7 +232,7 @@ pub use crate::indexer::{IndexWriter, SingleSegmentIndexWriter};
232232
pub use crate::schema::{Document, TantivyDocument, Term};
233233

234234
/// Index format version.
235-
pub const INDEX_FORMAT_VERSION: u32 = 6;
235+
pub const INDEX_FORMAT_VERSION: u32 = 7;
236236
/// Oldest index format version this tantivy version can read.
237237
pub const INDEX_FORMAT_OLDEST_SUPPORTED_VERSION: u32 = 4;
238238

src/schema/document/de.rs

Lines changed: 74 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@ use super::se::BinaryObjectSerializer;
2222
use super::{OwnedValue, Value};
2323
use crate::schema::document::type_codes;
2424
use crate::schema::{Facet, Field};
25+
use crate::store::DocStoreVersion;
2526
use crate::tokenizer::PreTokenizedString;
2627

2728
#[derive(Debug, thiserror::Error, Clone)]
@@ -45,6 +46,9 @@ pub enum DeserializeError {
4546
#[error("{0}")]
4647
/// A custom error message.
4748
Custom(String),
49+
#[error("Version {0}, Max version supported: {1}")]
50+
/// Unsupported version error.
51+
UnsupportedVersion(u32, u32),
4852
}
4953

5054
impl DeserializeError {
@@ -291,19 +295,24 @@ pub trait ObjectAccess<'de> {
291295
pub struct BinaryDocumentDeserializer<'de, R> {
292296
length: usize,
293297
position: usize,
298+
doc_store_version: DocStoreVersion,
294299
reader: &'de mut R,
295300
}
296301

297302
impl<'de, R> BinaryDocumentDeserializer<'de, R>
298303
where R: Read
299304
{
300305
/// Attempts to create a new document deserializer from a given reader.
301-
pub(crate) fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
306+
pub(crate) fn from_reader(
307+
reader: &'de mut R,
308+
doc_store_version: DocStoreVersion,
309+
) -> Result<Self, DeserializeError> {
302310
let length = VInt::deserialize(reader)?;
303311

304312
Ok(Self {
305313
length: length.val() as usize,
306314
position: 0,
315+
doc_store_version,
307316
reader,
308317
})
309318
}
@@ -329,8 +338,8 @@ where R: Read
329338
}
330339

331340
let field = Field::deserialize(self.reader).map_err(DeserializeError::from)?;
332-
333-
let deserializer = BinaryValueDeserializer::from_reader(self.reader)?;
341+
let deserializer =
342+
BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
334343
let value = V::deserialize(deserializer)?;
335344

336345
self.position += 1;
@@ -344,13 +353,17 @@ where R: Read
344353
pub struct BinaryValueDeserializer<'de, R> {
345354
value_type: ValueType,
346355
reader: &'de mut R,
356+
doc_store_version: DocStoreVersion,
347357
}
348358

349359
impl<'de, R> BinaryValueDeserializer<'de, R>
350360
where R: Read
351361
{
352362
/// Attempts to create a new value deserializer from a given reader.
353-
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
363+
fn from_reader(
364+
reader: &'de mut R,
365+
doc_store_version: DocStoreVersion,
366+
) -> Result<Self, DeserializeError> {
354367
let type_code = <u8 as BinarySerializable>::deserialize(reader)?;
355368

356369
let value_type = match type_code {
@@ -391,7 +404,11 @@ where R: Read
391404
}
392405
};
393406

394-
Ok(Self { value_type, reader })
407+
Ok(Self {
408+
value_type,
409+
reader,
410+
doc_store_version,
411+
})
395412
}
396413

397414
fn validate_type(&self, expected_type: ValueType) -> Result<(), DeserializeError> {
@@ -438,7 +455,16 @@ where R: Read
438455

439456
fn deserialize_datetime(self) -> Result<DateTime, DeserializeError> {
440457
self.validate_type(ValueType::DateTime)?;
441-
<DateTime as BinarySerializable>::deserialize(self.reader).map_err(DeserializeError::from)
458+
match self.doc_store_version {
459+
DocStoreVersion::V1 => {
460+
let timestamp_micros = <i64 as BinarySerializable>::deserialize(self.reader)?;
461+
Ok(DateTime::from_timestamp_micros(timestamp_micros))
462+
}
463+
DocStoreVersion::V2 => {
464+
let timestamp_nanos = <i64 as BinarySerializable>::deserialize(self.reader)?;
465+
Ok(DateTime::from_timestamp_nanos(timestamp_nanos))
466+
}
467+
}
442468
}
443469

444470
fn deserialize_facet(self) -> Result<Facet, DeserializeError> {
@@ -514,11 +540,13 @@ where R: Read
514540
visitor.visit_pre_tokenized_string(val)
515541
}
516542
ValueType::Array => {
517-
let access = BinaryArrayDeserializer::from_reader(self.reader)?;
543+
let access =
544+
BinaryArrayDeserializer::from_reader(self.reader, self.doc_store_version)?;
518545
visitor.visit_array(access)
519546
}
520547
ValueType::Object => {
521-
let access = BinaryObjectDeserializer::from_reader(self.reader)?;
548+
let access =
549+
BinaryObjectDeserializer::from_reader(self.reader, self.doc_store_version)?;
522550
visitor.visit_object(access)
523551
}
524552
#[allow(deprecated)]
@@ -537,7 +565,8 @@ where R: Read
537565

538566
let out_rc = std::rc::Rc::new(out);
539567
let mut slice: &[u8] = &out_rc;
540-
let access = BinaryObjectDeserializer::from_reader(&mut slice)?;
568+
let access =
569+
BinaryObjectDeserializer::from_reader(&mut slice, self.doc_store_version)?;
541570

542571
visitor.visit_object(access)
543572
}
@@ -551,19 +580,24 @@ pub struct BinaryArrayDeserializer<'de, R> {
551580
length: usize,
552581
position: usize,
553582
reader: &'de mut R,
583+
doc_store_version: DocStoreVersion,
554584
}
555585

556586
impl<'de, R> BinaryArrayDeserializer<'de, R>
557587
where R: Read
558588
{
559589
/// Attempts to create a new array deserializer from a given reader.
560-
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
590+
fn from_reader(
591+
reader: &'de mut R,
592+
doc_store_version: DocStoreVersion,
593+
) -> Result<Self, DeserializeError> {
561594
let length = <VInt as BinarySerializable>::deserialize(reader)?;
562595

563596
Ok(Self {
564597
length: length.val() as usize,
565598
position: 0,
566599
reader,
600+
doc_store_version,
567601
})
568602
}
569603

@@ -587,7 +621,8 @@ where R: Read
587621
return Ok(None);
588622
}
589623

590-
let deserializer = BinaryValueDeserializer::from_reader(self.reader)?;
624+
let deserializer =
625+
BinaryValueDeserializer::from_reader(self.reader, self.doc_store_version)?;
591626
let value = V::deserialize(deserializer)?;
592627

593628
// Advance the position cursor.
@@ -610,8 +645,11 @@ impl<'de, R> BinaryObjectDeserializer<'de, R>
610645
where R: Read
611646
{
612647
/// Attempts to create a new object deserializer from a given reader.
613-
fn from_reader(reader: &'de mut R) -> Result<Self, DeserializeError> {
614-
let inner = BinaryArrayDeserializer::from_reader(reader)?;
648+
fn from_reader(
649+
reader: &'de mut R,
650+
doc_store_version: DocStoreVersion,
651+
) -> Result<Self, DeserializeError> {
652+
let inner = BinaryArrayDeserializer::from_reader(reader, doc_store_version)?;
615653
Ok(Self { inner })
616654
}
617655
}
@@ -819,6 +857,7 @@ mod tests {
819857
use crate::schema::document::existing_type_impls::JsonObjectIter;
820858
use crate::schema::document::se::BinaryValueSerializer;
821859
use crate::schema::document::{ReferenceValue, ReferenceValueLeaf};
860+
use crate::store::DOC_STORE_VERSION;
822861

823862
fn serialize_value<'a>(value: ReferenceValue<'a, &'a serde_json::Value>) -> Vec<u8> {
824863
let mut writer = Vec::new();
@@ -829,9 +868,19 @@ mod tests {
829868
writer
830869
}
831870

871+
fn serialize_owned_value<'a>(value: ReferenceValue<'a, &'a OwnedValue>) -> Vec<u8> {
872+
let mut writer = Vec::new();
873+
874+
let mut serializer = BinaryValueSerializer::new(&mut writer);
875+
serializer.serialize_value(value).expect("Serialize value");
876+
877+
writer
878+
}
879+
832880
fn deserialize_value(buffer: Vec<u8>) -> crate::schema::OwnedValue {
833881
let mut cursor = Cursor::new(buffer);
834-
let deserializer = BinaryValueDeserializer::from_reader(&mut cursor).unwrap();
882+
let deserializer =
883+
BinaryValueDeserializer::from_reader(&mut cursor, DOC_STORE_VERSION).unwrap();
835884
crate::schema::OwnedValue::deserialize(deserializer).expect("Deserialize value")
836885
}
837886

@@ -1010,6 +1059,17 @@ mod tests {
10101059
assert_eq!(value, expected_val);
10111060
}
10121061

1062+
#[test]
1063+
fn test_nested_date_precision() {
1064+
let object = OwnedValue::Object(vec![(
1065+
"my-date".into(),
1066+
OwnedValue::Date(DateTime::from_timestamp_nanos(323456)),
1067+
)]);
1068+
let result = serialize_owned_value((&object).as_value());
1069+
let value = deserialize_value(result);
1070+
assert_eq!(value, object);
1071+
}
1072+
10131073
#[test]
10141074
fn test_nested_serialize() {
10151075
let mut object = serde_json::Map::new();

0 commit comments

Comments
 (0)