Skip to content

Commit e7daf69

Browse files
committed
use usize in bitpacker
use usize in bitpacker to enable larger columns in the columnar store Godbolt comparison with u32 vs u64 for get access: https://godbolt.org/z/cjf7nenYP Add a mini-tool to inspect columnar files created by tantivy. (very basic functionality which can be extended later)
1 parent 876a579 commit e7daf69

File tree

6 files changed

+95
-7
lines changed

6 files changed

+95
-7
lines changed

bitpacker/src/bitpacker.rs

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -94,14 +94,14 @@ impl BitUnpacker {
9494

9595
#[inline]
9696
pub fn get(&self, idx: u32, data: &[u8]) -> u64 {
97-
let addr_in_bits = idx * self.num_bits;
98-
let addr = (addr_in_bits >> 3) as usize;
97+
let addr_in_bits = idx as usize * self.num_bits as usize;
98+
let addr = addr_in_bits >> 3;
9999
if addr + 8 > data.len() {
100100
if self.num_bits == 0 {
101101
return 0;
102102
}
103103
let bit_shift = addr_in_bits & 7;
104-
return self.get_slow_path(addr, bit_shift, data);
104+
return self.get_slow_path(addr, bit_shift as u32, data);
105105
}
106106
let bit_shift = addr_in_bits & 7;
107107
let bytes: [u8; 8] = (&data[addr..addr + 8]).try_into().unwrap();
Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[package]
2+
name = "tantivy-columnar-inspect"
3+
version = "0.1.0"
4+
edition = "2021"
5+
license = "MIT"
6+
7+
[dependencies]
8+
tantivy = {path="../..", package="tantivy"}
9+
columnar = {path="../", package="tantivy-columnar"}
10+
common = {path="../../common", package="tantivy-common"}
11+
12+
[workspace]
13+
members = []
14+
15+
[profile.release]
16+
debug = true
17+
#debug-assertions = true
18+
#overflow-checks = true
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
use columnar::ColumnarReader;
2+
use common::file_slice::{FileSlice, WrapFile};
3+
use std::io;
4+
use std::path::Path;
5+
use tantivy::directory::footer::Footer;
6+
7+
fn main() -> io::Result<()> {
8+
println!("Opens a columnar file written by tantivy and validates it.");
9+
let path = std::env::args().nth(1).unwrap();
10+
11+
let path = Path::new(&path);
12+
println!("Reading {:?}", path);
13+
let _reader = open_and_validate_columnar(path.to_str().unwrap())?;
14+
15+
Ok(())
16+
}
17+
18+
pub fn validate_columnar_reader(reader: &ColumnarReader) {
19+
let num_rows = reader.num_rows();
20+
println!("num_rows: {}", num_rows);
21+
let columns = reader.list_columns().unwrap();
22+
println!("num columns: {:?}", columns.len());
23+
for (col_name, dynamic_column_handle) in columns {
24+
let col = dynamic_column_handle.open().unwrap();
25+
match col {
26+
columnar::DynamicColumn::Bool(_)
27+
| columnar::DynamicColumn::I64(_)
28+
| columnar::DynamicColumn::U64(_)
29+
| columnar::DynamicColumn::F64(_)
30+
| columnar::DynamicColumn::IpAddr(_)
31+
| columnar::DynamicColumn::DateTime(_)
32+
| columnar::DynamicColumn::Bytes(_) => {}
33+
columnar::DynamicColumn::Str(str_column) => {
34+
let num_vals = str_column.ords().values.num_vals();
35+
let num_terms_dict = str_column.num_terms() as u64;
36+
let max_ord = str_column.ords().values.iter().max().unwrap_or_default();
37+
println!("{col_name:35} num_vals {num_vals:10} \t num_terms_dict {num_terms_dict:8} max_ord: {max_ord:8}",);
38+
for ord in str_column.ords().values.iter() {
39+
assert!(ord < num_terms_dict);
40+
}
41+
}
42+
}
43+
}
44+
}
45+
46+
/// Opens a columnar file that was written by tantivy and validates it.
47+
pub fn open_and_validate_columnar(path: &str) -> io::Result<ColumnarReader> {
48+
let wrap_file = WrapFile::new(std::fs::File::open(path)?)?;
49+
let slice = FileSlice::new(std::sync::Arc::new(wrap_file));
50+
let (_footer, slice) = Footer::extract_footer(slice.clone()).unwrap();
51+
let reader = ColumnarReader::open(slice).unwrap();
52+
validate_columnar_reader(&reader);
53+
Ok(reader)
54+
}

common/src/file_slice.rs

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
use std::fs::File;
22
use std::ops::{Deref, Range, RangeBounds};
3+
use std::path::Path;
34
use std::sync::Arc;
45
use std::{fmt, io};
56

@@ -177,6 +178,12 @@ fn combine_ranges<R: RangeBounds<usize>>(orig_range: Range<usize>, rel_range: R)
177178
}
178179

179180
impl FileSlice {
181+
/// Creates a FileSlice from a path.
182+
pub fn open(path: &Path) -> io::Result<FileSlice> {
183+
let wrap_file = WrapFile::new(File::open(path)?)?;
184+
Ok(FileSlice::new(Arc::new(wrap_file)))
185+
}
186+
180187
/// Wraps a FileHandle.
181188
pub fn new(file_handle: Arc<dyn FileHandle>) -> Self {
182189
let num_bytes = file_handle.len();

src/directory/footer.rs

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,9 @@
1+
//! The footer is a small metadata structure that is appended at the end of every file.
2+
//!
3+
//! The footer is used to store a checksum of the file content.
4+
//! The footer also stores the version of the index format.
5+
//! This version is used to detect incompatibility between the index and the library version.
6+
17
use std::io;
28
use std::io::Write;
39

@@ -20,20 +26,22 @@ type CrcHashU32 = u32;
2026
/// A Footer is appended to every file
2127
#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
2228
pub struct Footer {
29+
/// The version of the index format
2330
pub version: Version,
31+
/// The crc32 hash of the body
2432
pub crc: CrcHashU32,
2533
}
2634

2735
impl Footer {
28-
pub fn new(crc: CrcHashU32) -> Self {
36+
pub(crate) fn new(crc: CrcHashU32) -> Self {
2937
let version = crate::VERSION.clone();
3038
Footer { version, crc }
3139
}
3240

33-
pub fn crc(&self) -> CrcHashU32 {
41+
pub(crate) fn crc(&self) -> CrcHashU32 {
3442
self.crc
3543
}
36-
pub fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
44+
pub(crate) fn append_footer<W: io::Write>(&self, mut write: &mut W) -> io::Result<()> {
3745
let mut counting_write = CountingWriter::wrap(&mut write);
3846
counting_write.write_all(serde_json::to_string(&self)?.as_ref())?;
3947
let footer_payload_len = counting_write.written_bytes();
@@ -42,6 +50,7 @@ impl Footer {
4250
Ok(())
4351
}
4452

53+
/// Extracts the tantivy Footer from the file and returns the footer and the rest of the file
4554
pub fn extract_footer(file: FileSlice) -> io::Result<(Footer, FileSlice)> {
4655
if file.len() < 4 {
4756
return Err(io::Error::new(

src/directory/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ mod mmap_directory;
66
mod directory;
77
mod directory_lock;
88
mod file_watcher;
9-
mod footer;
9+
pub mod footer;
1010
mod managed_directory;
1111
mod ram_directory;
1212
mod watch_event_router;

0 commit comments

Comments
 (0)