From 70e69cfd0e6e695d29821a9ad48feb38066cba99 Mon Sep 17 00:00:00 2001 From: Luca Cappelletti Date: Mon, 15 Apr 2024 00:03:58 +0200 Subject: [PATCH] Added serde, made rayon optional, introduced GitHub actions CI, fixed many code smells. (#25) * Added serde, rendered rayon optional, removed dead code * Added clone and fixed more code smells * Resolved more code smells * Resolved another code smell * Now using README as actual part of documentation * Added github action for clippy and tests * Added derive for debug * Added derives for mem dbg, so to track memory usage --------- Co-authored-by: Sho Nakatani --- .github/workflows/clippy.yml | 36 +++++ Cargo.toml | 11 +- README.md | 3 +- src/fid.rs | 30 +++-- src/fid/chunk.rs | 1 - src/fid/chunks.rs | 81 +++++++++--- src/fid/{fid.rs => fid_impl.rs} | 11 +- src/internal_data_structure/popcount_table.rs | 12 +- src/internal_data_structure/raw_bit_vector.rs | 4 +- src/lib.rs | 124 +----------------- 10 files changed, 156 insertions(+), 157 deletions(-) create mode 100644 .github/workflows/clippy.yml rename src/fid/{fid.rs => fid_impl.rs} (98%) diff --git a/.github/workflows/clippy.yml b/.github/workflows/clippy.yml new file mode 100644 index 00000000..947eac1e --- /dev/null +++ b/.github/workflows/clippy.yml @@ -0,0 +1,36 @@ +name: Clippy + + +on: + push: + branches: ["master"] + pull_request: + branches: ["master"] + +env: + CARGO_TERM_COLOR: always + +jobs: + clippy: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v2 + - name: Install Clippy + run: + rustup toolchain install nightly --component clippy + - name: Set up Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: nightly + override: true + - name: Run clippy + run: cargo clippy --all-features + - name: Run clippy without rayon + run: cargo clippy --no-default-features --features="serde" + - name: Run tests + run: cargo test --all-features + - name: Run tests without rayon + run: cargo test --no-default-features --features="serde" + - name: Run tests release + run: cargo test --release --all-features diff --git a/Cargo.toml b/Cargo.toml index 5d110d52..3810ac17 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,7 +12,13 @@ categories = ["compression", "data-structures"] edition = "2018" [dependencies] -rayon = "1.0" +# Rayon is an optional feature, which is enabled by default. +# It is used to crate the Chunks collection in parallel. +rayon = { version = "1.5", optional = true } +# Serde is another optional feature, which can be enabled by setting `serde` feature. +# It is used to serialize and deserialize the FID structure. +serde = { version = "1.0", optional = true, features = ["derive"] } +mem_dbg = {version = "0.1.4", optional = true} [dev-dependencies] criterion = "0.2" @@ -26,3 +32,6 @@ harness = false tag-prefix = "v" pre-release-hook = ["emacs", "CHANGELOG.md"] # Finally, I found this hook so effective!! disable-publish = true + +[features] +default = ["rayon"] diff --git a/README.md b/README.md index 28dbfbc2..d12c29bd 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,8 @@ High performance FID (Fully Indexable Dictionary) library. | [Changelog](https://github.com/laysakura/fid-rs/blob/master/CHANGELOG.md) -[![Build Status](https://travis-ci.com/laysakura/fid-rs.svg?branch=master)](https://travis-ci.com/laysakura/fid-rs) +[![GitHub Actions Status](https://github.com/laysakura/fid-rs/actions/workflows/clippy.yml/badge.svg)](https://github.com/laysakura/fid-rs/actions) +[![Travis Status](https://travis-ci.com/laysakura/fid-rs.svg?branch=master)](https://travis-ci.com/laysakura/fid-rs) [![Crates.io Version](https://img.shields.io/crates/v/fid-rs.svg)](https://crates.io/crates/fid-rs) [![Crates.io Downloads](https://img.shields.io/crates/d/fid-rs.svg)](https://crates.io/crates/fid-rs) [![Minimum rustc version](https://img.shields.io/badge/rustc-1.33+-lightgray.svg)](https://github.com/laysakura/fid-rs#rust-version-supports) diff --git a/src/fid.rs b/src/fid.rs index 934a221d..6479a8dd 100644 --- a/src/fid.rs +++ b/src/fid.rs @@ -2,11 +2,17 @@ mod block; mod blocks; mod chunk; mod chunks; -mod fid; +mod fid_impl; mod fid_iter; use super::internal_data_structure::popcount_table::PopcountTable; +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[cfg(feature = "mem_dbg")] +use mem_dbg::{MemDbg, MemSize}; + /// FID (Fully Indexable Dictionary). /// /// This class can handle bit sequence of virtually **arbitrary length.** @@ -94,7 +100,9 @@ use super::internal_data_structure::popcount_table::PopcountTable; /// In summary: /// /// _rank() = (value of left chunk) + (value of left block) + (value of table keyed by inner block bits)_. -#[derive(Clone)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))] pub struct Fid { /// Raw data. byte_vec: Vec, @@ -119,6 +127,9 @@ pub struct FidIter<'iter> { #[derive(Clone)] /// Collection of Chunk. +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))] struct Chunks { chunks: Vec, chunks_cnt: u64, @@ -127,17 +138,18 @@ struct Chunks { /// Total popcount of _[0, last bit of the chunk]_ of a bit vector. /// /// Each chunk takes _2^64_ at max (when every bit is '1' for Fid of length of _2^64_). -#[derive(Clone)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))] struct Chunk { value: u64, // popcount blocks: Blocks, - - #[allow(dead_code)] - length: u16, } /// Collection of Block in a Chunk. -#[derive(Clone)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))] struct Blocks { blocks: Vec, blocks_cnt: u16, @@ -146,7 +158,9 @@ struct Blocks { /// Total popcount of _[_first bit of the chunk which the block belongs to_, _last bit of the block_]_ of a bit vector. /// /// Each block takes (log 2^64)^2 = 64^2 = 2^16 at max (when every bit in a chunk is 1 for Fid of length of 2^64) -#[derive(Clone)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))] struct Block { value: u16, // popcount length: u8, diff --git a/src/fid/chunk.rs b/src/fid/chunk.rs index 89551b45..3ef25275 100644 --- a/src/fid/chunk.rs +++ b/src/fid/chunk.rs @@ -7,7 +7,6 @@ impl super::Chunk { let blocks = Blocks::new(rbv, i_chunk, length); Chunk { value, - length, blocks, } } diff --git a/src/fid/chunks.rs b/src/fid/chunks.rs index 842abb2d..bb539a30 100644 --- a/src/fid/chunks.rs +++ b/src/fid/chunks.rs @@ -1,4 +1,4 @@ -extern crate rayon; +#[cfg(feature = "rayon")] use rayon::prelude::*; use super::{Chunk, Chunks}; @@ -6,22 +6,22 @@ use crate::internal_data_structure::raw_bit_vector::RawBitVector; impl super::Chunks { /// Constructor. + #[cfg(feature = "rayon")] pub fn new(rbv: &RawBitVector) -> Chunks { let n = rbv.len(); let chunk_size: u16 = Chunks::calc_chunk_size(n); - let chunks_cnt: u64 = Chunks::calc_chunks_cnt(n); + let chunks_cnt: usize = Chunks::calc_chunks_cnt(n) as usize; // In order to use chunks.par_iter_mut(), chunks should have len first. // So fill meaning less None value. - let mut opt_chunks: Vec> = vec![None; chunks_cnt as usize]; + let mut chunks: Vec = Vec::with_capacity(chunks_cnt); // Parallel - Each chunk has its popcount. // Actually, chunk should have total popcount from index 0 but it is calculated later in sequential manner. - opt_chunks - .par_iter_mut() - .enumerate() - .for_each(|(i_chunk, chunk)| { - let this_chunk_size: u16 = if i_chunk as u64 == chunks_cnt - 1 { + (0..chunks_cnt) + .into_par_iter() + .map(|number_of_chunk| { + let this_chunk_size: u16 = if number_of_chunk == chunks_cnt - 1 { // When `chunk_size == 6`: // // 000 111 000 11 : rbv @@ -39,27 +39,76 @@ impl super::Chunks { chunk_size }; - let chunk_rbv = - rbv.clone_sub(i_chunk as u64 * chunk_size as u64, this_chunk_size as u64); + let chunk_rbv = rbv.clone_sub( + number_of_chunk as u64 * chunk_size as u64, + this_chunk_size as u64, + ); let popcnt_in_chunk = chunk_rbv.popcount(); - *chunk = Some(Chunk::new( + Chunk::new( popcnt_in_chunk, this_chunk_size, rbv, - i_chunk as u64, - )); - }); + number_of_chunk as u64, + ) + }) + .collect_into_vec(&mut chunks); // Sequential - Each chunk has total popcount from index 0. - let mut chunks: Vec = opt_chunks.into_iter().map(|v| v.unwrap()).collect(); - for i_chunk in 0..(chunks_cnt as usize) { + for i_chunk in 0..chunks_cnt { chunks[i_chunk].value += if i_chunk == 0 { 0 } else { chunks[i_chunk - 1].value } } + Chunks { + chunks, + chunks_cnt: chunks_cnt as u64, + } + } + + /// Constructor. + #[cfg(not(feature = "rayon"))] + pub fn new(rbv: &RawBitVector) -> Chunks { + let n = rbv.len(); + let chunk_size: u16 = Chunks::calc_chunk_size(n); + let chunks_cnt: u64 = Chunks::calc_chunks_cnt(n); + + let mut chunks: Vec = Vec::with_capacity(chunks_cnt as usize); + let mut comulative_popcount = 0; + + for i_chunk in 0..chunks_cnt { + let this_chunk_size: u16 = if i_chunk == chunks_cnt - 1 { + // When `chunk_size == 6`: + // + // 000 111 000 11 : rbv + // | | | : chunks + // + // Here, when `i_chunk == 1` (targeting on last '00011' chunk), + // `this_chunk_size == 5` + let chunk_size_or_0 = (n % chunk_size as u64) as u16; + if chunk_size_or_0 == 0 { + chunk_size + } else { + chunk_size_or_0 + } + } else { + chunk_size + }; + + let chunk_rbv = rbv.clone_sub(i_chunk * chunk_size as u64, this_chunk_size as u64); + + let popcnt_in_chunk = chunk_rbv.popcount(); + comulative_popcount += popcnt_in_chunk; + chunks.push(Chunk::new( + comulative_popcount, + this_chunk_size, + rbv, + i_chunk, + )); + } + Chunks { chunks, chunks_cnt } } diff --git a/src/fid/fid.rs b/src/fid/fid_impl.rs similarity index 98% rename from src/fid/fid.rs rename to src/fid/fid_impl.rs index db850cbc..c4ba9d33 100644 --- a/src/fid/fid.rs +++ b/src/fid/fid_impl.rs @@ -209,7 +209,7 @@ impl Fid { let n = self.len(); assert!(num <= n); - if num == 0 || num == 1 && self[0] == true { + if num == 0 || num == 1 && self[0] { return Some(0); } if self.rank(n - 1) < num { @@ -237,7 +237,7 @@ impl Fid { let n = self.bit_len; assert!(num <= n); - if num == 0 || num == 1 && self[0] == false { + if num == 0 || num == 1 && !self[0] { return Some(0); } if self.rank0(n - 1) < num { @@ -262,6 +262,11 @@ impl Fid { self.bit_len } + /// Returns whether the FID is empty. + pub fn is_empty(&self) -> bool { + self.bit_len == 0 + } + fn rbv(&self) -> RawBitVector { let last_byte_len_or_0 = (self.bit_len % 8) as u8; RawBitVector::new( @@ -362,7 +367,7 @@ mod from_slice_failure_tests { #[test] #[should_panic] fn empty() { - Fid::from(&[][..]); + let _ = Fid::from(&[][..]); } } diff --git a/src/internal_data_structure/popcount_table.rs b/src/internal_data_structure/popcount_table.rs index 43c1938b..8b5624ce 100644 --- a/src/internal_data_structure/popcount_table.rs +++ b/src/internal_data_structure/popcount_table.rs @@ -1,5 +1,13 @@ +#[cfg(feature = "serde")] +use serde::{Deserialize, Serialize}; + +#[cfg(feature = "mem_dbg")] +use mem_dbg::{MemDbg, MemSize}; + /// Cache table of `popcount` results. -#[derive(Clone)] +#[derive(Clone, Debug)] +#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] +#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))] pub struct PopcountTable { bit_length: u8, @@ -19,7 +27,7 @@ impl PopcountTable { /// When `bit_length` is out of [1, 64]. pub fn new(bit_length: u8) -> PopcountTable { assert!( - 1 <= bit_length && bit_length <= 64, + (1..=64).contains(&bit_length), "bit_length (= {}) must be in [1, 64]", bit_length ); diff --git a/src/internal_data_structure/raw_bit_vector.rs b/src/internal_data_structure/raw_bit_vector.rs index fa9bfd06..232d16b8 100644 --- a/src/internal_data_structure/raw_bit_vector.rs +++ b/src/internal_data_structure/raw_bit_vector.rs @@ -97,7 +97,7 @@ impl<'s> RawBitVector<'s> { // remove 1s in the left of first_byte_offset let left_1s_byte = match self.first_byte_offset { - 0 => 0b00000000 & self.byte_slice[0], + 0 => 0, 1 => 0b10000000 & self.byte_slice[0], 2 => 0b11000000 & self.byte_slice[0], 3 => 0b11100000 & self.byte_slice[0], @@ -120,7 +120,7 @@ impl<'s> RawBitVector<'s> { 4 => 0b00000111 & last_byte, 5 => 0b00000011 & last_byte, 6 => 0b00000001 & last_byte, - 7 => 0b00000000 & last_byte, + 7 => 0, _ => panic!("never happen"), }; popcnt -= right_1s_byte.count_ones() as u64; diff --git a/src/lib.rs b/src/lib.rs index 4aef242e..41397445 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,126 +1,4 @@ -//! High performance FID (Fully Indexable Dictionary) library. -//! -//! [Master API Docs](https://laysakura.github.io/fid-rs/fid_rs/) -//! | -//! [Released API Docs](https://docs.rs/crate/fid-rs) -//! | -//! [Benchmark Results](https://laysakura.github.io/fid-rs/criterion/report/) -//! | -//! [Changelog](https://github.com/laysakura/fid-rs/blob/master/CHANGELOG.md) -//! -//! [![Build Status](https://travis-ci.com/laysakura/fid-rs.svg?branch=master)](https://travis-ci.com/laysakura/fid-rs) -//! [![Crates.io Version](https://img.shields.io/crates/v/fid-rs.svg)](https://crates.io/crates/fid-rs) -//! [![Crates.io Downloads](https://img.shields.io/crates/d/fid-rs.svg)](https://crates.io/crates/fid-rs) -//! [![Minimum rustc version](https://img.shields.io/badge/rustc-1.33+-lightgray.svg)](https://github.com/laysakura/fid-rs#rust-version-supports) -//! [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://github.com/laysakura/fid-rs/blob/master/LICENSE-MIT) -//! [![License: Apache 2.0](https://img.shields.io/badge/license-Apache_2.0-blue.svg)](https://github.com/laysakura/fid-rs/blob/master/LICENSE-APACHE) -//! -//! # Quickstart -//! -//! To use fid-rs, add the following to your `Cargo.toml` file: -//! -//! ```toml -//! [dependencies] -//! fid-rs = "0.1" # NOTE: Replace to latest minor version. -//! ``` -//! -//! ## Usage Overview -//! -//! ```rust -//! use fid_rs::Fid; -//! -//! let fid = Fid::from("0100_1"); // Tips: Fid::from::<&str>() ignores '_'. -//! -//! // Basic operations --------------------- -//! assert_eq!(fid[0], false); // [0]1001; 0th bit is '0' (false) -//! assert_eq!(fid[1], true); // 0[1]001; 1st bit is '1' (true) -//! assert_eq!(fid[4], true); // 0100[1]; 4th bit is '1' (true) -//! -//! assert_eq!(fid.rank(0), 0); // [0]1001; Range [0, 0] has no '1' -//! assert_eq!(fid.rank(3), 1); // [0100]1; Range [0, 3] has 1 '1' -//! assert_eq!(fid.rank(4), 2); // [01001]; Range [0, 4] has 2 '1's -//! -//! assert_eq!(fid.select(0), Some(0)); // []01001; Minimum i where range [0, i] has 0 '1's is i=0 -//! assert_eq!(fid.select(1), Some(1)); // 0[1]001; Minimum i where range [0, i] has 1 '1's is i=1 -//! assert_eq!(fid.select(2), Some(4)); // 0100[1]; Minimum i where range [0, i] has 2 '1's is i=4 -//! assert_eq!(fid.select(3), None); // There is no i where range [0, i] has 3 '1's -//! -//! // rank0, select0 ----------------------- -//! assert_eq!(fid.rank0(0), 1); // [0]1001; Range [0, 0] has no '0' -//! assert_eq!(fid.rank0(3), 3); // [0100]1; Range [0, 3] has 3 '0's -//! assert_eq!(fid.rank0(4), 3); // [01001]; Range [0, 4] has 3 '0's -//! -//! assert_eq!(fid.select0(0), Some(0)); // []01001; Minimum i where range [0, i] has 0 '0's is i=0 -//! assert_eq!(fid.select0(1), Some(0)); // [0]1001; Minimum i where range [0, i] has 1 '0's is i=0 -//! assert_eq!(fid.select0(2), Some(2)); // 01[0]01; Minimum i where range [0, i] has 2 '0's is i=2 -//! assert_eq!(fid.select0(4), None); // There is no i where range [0, i] has 4 '0's -//! ``` -//! -//! ## Constructors -//! -//! ```rust -//! use fid_rs::Fid; -//! -//! // Most human-friendly way: Fid::from::<&str>() -//! let fid = Fid::from("0100_1"); -//! -//! // Complex construction in simple way: Fid::from::<&[bool]>() -//! let mut arr = [false; 5]; -//! arr[1] = true; -//! arr[4] = true; -//! let fid = Fid::from(&arr[..]); -//! ``` -//! -//! ## Iterator -//! -//! ```rust -//! use fid_rs::Fid; -//! -//! let fid = Fid::from("0100_1"); -//! -//! for bit in fid.iter() { -//! println!("{}", bit); -//! } -//! // => -//! // false -//! // true -//! // false -//! // false -//! // true -//! ``` -//! -//! ## Utility Methods -//! -//! ```rust -//! use fid_rs::Fid; -//! -//! let fid = Fid::from("0100_1"); -//! -//! assert_eq!(fid.len(), 5); -//! ``` -//! -//! # Features -//! -//! - **Arbitrary length support with minimum working memory**: fid-rs provides virtually _arbitrary size_ of FID. It is carefully designed to use as small memory space as possible. -//! - **Parallel build of FID**: Build operations (`Fid::from()`) takes _O(N)_ time. It is parallelized and achieves nearly optimal scale-out. -//! - **No memory copy while/after build operations**: After internally creating bit vector representation, any operation does not do memory copy. -//! - **Latest benchmark results are always accessible**: fid-rs is continuously benchmarked in Travis CI using [Criterion.rs](https://crates.io/crates/criterion). Graphical benchmark results are published [here](https://laysakura.github.io/fid-rs/criterion/report/). -//! -//! ## Complexity -//! -//! When the length of a `Fid` is _N_: -//! -//! | Operation | Time-complexity | Space-complexity | -//! |-----------|-----------------|------------------| -//! | [Fid::from::<&str>()](https://laysakura.github.io/fid-rs/fid_rs/fid/struct.Fid.html#implementations) | _O(N)_ | _N + o(N)_ | -//! | [Fid::from::<&[bool]>()](https://laysakura.github.io/fid-rs/fid_rs/fid/struct.Fid.html#implementations) | _O(N)_ | _N + o(N)_ | -//! | [Index<u64>](https://laysakura.github.io/fid-rs/fid_rs/fid/struct.Fid.html#impl-Index) | _O(1)_ | _0_ | -//! | [Fid::rank()](https://laysakura.github.io/fid-rs/fid_rs/fid/struct.Fid.html#method.rank) | _O(1)_ | _O(1)_ | -//! | [Fid::rank0()](https://laysakura.github.io/fid-rs/fid_rs/fid/struct.Fid.html#method.rank0) | _O(1)_ | _O(1)_ | -//! | [Fid::select()](https://laysakura.github.io/fid-rs/fid_rs/fid/struct.Fid.html#method.select) | _O(log N)_ | _O(1)_ | -//! | [Fid::select0()](https://laysakura.github.io/fid-rs/fid_rs/fid/struct.Fid.html#method.select0) | _O(log N)_ | _O(1)_ | -//! -//! (Actually, `select()`'s time-complexity can be _O(1)_ with complex implementation but fid-rs, like many other libraries, uses binary search of `rank()`'s result). +#![doc = include_str!("../README.md")] pub use fid::Fid;