Skip to content

Commit

Permalink
Merge branch 'master' of ssh://github.com/laysakura/fid-rs
Browse files Browse the repository at this point in the history
  • Loading branch information
laysakura committed Apr 14, 2024
2 parents 6934dfc + 70e69cf commit 4039dc2
Show file tree
Hide file tree
Showing 10 changed files with 156 additions and 157 deletions.
36 changes: 36 additions & 0 deletions .github/workflows/clippy.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
name: Clippy


on:
push:
branches: ["master"]
pull_request:
branches: ["master"]

env:
CARGO_TERM_COLOR: always

jobs:
clippy:
runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v2
- name: Install Clippy
run:
rustup toolchain install nightly --component clippy
- name: Set up Rust
uses: actions-rs/toolchain@v1
with:
toolchain: nightly
override: true
- name: Run clippy
run: cargo clippy --all-features
- name: Run clippy without rayon
run: cargo clippy --no-default-features --features="serde"
- name: Run tests
run: cargo test --all-features
- name: Run tests without rayon
run: cargo test --no-default-features --features="serde"
- name: Run tests release
run: cargo test --release --all-features
11 changes: 10 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,21 @@ categories = ["compression", "data-structures"]
edition = "2018"

[dependencies]
rayon = "1.0"
# Rayon is an optional feature, which is enabled by default.
# It is used to crate the Chunks collection in parallel.
rayon = { version = "1.5", optional = true }
# Serde is another optional feature, which can be enabled by setting `serde` feature.
# It is used to serialize and deserialize the FID structure.
serde = { version = "1.0", optional = true, features = ["derive"] }
mem_dbg = {version = "0.1.4", optional = true}

[dev-dependencies]
criterion = "0.5"
rand = "0.8"

[features]
default = ["rayon"]

[[bench]]
name = "bench"
harness = false
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,8 @@ High performance FID (Fully Indexable Dictionary) library.
|
[Changelog](https://github.com/laysakura/fid-rs/blob/master/CHANGELOG.md)

[![Build Status](https://travis-ci.com/laysakura/fid-rs.svg?branch=master)](https://travis-ci.com/laysakura/fid-rs)
[![GitHub Actions Status](https://github.com/laysakura/fid-rs/actions/workflows/clippy.yml/badge.svg)](https://github.com/laysakura/fid-rs/actions)
[![Travis Status](https://travis-ci.com/laysakura/fid-rs.svg?branch=master)](https://travis-ci.com/laysakura/fid-rs)
[![Crates.io Version](https://img.shields.io/crates/v/fid-rs.svg)](https://crates.io/crates/fid-rs)
[![Crates.io Downloads](https://img.shields.io/crates/d/fid-rs.svg)](https://crates.io/crates/fid-rs)
[![Minimum rustc version](https://img.shields.io/badge/rustc-1.33+-lightgray.svg)](https://github.com/laysakura/fid-rs#rust-version-supports)
Expand Down
30 changes: 22 additions & 8 deletions src/fid.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,17 @@ mod block;
mod blocks;
mod chunk;
mod chunks;
mod fid;
mod fid_impl;
mod fid_iter;

use super::internal_data_structure::popcount_table::PopcountTable;

#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

#[cfg(feature = "mem_dbg")]
use mem_dbg::{MemDbg, MemSize};

/// FID (Fully Indexable Dictionary).
///
/// This class can handle bit sequence of virtually **arbitrary length.**
Expand Down Expand Up @@ -94,7 +100,9 @@ use super::internal_data_structure::popcount_table::PopcountTable;
/// In summary:
///
/// _rank() = (value of left chunk) + (value of left block) + (value of table keyed by inner block bits)_.
#[derive(Clone)]
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
pub struct Fid {
/// Raw data.
byte_vec: Vec<u8>,
Expand All @@ -119,6 +127,9 @@ pub struct FidIter<'iter> {

#[derive(Clone)]
/// Collection of Chunk.
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
struct Chunks {
chunks: Vec<Chunk>,
chunks_cnt: u64,
Expand All @@ -127,17 +138,18 @@ struct Chunks {
/// Total popcount of _[0, <u>last bit of the chunk</u>]_ of a bit vector.
///
/// Each chunk takes _2^64_ at max (when every bit is '1' for Fid of length of _2^64_).
#[derive(Clone)]
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
struct Chunk {
value: u64, // popcount
blocks: Blocks,

#[allow(dead_code)]
length: u16,
}

/// Collection of Block in a Chunk.
#[derive(Clone)]
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
struct Blocks {
blocks: Vec<Block>,
blocks_cnt: u16,
Expand All @@ -146,7 +158,9 @@ struct Blocks {
/// Total popcount of _[_first bit of the chunk which the block belongs to_, _last bit of the block_]_ of a bit vector.
///
/// Each block takes (log 2^64)^2 = 64^2 = 2^16 at max (when every bit in a chunk is 1 for Fid of length of 2^64)
#[derive(Clone)]
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
struct Block {
value: u16, // popcount
length: u8,
Expand Down
1 change: 0 additions & 1 deletion src/fid/chunk.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ impl super::Chunk {
let blocks = Blocks::new(rbv, i_chunk, length);
Chunk {
value,
length,
blocks,
}
}
Expand Down
81 changes: 65 additions & 16 deletions src/fid/chunks.rs
Original file line number Diff line number Diff line change
@@ -1,27 +1,27 @@
extern crate rayon;
#[cfg(feature = "rayon")]
use rayon::prelude::*;

use super::{Chunk, Chunks};
use crate::internal_data_structure::raw_bit_vector::RawBitVector;

impl super::Chunks {
/// Constructor.
#[cfg(feature = "rayon")]
pub fn new(rbv: &RawBitVector) -> Chunks {
let n = rbv.len();
let chunk_size: u16 = Chunks::calc_chunk_size(n);
let chunks_cnt: u64 = Chunks::calc_chunks_cnt(n);
let chunks_cnt: usize = Chunks::calc_chunks_cnt(n) as usize;

// In order to use chunks.par_iter_mut(), chunks should have len first.
// So fill meaning less None value.
let mut opt_chunks: Vec<Option<Chunk>> = vec![None; chunks_cnt as usize];
let mut chunks: Vec<Chunk> = Vec::with_capacity(chunks_cnt);

// Parallel - Each chunk has its popcount.
// Actually, chunk should have total popcount from index 0 but it is calculated later in sequential manner.
opt_chunks
.par_iter_mut()
.enumerate()
.for_each(|(i_chunk, chunk)| {
let this_chunk_size: u16 = if i_chunk as u64 == chunks_cnt - 1 {
(0..chunks_cnt)
.into_par_iter()
.map(|number_of_chunk| {
let this_chunk_size: u16 = if number_of_chunk == chunks_cnt - 1 {
// When `chunk_size == 6`:
//
// 000 111 000 11 : rbv
Expand All @@ -39,27 +39,76 @@ impl super::Chunks {
chunk_size
};

let chunk_rbv =
rbv.clone_sub(i_chunk as u64 * chunk_size as u64, this_chunk_size as u64);
let chunk_rbv = rbv.clone_sub(
number_of_chunk as u64 * chunk_size as u64,
this_chunk_size as u64,
);

let popcnt_in_chunk = chunk_rbv.popcount();
*chunk = Some(Chunk::new(
Chunk::new(
popcnt_in_chunk,
this_chunk_size,
rbv,
i_chunk as u64,
));
});
number_of_chunk as u64,
)
})
.collect_into_vec(&mut chunks);

// Sequential - Each chunk has total popcount from index 0.
let mut chunks: Vec<Chunk> = opt_chunks.into_iter().map(|v| v.unwrap()).collect();
for i_chunk in 0..(chunks_cnt as usize) {
for i_chunk in 0..chunks_cnt {
chunks[i_chunk].value += if i_chunk == 0 {
0
} else {
chunks[i_chunk - 1].value
}
}
Chunks {
chunks,
chunks_cnt: chunks_cnt as u64,
}
}

/// Constructor.
#[cfg(not(feature = "rayon"))]
pub fn new(rbv: &RawBitVector) -> Chunks {
let n = rbv.len();
let chunk_size: u16 = Chunks::calc_chunk_size(n);
let chunks_cnt: u64 = Chunks::calc_chunks_cnt(n);

let mut chunks: Vec<Chunk> = Vec::with_capacity(chunks_cnt as usize);
let mut comulative_popcount = 0;

for i_chunk in 0..chunks_cnt {
let this_chunk_size: u16 = if i_chunk == chunks_cnt - 1 {
// When `chunk_size == 6`:
//
// 000 111 000 11 : rbv
// | | | : chunks
//
// Here, when `i_chunk == 1` (targeting on last '00011' chunk),
// `this_chunk_size == 5`
let chunk_size_or_0 = (n % chunk_size as u64) as u16;
if chunk_size_or_0 == 0 {
chunk_size
} else {
chunk_size_or_0
}
} else {
chunk_size
};

let chunk_rbv = rbv.clone_sub(i_chunk * chunk_size as u64, this_chunk_size as u64);

let popcnt_in_chunk = chunk_rbv.popcount();
comulative_popcount += popcnt_in_chunk;
chunks.push(Chunk::new(
comulative_popcount,
this_chunk_size,
rbv,
i_chunk,
));
}

Chunks { chunks, chunks_cnt }
}

Expand Down
11 changes: 8 additions & 3 deletions src/fid/fid.rs → src/fid/fid_impl.rs
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ impl Fid {
let n = self.len();
assert!(num <= n);

if num == 0 || num == 1 && self[0] == true {
if num == 0 || num == 1 && self[0] {
return Some(0);
}
if self.rank(n - 1) < num {
Expand Down Expand Up @@ -237,7 +237,7 @@ impl Fid {
let n = self.bit_len;
assert!(num <= n);

if num == 0 || num == 1 && self[0] == false {
if num == 0 || num == 1 && !self[0] {
return Some(0);
}
if self.rank0(n - 1) < num {
Expand All @@ -262,6 +262,11 @@ impl Fid {
self.bit_len
}

/// Returns whether the FID is empty.
pub fn is_empty(&self) -> bool {
self.bit_len == 0
}

fn rbv(&self) -> RawBitVector {
let last_byte_len_or_0 = (self.bit_len % 8) as u8;
RawBitVector::new(
Expand Down Expand Up @@ -362,7 +367,7 @@ mod from_slice_failure_tests {
#[test]
#[should_panic]
fn empty() {
Fid::from(&[][..]);
let _ = Fid::from(&[][..]);
}
}

Expand Down
12 changes: 10 additions & 2 deletions src/internal_data_structure/popcount_table.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};

#[cfg(feature = "mem_dbg")]
use mem_dbg::{MemDbg, MemSize};

/// Cache table of `popcount` results.
#[derive(Clone)]
#[derive(Clone, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "mem_dbg", derive(MemDbg, MemSize))]
pub struct PopcountTable {
bit_length: u8,

Expand All @@ -19,7 +27,7 @@ impl PopcountTable {
/// When `bit_length` is out of [1, 64].
pub fn new(bit_length: u8) -> PopcountTable {
assert!(
1 <= bit_length && bit_length <= 64,
(1..=64).contains(&bit_length),
"bit_length (= {}) must be in [1, 64]",
bit_length
);
Expand Down
4 changes: 2 additions & 2 deletions src/internal_data_structure/raw_bit_vector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ impl<'s> RawBitVector<'s> {

// remove 1s in the left of first_byte_offset
let left_1s_byte = match self.first_byte_offset {
0 => 0b00000000 & self.byte_slice[0],
0 => 0,
1 => 0b10000000 & self.byte_slice[0],
2 => 0b11000000 & self.byte_slice[0],
3 => 0b11100000 & self.byte_slice[0],
Expand All @@ -120,7 +120,7 @@ impl<'s> RawBitVector<'s> {
4 => 0b00000111 & last_byte,
5 => 0b00000011 & last_byte,
6 => 0b00000001 & last_byte,
7 => 0b00000000 & last_byte,
7 => 0,
_ => panic!("never happen"),
};
popcnt -= right_1s_byte.count_ones() as u64;
Expand Down
Loading

0 comments on commit 4039dc2

Please sign in to comment.