Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: make SQ build & preprocess faster #596

Merged
merged 2 commits into from
Sep 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 11 additions & 11 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

64 changes: 31 additions & 33 deletions crates/base/src/index.rs
Original file line number Diff line number Diff line change
Expand Up @@ -104,51 +104,49 @@ pub struct IndexOptions {
}

impl IndexOptions {
fn validate_self_quantization(
&self,
quantization: &Option<QuantizationOptions>,
) -> Result<(), ValidationError> {
match quantization {
None => Ok(()),
Some(
QuantizationOptions::Scalar(_)
| QuantizationOptions::Product(_)
| QuantizationOptions::Rabitq(_),
) => {
if !matches!(self.vector.v, VectorKind::Vecf32 | VectorKind::Vecf16) {
return Err(ValidationError::new(
"quantization is not support for vectors that are not dense vectors",
));
}
Ok(())
}
}
}
fn validate_self(&self) -> Result<(), ValidationError> {
match &self.indexing {
IndexingOptions::Flat(FlatIndexingOptions { quantization }) => {
self.validate_self_quantization(quantization)?;
if quantization.is_some()
&& !matches!(self.vector.v, VectorKind::Vecf32 | VectorKind::Vecf16)
{
return Err(ValidationError::new(
"quantization is only supported for dense vectors",
));
}
}
IndexingOptions::Ivf(IvfIndexingOptions { quantization, .. }) => {
if !matches!(self.vector.v, VectorKind::Vecf32 | VectorKind::Vecf16) {
return Err(ValidationError::new(
"ivf is not support for vectors that are not dense vectors",
"ivf is only supported for dense vectors",
));
}
if quantization.is_some()
&& !matches!(self.vector.v, VectorKind::Vecf32 | VectorKind::Vecf16)
{
return Err(ValidationError::new(
"quantization is only supported for dense vectors",
));
}
self.validate_self_quantization(quantization)?;
}
IndexingOptions::Hnsw(HnswIndexingOptions { quantization, .. }) => {
self.validate_self_quantization(quantization)?;
}
IndexingOptions::InvertedIndex(_) => {
if !matches!(self.vector.d, DistanceKind::Dot) {
if quantization.is_some()
&& !matches!(self.vector.v, VectorKind::Vecf32 | VectorKind::Vecf16)
{
return Err(ValidationError::new(
"inverted_index is not support for distance that is not negative dot product",
"quantization is only supported for dense vectors",
));
}
}
IndexingOptions::SparseInvertedIndex(_) => {
if !matches!(self.vector.v, VectorKind::SVecf32) {
return Err(ValidationError::new(
"inverted_index is not support for vectors that are not sparse vectors",
"sparse_inverted_index is only supported for sparse vectors",
));
}
if !matches!(self.vector.d, DistanceKind::Dot) {
return Err(ValidationError::new(
"sparse_inverted_index is only supported for dot distance",
));
}
}
Expand Down Expand Up @@ -284,7 +282,7 @@ pub enum IndexingOptions {
Flat(FlatIndexingOptions),
Ivf(IvfIndexingOptions),
Hnsw(HnswIndexingOptions),
InvertedIndex(InvertedIndexingOptions),
SparseInvertedIndex(SparseInvertedIndexIndexingOptions),
}

impl IndexingOptions {
Expand Down Expand Up @@ -320,16 +318,16 @@ impl Validate for IndexingOptions {
Self::Flat(x) => x.validate(),
Self::Ivf(x) => x.validate(),
Self::Hnsw(x) => x.validate(),
Self::InvertedIndex(x) => x.validate(),
Self::SparseInvertedIndex(x) => x.validate(),
}
}
}

#[derive(Debug, Clone, Serialize, Deserialize, Validate)]
#[serde(deny_unknown_fields)]
pub struct InvertedIndexingOptions {}
pub struct SparseInvertedIndexIndexingOptions {}

impl Default for InvertedIndexingOptions {
impl Default for SparseInvertedIndexIndexingOptions {
fn default() -> Self {
Self {}
}
Expand Down
1 change: 0 additions & 1 deletion crates/base/src/operator/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,4 @@ pub trait Operator: Copy + 'static + Send + Sync {
fn distance(lhs: Borrowed<'_, Self>, rhs: Borrowed<'_, Self>) -> Distance;
}

pub type Owned<T> = <T as Operator>::Vector;
pub type Borrowed<'a, T> = <<T as Operator>::Vector as VectorOwned>::Borrowed<'a>;
4 changes: 2 additions & 2 deletions crates/flat/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ impl<O: OperatorFlat, Q: Quantizer<O>> Flat<O, Q> {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
source: &(impl Vectors<O::Vector> + Collection + Source + Sync),
) -> Self {
let remapped = RemappedCollection::from_source(source);
from_nothing(path, options, &remapped)
Expand Down Expand Up @@ -83,7 +83,7 @@ impl<O: OperatorFlat, Q: Quantizer<O>> Flat<O, Q> {
fn from_nothing<O: OperatorFlat, Q: Quantizer<O>>(
path: impl AsRef<Path>,
options: IndexOptions,
collection: &(impl Vectors<Owned<O>> + Collection + Sync),
collection: &(impl Vectors<O::Vector> + Collection + Sync),
) -> Flat<O, Q> {
create_dir(path.as_ref()).unwrap();
let flat_indexing_options = options.indexing.clone().unwrap_flat();
Expand Down
6 changes: 3 additions & 3 deletions crates/hnsw/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ impl<O: OperatorHnsw, Q: Quantizer<O>> Hnsw<O, Q> {
pub fn create(
path: impl AsRef<Path>,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
source: &(impl Vectors<O::Vector> + Collection + Source + Sync),
) -> Self {
let remapped = RemappedCollection::from_source(source);
if let Some(main) = source.get_main::<Self>() {
Expand Down Expand Up @@ -116,7 +116,7 @@ impl<O: OperatorHnsw, Q: Quantizer<O>> Hnsw<O, Q> {
fn from_nothing<O: OperatorHnsw, Q: Quantizer<O>>(
path: impl AsRef<Path>,
options: IndexOptions,
collection: &(impl Vectors<Owned<O>> + Collection + Sync),
collection: &(impl Vectors<O::Vector> + Collection + Sync),
) -> Hnsw<O, Q> {
create_dir(path.as_ref()).unwrap();
let HnswIndexingOptions {
Expand Down Expand Up @@ -198,7 +198,7 @@ fn from_nothing<O: OperatorHnsw, Q: Quantizer<O>>(
fn from_main<O: OperatorHnsw, Q: Quantizer<O>>(
path: impl AsRef<Path>,
options: IndexOptions,
remapped: &RemappedCollection<Owned<O>, impl Vectors<Owned<O>> + Collection + Sync>,
remapped: &RemappedCollection<O::Vector, impl Vectors<O::Vector> + Collection + Sync>,
main: &Hnsw<O, Q>,
) -> Hnsw<O, Q> {
create_dir(path.as_ref()).unwrap();
Expand Down
4 changes: 2 additions & 2 deletions crates/index/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -316,7 +316,7 @@ impl<O: Op> Index<O> {
}
pub fn create_sealed_segment(
&self,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
source: &(impl Vectors<O::Vector> + Collection + Source + Sync),
sealed_segment_ids: &[NonZeroU128],
growing_segment_ids: &[NonZeroU128],
) -> Option<Arc<SealedSegment<O>>> {
Expand Down Expand Up @@ -444,7 +444,7 @@ impl<O: Op> IndexView<O> {
}
pub fn insert(
&self,
vector: Owned<O>,
vector: O::Vector,
pointer: Pointer,
) -> Result<Result<(), OutdatedError>, InsertError> {
if self.options.vector.dims != vector.as_borrowed().dims() {
Expand Down
16 changes: 8 additions & 8 deletions crates/index/src/optimizing/index_source.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use crate::delete::Delete;
use crate::Op;
use crate::{GrowingSegment, SealedSegment};
use base::index::IndexOptions;
use base::operator::{Borrowed, Owned};
use base::operator::Borrowed;
use base::search::*;
use std::any::Any;
use std::fmt::Debug;
Expand All @@ -17,7 +17,7 @@ pub struct IndexSource<V, O: Op> {
_phantom: PhantomData<fn(V) -> V>,
}

impl<O: Op> IndexSource<Owned<O>, O> {
impl<O: Op> IndexSource<O::Vector, O> {
pub fn new(
options: IndexOptions,
sealed: Option<Arc<SealedSegment<O>>>,
Expand All @@ -34,7 +34,7 @@ impl<O: Op> IndexSource<Owned<O>, O> {
}
}

impl<O: Op> Vectors<Owned<O>> for IndexSource<Owned<O>, O> {
impl<O: Op> Vectors<O::Vector> for IndexSource<O::Vector, O> {
fn dims(&self) -> u32 {
self.dims
}
Expand All @@ -61,7 +61,7 @@ impl<O: Op> Vectors<Owned<O>> for IndexSource<Owned<O>, O> {
}
}

impl<O: Op> Collection for IndexSource<Owned<O>, O> {
impl<O: Op> Collection for IndexSource<O::Vector, O> {
fn payload(&self, mut index: u32) -> Payload {
for x in self.sealed.iter() {
if index < x.len() {
Expand All @@ -79,7 +79,7 @@ impl<O: Op> Collection for IndexSource<Owned<O>, O> {
}
}

impl<O: Op> Source for IndexSource<Owned<O>, O> {
impl<O: Op> Source for IndexSource<O::Vector, O> {
fn get_main<T: Any>(&self) -> Option<&T> {
let x = self.sealed.as_ref()?;
Some(
Expand All @@ -104,7 +104,7 @@ pub struct RoGrowingCollection<V, O: Op> {
_phantom: PhantomData<fn(V) -> V>,
}

impl<O: Op> Debug for RoGrowingCollection<Owned<O>, O> {
impl<O: Op> Debug for RoGrowingCollection<O::Vector, O> {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.debug_struct("RoGrowingCollection")
.field("growing", &self.growing)
Expand All @@ -113,7 +113,7 @@ impl<O: Op> Debug for RoGrowingCollection<Owned<O>, O> {
}
}

impl<O: Op> Vectors<Owned<O>> for RoGrowingCollection<Owned<O>, O> {
impl<O: Op> Vectors<O::Vector> for RoGrowingCollection<O::Vector, O> {
fn dims(&self) -> u32 {
self.dims
}
Expand All @@ -133,7 +133,7 @@ impl<O: Op> Vectors<Owned<O>> for RoGrowingCollection<Owned<O>, O> {
}
}

impl<O: Op> Collection for RoGrowingCollection<Owned<O>, O> {
impl<O: Op> Collection for RoGrowingCollection<O::Vector, O> {
fn payload(&self, mut index: u32) -> Payload {
for x in self.growing.iter() {
if index < x.len() {
Expand Down
5 changes: 2 additions & 3 deletions crates/index/src/optimizing/indexing.rs
Original file line number Diff line number Diff line change
@@ -1,14 +1,13 @@
use crate::optimizing::index_source::IndexSource;
use crate::Index;
use crate::Op;
use base::operator::Owned;
use std::sync::Arc;

pub fn scan<O: Op>(
index: Arc<Index<O>>,
capacity: u32,
delete_threshold: f64,
) -> Option<IndexSource<Owned<O>, O>> {
) -> Option<IndexSource<O::Vector, O>> {
let (sealed, growing) = 'a: {
let protect = index.protect.lock();
// approach 1: merge small segments to a big segment
Expand Down Expand Up @@ -87,7 +86,7 @@ pub fn scan<O: Op>(
))
}

pub fn make<O: Op>(index: Arc<Index<O>>, source: IndexSource<Owned<O>, O>) {
pub fn make<O: Op>(index: Arc<Index<O>>, source: IndexSource<O::Vector, O>) {
let _ = index.create_sealed_segment(
&source,
&source.sealed.iter().map(|x| x.id()).collect::<Vec<_>>(),
Expand Down
2 changes: 1 addition & 1 deletion crates/index/src/segment/sealed.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ impl<O: Op> SealedSegment<O> {
path: PathBuf,
id: NonZeroU128,
options: IndexOptions,
source: &(impl Vectors<Owned<O>> + Collection + Source + Sync),
source: &(impl Vectors<O::Vector> + Collection + Source + Sync),
) -> Arc<Self> {
let indexing = SealedIndexing::create(&path, options, source);
Arc::new(Self {
Expand Down
2 changes: 1 addition & 1 deletion crates/indexing/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ base = { path = "../base" }
# algorithms
flat = { path = "../flat" }
hnsw = { path = "../hnsw" }
inverted = { path = "../inverted" }
ivf = { path = "../ivf" }
quantization = { path = "../quantization" }
sparse_inverted_index = { path = "../sparse_inverted_index" }

[lints]
workspace = true
6 changes: 3 additions & 3 deletions crates/indexing/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,16 @@ use quantization::rabitq::OperatorRabitqQuantization;
pub use sealed::SealedIndexing;

use base::operator::Operator;
use inverted::operator::OperatorInvertedIndex;
use ivf::operator::OperatorIvf;
use quantization::product::OperatorProductQuantization;
use quantization::scalar::OperatorScalarQuantization;
use sparse_inverted_index::operator::OperatorSparseInvertedIndex;

pub trait OperatorIndexing
where
Self: Operator,
Self: OperatorIvf,
Self: OperatorInvertedIndex,
Self: OperatorSparseInvertedIndex,
Self: OperatorScalarQuantization,
Self: OperatorProductQuantization,
Self: OperatorRabitqQuantization,
Expand All @@ -24,7 +24,7 @@ impl<T> OperatorIndexing for T
where
Self: Operator,
Self: OperatorIvf,
Self: OperatorInvertedIndex,
Self: OperatorSparseInvertedIndex,
Self: OperatorScalarQuantization,
Self: OperatorProductQuantization,
Self: OperatorRabitqQuantization,
Expand Down
Loading