Skip to content

Commit

Permalink
Fixed two bugs:
Browse files Browse the repository at this point in the history
1. n_jobs is set to None by default to avoid an error.
2. Solve unexpected parameter 'allow_empty_party'

Add readme and examples.
  • Loading branch information
JerryLife committed Dec 17, 2023
1 parent b2a55af commit 79cbbfd
Show file tree
Hide file tree
Showing 7 changed files with 90 additions and 13 deletions.
54 changes: 54 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,3 +17,57 @@ pip install vertibench

## Getting Started

This examples includes the pipeline of split and evaluate. First,
load your datasets or generate synthetic datasets.

```python
from sklearn.datasets import make_classification

# Generate a large dataset
X, y = make_classification(n_samples=10000, n_features=10)
```

To split the dataset by importance,

```python
from vertibench.Splitter import ImportanceSplitter

imp_splitter = ImportanceSplitter(num_parties=4, weights=[1, 1, 1, 3])
Xs = imp_splitter.split(X)
```

To split the dataset by correlation,

```python
from vertibench.Splitter import CorrelationSplitter

corr_splitter = CorrelationSplitter(num_parties=4)
Xs = corr_splitter.fit_split(X)
```

To evaluate a feature split `Xs` in terms of party importance,

```python
from vertibench.Evaluator import ImportanceEvaluator
from sklearn.linear_model import LogisticRegression
import numpy as np

model = LogisticRegression()
X = np.concatenate(Xs, axis=1)
model.fit(X, y)
imp_evaluator = ImportanceEvaluator()
imp_scores = imp_evaluator.evaluate(Xs, model.predict)
alpha = imp_evaluator.evaluate_alpha(scores=imp_scores)
print(f"Importance scores: {imp_scores}, alpha: {alpha}")
```

To evaluate a feature split in terms of correlation,

```python
from vertibench.Evaluator import CorrelationEvaluator

corr_evaluator = CorrelationEvaluator()
corr_scores = corr_evaluator.fit_evaluate(Xs)
beta = corr_evaluator.evaluate_beta()
print(f"Correlation scores: {corr_scores}, beta: {beta}")
```
9 changes: 0 additions & 9 deletions example/EvaluatorExample.py

This file was deleted.

32 changes: 32 additions & 0 deletions example/SplitEvaluateExample.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
from sklearn.datasets import make_classification

# Generate a large dataset
X, y = make_classification(n_samples=10000, n_features=10)


from vertibench.Evaluator import ImportanceEvaluator, CorrelationEvaluator
from vertibench.Splitter import ImportanceSplitter, CorrelationSplitter
from sklearn.linear_model import LogisticRegression

# Split by importance
imp_splitter = ImportanceSplitter(num_parties=4, weights=[1, 1, 1, 3])
Xs = imp_splitter.split(X)

# Evaluate split by importance
model = LogisticRegression()
model.fit(X, y)
imp_evaluator = ImportanceEvaluator()
imp_scores = imp_evaluator.evaluate(Xs, model.predict)
alpha = imp_evaluator.evaluate_alpha(scores=imp_scores)
print(f"Importance scores: {imp_scores}, alpha: {alpha}")

# Split by correlation
corr_splitter = CorrelationSplitter(num_parties=4)
Xs = corr_splitter.fit_split(X)

# Evaluate split by correlation
corr_evaluator = CorrelationEvaluator()
corr_scores = corr_evaluator.fit_evaluate(Xs)
beta = corr_evaluator.evaluate_beta()
print(f"Correlation scores: {corr_scores}, beta: {beta}")

Empty file removed example/SplitExample.py
Empty file.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ authors = [
{ name = "Junyi Hou" },
{ name = "Bingsheng He" }
]
version = "0.1.1a1"
version = "0.1.1"
description = "A tool for benchmarking vertical federated learning algorithms, containing synthetic data split and data evaluation."
readme = "README.md"
license = { file = "LICENSE" } # If you have a LICENSE file
Expand Down
2 changes: 1 addition & 1 deletion src/vertibench/Evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ class CorrelationEvaluator:
Correlation evaluator for VFL datasets
"""

def __init__(self, corr_func='spearmanr', gpu_id=None, svd_algo='auto', n_jobs=1, **kwargs):
def __init__(self, corr_func='spearmanr', gpu_id=None, svd_algo='auto', n_jobs=None, **kwargs):
"""
:param corr_func: [str] function to calculate the correlation between two features
:param gamma: [float] weight of the inner-party correlation score
Expand Down
4 changes: 2 additions & 2 deletions src/vertibench/Splitter.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def split_indices(self, X, allow_empty_party=False):

class CorrelationSplitter(Splitter):

def __init__(self, num_parties: int, evaluator: CorrelationEvaluator = None, seed=None, gpu_id=None, n_jobs=1):
def __init__(self, num_parties: int, evaluator: CorrelationEvaluator = None, seed=None, gpu_id=None, n_jobs=None):
"""
Split a 2D dataset by feature correlation (assuming the features are equally important).
:param num_parties: [int] number of parties
Expand Down Expand Up @@ -273,7 +273,7 @@ def fit(self, X, **kwargs):
self.max_icor = self.evaluator.max_icor

def split_indices(self, X, n_elites=20, n_offsprings=70, n_mutants=10, n_gen=100, bias=0.7, verbose=False,
beta=0.5, term_tol=1e-4, term_period=10):
beta=0.5, term_tol=1e-4, term_period=10, **kwargs):
"""
Use BRKGA to find the best order of features that minimizes the difference between the mean of icor and the
target. split() assumes that the min and max icor have been calculated by fit().
Expand Down

0 comments on commit 79cbbfd

Please sign in to comment.