Fixed two bugs:

1. n_jobs is set to None by default to avoid an error. 2. Solve unexpected parameter 'allow_empty_party' Add readme and examples.
Xtra-Computing · Dec 17, 2023 · 79cbbfd · 79cbbfd
1 parent b2a55af
commit 79cbbfd
Show file tree

Hide file tree

Showing 7 changed files with 90 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -17,3 +17,57 @@ pip install vertibench
 
 ## Getting Started
 
+This examples includes the pipeline of split and evaluate. First,
+ load your datasets or generate synthetic datasets. 
+
+```python
+from sklearn.datasets import make_classification
+
+# Generate a large dataset
+X, y = make_classification(n_samples=10000, n_features=10)
+```
+
+To split the dataset by importance,
+
+```python
+from vertibench.Splitter import ImportanceSplitter
+
+imp_splitter = ImportanceSplitter(num_parties=4, weights=[1, 1, 1, 3])
+Xs = imp_splitter.split(X)
+```
+
+To split the dataset by correlation,
+
+```python
+from vertibench.Splitter import CorrelationSplitter
+
+corr_splitter = CorrelationSplitter(num_parties=4)
+Xs = corr_splitter.fit_split(X)
+```
+
+To evaluate a feature split `Xs` in terms of party importance,
+
+```python
+from vertibench.Evaluator import ImportanceEvaluator
+from sklearn.linear_model import LogisticRegression
+import numpy as np
+
+model = LogisticRegression()
+X = np.concatenate(Xs, axis=1)
+model.fit(X, y)
+imp_evaluator = ImportanceEvaluator()
+imp_scores = imp_evaluator.evaluate(Xs, model.predict)
+alpha = imp_evaluator.evaluate_alpha(scores=imp_scores)
+print(f"Importance scores: {imp_scores}, alpha: {alpha}")
+```
+
+To evaluate a feature split in terms of correlation,
+
+```python
+from vertibench.Evaluator import CorrelationEvaluator
+
+corr_evaluator = CorrelationEvaluator()
+corr_scores = corr_evaluator.fit_evaluate(Xs)
+beta = corr_evaluator.evaluate_beta()
+print(f"Correlation scores: {corr_scores}, beta: {beta}")
+```
diff --git a/example/EvaluatorExample.py b/example/EvaluatorExample.py
diff --git a/example/SplitEvaluateExample.py b/example/SplitEvaluateExample.py
@@ -0,0 +1,32 @@
+from sklearn.datasets import make_classification
+
+# Generate a large dataset
+X, y = make_classification(n_samples=10000, n_features=10)
+
+
+from vertibench.Evaluator import ImportanceEvaluator, CorrelationEvaluator
+from vertibench.Splitter import ImportanceSplitter, CorrelationSplitter
+from sklearn.linear_model import LogisticRegression
+
+# Split by importance
+imp_splitter = ImportanceSplitter(num_parties=4, weights=[1, 1, 1, 3])
+Xs = imp_splitter.split(X)
+
+# Evaluate split by importance
+model = LogisticRegression()
+model.fit(X, y)
+imp_evaluator = ImportanceEvaluator()
+imp_scores = imp_evaluator.evaluate(Xs, model.predict)
+alpha = imp_evaluator.evaluate_alpha(scores=imp_scores)
+print(f"Importance scores: {imp_scores}, alpha: {alpha}")
+
+# Split by correlation
+corr_splitter = CorrelationSplitter(num_parties=4)
+Xs = corr_splitter.fit_split(X)
+
+# Evaluate split by correlation
+corr_evaluator = CorrelationEvaluator()
+corr_scores = corr_evaluator.fit_evaluate(Xs)
+beta = corr_evaluator.evaluate_beta()
+print(f"Correlation scores: {corr_scores}, beta: {beta}")
+
diff --git a/example/SplitExample.py b/example/SplitExample.py
diff --git a/pyproject.toml b/pyproject.toml
@@ -9,7 +9,7 @@ authors = [
     { name = "Junyi Hou" },
     { name = "Bingsheng He" }
 ]
-version = "0.1.1a1"
+version = "0.1.1"
 description = "A tool for benchmarking vertical federated learning algorithms, containing synthetic data split and data evaluation."
 readme = "README.md"
 license = { file = "LICENSE" }  # If you have a LICENSE file

diff --git a/src/vertibench/Evaluator.py b/src/vertibench/Evaluator.py
@@ -129,7 +129,7 @@ class CorrelationEvaluator:
     Correlation evaluator for VFL datasets
     """
 
-    def __init__(self, corr_func='spearmanr', gpu_id=None, svd_algo='auto', n_jobs=1, **kwargs):
+    def __init__(self, corr_func='spearmanr', gpu_id=None, svd_algo='auto', n_jobs=None, **kwargs):
         """
         :param corr_func: [str] function to calculate the correlation between two features
         :param gamma: [float] weight of the inner-party correlation score

diff --git a/src/vertibench/Splitter.py b/src/vertibench/Splitter.py
@@ -160,7 +160,7 @@ def split_indices(self, X, allow_empty_party=False):
 
 class CorrelationSplitter(Splitter):
 
-    def __init__(self, num_parties: int, evaluator: CorrelationEvaluator = None, seed=None, gpu_id=None, n_jobs=1):
+    def __init__(self, num_parties: int, evaluator: CorrelationEvaluator = None, seed=None, gpu_id=None, n_jobs=None):
         """
         Split a 2D dataset by feature correlation (assuming the features are equally important).
         :param num_parties: [int] number of parties
@@ -273,7 +273,7 @@ def fit(self, X, **kwargs):
         self.max_icor = self.evaluator.max_icor
 
     def split_indices(self, X, n_elites=20, n_offsprings=70, n_mutants=10, n_gen=100, bias=0.7, verbose=False,
-              beta=0.5, term_tol=1e-4, term_period=10):
+              beta=0.5, term_tol=1e-4, term_period=10, **kwargs):
         """
         Use BRKGA to find the best order of features that minimizes the difference between the mean of icor and the
         target. split() assumes that the min and max icor have been calculated by fit().