Merge pull request #61 from JuliaAI/dev

For 0.1.1 release
JuliaAI · Oct 4, 2023 · bff9803 · bff9803
2 parents 581e16e + bc3a569
commit bff9803
Show file tree

Hide file tree

Showing 180 changed files with 109,207 additions and 6,486 deletions.
diff --git a/.github/workflows/CompatHelper.yml b/.github/workflows/CompatHelper.yml
@@ -0,0 +1,16 @@
+name: CompatHelper
+on:
+  schedule:
+    - cron: 0 0 * * *
+  workflow_dispatch:
+jobs:
+  CompatHelper:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Pkg.add("CompatHelper")
+        run: julia -e 'using Pkg; Pkg.add("CompatHelper")'
+      - name: CompatHelper.main()
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+          COMPATHELPER_PRIV: ${{ secrets.DOCUMENTER_KEY }}
+        run: julia -e 'using CompatHelper; CompatHelper.main()'
diff --git a/.github/workflows/Documenter.yml b/.github/workflows/Documenter.yml
@@ -7,12 +7,15 @@ on:
   pull_request:
 jobs:
   build:
+    permissions:
+      contents: write
+      statuses: write
     runs-on: ubuntu-latest
     steps:
       - uses: actions/checkout@v2
       - uses: julia-actions/setup-julia@latest
         with:
-          version: '1.8'
+          version: '1.6'
       - name: Install dependencies
         run: julia --project=docs/ -e 'using Pkg; Pkg.develop(PackageSpec(path=pwd())); Pkg.instantiate()'
       - name: Build and deploy

diff --git a/.github/workflows/TagBot.yml b/.github/workflows/TagBot.yml
@@ -0,0 +1,31 @@
+name: TagBot
+on:
+  issue_comment:
+    types:
+      - created
+  workflow_dispatch:
+    inputs:
+      lookback:
+        default: 3
+permissions:
+  actions: read
+  checks: read
+  contents: write
+  deployments: read
+  issues: read
+  discussions: read
+  packages: read
+  pages: read
+  pull-requests: read
+  repository-projects: read
+  security-events: read
+  statuses: read
+jobs:
+  TagBot:
+    if: github.event_name == 'workflow_dispatch' || github.actor == 'JuliaTagBot'
+    runs-on: ubuntu-latest
+    steps:
+      - uses: JuliaRegistries/TagBot@v1
+        with:
+          token: ${{ secrets.GITHUB_TOKEN }}
+          ssh: ${{ secrets.DOCUMENTER_KEY }}
diff --git a/.github/workflows/formatter.yml b/.github/workflows/formatter.yml
@@ -12,6 +12,8 @@ jobs:
           julia  -e 'import Pkg; Pkg.add("JuliaFormatter")'
           julia  -e 'using JuliaFormatter; format(".")'
 
+      # https://github.com/marketplace/actions/create-pull-request
+      # https://github.com/peter-evans/create-pull-request#reference-example
       - name: Create Pull Request
         id: cpr
         uses: peter-evans/create-pull-request@v3

diff --git a/.gitignore b/.gitignore
@@ -31,6 +31,6 @@ Transforms.ipynb
 dataset.csv
 1.8/Project.toml
 .CondaPkg
-examples/*ipynb
-
+examples/python/*.ipynb
 
+docs/src/examples/__pycache__/convert.cpython-39.pyc
diff --git a/Project.toml b/Project.toml
@@ -1,11 +1,12 @@
 name = "Imbalance"
 uuid = "c709b415-507b-45b7-9a3d-1767c89fde68"
-authors = ["Essam <[email protected]>"]
-version = "0.1.0"
+authors = ["Essam Wisam <[email protected]>", "Anthony Blaom <[email protected]> and contributors"]
+version = "0.1.1"
 
 [deps]
 CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 CategoricalDistributions = "af321ab8-2d2e-40a6-b165-3d674595d28e"
+Clustering = "aaaa29a8-35af-508c-8bc3-b662a17a0fe5"
 Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MLJModelInterface = "e80e1ace-859a-464e-9ed9-23947d8ae3ea"
@@ -23,21 +24,22 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c"
 TransformsBase = "28dd2a49-a57a-4bfb-84ca-1a49db9b96b8"
 
 [compat]
-CategoricalArrays = "0.10.8"
-CategoricalDistributions = "0.1.10"
-Distances = "0.10.9"
-MLJModelInterface = "1.9.2"
-MLJTestInterface  = "0.2.2"
-NearestNeighbors = "0.4.13"
-OrderedCollections = "1.6.2"
-StatsBase = "0.34.0"
-ScientificTypes = "3.0.2"
-Tables = "1.10.1"
-TableOperations = "1.2.0"
-TableTransforms = "1.10.0"
-TransformsBase = "1.2.0"
-ProgressMeter = "1.8.0"
-julia = "1.8"
+CategoricalArrays = "0.10"
+CategoricalDistributions = "0.1"
+Clustering = "0.15"
+Distances = "0.10"
+MLJModelInterface = "1.9"
+MLJTestInterface = "0.2"
+NearestNeighbors = "0.4"
+OrderedCollections = "1.6"
+ProgressMeter = "1.8"
+ScientificTypes = "3.0"
+StatsBase = "0.34"
+TableOperations = "1.2"
+TableTransforms = "1.10"
+Tables = "1.10"
+TransformsBase = "1.2"
+julia = "1.6"
 
 
 [extras]

diff --git a/README.md b/README.md
@@ -1,6 +1,7 @@
 # Imbalance.jl
 
-![Imbalance](https://i.imgur.com/C34ilSZ.png)
+![Imbalance](https://i.imgur.com/CP97JdN.png)
+
 
 A Julia package with resampling methods to correct for class imbalance in a wide variety of classification settings.
 
@@ -16,27 +17,47 @@ import Pkg;
 Pkg.add("Imbalance")
 ```
 
+## ✨ Implemented Methods
+
+The package implements the following resampling algorithms
+
+- Random Oversampling
+- Random Walk Oversampling (RWO)
+- Random Oversampling Examples (ROSE)
+- Synthetic Minority Oversampling Technique (SMOTE)
+- Borderline SMOTE1
+- SMOTE-Nominal (SMOTE-N)
+- SMOTE-Nominal Categorical (SMOTE-NC)
+- Random Undersampling
+- Cluster Undersampling
+- EditedNearestNeighbors Undersampling
+- Tomek Links Undersampling
+- Balanced Bagging Classifier (@MLJBalancing.jl)
+
+Interested in contributing with more? Check [this](https://juliaai.github.io/Imbalance.jl/dev/contributing/).
 
 ## 🚀 Quick Start
+
 We will illustrate using the package to oversample with`SMOTE`; however, all other implemented oversampling methods follow the same pattern.
 
+
 ### 🔵 Standard API
 All methods by default support a pure functional interface.
 ```julia
 using Imbalance
 
 # Set dataset properties then generate imbalanced data
-probs = [0.5, 0.2, 0.3]                  # probability of each class      
+class_probs = [0.5, 0.2, 0.3]                  # probability of each class      
 num_rows, num_continuous_feats = 100, 5
-X, y = generate_imbalanced_data(num_rows, num_continuous_feats; probs, rng=42)      
+X, y = generate_imbalanced_data(num_rows, num_continuous_feats; class_probs, rng=42)      
 
 # Apply SMOTE to oversample the classes
 Xover, yover = smote(X, y; k=5, ratios=Dict(0=>1.0, 1=> 0.9, 2=>0.8), rng=42)
 
 ```
 
 ### 🤖 MLJ Interface
-All methods support the `MLJ` interface over tables where instead of directly calling the method, one instantiates a model for the method while optionally passing the keyword parameters found in the functional interface then wraps the model in a `machine` and follows by calling `transform` on the machine and data.
+All methods support the [`MLJ` interface](https://alan-turing-institute.github.io/MLJ.jl/dev/) where instead of directly calling the method, one instantiates a model for the method while optionally passing the keyword parameters found in the functional interface then wraps the model in a `machine` and follows by calling `transform` on the machine and data.
 ```julia
 using MLJ
 
@@ -54,47 +75,59 @@ Xover, yover = transform(mach, X, y)
 ```
 All implemented oversampling methods are considered static transforms and hence, no `fit` is required. 
 
+If `MLJBalancing` is also used, an arbitrary number of resampling methods from `Imbalance.jl` can be wrapped with a classification model from `MLJ` to function as a unified model where resampling automatically takes place on given data before training the model (and is bypassed during prediction).
+
+```julia
+using MLJBalancing
+
+# grab one more resampler and a classifier
+LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels verbosity=0
+TomekUndersampler = @load TomekUndersampler pkg=Imbalance verbosity=0
+
+undersampler = TomekUndersampler(min_ratios=0.5, rng=42)
+logistic_model = LogisticClassifier()
+
+# wrap the oversampler, undersample and classification model together
+balanced_model = BalancedModel(model=logistic_model, 
+                               balancer1=oversampler, balancer2=undersampler)
+
+# behaves like a single model
+mach = machine(balanced_model, X, y);
+fit!(mach, verbosity=0)
+predict(mach, X)
+```
+
 ### 🏓 Table Transforms Interface
-This interface operates on single tables; it assumes that `y` is one of the columns of the given table. Thus, it follows a similar pattern to the `MLJ` interface except that the index of `y` is a required argument while instantiating the model and the data to be transformed via `apply` is only one table `Xy`.
+The [`TableTransforms` interface](https://juliaml.github.io/TableTransforms.jl/stable/transforms/) operates on single tables; it assumes that `y` is one of the columns of the given table. Thus, it follows a similar pattern to the `MLJ` interface except that the index of `y` is a required argument while instantiating the model and the data to be transformed via `apply` is only one table `Xy`.
 ```julia
 using Imbalance
 using Imbalance.TableTransforms
+using TableTransforms
 
 # Generate imbalanced data
 num_rows = 200
 num_features = 5
 y_ind = 3
 Xy, _ = generate_imbalanced_data(num_rows, num_features; 
-                                 probs=[0.5, 0.2, 0.3], insert_y=y_ind, rng=42)
+                                 class_probs=[0.5, 0.2, 0.3], insert_y=y_ind, rng=42)
 
 # Initiate SMOTE model
 oversampler = SMOTE(y_ind; k=5, ratios=Dict(0=>1.0, 1=> 0.9, 2=>0.8), rng=42)
 Xyover = Xy |> oversampler       # can chain with other table transforms                  
+# equivalently if TableTransforms is used
 Xyover, cache = TableTransforms.apply(oversampler, Xy)    # equivalently
 ```
 The `reapply(oversampler, Xy, cache)` method from `TableTransforms` simply falls back to `apply(oversample, Xy)` and the `revert(oversampler, Xy, cache)` reverts the transform by removing the oversampled observations from the table.
 
 
 ## 🎨 Features
-- Provides some of the most sought oversampling algorithms in machine learning and is still under development
-- Supports multi-class classification and both nominal and continuous features
-- Generic by supporting table input/output formats as well as matrices
+- Supports multi-class variants of the algorithms and both nominal and continuous features
 - Provides `MLJ` and `TableTransforms` interfaces aside from the default pure functional interface
+- Generic by supporting table input/output formats as well as matrices
 - Supports tables regardless to whether the target is a separate column or one of the columns
 - Supports automatic encoding and decoding of nominal features
 
 
-## 📝 Methods
-
-The package so far provides five oversampling algorithms that all work in multi-class settings and with options for handling continuous and nominal features. In particular, it implements:
-
-* Basic Random Oversampling 
-* Random Oversampling Examples (ROSE)
-* Synthetic Minority Oversampling Technique (SMOTE)
-* SMOTE-Nominal (SMOTE-N)
-* SMOTE-Nominal Categorical (SMOTE-NC)
-
-
 ## 📜 Rationale
 Most if not all machine learning algorithms can be viewed as a form of empirical risk minimization where the object is to find the parameters $\theta$ that for some loss function $L$ minimize 
 
@@ -106,11 +139,11 @@ In a multi-class setting with $K$ classes, one can write
 
 $$\hat{\theta} = \arg\min_{\theta} \left( \frac{1}{N_1} \sum_{i \in C_1} L(f_{\theta}(x_i), y_i) + \frac{1}{N_2} \sum_{i \in C_2} L(f_{\theta}(x_i), y_i) + \ldots + \frac{1}{N_K} \sum_{i \in C_K} L(f_{\theta}(x_i), y_i) \right)$$
 
-Class imbalance occurs when some classes have much fewer examples than other classes. In this case, the corresponding terms contribute minimally to the sum which makes it easier for any learning algorithm to find an approximate solution to the empirical risk that mostly only minimizes the over the significant sums. This yields a hypothesis $f_\theta$ that may be very different from the true target $f$ with respect to the minority classes which may be the most important for the application in question.
+Class imbalance occurs when some classes have much fewer examples than other classes. In this case, the terms corresponding to smaller classes contribute minimally to the sum which makes it possible for any learning algorithm to find an approximate solution to minimizing the empirical risk that mostly only minimizes the over the significant sums. This yields a hypothesis $f_\theta$ that may be very different from the true target $f$ with respect to the minority classes which may be the most important for the application in question.
 
-One obvious possible remedy is to weight the smaller sums so that a learning algorithm more easily avoids approximate solutions that exploit their insignificance which can be seen to be equivalent to repeating examples of the observations in minority classes. This can be achieved by naive random oversampling which is offered by this package along with other more advanced oversampling methods.
+One obvious possible remedy is to weight the smaller sums so that a learning algorithm more easily avoids approximate solutions that exploit their insignificance which can be seen to be equivalent to repeating examples of the observations in minority classes. This can be achieved by naive random oversampling which is offered by this package along with other more advanced oversampling methods that function by generating synthetic data, which ideally would be analogous to one of the most plausible solutions to the class imbalance problem: collecting more data.
 
-To our knowledge, there are no existing maintained Julia packages that implement oversampling algorithms for multi-class classification problems or that handle both nominal and continuous features. This has served as a primary motivation for the creation of this package.
+To our knowledge, there are no existing maintained Julia packages that implement resampling algorithms for multi-class classification problems or that handle both nominal and continuous features. This has served as a primary motivation for the creation of this package.
 
 ## 👥 Credits
-This package was created by [Essam Wisam](https://github.com/JuliaAI) as a Google Summer of Code project, under the mentorship of [Anthony Blaom](https://ablaom.github.io). Additionally, [Rik Huijzer](https://github.com/rikhuijzer) and his binary `SMOTE` implementation in `Resample.jl` have also been helpful.
+This package was created by [Essam Wisam](https://github.com/JuliaAI) as a Google Summer of Code project, under the mentorship of [Anthony Blaom](https://ablaom.github.io). Special thanks also go to [Rik Huijzer](https://github.com/rikhuijzer) for his friendliness and the binary `SMOTE` implementation in `Resample.jl`.
diff --git a/docs/Project.toml b/docs/Project.toml
@@ -1,7 +1,38 @@
 [deps]
 CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"
 DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Documenter = "e30172f5-a6a5-5a46-863b-614d45cd2de4"
 DocumenterTools = "35a29f4d-8980-5a13-9543-d66fff28ecb8"
+Imbalance = "c709b415-507b-45b7-9a3d-1767c89fde68"
+Impute = "f7bf1975-0170-51b9-8c5f-a992d46b9575"
+LIBSVM = "b1bec4e5-fd48-53fe-b0cb-9723c09d164b"
 MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7"
+MLJBalancing = "45f359ea-796d-4f51-95a5-deb1a414c586"
+MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d"
+MLJFlux = "094fc8d1-fd35-5302-93ea-dabda2abf845"
+MLJLIBSVMInterface = "61c7150f-6c77-4bb1-949c-13197eac2a52"
+MLJNaiveBayesInterface = "33e4bacb-b9e2-458e-9a13-5d9a90b235fa"
+MLJScikitLearnInterface = "5ae90465-5518-4432-b9d2-8a1def2f0cab"
+MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
+Measures = "442fdcdd-2543-5da2-b0f3-8c86c306513e"
+NaiveBayes = "9bbee03b-0db5-5f46-924f-b5c9c21b8c60"
+OneRule = "90484964-6d6a-4979-af09-8657dbed84ff"
+Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
+Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ScientificTypes = "321657f4-b219-11e9-178b-2701a2544e81"
+TableTransforms = "0d432bfd-3ee1-4ac1-886a-39f05cc69a3e"
+
+[compat]
+CSV = "0.10"
+CategoricalArrays = "0.10"
+DataFrames = "1.6"
+Documenter = "1.0"
+DocumenterTools = "0.1"
+Imbalance = "0.1"
+MLJ = "0.19"
+MLJBase = "0.21"
+MLUtils = "0.4"
+Plots = "1.39"
+ScientificTypes = "3.0"
+TableTransforms = "1.10"