From a381924e91d3f593e41142d778a6400b80526e95 Mon Sep 17 00:00:00 2001 From: Essam Date: Wed, 11 Oct 2023 04:30:51 +0300 Subject: [PATCH 1/8] Update Project.toml --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 4230b084..5fdc6f5c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Imbalance" uuid = "c709b415-507b-45b7-9a3d-1767c89fde68" authors = ["Essam Wisam ", "Anthony Blaom and contributors"] -version = "0.1.1" +version = "0.1.2" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597" From 354b1239c7ebdb0c54a6258f62d4b893af2c46c4 Mon Sep 17 00:00:00 2001 From: Essam Date: Wed, 11 Oct 2023 06:00:26 +0300 Subject: [PATCH 2/8] =?UTF-8?q?=E2=9C=85=20Small=20improvement=20developer?= =?UTF-8?q?=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/src/contributing.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/docs/src/contributing.md b/docs/src/contributing.md index e4a6e256..df165c0f 100644 --- a/docs/src/contributing.md +++ b/docs/src/contributing.md @@ -24,7 +24,13 @@ Any method resampling method implemented in the `oversampling_methods` or `under │ └── resample_method.jl # implements the method itself (pure functional interface) ``` -# Adding New Resampling Methods +# Contribution + + +## Reporting Problems or Seeking Support +- Do not hesitate to post a Github issue with your question or problem. + +## Adding New Resampling Methods - Make a new folder `resample_method` for the method in the `oversampling_methods` or `undersampling_methods` - Implement in `resample_method/resample_method.jl` the method over matrices for one minority class - Use `generic_oversample.jl` to generalize it to work on the whole data @@ -42,10 +48,13 @@ Surely, you can ignore ignore the third step if the algorithm you are implementi - `BorderlineSMOTE2`: A small modification of the `BorderlineSMOTE1` condition - `RepeatedENNUndersampler`: Simply repeats `ENNUndersampler` multiple times -# Adding New Tutorials + +## Adding New Tutorials - Make a new notebook with the tutorial in the `examples` folder found in `docs/src/examples` - Run the notebook so that the output is shown below each cell - If the notebook produces visuals then save and load them in the notebook - Convert it to markdown by using Python to run `from convert import convert_to_md; convert_to_md('')` - Set a title, description, image and links for it in the dictionary found in `docs/examples.jl` -- For the colab link, you do not need to upload anything just follow the link pattern in the file \ No newline at end of file +- For the colab link, you do not need to upload anything just follow the link pattern in the file + + From 3a76dbdeb345addf711c6a95bd381ad276b5d6b7 Mon Sep 17 00:00:00 2001 From: Essam Date: Wed, 11 Oct 2023 15:29:36 +0300 Subject: [PATCH 3/8] =?UTF-8?q?=F0=9F=93=9D=20Retouch=20Colab=20documentat?= =?UTF-8?q?ion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/src/examples/Colab.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/src/examples/Colab.md b/docs/src/examples/Colab.md index 0a36e850..6e33d9ac 100644 --- a/docs/src/examples/Colab.md +++ b/docs/src/examples/Colab.md @@ -1,6 +1,6 @@ # Google Colab -It is possible to run tutorials found in the examples section or API documentation on Google colab. It should be evident how so by launching the notebook. This section describes what happens under the hood. +It is possible to run tutorials found in the examples section or API documentation on Google Colab (using provided link or icon). It should be evident how so by launching the notebook. This section describes what happens under the hood. - The first cell runs the following bash script to install Julia: From 78c84b259dbc3262030f3c9e24817e5c77cad2e7 Mon Sep 17 00:00:00 2001 From: Essam Date: Thu, 12 Oct 2023 00:04:44 +0300 Subject: [PATCH 4/8] =?UTF-8?q?=F0=9F=93=9D=20Reference=20correction?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/src/algorithms/implementation_notes.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/docs/src/algorithms/implementation_notes.md b/docs/src/algorithms/implementation_notes.md index 4e408130..101dff7e 100644 --- a/docs/src/algorithms/implementation_notes.md +++ b/docs/src/algorithms/implementation_notes.md @@ -5,4 +5,6 @@ Papers often propose the resampling algorithm for the case of binary classificat ### Generalizing to Real Ratios Papers often proposes the resampling algorithm using integer ratios. For instance, a ratio of `2` would mean to double the amount of data in a class and a ratio of $2.2$ is not allowed or will be rounded. In `Imbalance.jl` any appropriate real ratio can be used and the ratio is relative to the size of the majority or minority class depending on whether the algorithm is oversampling or undersampling. The generalization occurs by randomly choosing points instead of looping on each point. That is, if a $2.2$ ratio corresponds to $227$ examples then $227$ examples are chosen randomly by replacement then applying resampling logic to each. Given an integer ratio $k$, this falls back to be on average equivalent to looping on the points $k$ times. -[1] López, V., Fernández, A., Moreno-Torres, J.G., & Herrera, F. (2012). Analysis of preprocessing vs. cost-sensitive learning for imbalanced classification. Open problems on intrinsic data characteristics. Expert Systems with Applications, 39(7), 6585-6608. \ No newline at end of file +[1] Fernández, A., López, V., Galar, M., Del Jesus, M. J., and Herrera, F. (2013). Analysing the classifi- +cation of imbalanced data-sets with multiple classes: Binarization techniques and ad-hoc approaches. +Knowledge-Based Systems, 42:97–110. \ No newline at end of file From b5b37ceee46314f44168cb4acbca5e96a7c71f53 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 21 Nov 2023 15:38:25 -0600 Subject: [PATCH 5/8] =?UTF-8?q?=E2=9E=95=20Add=20link=20for=20MLJBalancing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 98baec56..2b19adbe 100644 --- a/README.md +++ b/README.md @@ -76,7 +76,7 @@ Xover, yover = transform(mach, X, y) All implemented oversampling methods are considered static transforms and hence, no `fit` is required. #### Pipelining Models -If `MLJBalancing` is also used, an arbitrary number of resampling methods from `Imbalance.jl` can be wrapped with a classification model from `MLJ` to function as a unified model where resampling automatically takes place on given data before training the model (and is bypassed during prediction). +If [MLJBalancing](https://github.com/JuliaAI/MLJBalancing.jl) is also used, an arbitrary number of resampling methods from `Imbalance.jl` can be wrapped with a classification model from `MLJ` to function as a unified model where resampling automatically takes place on given data before training the model (and is bypassed during prediction). ```julia using MLJBalancing @@ -147,4 +147,4 @@ One obvious possible remedy is to weight the smaller sums so that a learning alg To our knowledge, there are no existing maintained Julia packages that implement resampling algorithms for multi-class classification problems or that handle both nominal and continuous features. This has served as a primary motivation for the creation of this package. ## 👥 Credits -This package was created by [Essam Wisam](https://github.com/JuliaAI) as a Google Summer of Code project, under the mentorship of [Anthony Blaom](https://ablaom.github.io). Special thanks also go to [Rik Huijzer](https://github.com/rikhuijzer) for his friendliness and the binary `SMOTE` implementation in `Resample.jl`. \ No newline at end of file +This package was created by [Essam Wisam](https://github.com/JuliaAI) as a Google Summer of Code project, under the mentorship of [Anthony Blaom](https://ablaom.github.io). Special thanks also go to [Rik Huijzer](https://github.com/rikhuijzer) for his friendliness and the binary `SMOTE` implementation in `Resample.jl`. From 562aba43d057913a886beb4942f2cd2856b3f6c2 Mon Sep 17 00:00:00 2001 From: Essam Date: Tue, 5 Dec 2023 21:01:20 -0600 Subject: [PATCH 6/8] Update Project.toml --- Project.toml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Project.toml b/Project.toml index 5fdc6f5c..45a7be0a 100644 --- a/Project.toml +++ b/Project.toml @@ -24,6 +24,9 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" TransformsBase = "28dd2a49-a57a-4bfb-84ca-1a49db9b96b8" [compat] +LinearAlgebra="1.6" +Random="1.6" +Statistics="1.6" CategoricalArrays = "0.10" CategoricalDistributions = "0.1" Clustering = "0.15" From 142b84f7caba74425bc94cf2a487bdc04b2ae96d Mon Sep 17 00:00:00 2001 From: Essam Date: Fri, 5 Jan 2024 22:29:21 -0600 Subject: [PATCH 7/8] =?UTF-8?q?=F0=9F=90=9B=20Fix=20freezing=20dict=20in?= =?UTF-8?q?=20group=5Finds=20#73?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/common/utils.jl | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/common/utils.jl b/src/common/utils.jl index 97bab48d..8e74e140 100644 --- a/src/common/utils.jl +++ b/src/common/utils.jl @@ -35,7 +35,6 @@ where that value occurs. """ function group_inds(categorical_array::AbstractVector{T}) where {T} result = LittleDict{T,AbstractVector{Int}}() - freeze(result) for (i, v) in enumerate(categorical_array) # Make a new entry in the dict if it doesn't exist if !haskey(result, v) @@ -44,6 +43,6 @@ function group_inds(categorical_array::AbstractVector{T}) where {T} # It exists, so push the index belonging to the class push!(result[v], i) end - return result + return freeze(result) end From c4f45e9916e1df1bcb2b0f013427d7119ed78abf Mon Sep 17 00:00:00 2001 From: Essam Date: Fri, 5 Jan 2024 22:49:34 -0600 Subject: [PATCH 8/8] =?UTF-8?q?=E2=AC=86=EF=B8=8F=20Bump=20version=20Proje?= =?UTF-8?q?ct.toml?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 45a7be0a..9a92791e 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "Imbalance" uuid = "c709b415-507b-45b7-9a3d-1767c89fde68" authors = ["Essam Wisam ", "Anthony Blaom and contributors"] -version = "0.1.2" +version = "0.1.3" [deps] CategoricalArrays = "324d7699-5711-5eae-9e2f-1d82baa6b597"