Add Wine Quality dataset, #6

christianversloot · christianversloot · commit 093443097c23 · 2020-12-01T09:26:47.000+01:00
diff --git a/README.md b/README.md
@@ -31,6 +31,7 @@ _The names TensorFlow, Keras, as well as related names, marks, emblems and image
   * [SVHN-Extra](#svhn-extra)
   * [STL-10](#stl-10)
   * [Iris](#iris)
+  * [Wine Quality dataset](#wine-quality-dataset)
 - [Contributors and other references](#contributors-and-other-references)
 - [License](#license)
 
@@ -42,7 +43,7 @@ This package makes use of the TensorFlow 2.x package and specifically `tensorflo
 * `pip install tensorflow`
 
 ### Installation procedure
-Installing is really easy, and can be done with [PIP](https://pypi.org/project/extra-keras-datasets/): `pip install extra-keras-datasets`.
+Installing is really easy, and can be done with [PIP](https://pypi.org/project/extra-keras-datasets/): `pip install extra-keras-datasets`. The package depends on `numpy`, `scipy`, `pandas` and `scikit-learn`, which will be automatically installed.
 
 ## Datasets
 
@@ -192,6 +193,21 @@ from extra_keras_datasets import iris
 
 ---
 
+### Wine Quality dataset
+This dataset presents wine qualities related to red and white vinho verde wine samples, from the north of Portugal. According to the creators, "[the] goal is to model wine quality based on physicochemical tests". Various chemical properties of the wine are available as well (`inputs`) as well as the quality score (`targets`) for the wine.
+
+* Input structure: (fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, wine type)
+* Target structure: quality score between 0 and 10
+
+```
+from extra_keras_datasets import wine_quality
+(input_train, target_train), (input_test, target_test) = wine_quality.load_data(which_data='both', test_split=0.2, shuffle=True)
+```
+
+<a href="./assets/wine_quality.jpg"><img src="./assets/wine_quality.jpg" width="100%" style="border: 3px solid #f6f8fa;" /></a>
+
+---
+
 ## Contributors and other references
 * **EMNIST dataset:**
   * Cohen, G., Afshar, S., Tapson, J., & van Schaik, A. (2017). EMNIST: an extension of MNIST to handwritten letters. Retrieved from http://arxiv.org/abs/1702.05373
@@ -204,6 +220,8 @@ from extra_keras_datasets import iris
   * Coates, A., Ng, A., & Lee, H. (2011, June). An analysis of single-layer networks in unsupervised feature learning. In Proceedings of the fourteenth international conference on artificial intelligence and statistics (pp. 215-223). Retrieved from http://cs.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf
 * **Iris dataset:**
   * Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950).
+* **Wine Quality dataset:**
+  * P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
 
 ## License
 The licenseable parts of this repository are licensed under a [MIT License](./LICENSE), so you're free to use this repo in your machine learning projects / blogs / exercises, and so on. Happy engineering! 🚀
diff --git a/assets/wine_quality.jpg b/assets/wine_quality.jpg
diff --git a/extra_keras_datasets/__init__.py b/extra_keras_datasets/__init__.py
@@ -7,5 +7,6 @@
 from . import svhn
 from . import stl10
 from . import iris
+from . import wine_quality
 
-__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris']
+__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality']
diff --git a/extra_keras_datasets/wine_quality.py b/extra_keras_datasets/wine_quality.py
@@ -0,0 +1,117 @@
+"""
+  Import the Wine Quality dataset
+  Source: https://archive.ics.uci.edu/ml/datasets/wine+quality
+  Description: Two datasets are included, related to red and white vinho
+  verde wine samples, from the north of Portugal.
+  The goal is to model wine quality based on physicochemical tests.
+
+  ~~~ Important note ~~~
+  Please cite the following paper when using or referencing the dataset:
+  P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis.
+  Modeling wine preferences by data mining from physicochemical
+  properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
+"""
+
+from tensorflow.keras.utils import get_file
+import numpy as np
+import logging
+import pandas as pd
+from sklearn.model_selection import train_test_split
+
+
+def warn_citation():
+    """Warns about citation requirements
+    # Returns
+      Void
+    """
+    logging.warning(("Please cite the following paper when using or"
+                     " referencing this Extra Keras Dataset:"))
+    logging.warning(
+        ("P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. "
+         "Modeling wine preferences by data mining from physicochemical "
+         "properties. In Decision Support Systems, Elsevier, "
+         "47(4):547-553, 2009.")
+      )
+
+
+def load_data(
+    path_red="wine-quality-red.csv",
+    path_white="wine-quality-white.csv",
+    test_split=0.2,
+    which_data='both',
+    shuffle=True
+):
+    """Loads the Wine Quality dataset.
+    # Arguments
+        path_red: path where to cache the red wines dataset locally
+            (relative to ~/.keras/datasets).
+        path_white: path where to cache the white wines dataset locally
+            (relative to ~/.keras/datasets).
+        test_split: percentage of data to use for testing (by default 20%)
+        which_data: wine type to return. Can be 'white', 'red', or 'both'.
+        shuffle: whether to shuffle the data when generating train/test
+            split.
+    # Returns
+        Tuple of Numpy arrays: `(input_train, target_train),
+                                  (input_test, target_test)`.
+        Input structure: (fixed acidity, volatile acidity, citric acid,
+                          residual sugar, chlorides, free sulfur diox-
+                          ide, total sulfur dioxide, density, pH, sul-
+                          phates, alcohol, wine type)
+        Target structure: quality score between 0 and 10
+
+    """
+    # Log about loading
+    logging.basicConfig(level=logging.INFO)
+    logging.info('Loading dataset = wine_quality')
+
+    # Assert data
+    assert which_data in ['red', 'white', 'both']
+
+    # Download data
+    path_white = get_file(
+        path_white,
+        origin=("https://archive.ics.uci.edu/ml/machine-learning-"
+                "databases/wine-quality/winequality-white.csv")
+    )
+    path_red = get_file(
+        path_red,
+        origin=("https://archive.ics.uci.edu/ml/machine-learning-"
+                "databases/wine-quality/winequality-red.csv")
+    )
+
+    # Process white data
+    white = pd.read_csv(path_white, header=0, delimiter=';')
+    white.insert(11, 'type', 'white')
+    white = white.to_numpy()
+    white_samples = white[:, 0:12]
+    white_targets = white[:, 12]
+
+    # Process red data
+    red = pd.read_csv(path_red, header=0, delimiter=';')
+    red.insert(11, 'type', 'red')
+    red = red.to_numpy()
+    red_samples = red[:, 0:12]
+    red_targets = red[:, 12]
+
+    # Specify dataset before train/test split generation
+    if which_data == 'red':
+        samples = red_samples
+        targets = red_targets
+    elif which_data == 'white':
+        samples = white_samples
+        targets = white_targets
+    else:
+        samples = np.concatenate((red_samples, white_samples))
+        targets = np.concatenate((red_targets, white_targets))
+
+    # Generate train/test split
+    input_train, input_test, target_train, target_test = \
+        train_test_split(samples, targets, test_size=test_split,
+                         shuffle=shuffle)
+
+    # Warn about citation
+    warn_citation()
+
+    # Return data
+    return (input_train, target_train), (input_test, target_test)
diff --git a/setup.py b/setup.py
@@ -13,7 +13,7 @@
     download_url=("https://github.com/christianversloot/"
                   "extra_keras_datasets/archive/{{VERSION}}.tar.gz"),
     keywords=["keras", "datasets", "machine learning"],
-    install_requires=["numpy", "scipy"],
+    install_requires=["numpy", "scipy", "pandas", "scikit-learn"],
     classifiers=[
         "Development Status :: 5 - Production/Stable",
         "Intended Audience :: Developers",