diff --git a/README.md b/README.md index f9f32d7..24fd3f8 100644 --- a/README.md +++ b/README.md @@ -31,6 +31,7 @@ _The names TensorFlow, Keras, as well as related names, marks, emblems and image * [SVHN-Extra](#svhn-extra) * [STL-10](#stl-10) * [Iris](#iris) + * [Wine Quality dataset](#wine-quality-dataset) - [Contributors and other references](#contributors-and-other-references) - [License](#license) @@ -42,7 +43,7 @@ This package makes use of the TensorFlow 2.x package and specifically `tensorflo * `pip install tensorflow` ### Installation procedure -Installing is really easy, and can be done with [PIP](https://pypi.org/project/extra-keras-datasets/): `pip install extra-keras-datasets`. +Installing is really easy, and can be done with [PIP](https://pypi.org/project/extra-keras-datasets/): `pip install extra-keras-datasets`. The package depends on `numpy`, `scipy`, `pandas` and `scikit-learn`, which will be automatically installed. ## Datasets @@ -192,6 +193,21 @@ from extra_keras_datasets import iris --- +### Wine Quality dataset +This dataset presents wine qualities related to red and white vinho verde wine samples, from the north of Portugal. According to the creators, "[the] goal is to model wine quality based on physicochemical tests". Various chemical properties of the wine are available as well (`inputs`) as well as the quality score (`targets`) for the wine. + +* Input structure: (fixed acidity, volatile acidity, citric acid, residual sugar, chlorides, free sulfur dioxide, total sulfur dioxide, density, pH, sulphates, alcohol, wine type) +* Target structure: quality score between 0 and 10 + +``` +from extra_keras_datasets import wine_quality +(input_train, target_train), (input_test, target_test) = wine_quality.load_data(which_data='both', test_split=0.2, shuffle=True) +``` + + + +--- + ## Contributors and other references * **EMNIST dataset:** * Cohen, G., Afshar, S., Tapson, J., & van Schaik, A. (2017). EMNIST: an extension of MNIST to handwritten letters. Retrieved from http://arxiv.org/abs/1702.05373 @@ -204,6 +220,8 @@ from extra_keras_datasets import iris * Coates, A., Ng, A., & Lee, H. (2011, June). An analysis of single-layer networks in unsupervised feature learning. In Proceedings of the fourteenth international conference on artificial intelligence and statistics (pp. 215-223). Retrieved from http://cs.stanford.edu/~acoates/papers/coatesleeng_aistats_2011.pdf * **Iris dataset:** * Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950). +* **Wine Quality dataset:** + * P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. ## License The licenseable parts of this repository are licensed under a [MIT License](./LICENSE), so you're free to use this repo in your machine learning projects / blogs / exercises, and so on. Happy engineering! 🚀 diff --git a/assets/wine_quality.jpg b/assets/wine_quality.jpg new file mode 100644 index 0000000..ee02514 Binary files /dev/null and b/assets/wine_quality.jpg differ diff --git a/extra_keras_datasets/__init__.py b/extra_keras_datasets/__init__.py index 11a43e5..0918210 100644 --- a/extra_keras_datasets/__init__.py +++ b/extra_keras_datasets/__init__.py @@ -7,5 +7,6 @@ from . import svhn from . import stl10 from . import iris +from . import wine_quality -__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris'] +__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality'] diff --git a/extra_keras_datasets/wine_quality.py b/extra_keras_datasets/wine_quality.py new file mode 100644 index 0000000..01496e8 --- /dev/null +++ b/extra_keras_datasets/wine_quality.py @@ -0,0 +1,117 @@ +""" + Import the Wine Quality dataset + Source: https://archive.ics.uci.edu/ml/datasets/wine+quality + Description: Two datasets are included, related to red and white vinho + verde wine samples, from the north of Portugal. + The goal is to model wine quality based on physicochemical tests. + + ~~~ Important note ~~~ + Please cite the following paper when using or referencing the dataset: + P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. + Modeling wine preferences by data mining from physicochemical + properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. +""" + +from tensorflow.keras.utils import get_file +import numpy as np +import logging +import pandas as pd +from sklearn.model_selection import train_test_split + + +def warn_citation(): + """Warns about citation requirements + # Returns + Void + """ + logging.warning(("Please cite the following paper when using or" + " referencing this Extra Keras Dataset:")) + logging.warning( + ("P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. " + "Modeling wine preferences by data mining from physicochemical " + "properties. In Decision Support Systems, Elsevier, " + "47(4):547-553, 2009.") + ) + + +def load_data( + path_red="wine-quality-red.csv", + path_white="wine-quality-white.csv", + test_split=0.2, + which_data='both', + shuffle=True +): + """Loads the Wine Quality dataset. + # Arguments + path_red: path where to cache the red wines dataset locally + (relative to ~/.keras/datasets). + path_white: path where to cache the white wines dataset locally + (relative to ~/.keras/datasets). + test_split: percentage of data to use for testing (by default 20%) + which_data: wine type to return. Can be 'white', 'red', or 'both'. + shuffle: whether to shuffle the data when generating train/test + split. + # Returns + Tuple of Numpy arrays: `(input_train, target_train), + (input_test, target_test)`. + Input structure: (fixed acidity, volatile acidity, citric acid, + residual sugar, chlorides, free sulfur diox- + ide, total sulfur dioxide, density, pH, sul- + phates, alcohol, wine type) + Target structure: quality score between 0 and 10 + + """ + # Log about loading + logging.basicConfig(level=logging.INFO) + logging.info('Loading dataset = wine_quality') + + # Assert data + assert which_data in ['red', 'white', 'both'] + + # Download data + path_white = get_file( + path_white, + origin=("https://archive.ics.uci.edu/ml/machine-learning-" + "databases/wine-quality/winequality-white.csv") + ) + path_red = get_file( + path_red, + origin=("https://archive.ics.uci.edu/ml/machine-learning-" + "databases/wine-quality/winequality-red.csv") + ) + + # Process white data + white = pd.read_csv(path_white, header=0, delimiter=';') + white.insert(11, 'type', 'white') + white = white.to_numpy() + white_samples = white[:, 0:12] + white_targets = white[:, 12] + + # Process red data + red = pd.read_csv(path_red, header=0, delimiter=';') + red.insert(11, 'type', 'red') + red = red.to_numpy() + red_samples = red[:, 0:12] + red_targets = red[:, 12] + + # Specify dataset before train/test split generation + if which_data == 'red': + samples = red_samples + targets = red_targets + elif which_data == 'white': + samples = white_samples + targets = white_targets + else: + samples = np.concatenate((red_samples, white_samples)) + targets = np.concatenate((red_targets, white_targets)) + + # Generate train/test split + input_train, input_test, target_train, target_test = \ + train_test_split(samples, targets, test_size=test_split, + shuffle=shuffle) + + # Warn about citation + warn_citation() + + # Return data + return (input_train, target_train), (input_test, target_test) diff --git a/setup.py b/setup.py index c99dc47..4a48d25 100644 --- a/setup.py +++ b/setup.py @@ -13,7 +13,7 @@ download_url=("https://github.com/christianversloot/" "extra_keras_datasets/archive/{{VERSION}}.tar.gz"), keywords=["keras", "datasets", "machine learning"], - install_requires=["numpy", "scipy"], + install_requires=["numpy", "scipy", "pandas", "scikit-learn"], classifiers=[ "Development Status :: 5 - Production/Stable", "Intended Audience :: Developers",