|
| 1 | +""" |
| 2 | + Import the Wine Quality dataset |
| 3 | + Source: https://archive.ics.uci.edu/ml/datasets/wine+quality |
| 4 | + Description: Two datasets are included, related to red and white vinho |
| 5 | + verde wine samples, from the north of Portugal. |
| 6 | + The goal is to model wine quality based on physicochemical tests. |
| 7 | +
|
| 8 | + ~~~ Important note ~~~ |
| 9 | + Please cite the following paper when using or referencing the dataset: |
| 10 | + P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. |
| 11 | + Modeling wine preferences by data mining from physicochemical |
| 12 | + properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009. |
| 13 | +""" |
| 14 | + |
| 15 | +from tensorflow.keras.utils import get_file |
| 16 | +import numpy as np |
| 17 | +import logging |
| 18 | +import pandas as pd |
| 19 | +from sklearn.model_selection import train_test_split |
| 20 | + |
| 21 | + |
| 22 | +def warn_citation(): |
| 23 | + """Warns about citation requirements |
| 24 | + # Returns |
| 25 | + Void |
| 26 | + """ |
| 27 | + logging.warning(("Please cite the following paper when using or" |
| 28 | + " referencing this Extra Keras Dataset:")) |
| 29 | + logging.warning( |
| 30 | + ("P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. " |
| 31 | + "Modeling wine preferences by data mining from physicochemical " |
| 32 | + "properties. In Decision Support Systems, Elsevier, " |
| 33 | + "47(4):547-553, 2009.") |
| 34 | + ) |
| 35 | + |
| 36 | + |
| 37 | +def load_data( |
| 38 | + path_red="wine-quality-red.csv", |
| 39 | + path_white="wine-quality-white.csv", |
| 40 | + test_split=0.2, |
| 41 | + which_data='both', |
| 42 | + shuffle=True |
| 43 | +): |
| 44 | + """Loads the Wine Quality dataset. |
| 45 | + # Arguments |
| 46 | + path_red: path where to cache the red wines dataset locally |
| 47 | + (relative to ~/.keras/datasets). |
| 48 | + path_white: path where to cache the white wines dataset locally |
| 49 | + (relative to ~/.keras/datasets). |
| 50 | + test_split: percentage of data to use for testing (by default 20%) |
| 51 | + which_data: wine type to return. Can be 'white', 'red', or 'both'. |
| 52 | + shuffle: whether to shuffle the data when generating train/test |
| 53 | + split. |
| 54 | + # Returns |
| 55 | + Tuple of Numpy arrays: `(input_train, target_train), |
| 56 | + (input_test, target_test)`. |
| 57 | + Input structure: (fixed acidity, volatile acidity, citric acid, |
| 58 | + residual sugar, chlorides, free sulfur diox- |
| 59 | + ide, total sulfur dioxide, density, pH, sul- |
| 60 | + phates, alcohol, wine type) |
| 61 | + Target structure: quality score between 0 and 10 |
| 62 | +
|
| 63 | + """ |
| 64 | + # Log about loading |
| 65 | + logging.basicConfig(level=logging.INFO) |
| 66 | + logging.info('Loading dataset = wine_quality') |
| 67 | + |
| 68 | + # Assert data |
| 69 | + assert which_data in ['red', 'white', 'both'] |
| 70 | + |
| 71 | + # Download data |
| 72 | + path_white = get_file( |
| 73 | + path_white, |
| 74 | + origin=("https://archive.ics.uci.edu/ml/machine-learning-" |
| 75 | + "databases/wine-quality/winequality-white.csv") |
| 76 | + ) |
| 77 | + path_red = get_file( |
| 78 | + path_red, |
| 79 | + origin=("https://archive.ics.uci.edu/ml/machine-learning-" |
| 80 | + "databases/wine-quality/winequality-red.csv") |
| 81 | + ) |
| 82 | + |
| 83 | + # Process white data |
| 84 | + white = pd.read_csv(path_white, header=0, delimiter=';') |
| 85 | + white.insert(11, 'type', 'white') |
| 86 | + white = white.to_numpy() |
| 87 | + white_samples = white[:, 0:12] |
| 88 | + white_targets = white[:, 12] |
| 89 | + |
| 90 | + # Process red data |
| 91 | + red = pd.read_csv(path_red, header=0, delimiter=';') |
| 92 | + red.insert(11, 'type', 'red') |
| 93 | + red = red.to_numpy() |
| 94 | + red_samples = red[:, 0:12] |
| 95 | + red_targets = red[:, 12] |
| 96 | + |
| 97 | + # Specify dataset before train/test split generation |
| 98 | + if which_data == 'red': |
| 99 | + samples = red_samples |
| 100 | + targets = red_targets |
| 101 | + elif which_data == 'white': |
| 102 | + samples = white_samples |
| 103 | + targets = white_targets |
| 104 | + else: |
| 105 | + samples = np.concatenate((red_samples, white_samples)) |
| 106 | + targets = np.concatenate((red_targets, white_targets)) |
| 107 | + |
| 108 | + # Generate train/test split |
| 109 | + input_train, input_test, target_train, target_test = \ |
| 110 | + train_test_split(samples, targets, test_size=test_split, |
| 111 | + shuffle=shuffle) |
| 112 | + |
| 113 | + # Warn about citation |
| 114 | + warn_citation() |
| 115 | + |
| 116 | + # Return data |
| 117 | + return (input_train, target_train), (input_test, target_test) |
0 commit comments