Merge pull request #22 from christianversloot/dataset-release

Dataset release
machinecurve · Dec 2, 2020 · b171535 · b171535
2 parents 9f705a4 + d0ba71a
commit b171535
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 1 deletion.
diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ _The names TensorFlow, Keras, as well as related names, marks, emblems and image
   * [STL-10](#stl-10)
   * [Iris](#iris)
   * [Wine Quality dataset](#wine-quality-dataset)
+  * [USPS Handwritten Digits Dataset](#usps-handwritten-digits-dataset)
 - [Contributors and other references](#contributors-and-other-references)
 - [License](#license)
 
@@ -206,6 +207,21 @@ from extra_keras_datasets import wine_quality
 
 ---
 
+### USPS Handwritten Digits Dataset
+This dataset presents thousands of 16x16 grayscale images of handwritten digits, generated from real USPS based mail.
+
+* Input structure: 16x16 image
+* Target structure: digit ranging from 0.0 - 9.0 describing the input
+
+```
+from extra_keras_datasets import usps
+(input_train, target_train), (input_test, target_test) = usps.load_data()
+```
+
+<a href="./assets/usps.png"><img src="./assets/usps.png" width="100%" style="border: 3px solid #f6f8fa;" /></a>
+
+---
+
 ## Contributors and other references
 * **EMNIST dataset:**
   * Cohen, G., Afshar, S., Tapson, J., & van Schaik, A. (2017). EMNIST: an extension of MNIST to handwritten letters. Retrieved from http://arxiv.org/abs/1702.05373
@@ -220,6 +236,8 @@ from extra_keras_datasets import wine_quality
   * Fisher,R.A. "The use of multiple measurements in taxonomic problems" Annual Eugenics, 7, Part II, 179-188 (1936); also in "Contributions to Mathematical Statistics" (John Wiley, NY, 1950).
 * **Wine Quality dataset:**
   * P. Cortez, A. Cerdeira, F. Almeida, T. Matos and J. Reis. Modeling wine preferences by data mining from physicochemical properties. In Decision Support Systems, Elsevier, 47(4):547-553, 2009.
+* **USPS Handwritten Digits Dataset**
+  * Hull, J. J. (1994). A database for handwritten text recognition research. IEEE Transactions on pattern analysis and machine intelligence, 16(5), 550-554.
 
 ## License
 The licenseable parts of this repository are licensed under a [MIT License](./LICENSE), so you're free to use this repo in your machine learning projects / blogs / exercises, and so on. Happy engineering! 🚀
diff --git a/assets/usps.png b/assets/usps.png
diff --git a/extra_keras_datasets/__init__.py b/extra_keras_datasets/__init__.py
@@ -8,5 +8,6 @@
 from . import stl10
 from . import iris
 from . import wine_quality
+from . import usps
 
-__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality']
+__all__ = ['emnist', 'kmnist', 'svhn', 'stl10', 'iris', 'wine_quality', 'usps']
diff --git a/extra_keras_datasets/usps.py b/extra_keras_datasets/usps.py
@@ -0,0 +1,103 @@
+"""
+  Import the USPS Handwritten Digits Dataset
+  Source: https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/datasets/
+          multiclass.html#usps
+  (and: https://ieeexplore.ieee.org/document/291440)
+  Description: Handwritten text recognition image database.
+
+  ~~~ Important note ~~~
+  Please cite the following paper when using or referencing the dataset:
+  Hull, J. J. (1994). A database for handwritten text recognition
+  research. IEEE Transactions on pattern analysis and machine
+  intelligence, 16(5), 550-554.
+"""
+
+from tensorflow.keras.utils import get_file
+import logging
+from sklearn.datasets import load_svmlight_file
+import bz2
+
+
+def warn_citation():
+    """Warns about citation requirements
+    # Returns
+      Void
+    """
+    logging.warning(("Please cite the following paper when using or"
+                     " referencing this Extra Keras Dataset:"))
+    logging.warning(
+        ("Hull, J. J. (1994). A database for handwritten text "
+         "recognition research. IEEE Transactions on pattern analysis and "
+         "machine intelligence, 16(5), 550-554.")
+      )
+
+
+def decompress(path):
+    """Decompresses BZ2 data into another file"""
+    bz_zip = bz2.BZ2File(path)
+    decompressed_data = bz_zip.read()
+    new_path = path[:-4]
+    open(new_path, 'wb').write(decompressed_data)
+    return new_path
+
+
+def load_to_numpy(path):
+    """Loads LIBSVM data into NumPY format"""
+    data = load_svmlight_file(path)
+    return (data[0].toarray(), data[1])
+
+
+def load_data(
+    path="usps.bz2",
+    path_testing="usps-testing.bz2"
+):
+    """Loads the USPS Handwritten Digits Dataset.
+    # Arguments
+        path: path where to cache the USPS data locally
+            (relative to ~/.keras/datasets).
+        path_testing: path where to cache the USPS testing data locally
+            (relative to ~/.keras/datasets).
+    # Returns
+        Tuple of Numpy arrays: `(input_train, target_train),
+                                  (input_test, target_test)`.
+        Input structure: 16x16 image with a digit
+        Target structure: number in the 0.0 - 9.0 range
+
+    """
+    # Log about loading
+    logging.basicConfig(level=logging.INFO)
+    logging.info('Loading dataset = usps')
+
+    # Download data
+    path = get_file(
+        path,
+        origin=("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/"
+                "datasets/multiclass/usps.bz2")
+    )
+    path_testing = get_file(
+        path_testing,
+        origin=("https://www.csie.ntu.edu.tw/~cjlin/libsvmtools/"
+                "datasets/multiclass/usps.t.bz2")
+    )
+
+    # Decompress data
+    decompress_train = decompress(path)
+    decompress_test = decompress(path_testing)
+
+    # Load LIBSVM data into NumPy array
+    (input_train, target_train) = load_to_numpy(decompress_train)
+    (input_test, target_test) = load_to_numpy(decompress_test)
+
+    # Reshape data
+    input_train = input_train.reshape(input_train.shape[0], 16, 16)
+    input_test = input_test.reshape(input_test.shape[0], 16, 16)
+
+    # Correct targets (e.g. number 3 is now returned as 4.0)
+    target_train = target_train - 1
+    target_test = target_test - 1
+
+    # Warn about citation
+    warn_citation()
+
+    # Return data
+    return (input_train, target_train), (input_test, target_test)