Skip to content

Commit

Permalink
Moved display functions into their own class, with specific output fo…
Browse files Browse the repository at this point in the history
…r ipython vs. jupyter vs. non-interactive environments
  • Loading branch information
mubdi committed Mar 14, 2018
1 parent 4de886b commit 6c633a3
Show file tree
Hide file tree
Showing 4 changed files with 171 additions and 110 deletions.
3 changes: 2 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ KnowYourData requires:

* Python (>=2.6 or >=3.4)
* Numpy (>=1.10.0)
* ipython

### User Installation
The easiest way to install KnowYourData is with `pip`:
Expand Down Expand Up @@ -46,7 +47,7 @@ Development To Do:
* ~~Add basic statistics (mean, std deviation, median, quartiles)~~

### Display
* Have output specific for ipython environment vs. jupyter environment vs. non-interactive environment
* ~~Have output specific for ipython environment vs. jupyter environment vs. non-interactive environment~~
* ~~For memory size, convert to human readable units~~
* Create simple options for graphs and the like

Expand Down
3 changes: 1 addition & 2 deletions docs/index.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Welcome to KnowYourData
=======================
`KnowYourData` is a rapid and lightweight module to describe the statistics and structure of data arrays for interactive use. This project was started in 2018 and currently maintained by Mubdi Rahman.
`KnowYourData` is a rapid and lightweight module to describe the statistics and structure of data arrays for interactive use. This project was started in 2018 and currently maintained by Mubdi Rahman. This module arose from the regular need to display properties of data arrays while conducting data exploration or diagnostics, for instance, to set min and max values for plotting, or when looking at the first few values in an array don't provide a fair representation of the data.

This module provides a quick way of displaying such information as the mean, median, confidence intervals, and size and shape of the data array.

Expand Down Expand Up @@ -28,5 +28,4 @@ Indices and tables
==================

* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`
272 changes: 166 additions & 106 deletions knowyourdata/kyd.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,86 +14,33 @@

import sys
import numpy as np
from IPython.display import display


class KYD(object):
"""The Central Class for KYD"""

# Variable for Data Vector
data = None

# Initial Flags
f_allfinite = False
f_allnonfinite = False
f_hasnan = False
f_hasinf = False
class KYD_data_summary(object):
"""A class to store and display the summary information"""

# Initialized Numbers
num_nan = 0
num_inf = 0
text_repr = ""
html_repr = ""

# Display Settings
col_width = 10
precision = 4

def check_finite(self):
"""Checking to see if all elements are finite and setting flags"""
if np.all(np.isfinite(self.data)):
self.filt_data = self.data
self.f_allfinite = True
else:
finite_inds = np.where(np.isfinite(self.data))

self.filt_data = self.data[finite_inds]

if self.filt_data.size == 0:
self.f_allnonfinite = True

if np.any(np.isnan(self.data)):
self.f_hasnan = True
self.num_nan = np.sum(np.isnan(self.data))

if np.any(np.isinf(self.data)):
self.f_hasinf = True
self.num_inf = np.sum(np.isinf(self.data))

def check_struct(self):
"""Determining the Structure of the Numpy Array"""
self.dtype = self.data.dtype
self.ndim = self.data.ndim
self.shape = self.data.shape
self.size = self.data.size
self.memsize = sys.getsizeof(self.data)
self.human_memsize = sizeof_fmt(self.memsize)

def get_basic_stats(self):
"""Get basic statistics about array"""

if self.f_allnonfinite:
self.min = self.max = self.range = np.nan
self.mean = self.std = self.median = np.nan
self.firstquartile = self.thirdquartile = np.nan
self.ci_68 = self.ci_95 = self.ci_99 = np.array([np.nan, np.nan])

return
def __repr__(self):
"""
The Plain String Representation of the Data Summary
"""
return self.text_repr

self.min = np.float_(np.min(self.filt_data))
self.max = np.float_(np.max(self.filt_data))
self.range = self.max - self.min
self.mean = np.mean(self.filt_data)
self.std = np.std(self.filt_data)
self.median = np.float_(np.median(self.filt_data))
self.firstquartile = np.float_(np.percentile(self.filt_data, 25))
self.thirdquartile = np.float_(np.percentile(self.filt_data, 75))
self.ci_99 = np.float_(
np.percentile(self.filt_data, np.array([0.5, 99.5])))
self.ci_95 = np.float_(
np.percentile(self.filt_data, np.array([2.5, 97.5])))
self.ci_68 = np.float_(
np.percentile(self.filt_data, np.array([16.0, 84.0])))
def _repr_html_(self):
"""
The HTML Representation of the Data Summary
"""
return self.html_repr

def display_basic_stats_new(self):
"""Display Basic Statistics"""
def make_txt_basic_stats(self):
"""Make Text Representation of Basic Statistics"""
pstr_list = []

pstr_struct_header1 = "Basic Statistics "
Expand All @@ -112,26 +59,39 @@ def display_basic_stats_new(self):

tmp_data = [
[
"Mean:", "{self.mean:.{self.precision}}".format(self=self),
"Mean:", "{kyd_class.mean:.{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"",
"Std Dev:", "{self.std:.{self.precision}}".format(self=self)
"Std Dev:", "{kyd_class.std:.{kyd_class.precision}}".format(
kyd_class=self.kyd_class)
],
["Min:", "1Q:", "Median:", "3Q:", "Max:"],
[
"{self.min: .{self.precision}}".format(self=self),
"{self.firstquartile: .{self.precision}}".format(self=self),
"{self.median: .{self.precision}}".format(self=self),
"{self.thirdquartile: .{self.precision}}".format(self=self),
"{self.max: .{self.precision}}".format(self=self),
"{kyd_class.min: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.firstquartile: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.median: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.thirdquartile: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.max: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
],
['-99 CI:', '-95 CI:', '-68 CI:', '+68 CI:', '+95 CI:', '+99 CI:'],
[
"{self.ci_99[0]: .{self.precision}}".format(self=self),
"{self.ci_95[0]: .{self.precision}}".format(self=self),
"{self.ci_68[0]: .{self.precision}}".format(self=self),
"{self.ci_68[1]: .{self.precision}}".format(self=self),
"{self.ci_95[1]: .{self.precision}}".format(self=self),
"{self.ci_99[1]: .{self.precision}}".format(self=self),
"{kyd_class.ci_99[0]: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.ci_95[0]: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.ci_68[0]: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.ci_68[1]: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.ci_95[1]: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
"{kyd_class.ci_99[1]: .{kyd_class.precision}}".format(
kyd_class=self.kyd_class),
],
]

Expand All @@ -158,8 +118,8 @@ def display_basic_stats_new(self):

return pstr_list

def display_struct(self):
"""Display information about array structure"""
def make_txt_struct(self):
"""Make Text Representation of Array"""

pstr_list = []

Expand All @@ -175,54 +135,55 @@ def display_struct(self):

pstr_n_dim = (
"Number of Dimensions:\t"
"{self.ndim}").format(
self=self)
"{kyd_class.ndim}").format(
kyd_class=self.kyd_class)
pstr_list.append(pstr_n_dim)

pstr_shape = (
"Shape of Dimensions:\t"
"{self.shape}").format(
self=self)
"{kyd_class.shape}").format(
kyd_class=self.kyd_class)
pstr_list.append(pstr_shape)

pstr_dtype = (
"Array Data Type:\t"
"{self.dtype}").format(
self=self)
"{kyd_class.dtype}").format(
kyd_class=self.kyd_class)
pstr_list.append(pstr_dtype)

pstr_memsize = (
"Memory Size:\t\t"
"{self.human_memsize}").format(
self=self)
"{kyd_class.human_memsize}").format(
kyd_class=self.kyd_class)
pstr_list.append(pstr_memsize)

pstr_spacer = ("")
pstr_list.append(pstr_spacer)

pstr_numnan = (
"Number of NaN:\t"
"{self.num_nan}").format(
self=self)
"{kyd_class.num_nan}").format(
kyd_class=self.kyd_class)
pstr_list.append(pstr_numnan)

pstr_numinf = (
"Number of Inf:\t"
"{self.num_inf}").format(
self=self)
"{kyd_class.num_inf}").format(
kyd_class=self.kyd_class)
pstr_list.append(pstr_numinf)

return pstr_list

def display(self, short=False):
"""Displaying all relevant statistics"""
def make_text_repr(self):
"""Making final text string for plain text representation"""

if short:
pass
tmp_text_repr = ""

tmp_text_repr += "\n"

pstr_basic = self.make_txt_basic_stats()
pstr_struct = self.make_txt_struct()

print("")
pstr_basic = self.display_basic_stats_new()
pstr_struct = self.display_struct()
n_basic = len(pstr_basic)
n_struct = len(pstr_struct)

Expand All @@ -249,15 +210,113 @@ def display(self, short=False):
tmp_str += ''.ljust(r_colwidth)
tmp_str += '\t|'

print(tmp_str)
tmp_text_repr += tmp_str + "\n"

tmp_text_repr += "\n"
self.text_repr = tmp_text_repr

print("")
def __init__(self, kyd_class):
super(KYD_data_summary, self).__init__()
self.kyd_class = kyd_class
self.make_text_repr()


class KYD(object):
"""The Central Class for KYD"""

# Variable for Data Vector
data = None

# Initial Flags
f_allfinite = False
f_allnonfinite = False
f_hasnan = False
f_hasinf = False

# Initialized Numbers
num_nan = 0
num_inf = 0

# Display Settings
col_width = 10
precision = 4

def check_finite(self):
"""Checking to see if all elements are finite and setting flags"""
if np.all(np.isfinite(self.data)):
self.filt_data = self.data
self.f_allfinite = True
else:
finite_inds = np.where(np.isfinite(self.data))

self.filt_data = self.data[finite_inds]

if self.filt_data.size == 0:
self.f_allnonfinite = True

if np.any(np.isnan(self.data)):
self.f_hasnan = True
self.num_nan = np.sum(np.isnan(self.data))

if np.any(np.isinf(self.data)):
self.f_hasinf = True
self.num_inf = np.sum(np.isinf(self.data))

def check_struct(self):
"""Determining the Structure of the Numpy Array"""
self.dtype = self.data.dtype
self.ndim = self.data.ndim
self.shape = self.data.shape
self.size = self.data.size
self.memsize = sys.getsizeof(self.data)
self.human_memsize = sizeof_fmt(self.memsize)

def get_basic_stats(self):
"""Get basic statistics about array"""

if self.f_allnonfinite:
self.min = self.max = self.range = np.nan
self.mean = self.std = self.median = np.nan
self.firstquartile = self.thirdquartile = np.nan
self.ci_68 = self.ci_95 = self.ci_99 = np.array([np.nan, np.nan])

return

self.min = np.float_(np.min(self.filt_data))
self.max = np.float_(np.max(self.filt_data))
self.range = self.max - self.min
self.mean = np.mean(self.filt_data)
self.std = np.std(self.filt_data)
self.median = np.float_(np.median(self.filt_data))
self.firstquartile = np.float_(np.percentile(self.filt_data, 25))
self.thirdquartile = np.float_(np.percentile(self.filt_data, 75))
self.ci_99 = np.float_(
np.percentile(self.filt_data, np.array([0.5, 99.5])))
self.ci_95 = np.float_(
np.percentile(self.filt_data, np.array([2.5, 97.5])))
self.ci_68 = np.float_(
np.percentile(self.filt_data, np.array([16.0, 84.0])))

def make_summary(self):
"""Making Data Summary"""
self.data_summary = KYD_data_summary(self)

def clear_memory(self):
"""Ensuring the Numpy Array does not exist in memory"""
del self.data
del self.filt_data

def display(self, short=False):
"""Displaying all relevant statistics"""

if short:
pass
try:
get_ipython
display(self.data_summary)
except NameError:
print(self.data_summary)

def __init__(self, data):
super(KYD, self).__init__()

Expand All @@ -271,6 +330,7 @@ def __init__(self, data):
self.check_struct()
self.get_basic_stats()
self.clear_memory()
self.make_summary()


def sizeof_fmt(num, suffix='B'):
Expand Down
Loading

0 comments on commit 6c633a3

Please sign in to comment.