diff --git a/README.md b/README.md index 43b242a..0c37c34 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ KnowYourData requires: * Python (>=2.6 or >=3.4) * Numpy (>=1.10.0) +* ipython ### User Installation The easiest way to install KnowYourData is with `pip`: @@ -46,7 +47,7 @@ Development To Do: * ~~Add basic statistics (mean, std deviation, median, quartiles)~~ ### Display -* Have output specific for ipython environment vs. jupyter environment vs. non-interactive environment +* ~~Have output specific for ipython environment vs. jupyter environment vs. non-interactive environment~~ * ~~For memory size, convert to human readable units~~ * Create simple options for graphs and the like diff --git a/docs/index.rst b/docs/index.rst index 912faef..244a805 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -1,6 +1,6 @@ Welcome to KnowYourData ======================= -`KnowYourData` is a rapid and lightweight module to describe the statistics and structure of data arrays for interactive use. This project was started in 2018 and currently maintained by Mubdi Rahman. +`KnowYourData` is a rapid and lightweight module to describe the statistics and structure of data arrays for interactive use. This project was started in 2018 and currently maintained by Mubdi Rahman. This module arose from the regular need to display properties of data arrays while conducting data exploration or diagnostics, for instance, to set min and max values for plotting, or when looking at the first few values in an array don't provide a fair representation of the data. This module provides a quick way of displaying such information as the mean, median, confidence intervals, and size and shape of the data array. @@ -28,5 +28,4 @@ Indices and tables ================== * :ref:`genindex` -* :ref:`modindex` * :ref:`search` diff --git a/knowyourdata/kyd.py b/knowyourdata/kyd.py index 4ce5018..8c8f856 100644 --- a/knowyourdata/kyd.py +++ b/knowyourdata/kyd.py @@ -14,86 +14,33 @@ import sys import numpy as np +from IPython.display import display -class KYD(object): - """The Central Class for KYD""" - - # Variable for Data Vector - data = None - - # Initial Flags - f_allfinite = False - f_allnonfinite = False - f_hasnan = False - f_hasinf = False +class KYD_data_summary(object): + """A class to store and display the summary information""" - # Initialized Numbers - num_nan = 0 - num_inf = 0 + text_repr = "" + html_repr = "" # Display Settings col_width = 10 precision = 4 - def check_finite(self): - """Checking to see if all elements are finite and setting flags""" - if np.all(np.isfinite(self.data)): - self.filt_data = self.data - self.f_allfinite = True - else: - finite_inds = np.where(np.isfinite(self.data)) - - self.filt_data = self.data[finite_inds] - - if self.filt_data.size == 0: - self.f_allnonfinite = True - - if np.any(np.isnan(self.data)): - self.f_hasnan = True - self.num_nan = np.sum(np.isnan(self.data)) - - if np.any(np.isinf(self.data)): - self.f_hasinf = True - self.num_inf = np.sum(np.isinf(self.data)) - - def check_struct(self): - """Determining the Structure of the Numpy Array""" - self.dtype = self.data.dtype - self.ndim = self.data.ndim - self.shape = self.data.shape - self.size = self.data.size - self.memsize = sys.getsizeof(self.data) - self.human_memsize = sizeof_fmt(self.memsize) - - def get_basic_stats(self): - """Get basic statistics about array""" - - if self.f_allnonfinite: - self.min = self.max = self.range = np.nan - self.mean = self.std = self.median = np.nan - self.firstquartile = self.thirdquartile = np.nan - self.ci_68 = self.ci_95 = self.ci_99 = np.array([np.nan, np.nan]) - - return + def __repr__(self): + """ + The Plain String Representation of the Data Summary + """ + return self.text_repr - self.min = np.float_(np.min(self.filt_data)) - self.max = np.float_(np.max(self.filt_data)) - self.range = self.max - self.min - self.mean = np.mean(self.filt_data) - self.std = np.std(self.filt_data) - self.median = np.float_(np.median(self.filt_data)) - self.firstquartile = np.float_(np.percentile(self.filt_data, 25)) - self.thirdquartile = np.float_(np.percentile(self.filt_data, 75)) - self.ci_99 = np.float_( - np.percentile(self.filt_data, np.array([0.5, 99.5]))) - self.ci_95 = np.float_( - np.percentile(self.filt_data, np.array([2.5, 97.5]))) - self.ci_68 = np.float_( - np.percentile(self.filt_data, np.array([16.0, 84.0]))) + def _repr_html_(self): + """ + The HTML Representation of the Data Summary + """ + return self.html_repr - def display_basic_stats_new(self): - """Display Basic Statistics""" + def make_txt_basic_stats(self): + """Make Text Representation of Basic Statistics""" pstr_list = [] pstr_struct_header1 = "Basic Statistics " @@ -112,26 +59,39 @@ def display_basic_stats_new(self): tmp_data = [ [ - "Mean:", "{self.mean:.{self.precision}}".format(self=self), + "Mean:", "{kyd_class.mean:.{kyd_class.precision}}".format( + kyd_class=self.kyd_class), "", - "Std Dev:", "{self.std:.{self.precision}}".format(self=self) + "Std Dev:", "{kyd_class.std:.{kyd_class.precision}}".format( + kyd_class=self.kyd_class) ], ["Min:", "1Q:", "Median:", "3Q:", "Max:"], [ - "{self.min: .{self.precision}}".format(self=self), - "{self.firstquartile: .{self.precision}}".format(self=self), - "{self.median: .{self.precision}}".format(self=self), - "{self.thirdquartile: .{self.precision}}".format(self=self), - "{self.max: .{self.precision}}".format(self=self), + "{kyd_class.min: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.firstquartile: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.median: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.thirdquartile: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.max: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), ], ['-99 CI:', '-95 CI:', '-68 CI:', '+68 CI:', '+95 CI:', '+99 CI:'], [ - "{self.ci_99[0]: .{self.precision}}".format(self=self), - "{self.ci_95[0]: .{self.precision}}".format(self=self), - "{self.ci_68[0]: .{self.precision}}".format(self=self), - "{self.ci_68[1]: .{self.precision}}".format(self=self), - "{self.ci_95[1]: .{self.precision}}".format(self=self), - "{self.ci_99[1]: .{self.precision}}".format(self=self), + "{kyd_class.ci_99[0]: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.ci_95[0]: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.ci_68[0]: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.ci_68[1]: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.ci_95[1]: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), + "{kyd_class.ci_99[1]: .{kyd_class.precision}}".format( + kyd_class=self.kyd_class), ], ] @@ -158,8 +118,8 @@ def display_basic_stats_new(self): return pstr_list - def display_struct(self): - """Display information about array structure""" + def make_txt_struct(self): + """Make Text Representation of Array""" pstr_list = [] @@ -175,26 +135,26 @@ def display_struct(self): pstr_n_dim = ( "Number of Dimensions:\t" - "{self.ndim}").format( - self=self) + "{kyd_class.ndim}").format( + kyd_class=self.kyd_class) pstr_list.append(pstr_n_dim) pstr_shape = ( "Shape of Dimensions:\t" - "{self.shape}").format( - self=self) + "{kyd_class.shape}").format( + kyd_class=self.kyd_class) pstr_list.append(pstr_shape) pstr_dtype = ( "Array Data Type:\t" - "{self.dtype}").format( - self=self) + "{kyd_class.dtype}").format( + kyd_class=self.kyd_class) pstr_list.append(pstr_dtype) pstr_memsize = ( "Memory Size:\t\t" - "{self.human_memsize}").format( - self=self) + "{kyd_class.human_memsize}").format( + kyd_class=self.kyd_class) pstr_list.append(pstr_memsize) pstr_spacer = ("") @@ -202,27 +162,28 @@ def display_struct(self): pstr_numnan = ( "Number of NaN:\t" - "{self.num_nan}").format( - self=self) + "{kyd_class.num_nan}").format( + kyd_class=self.kyd_class) pstr_list.append(pstr_numnan) pstr_numinf = ( "Number of Inf:\t" - "{self.num_inf}").format( - self=self) + "{kyd_class.num_inf}").format( + kyd_class=self.kyd_class) pstr_list.append(pstr_numinf) return pstr_list - def display(self, short=False): - """Displaying all relevant statistics""" + def make_text_repr(self): + """Making final text string for plain text representation""" - if short: - pass + tmp_text_repr = "" + + tmp_text_repr += "\n" + + pstr_basic = self.make_txt_basic_stats() + pstr_struct = self.make_txt_struct() - print("") - pstr_basic = self.display_basic_stats_new() - pstr_struct = self.display_struct() n_basic = len(pstr_basic) n_struct = len(pstr_struct) @@ -249,15 +210,113 @@ def display(self, short=False): tmp_str += ''.ljust(r_colwidth) tmp_str += '\t|' - print(tmp_str) + tmp_text_repr += tmp_str + "\n" + + tmp_text_repr += "\n" + self.text_repr = tmp_text_repr - print("") + def __init__(self, kyd_class): + super(KYD_data_summary, self).__init__() + self.kyd_class = kyd_class + self.make_text_repr() + + +class KYD(object): + """The Central Class for KYD""" + + # Variable for Data Vector + data = None + + # Initial Flags + f_allfinite = False + f_allnonfinite = False + f_hasnan = False + f_hasinf = False + + # Initialized Numbers + num_nan = 0 + num_inf = 0 + + # Display Settings + col_width = 10 + precision = 4 + + def check_finite(self): + """Checking to see if all elements are finite and setting flags""" + if np.all(np.isfinite(self.data)): + self.filt_data = self.data + self.f_allfinite = True + else: + finite_inds = np.where(np.isfinite(self.data)) + + self.filt_data = self.data[finite_inds] + + if self.filt_data.size == 0: + self.f_allnonfinite = True + + if np.any(np.isnan(self.data)): + self.f_hasnan = True + self.num_nan = np.sum(np.isnan(self.data)) + + if np.any(np.isinf(self.data)): + self.f_hasinf = True + self.num_inf = np.sum(np.isinf(self.data)) + + def check_struct(self): + """Determining the Structure of the Numpy Array""" + self.dtype = self.data.dtype + self.ndim = self.data.ndim + self.shape = self.data.shape + self.size = self.data.size + self.memsize = sys.getsizeof(self.data) + self.human_memsize = sizeof_fmt(self.memsize) + + def get_basic_stats(self): + """Get basic statistics about array""" + + if self.f_allnonfinite: + self.min = self.max = self.range = np.nan + self.mean = self.std = self.median = np.nan + self.firstquartile = self.thirdquartile = np.nan + self.ci_68 = self.ci_95 = self.ci_99 = np.array([np.nan, np.nan]) + + return + + self.min = np.float_(np.min(self.filt_data)) + self.max = np.float_(np.max(self.filt_data)) + self.range = self.max - self.min + self.mean = np.mean(self.filt_data) + self.std = np.std(self.filt_data) + self.median = np.float_(np.median(self.filt_data)) + self.firstquartile = np.float_(np.percentile(self.filt_data, 25)) + self.thirdquartile = np.float_(np.percentile(self.filt_data, 75)) + self.ci_99 = np.float_( + np.percentile(self.filt_data, np.array([0.5, 99.5]))) + self.ci_95 = np.float_( + np.percentile(self.filt_data, np.array([2.5, 97.5]))) + self.ci_68 = np.float_( + np.percentile(self.filt_data, np.array([16.0, 84.0]))) + + def make_summary(self): + """Making Data Summary""" + self.data_summary = KYD_data_summary(self) def clear_memory(self): """Ensuring the Numpy Array does not exist in memory""" del self.data del self.filt_data + def display(self, short=False): + """Displaying all relevant statistics""" + + if short: + pass + try: + get_ipython + display(self.data_summary) + except NameError: + print(self.data_summary) + def __init__(self, data): super(KYD, self).__init__() @@ -271,6 +330,7 @@ def __init__(self, data): self.check_struct() self.get_basic_stats() self.clear_memory() + self.make_summary() def sizeof_fmt(num, suffix='B'): diff --git a/requirements.txt b/requirements.txt index 152d9da..e9a13f1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ -numpy>=1.10.0 \ No newline at end of file +numpy>=1.10.0 +ipython \ No newline at end of file