Initial commit

elliewix · May 1, 2016 · f81ef92 · f81ef92
commit f81ef92
Show file tree

Hide file tree

Showing 1,020 changed files with 488,601 additions and 0 deletions.
diff --git a/.ipynb_checkpoints/data_profile-checkpoint.ipynb b/.ipynb_checkpoints/data_profile-checkpoint.ipynb
@@ -0,0 +1,181 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 81,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "from os.path import isfile, join\n",
+    "import csv\n",
+    "import datetime\n",
+    "import pandas as pd\n",
+    "from __future__ import division\n",
+    "import markdown"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def getFiles(path):\n",
+    "    \"\"\"Function to return a list of all files within a folder\"\"\"\n",
+    "    files = [ f for f in os.listdir(path) if isfile(join(path,f)) and f[0] != '.' ]\n",
+    "    return files\n",
+    "\n",
+    "def basic_stats(file):\n",
+    "    stats = os.stat(f)\n",
+    "    size = stats.st_size\n",
+    "    last_modified = datetime.datetime.fromtimestamp(stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S')\n",
+    "    last_access = datetime.datetime.fromtimestamp(stats.st_atime).strftime('%Y-%m-%d %H:%M:%S')\n",
+    "    return {'filename': file, 'size': size, 'last_access': last_access, 'last_modified': last_modified}\n",
+    "\n",
+    "def review_csv(file, mode = 'rt', headers = True, index_row = True, missing = ''):\n",
+    "    with open(file, mode) as fin:\n",
+    "        fin = csv.reader(fin)\n",
+    "        if headers:\n",
+    "            col_names = next(fin)\n",
+    "            data = [r for r in fin]\n",
+    "        else:\n",
+    "            data = [r for r in fin]\n",
+    "\n",
+    "    if index_row:\n",
+    "        ids = [r[0] for r in data]\n",
+    "    else:\n",
+    "        ids = \"None declared\"\n",
+    "\n",
+    "    num_rows = len(data)\n",
+    "    data = map(list, zip(*data))\n",
+    "        \n",
+    "\n",
+    "    num_columns = len(col_names)\n",
+    "    col_info = {'csv_basic': {'num_rows': num_rows, 'num_columns': num_columns}, 'cols': {}}\n",
+    "    for i, col in enumerate(col_names):\n",
+    "        info = {}\n",
+    "        info['unique_values'] = len(set(data[i]))\n",
+    "        #print data[i]\n",
+    "        info['missing'] = data[i].count(missing)\n",
+    "        info['percent_missing'] = \"{:.0%}\".format(info['missing'] / len(data[i]))\n",
+    "        info['percent_digit'] = \"{:.0%}\".format(len([d for d in data[i] if d.isdigit()]) / \\\n",
+    "                                len([d for d in data[i] if len(d) > 0]))\n",
+    "        digits = [d for d in data[i] if d.isdigit()]\n",
+    "        if len(digits) > 0:\n",
+    "            info['min_digit'] = min(digits)\n",
+    "            info['max_digit'] = max(digits)\n",
+    "        else:\n",
+    "            info['min_digit'] = \"no digits\"\n",
+    "            info['max_digit'] = \"no digits\"\n",
+    "        if headers:\n",
+    "            col_info['cols'][col] = info\n",
+    "        else:\n",
+    "            col_info['cols']['col_' + str(i)] = info\n",
+    "    #print col_info\n",
+    "    return col_info\n",
+    "            \n",
+    "def make_md(file_data, print_me = True, make_md = True, make_html = False):\n",
+    "    dt = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())\n",
+    "    for f, f_data in file_data.iteritems():\n",
+    "        md = \"\"\n",
+    "        md += \"Data Profile for \" + f + \"\\n\"\n",
+    "        md += \"Generated on: \" + dt + \"\\n\"\n",
+    "        md += \"\\n\\n\"\n",
+    "        basic = f_data['csv_basic']\n",
+    "        md += \"Number of columns: \" + str(basic['num_columns']) + \"\\n\"\n",
+    "        md += \"Number of rows: \" + str(basic['num_rows']) + \"\\n\"\n",
+    "        md += \"\\n\"\n",
+    "        info = [f_data['columns'] for f in file_data.keys()][0]\n",
+    "        for key, data in info.iteritems():\n",
+    "            md += \"**\" + key + \"**\" + \"\\n\"\n",
+    "            md += \"-\" * (len(key) + 2) + \"\\n\"\n",
+    "            md += \"* Description of column: \\n\"\n",
+    "            md += \"* Collection methods: \\n\"\n",
+    "            md += \"* Description of data values and units: \\n\"\n",
+    "            md += \"* Reason for missing values: \\n\"\n",
+    "            md += \"\\n\"\n",
+    "            for column, val in data.iteritems():\n",
+    "                md += \"* \" + column + \": \" + str(val) + \"\\n\"\n",
+    "            md += \"\\n\"\n",
+    "        if print_me:\n",
+    "            print md#return md\n",
+    "        if make_md:\n",
+    "            write_name = f.split('/')[-1].split('.')[0] + '_DataProfile.md'\n",
+    "            with open(write_name, 'wt') as fout:\n",
+    "                fout.write(md)\n",
+    "        if make_html:\n",
+    "            write_name = f.split('/')[-1].split('.')[0] + '_DataProfile.html'\n",
+    "            with open(write_name, 'wt') as fout:\n",
+    "                fout.write(markdown.markdown(md))\n",
+    "        \n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "source = 'vagrants/'\n",
+    "\n",
+    "files = [source + f for f in getFiles(source)]\n",
+    "\n",
+    "file_data = {}\n",
+    "\n",
+    "for f in files:\n",
+    "    if f.endswith('.csv'):\n",
+    "        finfo = basic_stats(f)\n",
+    "        csvinfo = review_csv(f, mode = 'rU', missing = '[unknown]')\n",
+    "        file_data[f] = ({'file_metadata': finfo, \\\n",
+    "                         'csv_basic': csvinfo['csv_basic'], \\\n",
+    "                         'columns': csvinfo['cols']})\n",
+    "\n",
+    "        \n",
+    "make_md(file_data, print_me = False, make_md = True, make_html = True)\n",
+    "#print md\n",
+    "\n",
+    "#print file_data\n",
+    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/.ipynb_checkpoints/make_fake_data-checkpoint.ipynb b/.ipynb_checkpoints/make_fake_data-checkpoint.ipynb
@@ -0,0 +1,104 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import csv\n",
+    "import random\n",
+    "import string"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def make_headers(num_cols):\n",
+    "    headers = []\n",
+    "    for c in range(num_cols):\n",
+    "        headers.append(\"\".join([random.choice(string.lowercase) for l in range(random.randint(5,10))]))\n",
+    "    return headers\n",
+    "\n",
+    "def make_csv(file_path, num_rows, num_cols, headers):\n",
+    "    main_list = [headers]\n",
+    "    ints = range(1000)\n",
+    "    for r in range(num_rows):\n",
+    "        row = []\n",
+    "        for c in range(num_cols):\n",
+    "            row.append(random.choice(ints))\n",
+    "        main_list.append(row)\n",
+    "    with open(file_path, 'wt') as fout:\n",
+    "        fout = csv.writer(fout)\n",
+    "        fout.writerows(main_list)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "numcols = 10\n",
+    "min_rows = 5\n",
+    "max_rows = 200\n",
+    "num_fake_files = 1000\n",
+    "dump = \"fakedata/\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 49,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "head = make_headers(10)\n",
+    "\n",
+    "for i in range(num_fake_files):\n",
+    "    path = dump + str(i) + '.csv'\n",
+    "    make_csv(path, random.randint(min_rows, max_rows), numcols, head)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}