Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
elliewix committed May 1, 2016
0 parents commit f81ef92
Show file tree
Hide file tree
Showing 1,020 changed files with 488,601 additions and 0 deletions.
181 changes: 181 additions & 0 deletions .ipynb_checkpoints/data_profile-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 81,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import os\n",
"from os.path import isfile, join\n",
"import csv\n",
"import datetime\n",
"import pandas as pd\n",
"from __future__ import division\n",
"import markdown"
]
},
{
"cell_type": "code",
"execution_count": 92,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def getFiles(path):\n",
" \"\"\"Function to return a list of all files within a folder\"\"\"\n",
" files = [ f for f in os.listdir(path) if isfile(join(path,f)) and f[0] != '.' ]\n",
" return files\n",
"\n",
"def basic_stats(file):\n",
" stats = os.stat(f)\n",
" size = stats.st_size\n",
" last_modified = datetime.datetime.fromtimestamp(stats.st_mtime).strftime('%Y-%m-%d %H:%M:%S')\n",
" last_access = datetime.datetime.fromtimestamp(stats.st_atime).strftime('%Y-%m-%d %H:%M:%S')\n",
" return {'filename': file, 'size': size, 'last_access': last_access, 'last_modified': last_modified}\n",
"\n",
"def review_csv(file, mode = 'rt', headers = True, index_row = True, missing = ''):\n",
" with open(file, mode) as fin:\n",
" fin = csv.reader(fin)\n",
" if headers:\n",
" col_names = next(fin)\n",
" data = [r for r in fin]\n",
" else:\n",
" data = [r for r in fin]\n",
"\n",
" if index_row:\n",
" ids = [r[0] for r in data]\n",
" else:\n",
" ids = \"None declared\"\n",
"\n",
" num_rows = len(data)\n",
" data = map(list, zip(*data))\n",
" \n",
"\n",
" num_columns = len(col_names)\n",
" col_info = {'csv_basic': {'num_rows': num_rows, 'num_columns': num_columns}, 'cols': {}}\n",
" for i, col in enumerate(col_names):\n",
" info = {}\n",
" info['unique_values'] = len(set(data[i]))\n",
" #print data[i]\n",
" info['missing'] = data[i].count(missing)\n",
" info['percent_missing'] = \"{:.0%}\".format(info['missing'] / len(data[i]))\n",
" info['percent_digit'] = \"{:.0%}\".format(len([d for d in data[i] if d.isdigit()]) / \\\n",
" len([d for d in data[i] if len(d) > 0]))\n",
" digits = [d for d in data[i] if d.isdigit()]\n",
" if len(digits) > 0:\n",
" info['min_digit'] = min(digits)\n",
" info['max_digit'] = max(digits)\n",
" else:\n",
" info['min_digit'] = \"no digits\"\n",
" info['max_digit'] = \"no digits\"\n",
" if headers:\n",
" col_info['cols'][col] = info\n",
" else:\n",
" col_info['cols']['col_' + str(i)] = info\n",
" #print col_info\n",
" return col_info\n",
" \n",
"def make_md(file_data, print_me = True, make_md = True, make_html = False):\n",
" dt = '{:%Y-%b-%d %H:%M:%S}'.format(datetime.datetime.now())\n",
" for f, f_data in file_data.iteritems():\n",
" md = \"\"\n",
" md += \"Data Profile for \" + f + \"\\n\"\n",
" md += \"Generated on: \" + dt + \"\\n\"\n",
" md += \"\\n\\n\"\n",
" basic = f_data['csv_basic']\n",
" md += \"Number of columns: \" + str(basic['num_columns']) + \"\\n\"\n",
" md += \"Number of rows: \" + str(basic['num_rows']) + \"\\n\"\n",
" md += \"\\n\"\n",
" info = [f_data['columns'] for f in file_data.keys()][0]\n",
" for key, data in info.iteritems():\n",
" md += \"**\" + key + \"**\" + \"\\n\"\n",
" md += \"-\" * (len(key) + 2) + \"\\n\"\n",
" md += \"* Description of column: \\n\"\n",
" md += \"* Collection methods: \\n\"\n",
" md += \"* Description of data values and units: \\n\"\n",
" md += \"* Reason for missing values: \\n\"\n",
" md += \"\\n\"\n",
" for column, val in data.iteritems():\n",
" md += \"* \" + column + \": \" + str(val) + \"\\n\"\n",
" md += \"\\n\"\n",
" if print_me:\n",
" print md#return md\n",
" if make_md:\n",
" write_name = f.split('/')[-1].split('.')[0] + '_DataProfile.md'\n",
" with open(write_name, 'wt') as fout:\n",
" fout.write(md)\n",
" if make_html:\n",
" write_name = f.split('/')[-1].split('.')[0] + '_DataProfile.html'\n",
" with open(write_name, 'wt') as fout:\n",
" fout.write(markdown.markdown(md))\n",
" \n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 95,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"source = 'vagrants/'\n",
"\n",
"files = [source + f for f in getFiles(source)]\n",
"\n",
"file_data = {}\n",
"\n",
"for f in files:\n",
" if f.endswith('.csv'):\n",
" finfo = basic_stats(f)\n",
" csvinfo = review_csv(f, mode = 'rU', missing = '[unknown]')\n",
" file_data[f] = ({'file_metadata': finfo, \\\n",
" 'csv_basic': csvinfo['csv_basic'], \\\n",
" 'columns': csvinfo['cols']})\n",
"\n",
" \n",
"make_md(file_data, print_me = False, make_md = True, make_html = True)\n",
"#print md\n",
"\n",
"#print file_data\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
104 changes: 104 additions & 0 deletions .ipynb_checkpoints/make_fake_data-checkpoint.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"import csv\n",
"import random\n",
"import string"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def make_headers(num_cols):\n",
" headers = []\n",
" for c in range(num_cols):\n",
" headers.append(\"\".join([random.choice(string.lowercase) for l in range(random.randint(5,10))]))\n",
" return headers\n",
"\n",
"def make_csv(file_path, num_rows, num_cols, headers):\n",
" main_list = [headers]\n",
" ints = range(1000)\n",
" for r in range(num_rows):\n",
" row = []\n",
" for c in range(num_cols):\n",
" row.append(random.choice(ints))\n",
" main_list.append(row)\n",
" with open(file_path, 'wt') as fout:\n",
" fout = csv.writer(fout)\n",
" fout.writerows(main_list)"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"numcols = 10\n",
"min_rows = 5\n",
"max_rows = 200\n",
"num_fake_files = 1000\n",
"dump = \"fakedata/\""
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"head = make_headers(10)\n",
"\n",
"for i in range(num_fake_files):\n",
" path = dump + str(i) + '.csv'\n",
" make_csv(path, random.randint(min_rows, max_rows), numcols, head)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.11"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Loading

0 comments on commit f81ef92

Please sign in to comment.