-
Notifications
You must be signed in to change notification settings - Fork 0
/
Case Master Word Counts.py
65 lines (54 loc) · 2.1 KB
/
Case Master Word Counts.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
from __future__ import division
import pandas as pd
import nltk
import csv
import string
from string import punctuation
from string import digits
from nltk.corpus import stopwords
from collections import Counter
# Load in data from Case Master csv export
df = pd.read_csv('C:/Users/nmannheimer/Desktop/DataScience/Text Analytics/CaseMaster.csv')
case_type_df = df.loc[df['Case Type Detail'].isin(['Connections', 'Data Sources'])]
case_type_df = case_type_df.reset_index(drop=True)
length = len(case_type_df)
print length
# Save Descriptions to a single string
descs = ""
for index, row in case_type_df.iterrows():
descs += row['No Commas']
print index/length * 100
print 'Text Blob Created'
# Remove those pesky non-ascii characters
printable = set(string.printable)
descs = filter(lambda x: x in printable, descs)
print 'Invalid Characters Removed'
# Remove digits from the string
descs = descs.translate(None, digits)
print 'Digits Removed'
# Remove punctuation from the string
descs = descs.translate(None, punctuation)
print 'Punctuation Removed'
# Make all words upper-case to remove Tableau vs tableau duplication
descs = descs.title()
print 'All Upper Case'
# Create tokens by removing punctuation and creating a list of all words
tokens = nltk.word_tokenize(descs)
print 'Tokens Created'
# Remove stop words like 'the' or 'and'
# We also need to make the stopwords upper-case
# From this point on operations are much faster because we're into higher performance data structures
cachedStopWords = stopwords.words("english")
cachedStopWords = [word.title() for word in cachedStopWords]
tokens = [word for word in tokens if word not in cachedStopWords]
print 'Tokens Cleaned'
# Count the words by occurrences
counts = Counter(tokens)
print 'Counter Complete'
# Save word counts as a csv
with open('C:/Users/nmannheimer/Desktop/DataScience/Text Analytics/DataSourcesandConnections.csv', 'wb') as csv_file:
writer = csv.writer(csv_file)
for key, value in counts.items():
writer.writerow([key, value])
print 'Completed csv Saved'
print 'Done!'