-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCode.py
178 lines (130 loc) · 7.65 KB
/
Code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
### Coding challenge Construction data science
import pandas as pd
import numpy as np
import matplotlib.pyplot as mpl
import nltk as nltk
import string
string.punctuation
from nltk.corpus import stopwords
ignore_words = stopwords.words('english')
import os
os.getcwd()
# Data briefing
dt = pd.read_csv('City.csv')
print(dt.iloc[:3])
# Parse data into list of tuples, drop unnecessary categorical columns
# keep OBJECTID as primary key attribute and description
def pipeline_df(dt):
data = []
# remove NA values and general cleanup
dt_new = dt.dropna(subset=['OBJECTID', 'description'])
for i, row in dt_new.iterrows():
# create new column names
id_city = row['OBJECTID']
# lowercase all text, remove punctuations, remove white space
ds_city = row['description'].lower().translate(str.maketrans('','', string.punctuation)).split(' ')
# create new object containing no stopwords, aka remove stopwords from data
ds_city_clean = [s for s in ds_city if s not in ignore_words]
data.append((id_city, ds_city_clean))
# Retrieve all unique words to be used as features
words_u, words_ct, words = get_features(data)
# Bag of words Matrix (M entries/rows and N features)
X = build_feature_vector(data, words_u)
# Return Matrix
return X, words_u
# Return list of unique words aka our features
def get_features(data):
all_words = np.array([])
for entry in data:
all_words = np.concatenate((all_words, entry[1])) # Combine/Concatenate array
all_words_u, all_words_count = np.unique(all_words, return_counts=True) # remove duplicates
return all_words_u, all_words_count, all_words
def build_feature_vector(data, f_vector):
M,N = len(data), len(f_vector) #create matrix, M is row length, N is column length
X = np.zeros((M,N)) # start with matrix of 0 values
for ii, entry in enumerate(data): # enumerte makes counter for each entry/word in description
bag_vector = np.zeros(len(f_vector))
for word_entry in entry[1]: # for each word in every description
for i, word_f in enumerate(f_vector):
if word_entry == word_f:
bag_vector[i] = bag_vector[i] + 1
X[ii,:] = bag_vector
return X
# Input data into pipeline and get output matrix
X_tmp,features = pipeline_df(dt)
# Remove bad features Columns
X = X_tmp[:,1:]
features = features[1:]
X_dt = pd.DataFrame(X,columns=features)
X_dt.head
# X_dt is our dataframe indexing descriptions in order for rows and the columns
# are our count on how many times the unique word appears in the respective description
# numerical values are also in the list of features
### Plotting barplots of the frequency of key words we're interested in to help us
# evaluate what the best model might be for the task at hand and move from there.
# Barplots are a way to visualize the text data after it has been quantified for this problem
# it might give us insight or an idea of what model and direction to head toward after observing
# frequencies, patterns, distributions, etc. Can also use other statistical/math plots to visualize the data
X_dt['residential'].value_counts()[1:].plot(kind='bar', title='residential')
# If projects contain residential, odds are they will be dropped for relevancy
# becaue the Proability(Residential | Description contains Residential) ~ 1 is a safe assuumption for this sake
X_dt['large' and 'development'].value_counts()[1:].plot(kind='bar', title='Large and Development')
X_dt['large'].value_counts()[1:].plot(kind='bar', title='large')
intfeat = [int(i) for i in features if i.isdigit()] # extract numbers from features to check distribution of square footages
mpl.hist(intfeat,range=(50000, 1000000), bins = 30) # most numbers > 50,000 are distributed around less than 200,000
# these numbers are most likely the square footage for the project
mpl.hist(intfeat,range=(50000, 200000), bins = 30)
# Could comeup with more creative methods of analyzing the frequency of the words
# with a more complicated model, trying to identifying what words could indicate large-scale development
# like zoning, permits, etc, but for this sake we can use the frequency of the words to
# weight the conditions given for the contractor and use the weights to give out a relevancy score
# The word Scale never appeared after the filtering so can't search the frequency of that
# best I could see is large and development as maybe an indicator of large-scale development
### A metric I came up for giving a relevancy score on each project description
# is to give a value between [0,10], 10 indicating most relevant, 0 indicating not relevant
# If a description contains the word residential, it get classified as not relevant
# and gets a relevancy score of 0, I think it's a reasonable assumption that if the description contains
# the word residential, the construction project is a residential one, which the contractor does not do
# If the description contains the words large and development, it gets 3 points of relevancy score
# it's difficult and tricky to pick how much to weight the conditions for this problem at hand
# so the numbers are just a gut feeling of an appropiate weight, they are probabily inaccurate for extensive modeling
# If a description contains a number between 50,000 and 10,000,000 it gets points of relevancy score
# based on how high the number is, this number I assume is the square footage and I believe it is
# a safe assumption, while I could have checked the condition for a numeric value in that range and also
# containg some variation of "sqft", there are just too many ways sqft is written and hard coding the
# variations is not worth it, I could use things like Jaccard's similarity or shingling methods to help but
# I believe this is overthinking the problem which I admit I fell victim to. To determine the relevancy score
# Contribution to relevancy score is sqft / 15,000, the denominator of this fraction depends on
# what the SF city constititues the threshold of sqft for large-scale development, information I don't know
# This means that if sqft > something is considered large-scale development, the denominator depends on that.
X_dt_matrix = X_dt.values
rel_score = []
for row in X_dt_matrix:
score = 0 # temp score for each row, start as 0
words = [] # list of all words that appear in a row
for ii,col in enumerate(row):
if col > 0.0:
# check to see if that word appears (i.e. value that is >0)
# add to our list of words
words.append(features[ii])
# Relevancy Score creation
if 'large' in words or 'development' in words:
score = score + 1
if 'large' in words and 'development' in words:
score = score + 3
if 'residential' in words:
score = score*0
for word in words:
if word.isdigit():
num = float(word)
if num > 50000 and num < 10000000:
score = score + num/15000
rel_score.append(score)
dt_temp = dt.dropna(subset=['OBJECTID', 'description'])
dt_temp = dt_temp.loc[:, ['OBJECTID', 'description']]
MVPdt = dt_temp
MVPdt['Relevance_Score'] = rel_score # combine relevance score into dataframe with ID and description
# this is the minimum viable product
# There are some outliers in the relevance score but that's not a big problem as what's more important
# is that we tried to scale the relevance to be out of 10, probles with the metric of the sqft cause the scores
# to be high due to how points were calculated for sqft.