-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathKL_divergence_brides.py
93 lines (83 loc) · 3.43 KB
/
KL_divergence_brides.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import nltk
import pandas as pd#gives us DataFrames
import matplotlib.pyplot as plt #For graphics
import wordcloud #Makes word clouds
import numpy as np #For divergences/distances
import scipy #For divergences/distances
import seaborn as sns #makes our plots look nicer
import sklearn.manifold
import lucem_illud
import analysis
def kl_divergence(X, Y):
P = X.copy()
Q = Y.copy()
P.columns = ['P']
Q.columns = ['Q']
df = Q.join(P).fillna(0)
p = df.iloc[:,1]
q = df.iloc[:,0]
D_kl = scipy.stats.entropy(p, q)
return D_kl
def chi2_divergence(X,Y):
P = X.copy()
Q = Y.copy()
P.columns = ['P']
Q.columns = ['Q']
df = Q.join(P).fillna(0)
p = df.iloc[:,1]
q = df.iloc[:,0]
return scipy.stats.chisquare(p, q).statistic
def Divergence(corpus1, corpus2, difference="KL"):
"""Difference parameter can equal KL, Chi2, or Wass"""
freqP = nltk.FreqDist(corpus1)
P = pd.DataFrame(list(freqP.values()), columns = ['frequency'], index = list(freqP.keys()))
freqQ = nltk.FreqDist(corpus2)
Q = pd.DataFrame(list(freqQ.values()), columns = ['frequency'], index = list(freqQ.keys()))
if difference == "KL":
return kl_divergence(P, Q)
elif difference == "Chi2":
return chi2_divergence(P, Q)
elif difference == "KS":
try:
return scipy.stats.ks_2samp(P['frequency'], Q['frequency']).statistic
except:
return scipy.stats.ks_2samp(P['frequency'], Q['frequency'])
elif difference == "Wasserstein":
try:
return scipy.stats.wasserstein_distance(P['frequency'], Q['frequency'], u_weights=None, v_weights=None).statistic
except:
return scipy.stats.wasserstein_distance(P['frequency'], Q['frequency'], u_weights=None, v_weights=None)
def heat_map():
df1 = analysis.tokenize('data/brides_data/brides-wanted_2001.csv')
df2 = analysis.tokenize('data/brides_data/brides-wanted_2002.csv')
df3 = analysis.tokenize('data/brides_data/brides-wanted_2003.csv')
df4 = analysis.tokenize('data/brides_data/brides-wanted_2005.csv')
df5 = analysis.tokenize('data/brides_data/brides-wanted_2006.csv')
df6 = analysis.tokenize('data/brides_data/brides-wanted_2007.csv')
df7 = analysis.tokenize('data/brides_data/brides-wanted_2008.csv')
df8 = analysis.tokenize('data/brides_data/brides-wanted_2009.csv')
df9 = analysis.tokenize('data/brides_data/brides-wanted_2010.csv')
df10 = analysis.tokenize('data/brides_data/brides-wanted_2011.csv')
df11 = analysis.tokenize('data/brides_data/brides-wanted_2012.csv')
df12 = analysis.tokenize('data/brides_data/brides-wanted_2013.csv')
df13 = analysis.tokenize('data/brides_data/brides-wanted_2014.csv')
corpora = []
for df in [df1, df2, df3, df4, df5, df6, df7, df8, df9, df10, df11, df12, df13]:
t = df['tokenized_ads'].tolist()
flat_list = [item for sublist in t for item in sublist]
corpora.append(flat_list)
fileids = [2001, 2002, 2003, 2005, 2006,2007, 2008,2009, 2010, 2011, 2012, 2013, 2014 ]
L = []
for p in corpora:
l = []
for q in corpora:
l.append(Divergence(p,q, difference = 'KL'))
L.append(l)
M = np.array(L)
fig = plt.figure()
div = pd.DataFrame(M, columns = fileids, index = fileids)
ax = sns.heatmap(div)
plt.title('KL Divergence between bride seeking ads across years')
plt.show()
if __name__ == '__main__':
heat_map()