-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathRecord_linkage_helper.py
92 lines (71 loc) · 2.93 KB
/
Record_linkage_helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
"""
MACS 30122: Final Project - Record Linkage
"""
import pandas as pd
import os
all_json_state_names = ["Alaska", "Alabama", "Arkansas", "Arizona",
"California", "Colorado", "Connecticut", "DC",
"Delaware", "Florida", "Georgia", "Hawaii", "Iowa",
"Idaho", "Illinois", "Indiana", "Kansas", "Kentucky",
"Louisiana", "Massachussetts","Maryland", "Maine", "Michigan",
"Minnesota", "Missouri", "Mississippi", "Montana", "Nevada",
"North Carolina", "North Dakota", "Nebraska", "New Hampshire",
"New Jersey", "New Mexico", "New York", "Rhode Island",
"Ohio", "Oklahoma", "Oregon", "Pennsylvania", "Puerto Rico",
"South Carolina", "South Dakota", "Tennessee",
"Texas", "Utah", "Virginia", "Vermont",
"Washington", "Wisconsin", "West Virginia", "Wyoming"]
def remove_u2069(t):
'''
Remove the Unicode character pop directional isolate (\u2069) from data
crawled from the website.
Input (str):
a tweet, possible with in-line pop directional isolate
character
Output (str):
a tweet, removed possible in-line pop directional
isolate character
'''
return repr(t).replace('\\u2069','').replace('"','').replace("'","")
def timestamp_to_str(t):
'''
Alter the format of dates from timestamp to string.
Input (timestamp):
date in the format of timestamp
Output (str):
date in the format of string
'''
return t.date().strftime('%Y/%m/%d')
def read_in_one_governor_tweets(input_path, file_to_link):
'''
Read in and construct a dataframe for tweets from one state governor.
Input:
input_path (str): the path of the JSON file containing tweets
from one state governor
file_to_link (str): the name of the JSON file (with the suffix
of ".json")
Output (dataframe):
a dataframe containing tweets from one state governor
'''
p = input_path
state_name = file_to_link.replace(".json","")
# Read in the governors' Twitter data
tweets_df = pd.read_json(os.path.join(p, file_to_link),lines=True,encoding='utf-8')
tweets_df.loc[:, 'date'] = tweets_df.loc[:, 'date'].apply(timestamp_to_str)
tweets_df = tweets_df[['tweet','date']]
tweets_df.drop_duplicates(keep='first',inplace=True)
tweets_df.reset_index(drop=True, inplace=True)
tweets_df.loc[:,'province_state'] = state_name
tweets_df['tweet'] = tweets_df['tweet'].apply(remove_u2069)
tweets_df.rename(columns={'tweet': 'governor_tweet'}, inplace=True)
return tweets_df
def replace_separator(t):
'''
Replace the separator of date format for merging.
Input (str):
Date in format yyyy-mm-dd
Output (str):
Date in format yyyy/mm/dd
'''
t_rep = t.replace("-","/")
return t_rep