Skip to content

Commit

Permalink
Update supervisord to uwsgi, index full month/date in elasticsearch (#…
Browse files Browse the repository at this point in the history
…100)

* Update supervisord to uwsgi, index full month/date in elasticsearch

* Update elasticsearch indexing

* include date_dt field into elasticsearch

* Update metadata and script for fetching events

* Update file format in all backend script

* Update data, fix typos in fetch event script

* Update data, supervisord script for serving GROBID

* Quick fix of pandas warning

* Missing unidecode in requirements.txt

* Adding events from Wharton Stats and GRASP, update data

* fix data queries on landing page

* fix selected-events pages

* fix recommendations page

* fix no-op state update

* fix search page

* fix infinite loading on select-events and recommendations pages

* remove id from EventItem props
  • Loading branch information
titipata authored Oct 18, 2019
1 parent 616f497 commit f0080c6
Show file tree
Hide file tree
Showing 19 changed files with 2,056 additions and 2,072 deletions.
3 changes: 2 additions & 1 deletion backend/create_events_features.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,8 @@ def generate_owner_candidate(event):
def create_events_features():
# produce topic vectors using tf-idf and Laten Semantic Analysis (LSA) and search candidate list
print('Compute LSA vectors...')
events_df = pd.DataFrame(json.loads(open(PATH_DATA, 'r').read()))
events_json = json.loads(open(PATH_DATA, 'r').read())
events_df = pd.DataFrame(events_json['data'])
events_text = [' '.join([e[1] for e in r.items()])
for _, r in events_df[['title', 'description', 'location', 'owner']].iterrows()]
events_preprocessed_text = [preprocess(text) for text in events_text]
Expand Down
1,800 changes: 854 additions & 946 deletions backend/data/events.json

Large diffs are not rendered by default.

1,790 changes: 846 additions & 944 deletions backend/data/events_vector.json

Large diffs are not rendered by default.

257 changes: 203 additions & 54 deletions backend/fetch_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,25 +96,57 @@ def find_startend_time(s):
return starttime, endtime


def read_json(file_path):
class NoIndent:
def __init__(self, o):
self.o = o


class EventEncoder(json.JSONEncoder):
"""
Read collected file from path
Class to save JSON where dictionary is stored per line
ref: https://stackoverflow.com/questions/58327845/save-dictionary-of-list-and-key-to-json-where-one-dictionary-is-stored-per-lin
"""
if not os.path.exists(file_path):
events = []
return events
else:
with open(file_path, 'r') as fp:
events = [json.loads(line) for line in fp]
return events

def __init__(self, *args, **kwargs):
super(EventEncoder, self).__init__(*args, **kwargs)
self._literal = []

def default(self, o):
if isinstance(o, NoIndent):
i = len(self._literal)
self._literal.append(json.dumps(o.o))
return '__%d__' % i
else:
return super(EventEncoder, self).default(o)

def save_json(ls, file_path):
def encode(self, o):
s = super(EventEncoder, self).encode(o)
for i, literal in enumerate(self._literal):
s = s.replace('"__%d__"' % i, literal)
return s


def save_json(events_json, file_path):
"""
Save list of dictionary to JSON
Save a dictionary with key and list inside the key in the following format
events_json = {
'refresh_count': 1,
'fetch_date': '10-10-2019',
'modified_date': '',
'data': [
{'date': '10-10-2019', 'title': 'title1', ...},
{'date': '11-10-2019', 'title': 'title2', ...},
]
}
where dictionary is saved per line
"""
events_json['data'] = [NoIndent(d) for d in events_json['data']]
s = json.dumps(events_json, indent=2, cls=EventEncoder)
with open(file_path, 'w') as fp:
fp.write('[\n ' + ',\n '.join(json.dumps(i) for i in ls) + '\n]')
fp.write(s)


def convert_event_to_dict(event):
Expand Down Expand Up @@ -145,25 +177,6 @@ def convert_event_to_dict(event):
return event_dict


def fetch_events():
"""
Saving Penn events to JSON format
"""
base_url = 'http://www.upenn.edu/calendar-export/?showndays=50'
page = requests.get(base_url)
tree = html.fromstring(page.content)
events = tree.findall('event')

events_list = []
for event in events:
try:
event_dict = convert_event_to_dict(event)
events_list.append(event_dict)
except:
pass
return events_list


def stringify_children(node):
"""
Filters and removes possible Nones in texts and tails
Expand Down Expand Up @@ -2303,27 +2316,37 @@ def fetch_events_gse(base_url='https://www.gse.upenn.edu/event'):
year_next, month_next = date_next.year, date_next.month
for (y, m) in [(year, month), (year_next, month_next)]:
event_extension = '?date={}-{}'.format(y, m)
page_soup = BeautifulSoup(requests.get(base_url + event_extension).content, 'html.parser')
event_page = page_soup.find('div', attrs={'class': 'region region-content'})
event_content = event_page.find_all('div', attrs={'class': 'view-content'})[1]
all_events = event_content.find_all('div', attrs={'class': 'views-row'})
page_soup = BeautifulSoup(requests.get(
base_url + event_extension
).content, 'html.parser')
event_page = page_soup.find(
'div', attrs={'class': 'region region-content'})
event_content = event_page.find_all(
'div', attrs={'class': 'view-content'})[1]
all_events = event_content.find_all(
'div', attrs={'class': 'views-row'})

for event_post in all_events:
title = event_post.find('span', attrs={'class': '_summary'})
title = title.text.strip() if title is not None else ''
description = event_post.find('span', attrs={'class': '_description'})
description = event_post.find(
'span', attrs={'class': '_description'})
description = description.text.strip() if description is not None else ''
date = event_post.find('span', attrs={'class': '_start'})
date = date.text.split(' ')[0] if date is not None else ''
starttime = event_post.find('span', attrs={'class': 'date-display-start'})
starttime = event_post.find(
'span', attrs={'class': 'date-display-start'})
starttime = starttime.text.strip() if starttime is not None else ''
endtime = event_post.find('span', attrs={'class': 'date-display-end'})
endtime = event_post.find(
'span', attrs={'class': 'date-display-end'})
endtime = endtime.text.strip() if endtime is not None else ''
speaker = event_post.find('span', attrs={'class': '_organizer'})
speaker = speaker.text.strip() if speaker is not None else ''
location = event_post.find('div', attrs={'class': 'views-field-field-location-1'})
location = event_post.find(
'div', attrs={'class': 'views-field-field-location-1'})
location = location.text.strip() if location is not None else ''
event_url_match = re.findall('https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', description)
event_url_match = re.findall(
'https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+', description)
if len(event_url_match) >= 1:
event_url = event_url_match[0]
else:
Expand All @@ -2342,13 +2365,121 @@ def fetch_events_gse(base_url='https://www.gse.upenn.edu/event'):
return events


def fetch_events_grasp(base_url='https://www.grasp.upenn.edu'):
"""
Fetch events from General Robotics, Automation,
Sensing & Perception laboratory (GRASP) seminar
"""
events = []
event_page = requests.get(urljoin(base_url, 'events'))
event_soup = BeautifulSoup(event_page.content, 'html.parser')

event_panel = event_soup.find('div', attrs={'class': 'view-content'})
all_events = event_panel.find_all('div', attrs={'class': 'views-row'})
for event in all_events:
title = event.find('div', attrs={'class': 'field-title'})
title = title.text.strip() if title is not None else ''
date = event.find(
'div', attrs={'class': 'calendar-tile'}).attrs['content'].split('T')[0]
event_url = urljoin(base_url, event.find('div').attrs.get('about'))
if ':' in title:
speaker = title.split(':')[-1]
else:
speaker = ''

start_end_time = event.find(
'span', attrs={'class': 'date-display-single'})
if start_end_time is not None:
starttime = start_end_time.find(
'span', attrs={'class': 'date-display-start'})
starttime = starttime.text.strip() if starttime is not None else ''
endtime = start_end_time.find(
'span', attrs={'class': 'date-display-end'})
endtime = endtime.text.strip() if endtime is not None else ''

event_detail_page = requests.get(event_url)
event_detail_soup = BeautifulSoup(
event_detail_page.content, 'html.parser')
description = event_detail_soup.find(
'div', attrs={'class': 'field-body'})
description = description.text.strip() if description is not None else ''
location = event_detail_soup.find(
'div', attrs={'class': 'street-block'})
location = location.text.strip() if location is not None else ''

events.append({
'title': title,
'date': date,
'location': location,
'description': description,
'starttime': starttime,
'endtime': endtime,
'speaker': speaker,
'url': event_url,
'owner': 'General Robotics, Automation, Sensing & Perception laboratory (GRASP)'
})
return events


def fetch_events_wharton_stats(base_url='https://statistics.wharton.upenn.edu/research/seminars-conferences/'):
"""
Fetch events from Wharton Statistics Seminars
"""
events = []
event_page = requests.get(base_url)
event_soup = BeautifulSoup(event_page.content, 'html.parser')
table = event_soup.find('table')
if table is not None:
all_events = table.find_all('tr')[1::]
for event_tr in all_events:
event_loc, speaker, title_desc = event_tr.find_all('td')
speaker = speaker.text.strip() if speaker is not None else ''
title = title_desc.text.strip() if title_desc is not None else ''
event_url = title_desc.find('a')
event_url = event_url.attrs['href'] if event_url is not None else base_url

if event_url is not base_url:
event_soup = BeautifulSoup(requests.get(
event_url).content, 'html.parser')
try:
description = event_soup.find(
'div', attrs={'class': 'wpb_wrapper'}).find_all('p')
description = ' '.join([p.text.strip()
for p in description])
except:
description = ''
else:
description = ''

if len(event_loc.text.split('\n')) == 3:
date, start_end_time, location = event_loc.text.split('\n')
start_end_time = start_end_time.replace(
'Time: ', '').replace('*', '')
starttime, endtime = start_end_time.split('-')
starttime = starttime + endtime.split(' ')[-1]
location = location.replace('Location: ', '').replace('*', '')
events.append({
'title': title,
'date': date,
'location': location,
'description': description,
'starttime': starttime,
'endtime': endtime,
'speaker': speaker,
'url': event_url,
'owner': 'Wharton Statistics Seminars'
})
return events


def drop_duplicate_events(df):
"""
Function to group dataframe, use all new information from the latest row
but keep the ``event_index`` from the first one
"""
df = df.sort_values('event_index', na_position='last')
event_index = df.event_index.iloc[0]
r = df.iloc[-1]
r = df.iloc[-1].to_dict()
r['event_index'] = event_index
return pd.Series(r)

Expand All @@ -2370,7 +2501,8 @@ def fetch_all_events():
fetch_events_annenberg, fetch_events_religious_studies, fetch_events_AHEAD,
fetch_events_SPP, fetch_events_ortner_center, fetch_events_penn_today,
fetch_events_mins, fetch_events_mindcore, fetch_events_seas,
fetch_events_vet, fetch_events_gse
fetch_events_vet, fetch_events_gse, fetch_events_grasp,
fetch_events_wharton_stats
]
for f in tqdm(fetch_fns):
try:
Expand All @@ -2383,28 +2515,45 @@ def fetch_all_events():
events_df.loc[:, 'starttime'] = events_df.apply(clean_starttime, axis=1)
if len(events_df.loc[events_df.endtime == '']) > 0:
events_df.loc[events_df.endtime == '', 'endtime'] = events_df.loc[events_df.endtime == ''].apply(
clean_endtime, axis=1)
clean_endtime, axis=1
)

# save data
# save data to json if not data in ``data`` folder
group_columns = ['owner', 'title', 'url', 'date', 'starttime']
if not os.path.exists(PATH_DATA):
events_df = events_df.drop_duplicates(
subset=group_columns, keep='first')
events_df['event_index'] = np.arange(len(events_df))
save_json(events_df.to_dict(orient='records'), PATH_DATA)
events_df['event_index'] = list(range(len(events_df)))

events_json = {}
events_json['name'] = "Penn Events"
events_json['refresh_count'] = 1
events_json['fetch_date'] = datetime.now().strftime('%d-%m-%Y')
events_json['modified_date'] = ''
events_json['data'] = events_df.to_dict(orient='records')
save_json(events_json, PATH_DATA)
# if data already exist, append new fetched data to an existing data
else:
events_former_df = pd.DataFrame(
json.loads(open(PATH_DATA, 'r').read()))
events_json = json.loads(open(PATH_DATA, 'r').read())
events_former_df = pd.DataFrame(events_json['data'])
events_df = pd.concat(
(events_former_df, events_df), axis=0, sort=False)
events_df = events_df.groupby(group_columns, as_index=False, level=0).apply(drop_duplicate_events)
(events_former_df, events_df), axis=0, sort=False
)
events_df = events_df.groupby(
group_columns, as_index=False, level=0
).apply(drop_duplicate_events)
events_df.sort_values('event_index', na_position='last', inplace=True)
event_idx_begin = events_former_df['event_index'].max() + 1
event_idx_end = event_idx_begin + events_df.event_index.isnull().sum()
events_df.loc[pd.isnull(events_df.event_index), 'event_index'] = np.arange(
event_idx_begin, event_idx_end)
events_df.loc[:, 'event_index'] = events_df.loc[:, 'event_index'].astype(int)
save_json(events_df.fillna('').to_dict(orient='records'), PATH_DATA)
events_df.loc[pd.isnull(events_df.event_index), 'event_index'] = list(
range(event_idx_begin, event_idx_end)
)
events_df.loc[:, 'event_index'] = events_df.loc[:, 'event_index'].astype(int)

events_json['refresh_count'] = events_json['refresh_count'] + 1
events_json['modified_date'] = datetime.now().strftime('%d-%m-%Y')
events_json['data'] = events_df.to_dict(orient='records')
save_json(events_json, PATH_DATA)


if __name__ == '__main__':
Expand Down
17 changes: 11 additions & 6 deletions backend/hug_api.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
import hug
import json
import numpy as np
Expand All @@ -13,11 +14,13 @@
api.http.add_middleware(hug.middleware.CORSMiddleware(api))


path_data, path_vector = 'data/events.json', 'data/events_vector.json'
path_data, path_vector = os.path.join('data', 'events.json'), os.path.join('data', 'events_vector.json')
event_vectors = json.load(open(path_vector, 'r'))
events = json.load(open(path_data, 'r'))
event_vectors_map = {e['event_index']: e['event_vector']
for e in event_vectors}
events = json.load(open(path_data, 'r'))['data']
event_vectors_map = {
e['event_index']: e['event_vector']
for e in event_vectors
}


# elasticsearch
Expand Down Expand Up @@ -91,14 +94,16 @@ def query(search_query: hug.types.text):
query=search_query,
fields=fields
)
search_responses = responses[0:n_results].execute().to_dict()['hits']['hits']
search_responses = responses[0:n_results].execute()
search_responses = search_responses.to_dict()['hits']['hits']

# return future events for a given query
future_events_indices, relevances = [], []
for response in filter(lambda r: get_future_event(r['_source']['date_dt']), search_responses):
future_events_indices.append(response['_id'])
relevances.append(response['_score'])
relevances = [int(100 * (r / max(relevances))) for r in relevances] # normalize to range 0 - 100
relevances = [int(100 * (r / max(relevances)))
for r in relevances] # normalize by the maximum relevance
search_relevances = [{'event_index': int(i), 'relevance': r}
for i, r in zip(future_events_indices, relevances)]

Expand Down
Loading

0 comments on commit f0080c6

Please sign in to comment.