-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscript.py
87 lines (69 loc) · 2.63 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
Scrapes a headline from The Daily Pennsylvanian website and saves it to a
JSON file that tracks headlines over time.
"""
import os
import sys
import daily_event_monitor
import bs4
import requests
import loguru
def scrape_data_point():
"""
Scrapes the main headline from The Daily Pennsylvanian home page.
Returns:
str: The headline text if found, otherwise an empty string.
"""
req = requests.get("https://www.thedp.com")
loguru.logger.info(f"Request URL: {req.url}")
loguru.logger.info(f"Request status code: {req.status_code}")
if req.ok:
soup = bs4.BeautifulSoup(req.text, "html.parser")
target_element = soup.find("a", class_="frontpage-link")
data_point = "" if target_element is None else target_element.text
loguru.logger.info(f"Data point: {data_point}")
return data_point
if __name__ == "__main__":
# Setup logger to track runtime
loguru.logger.add("scrape.log", rotation="1 day")
# Create data dir if needed
loguru.logger.info("Creating data directory if it does not exist")
try:
os.makedirs("data", exist_ok=True)
except Exception as e:
loguru.logger.error(f"Failed to create data directory: {e}")
sys.exit(1)
# Load daily event monitor
loguru.logger.info("Loading daily event monitor")
dem = daily_event_monitor.DailyEventMonitor(
"data/daily_pennsylvanian_headlines.json"
)
# Run scrape
loguru.logger.info("Starting scrape")
try:
data_point = scrape_data_point()
except Exception as e:
loguru.logger.error(f"Failed to scrape data point: {e}")
data_point = None
# Save data
if data_point is not None:
dem.add_today(data_point)
dem.save()
loguru.logger.info("Saved daily event monitor")
def print_tree(directory, ignore_dirs=[".git", "__pycache__"]):
loguru.logger.info(f"Printing tree of files/dirs at {directory}")
for root, dirs, files in os.walk(directory):
dirs[:] = [d for d in dirs if d not in ignore_dirs]
level = root.replace(directory, "").count(os.sep)
indent = " " * 4 * (level)
loguru.logger.info(f"{indent}+--{os.path.basename(root)}/")
sub_indent = " " * 4 * (level + 1)
for file in files:
loguru.logger.info(f"{sub_indent}+--{file}")
print_tree(os.getcwd())
loguru.logger.info("Printing contents of data file {}".format(dem.file_path))
with open(dem.file_path, "r") as f:
loguru.logger.info(f.read())
# Finish
loguru.logger.info("Scrape complete")
loguru.logger.info("Exiting")