diff --git a/.env_template b/.env_template index 49ea0bd..56b5add 100644 --- a/.env_template +++ b/.env_template @@ -2,3 +2,4 @@ YOUTUBE_API_KEY= MONGO_URI= MONGO_DB= STAGE= +DAILY_SUN_URL= \ No newline at end of file diff --git a/README.md b/README.md index 5df5eb3..839e973 100644 --- a/README.md +++ b/README.md @@ -22,4 +22,6 @@ To start the project, run the following command in the terminal ## Setting up the database -Add /graphql to the url to access the interactive GraphQL platform \ No newline at end of file +Create a Mongo database named `score_db` and another named `daily_sun_db`. A partnership with the Daily Sun has given us access to their articles which we copy and paginate the results for frontend. + +Add /graphql to the url to access the interactive GraphQL platform diff --git a/app.py b/app.py index 5debd7e..860dc89 100644 --- a/app.py +++ b/app.py @@ -7,6 +7,8 @@ from src.schema import Query, Mutation from src.scrapers.games_scraper import fetch_game_schedule from src.scrapers.youtube_stats import fetch_videos +from src.scrapers.daily_sun_scrape import fetch_news +from src.services.article_service import ArticleService from src.utils.team_loader import TeamLoader app = Flask(__name__) @@ -42,6 +44,11 @@ def parse_args(): action="store_true", help="Skips scraping tasks if set, useful for frontend development.", ) + parser.add_argument( + "--no-daily-sun", + action="store_true", + help="Skips using the Daily Sun page for alerts", + ) return parser.parse_args() args = parse_args() @@ -52,7 +59,7 @@ def scrape_schedules(): logging.info("Scraping game schedules...") fetch_game_schedule() - @scheduler.task("interval", id="scrape_schedules", seconds=43200) + @scheduler.task("interval", id="scrape_schedules", seconds=43200) # 12 hours def scrape_videos(): logging.info("Scraping YouTube videos...") fetch_videos() @@ -60,5 +67,20 @@ def scrape_videos(): scrape_schedules() scrape_videos() +if not args.no_daily_sun: + @scheduler.task("interval", id="scrape_daily_sun", seconds=3600) + def scrape_daily_sun(): + logging.info("Getting Daily Sun Sports News...") + fetch_news() + + @scheduler.task("interval", id="cleanse_daily_sun_db", seconds=604800) # 1 week + def cleanse_daily_sun_db(): + logging.info("Cleaning the Daily Sun database from old articles...") + ArticleService.cleanse_old_articles() + + scrape_daily_sun() + cleanse_daily_sun_db() + + if __name__ == "__main__": app.run(debug=True, host="0.0.0.0", port=8000) \ No newline at end of file diff --git a/src/database.py b/src/database.py index 19801e0..b9c25e5 100644 --- a/src/database.py +++ b/src/database.py @@ -6,7 +6,7 @@ if os.getenv("STAGE") == "local": file_name = "ca-certificate.crt" - use_tls = os.getenv("MONGO_URI") != "mongodb://localhost:27017/" + use_tls = "localhost" not in os.getenv("MONGO_URI") else: file_name = "/etc/ssl/ca-certificate.crt" use_tls = True @@ -17,3 +17,4 @@ client = MongoClient(os.getenv("MONGO_URI")) db = client[os.getenv("MONGO_DB", "score_db")] +daily_sun_db = client[os.getenv("DAILY_SUN_DB", "daily_sun_db")] diff --git a/src/models/__init__.py b/src/models/__init__.py index ab83d25..efbf4e5 100644 --- a/src/models/__init__.py +++ b/src/models/__init__.py @@ -1,3 +1,4 @@ from .game import Game from .team import Team -from .youtube_video import YoutubeVideo \ No newline at end of file +from .youtube_video import YoutubeVideo +from .article import Article \ No newline at end of file diff --git a/src/models/article.py b/src/models/article.py new file mode 100644 index 0000000..bfcb8e1 --- /dev/null +++ b/src/models/article.py @@ -0,0 +1,56 @@ +from bson.objectid import ObjectId +from datetime import datetime + +class Article: + """ + A model representing a news article. + + Attributes: + - title: The title of the article + - image: The filename of the article's main image + - sports_type: The specific sport category + - published_at: The publication date + - url: The URL to the full article + - slug: Unique identifier from the source + - created_at: When the article was added to our DB + """ + def __init__(self, title, sports_type, published_at, url, slug, image=None, id=None, created_at=None): + self.id = id if id else str(ObjectId()) + self.title = title + self.image = image + self.sports_type = sports_type + self.published_at = published_at + self.url = url + self.slug = slug + self.created_at = created_at if created_at else datetime.now() + + def to_dict(self): + """ + Converts the Article object to a dictionary format for MongoDB storage. + """ + return { + "_id": self.id, + "title": self.title, + "image": self.image, + "sports_type": self.sports_type, + "published_at": self.published_at, + "url": self.url, + "slug": self.slug, + "created_at": self.created_at + } + + @staticmethod + def from_dict(data): + """ + Converts a MongoDB document to an Article object. + """ + return Article( + id=data.get("_id"), + title=data.get("title"), + image=data.get("image"), + sports_type=data.get("sports_type"), + published_at=data.get("published_at"), + url=data.get("url"), + slug=data.get("slug"), + created_at=data.get("created_at") + ) \ No newline at end of file diff --git a/src/mutations/__init__.py b/src/mutations/__init__.py index 3fd3a8a..3df8e4d 100644 --- a/src/mutations/__init__.py +++ b/src/mutations/__init__.py @@ -1,3 +1,4 @@ from .create_game import CreateGame from .create_team import CreateTeam -from .create_youtube_video import CreateYoutubeVideo \ No newline at end of file +from .create_youtube_video import CreateYoutubeVideo +from .create_article import CreateArticle \ No newline at end of file diff --git a/src/mutations/create_article.py b/src/mutations/create_article.py new file mode 100644 index 0000000..1e0a03b --- /dev/null +++ b/src/mutations/create_article.py @@ -0,0 +1,27 @@ +from graphene import Mutation, String, Field +from src.types import ArticleType +from src.services.article_service import ArticleService + +class CreateArticle(Mutation): + class Arguments: + title = String(required=True) + sports_type = String(required=True) + published_at = String(required=True) + url = String(required=True) + slug = String(required=True) + image = String(required=False) + + article = Field(lambda: ArticleType) + + def mutate(self, info, title, sports_type, published_at, url, slug, image=None): + from datetime import datetime + article_data = { + "title": title, + "sports_type": sports_type, + "published_at": datetime.fromisoformat(published_at), + "url": url, + "slug": slug, + "image": image + } + new_article = ArticleService.create_article(article_data) + return CreateArticle(article=new_article) \ No newline at end of file diff --git a/src/queries/__init__.py b/src/queries/__init__.py index f345409..fdf2f41 100644 --- a/src/queries/__init__.py +++ b/src/queries/__init__.py @@ -1,3 +1,4 @@ from .game_query import GameQuery from .team_query import TeamQuery from .youtube_video_query import YoutubeVideoQuery +from .article_query import ArticleQuery \ No newline at end of file diff --git a/src/queries/article_query.py b/src/queries/article_query.py new file mode 100644 index 0000000..52e6cbc --- /dev/null +++ b/src/queries/article_query.py @@ -0,0 +1,12 @@ +from graphene import ObjectType, List, String +from src.services.article_service import ArticleService +from src.types import ArticleType + +class ArticleQuery(ObjectType): + articles = List(ArticleType, sports_type=String()) + + def resolve_articles(self, info, sports_type=None): + """ + Resolver for retrieving news articles, optionally filtered by sports_type. + """ + return ArticleService.get_articles(sports_type) \ No newline at end of file diff --git a/src/repositories/__init__.py b/src/repositories/__init__.py index 1c18bb7..f9c6252 100644 --- a/src/repositories/__init__.py +++ b/src/repositories/__init__.py @@ -1,3 +1,4 @@ from .game_repository import GameRepository from .team_repository import TeamRepository from .youtube_video_repository import YoutubeVideoRepository +from .article_repository import ArticleRepository \ No newline at end of file diff --git a/src/repositories/article_repository.py b/src/repositories/article_repository.py new file mode 100644 index 0000000..1a30dc5 --- /dev/null +++ b/src/repositories/article_repository.py @@ -0,0 +1,69 @@ +from src.database import daily_sun_db +from src.models.article import Article +from pymongo import UpdateOne +from datetime import datetime, timedelta + +class ArticleRepository: + @staticmethod + def upsert(article): + """ + Upsert an article into the 'news_articles' collection in MongoDB. + """ + article_collection = daily_sun_db["news_articles"] + article_collection.update_one( + {"slug": article.slug}, + {"$set": article.to_dict()}, + upsert=True + ) + + @staticmethod + def bulk_upsert(articles): + """ + Bulk upsert articles into the 'news_articles' collection based on slug. + """ + if not articles: + return + + article_collection = daily_sun_db["news_articles"] + operations = [ + UpdateOne( + {"slug": article.slug}, + {"$set": article.to_dict()}, + upsert=True + ) + for article in articles + ] + if operations: + article_collection.bulk_write(operations) + + @staticmethod + def find_recent(limit_days=3): + """ + Retrieve articles from the last N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + query = {"published_at": {"$gte": datetime.now() - timedelta(days=limit_days)}} + articles = article_collection.find(query).sort("published_at", -1) + return [Article.from_dict(article) for article in articles] + + @staticmethod + def find_by_sports_type(sports_type, limit_days=3): + """ + Retrieve articles by sports_type from the last N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + query = { + "sports_type": sports_type, + "published_at": {"$gte": datetime.now() - timedelta(days=limit_days)} + } + articles = article_collection.find(query).sort("published_at", -1) + return [Article.from_dict(article) for article in articles] + + @staticmethod + def delete_not_recent(limit_days=3): + """ + Delete articles older than N days, sorted by published_at descending. + """ + article_collection = daily_sun_db["news_articles"] + query = {"published_at": {"$lt": datetime.now() - timedelta(days=limit_days)}} + article_collection.delete_many(query) \ No newline at end of file diff --git a/src/schema.py b/src/schema.py index 2cbbe69..0f3ae99 100644 --- a/src/schema.py +++ b/src/schema.py @@ -1,9 +1,9 @@ from graphene import ObjectType, Schema, Mutation -from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo -from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery +from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo, CreateArticle +from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery, ArticleQuery -class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ObjectType): +class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ArticleQuery, ObjectType): pass @@ -11,6 +11,7 @@ class Mutation(ObjectType): create_game = CreateGame.Field(description="Creates a new game.") create_team = CreateTeam.Field(description="Creates a new team.") create_youtube_video = CreateYoutubeVideo.Field(description="Creates a new youtube video.") + create_article = CreateArticle.Field(description="Creates a new article.") schema = Schema(query=Query, mutation=Mutation) diff --git a/src/scrapers/daily_sun_scrape.py b/src/scrapers/daily_sun_scrape.py new file mode 100644 index 0000000..cd11c5a --- /dev/null +++ b/src/scrapers/daily_sun_scrape.py @@ -0,0 +1,59 @@ +import os +import requests +from datetime import datetime, timedelta +from dotenv import load_dotenv +from ..services import ArticleService +import logging + +load_dotenv() + + +def fetch_news(): + try: + url = os.getenv("DAILY_SUN_URL") + response = requests.get( + url, + headers={ + "user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + } + ) + response.raise_for_status() + data = response.json() + + # Current date and 3-day threshold + current_date = datetime.now() + three_days_ago = current_date - timedelta(days=3) + + # Process articles + articles_to_store = [] + for article in data.get("articles", []): + published_at = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S") + + if published_at >= three_days_ago: + sports_type = next( + (tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]), + "General" + ) + article_url = f"https://cornellsun.com/article/{article['slug']}" + + article_doc = { + "title": article["headline"], + "image": article["dominantMedia"]["title"] if article["dominantMedia"] else None, + "sports_type": sports_type, + "published_at": published_at, + "url": article_url, + "slug": article["slug"], + "created_at": datetime.now() + } + articles_to_store.append(article_doc) + + if articles_to_store: + ArticleService.create_articles_bulk(articles_to_store) + logging.info(f"Stored/Updated {len(articles_to_store)} recent articles") + else: + logging.info("No recent articles to store") + return True + + except Exception as e: + logging.error(f"Error fetching news: {str(e)}") + return False diff --git a/src/services/__init__.py b/src/services/__init__.py index 2ed3e7a..29b5c31 100644 --- a/src/services/__init__.py +++ b/src/services/__init__.py @@ -1,3 +1,4 @@ from .game_service import GameService from .team_service import TeamService -from .youtube_video_service import YoutubeVideoService \ No newline at end of file +from .youtube_video_service import YoutubeVideoService +from .article_service import ArticleService \ No newline at end of file diff --git a/src/services/article_service.py b/src/services/article_service.py new file mode 100644 index 0000000..77da243 --- /dev/null +++ b/src/services/article_service.py @@ -0,0 +1,73 @@ +from src.database import daily_sun_db +from src.models.article import Article +from src.repositories.article_repository import ArticleRepository +from datetime import datetime, timedelta +import logging + +class ArticleService: + @staticmethod + def get_articles(sports_type=None): + """ + Retrieve all articles from the last 3 days, optionally filtered by sports_type, sorted by published_at descending. + """ + try: + if sports_type: + return ArticleRepository.find_by_sports_type(sports_type) + return ArticleRepository.find_recent() + except Exception as e: + logging.error(f"Error retrieving articles: {str(e)}") + return [] + + @staticmethod + def create_article(article_data): + """ + Create a single article and store it in MongoDB. + """ + try: + article = Article( + title=article_data["title"], + sports_type=article_data["sports_type"], + published_at=article_data["published_at"], + url=article_data["url"], + slug=article_data["slug"], + image=article_data.get("image") + ) + return ArticleRepository.upsert(article) + except Exception as e: + logging.error(f"Error creating article: {str(e)}") + return None + + @staticmethod + def create_articles_bulk(articles_data): + """ + Create or update multiple articles in bulk and store them in MongoDB. + """ + try: + if not articles_data: + return + articles = [ + Article( + title=data["title"], + sports_type=data["sports_type"], + published_at=data["published_at"], + url=data["url"], + slug=data["slug"], + image=data.get("image") + ) + for data in articles_data + ] + ArticleRepository.bulk_upsert(articles) + except Exception as e: + logging.error(f"Error creating articles in bulk: {str(e)}") + raise + + @staticmethod + def cleanse_old_articles(): + """ + Remove articles older than 3 days from the database. + """ + try: + ArticleRepository.delete_not_recent(limit_days=5) # provide a buffer from the 3-day threshold + except Exception as e: + logging.error(f"Error cleansing old articles: {str(e)}") + raise \ No newline at end of file diff --git a/src/types.py b/src/types.py index 830e1e2..88ceb36 100644 --- a/src/types.py +++ b/src/types.py @@ -1,5 +1,5 @@ from graphene import ObjectType, Field, String, List, Int -from src.services import TeamService +from datetime import datetime class TeamType(ObjectType): """ @@ -159,4 +159,29 @@ class YoutubeVideoType(ObjectType): def __init__(self, **kwargs): for key, value in kwargs.items(): - setattr(self, key, value) \ No newline at end of file + setattr(self, key, value) + +class ArticleType(ObjectType): + """ + A GraphQL type representing a news article. + + Attributes: + - title: The title of the article + - image: The filename of the article's main image + - sports_type: The specific sport category + - published_at: The publication date + - url: The URL to the full article + """ + id = String() + title = String(required=True) + image = String() + sports_type = String(required=True) + published_at = String(required=True) + url = String(required=True) + + def __init__(self, **kwargs): + for key, value in kwargs.items(): + if key == "published_at" and isinstance(value, datetime): + setattr(self, key, value.isoformat()) + else: + setattr(self, key, value) \ No newline at end of file