Skip to content

implement articles from daily sun #19

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .env_template
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,4 @@ YOUTUBE_API_KEY=
MONGO_URI=
MONGO_DB=
STAGE=
DAILY_SUN_URL=
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,6 @@ To start the project, run the following command in the terminal

## Setting up the database

Add /graphql to the url to access the interactive GraphQL platform
Create a Mongo database named `score_db` and another named `daily_sun_db`. A partnership with the Daily Sun has given us access to their articles which we copy and paginate the results for frontend.

Add /graphql to the url to access the interactive GraphQL platform
24 changes: 23 additions & 1 deletion app.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@
from src.schema import Query, Mutation
from src.scrapers.games_scraper import fetch_game_schedule
from src.scrapers.youtube_stats import fetch_videos
from src.scrapers.daily_sun_scrape import fetch_news
from src.services.article_service import ArticleService
from src.utils.team_loader import TeamLoader

app = Flask(__name__)
Expand Down Expand Up @@ -42,6 +44,11 @@ def parse_args():
action="store_true",
help="Skips scraping tasks if set, useful for frontend development.",
)
parser.add_argument(
"--no-daily-sun",
action="store_true",
help="Skips using the Daily Sun page for alerts",
)
return parser.parse_args()

args = parse_args()
Expand All @@ -52,13 +59,28 @@ def scrape_schedules():
logging.info("Scraping game schedules...")
fetch_game_schedule()

@scheduler.task("interval", id="scrape_schedules", seconds=43200)
@scheduler.task("interval", id="scrape_schedules", seconds=43200) # 12 hours
def scrape_videos():
logging.info("Scraping YouTube videos...")
fetch_videos()

scrape_schedules()
scrape_videos()

if not args.no_daily_sun:
@scheduler.task("interval", id="scrape_daily_sun", seconds=3600)
def scrape_daily_sun():
logging.info("Getting Daily Sun Sports News...")
fetch_news()

@scheduler.task("interval", id="cleanse_daily_sun_db", seconds=604800) # 1 week
def cleanse_daily_sun_db():
logging.info("Cleaning the Daily Sun database from old articles...")
ArticleService.cleanse_old_articles()

scrape_daily_sun()
cleanse_daily_sun_db()


if __name__ == "__main__":
app.run(debug=True, host="0.0.0.0", port=8000)
3 changes: 2 additions & 1 deletion src/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

if os.getenv("STAGE") == "local":
file_name = "ca-certificate.crt"
use_tls = os.getenv("MONGO_URI") != "mongodb://localhost:27017/"
use_tls = "localhost" not in os.getenv("MONGO_URI")
else:
file_name = "/etc/ssl/ca-certificate.crt"
use_tls = True
Expand All @@ -17,3 +17,4 @@
client = MongoClient(os.getenv("MONGO_URI"))

db = client[os.getenv("MONGO_DB", "score_db")]
daily_sun_db = client[os.getenv("DAILY_SUN_DB", "daily_sun_db")]
3 changes: 2 additions & 1 deletion src/models/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .game import Game
from .team import Team
from .youtube_video import YoutubeVideo
from .youtube_video import YoutubeVideo
from .article import Article
56 changes: 56 additions & 0 deletions src/models/article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
from bson.objectid import ObjectId
from datetime import datetime

class Article:
"""
A model representing a news article.

Attributes:
- title: The title of the article
- image: The filename of the article's main image
- sports_type: The specific sport category
- published_at: The publication date
- url: The URL to the full article
- slug: Unique identifier from the source
- created_at: When the article was added to our DB
"""
def __init__(self, title, sports_type, published_at, url, slug, image=None, id=None, created_at=None):
self.id = id if id else str(ObjectId())
self.title = title
self.image = image
self.sports_type = sports_type
self.published_at = published_at
self.url = url
self.slug = slug
self.created_at = created_at if created_at else datetime.now()

def to_dict(self):
"""
Converts the Article object to a dictionary format for MongoDB storage.
"""
return {
"_id": self.id,
"title": self.title,
"image": self.image,
"sports_type": self.sports_type,
"published_at": self.published_at,
"url": self.url,
"slug": self.slug,
"created_at": self.created_at
}

@staticmethod
def from_dict(data):
"""
Converts a MongoDB document to an Article object.
"""
return Article(
id=data.get("_id"),
title=data.get("title"),
image=data.get("image"),
sports_type=data.get("sports_type"),
published_at=data.get("published_at"),
url=data.get("url"),
slug=data.get("slug"),
created_at=data.get("created_at")
)
3 changes: 2 additions & 1 deletion src/mutations/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .create_game import CreateGame
from .create_team import CreateTeam
from .create_youtube_video import CreateYoutubeVideo
from .create_youtube_video import CreateYoutubeVideo
from .create_article import CreateArticle
27 changes: 27 additions & 0 deletions src/mutations/create_article.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from graphene import Mutation, String, Field
from src.types import ArticleType
from src.services.article_service import ArticleService

class CreateArticle(Mutation):
class Arguments:
title = String(required=True)
sports_type = String(required=True)
published_at = String(required=True)
url = String(required=True)
slug = String(required=True)
image = String(required=False)

article = Field(lambda: ArticleType)

def mutate(self, info, title, sports_type, published_at, url, slug, image=None):
from datetime import datetime
article_data = {
"title": title,
"sports_type": sports_type,
"published_at": datetime.fromisoformat(published_at),
"url": url,
"slug": slug,
"image": image
}
new_article = ArticleService.create_article(article_data)
return CreateArticle(article=new_article)
1 change: 1 addition & 0 deletions src/queries/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .game_query import GameQuery
from .team_query import TeamQuery
from .youtube_video_query import YoutubeVideoQuery
from .article_query import ArticleQuery
12 changes: 12 additions & 0 deletions src/queries/article_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from graphene import ObjectType, List, String
from src.services.article_service import ArticleService
from src.types import ArticleType

class ArticleQuery(ObjectType):
articles = List(ArticleType, sports_type=String())

def resolve_articles(self, info, sports_type=None):
"""
Resolver for retrieving news articles, optionally filtered by sports_type.
"""
return ArticleService.get_articles(sports_type)
1 change: 1 addition & 0 deletions src/repositories/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .game_repository import GameRepository
from .team_repository import TeamRepository
from .youtube_video_repository import YoutubeVideoRepository
from .article_repository import ArticleRepository
69 changes: 69 additions & 0 deletions src/repositories/article_repository.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
from src.database import daily_sun_db
from src.models.article import Article
from pymongo import UpdateOne
from datetime import datetime, timedelta

class ArticleRepository:
@staticmethod
def upsert(article):
"""
Upsert an article into the 'news_articles' collection in MongoDB.
"""
article_collection = daily_sun_db["news_articles"]
article_collection.update_one(
{"slug": article.slug},
{"$set": article.to_dict()},
upsert=True
)

@staticmethod
def bulk_upsert(articles):
"""
Bulk upsert articles into the 'news_articles' collection based on slug.
"""
if not articles:
return

article_collection = daily_sun_db["news_articles"]
operations = [
UpdateOne(
{"slug": article.slug},
{"$set": article.to_dict()},
upsert=True
)
for article in articles
]
if operations:
article_collection.bulk_write(operations)

@staticmethod
def find_recent(limit_days=3):
"""
Retrieve articles from the last N days, sorted by published_at descending.
"""
article_collection = daily_sun_db["news_articles"]
query = {"published_at": {"$gte": datetime.now() - timedelta(days=limit_days)}}
articles = article_collection.find(query).sort("published_at", -1)
return [Article.from_dict(article) for article in articles]

@staticmethod
def find_by_sports_type(sports_type, limit_days=3):
"""
Retrieve articles by sports_type from the last N days, sorted by published_at descending.
"""
article_collection = daily_sun_db["news_articles"]
query = {
"sports_type": sports_type,
"published_at": {"$gte": datetime.now() - timedelta(days=limit_days)}
}
articles = article_collection.find(query).sort("published_at", -1)
return [Article.from_dict(article) for article in articles]

@staticmethod
def delete_not_recent(limit_days=3):
"""
Delete articles older than N days, sorted by published_at descending.
"""
article_collection = daily_sun_db["news_articles"]
query = {"published_at": {"$lt": datetime.now() - timedelta(days=limit_days)}}
article_collection.delete_many(query)
7 changes: 4 additions & 3 deletions src/schema.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,17 @@
from graphene import ObjectType, Schema, Mutation
from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo
from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery
from src.mutations import CreateGame, CreateTeam, CreateYoutubeVideo, CreateArticle
from src.queries import GameQuery, TeamQuery, YoutubeVideoQuery, ArticleQuery


class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ObjectType):
class Query(TeamQuery, GameQuery, YoutubeVideoQuery, ArticleQuery, ObjectType):
pass


class Mutation(ObjectType):
create_game = CreateGame.Field(description="Creates a new game.")
create_team = CreateTeam.Field(description="Creates a new team.")
create_youtube_video = CreateYoutubeVideo.Field(description="Creates a new youtube video.")
create_article = CreateArticle.Field(description="Creates a new article.")


schema = Schema(query=Query, mutation=Mutation)
59 changes: 59 additions & 0 deletions src/scrapers/daily_sun_scrape.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
import os
import requests
from datetime import datetime, timedelta
from dotenv import load_dotenv
from ..services import ArticleService
import logging

load_dotenv()


def fetch_news():
try:
url = os.getenv("DAILY_SUN_URL")
response = requests.get(
url,
headers={
"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36"
}
)
response.raise_for_status()
data = response.json()

# Current date and 3-day threshold
current_date = datetime.now()
three_days_ago = current_date - timedelta(days=3)

# Process articles
articles_to_store = []
for article in data.get("articles", []):
published_at = datetime.strptime(article["published_at"], "%Y-%m-%d %H:%M:%S")

if published_at >= three_days_ago:
sports_type = next(
(tag["name"] for tag in article["tags"] if tag["name"] not in ["Sports", "Top Stories"]),
"General"
)
article_url = f"https://cornellsun.com/article/{article['slug']}"

article_doc = {
"title": article["headline"],
"image": article["dominantMedia"]["title"] if article["dominantMedia"] else None,
"sports_type": sports_type,
"published_at": published_at,
"url": article_url,
"slug": article["slug"],
"created_at": datetime.now()
}
articles_to_store.append(article_doc)

if articles_to_store:
ArticleService.create_articles_bulk(articles_to_store)
logging.info(f"Stored/Updated {len(articles_to_store)} recent articles")
else:
logging.info("No recent articles to store")
return True

except Exception as e:
logging.error(f"Error fetching news: {str(e)}")
return False
3 changes: 2 additions & 1 deletion src/services/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .game_service import GameService
from .team_service import TeamService
from .youtube_video_service import YoutubeVideoService
from .youtube_video_service import YoutubeVideoService
from .article_service import ArticleService
Loading