Skip to content

Commit cfdd180

Browse files
author
Sean McIlroy
committed
basic news scraping and summary working for ap and reuters
1 parent f9e4e32 commit cfdd180

File tree

3 files changed

+60
-0
lines changed

3 files changed

+60
-0
lines changed

scrape_news/scrape_ap.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
from utils import get_summary
4+
5+
print('==========================================================')
6+
7+
site = 'https://www.apnews.com'
8+
9+
source = requests.get(site).text
10+
11+
soup = BeautifulSoup(source, 'lxml')
12+
13+
first_story = soup.find('a', class_="headline")
14+
15+
print(first_story.text)
16+
print(get_summary(site + first_story['href']))
17+
print('==========================================================')
18+
19+
second_story_container = soup.find('div', class_="RelatedStory")
20+
second_story_link = second_story_container.a
21+
second_story_title = second_story_container.find('div', class_="headline")
22+
23+
print(second_story_title.text)
24+
print(get_summary(site + second_story_link['href']))
25+
print('==========================================================')

scrape_news/scrape_reuters.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from bs4 import BeautifulSoup
2+
import requests
3+
from utils import get_summary
4+
5+
print('==========================================================')
6+
7+
site = 'https://www.reuters.com'
8+
9+
source = requests.get(site).text
10+
11+
soup = BeautifulSoup(source, 'lxml')
12+
13+
first_story_container = soup.find('h2', class_="story-title")
14+
first_story = first_story_container.a
15+
16+
print(first_story.text)
17+
print(get_summary(site + first_story['href']))
18+
print('==========================================================')
19+
20+
second_story_container = soup.find('div', class_="news-headline-list")
21+
second_story_link = second_story_container.find('a')
22+
second_story_title = second_story_container.find('h3', class_="story-title")
23+
24+
print(second_story_title.text.strip())
25+
print(get_summary(site + second_story_link['href']))
26+
print('==========================================================')

scrape_news/utils.py

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
from newspaper import Article
2+
3+
def get_summary(url):
4+
article = Article(url)
5+
article.download()
6+
article.parse()
7+
article.nlp()
8+
return article.summary
9+

0 commit comments

Comments
 (0)