Skip to content

Commit 70013d2

Browse files
Merge pull request #21 from La-Forge/ileDeFranceAppelProjet
feat: ✨ add scrapping for IDF's appel à projet page
2 parents 0b3344d + 6a51de0 commit 70013d2

File tree

9 files changed

+239
-62
lines changed

9 files changed

+239
-62
lines changed

generate_feeds.py

Lines changed: 17 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,23 @@
11
from scrappers.GniusScrapper import GniusScrapper, FEED_PATH as GNIUS_FEED_PATH
22
from scrappers.BpifranceScrapper import BpifranceScrapper, FEED_PATH as BPI_FEED_PATH
3+
from scrappers.IleDeFranceScrapper import IleDeFranceScrapper, FEED_PATH as IDF_FEED_PATH
34

45
import os
56
import argparse
67

78

8-
def main(verbose, update_bpi, update_gnius):
9+
def main(verbose, update_bpi, update_gnius, update_idf):
910
script_dir = os.path.dirname(os.path.abspath(__file__))
1011
feeds_dir = os.path.join(script_dir, "feeds")
1112
os.makedirs(feeds_dir, exist_ok=True)
1213

1314
bpi_scrapper = BpifranceScrapper()
1415
gnius_scrapper = GniusScrapper()
16+
idf_scrapper = IleDeFranceScrapper()
1517

1618
bpi_feed_file = BPI_FEED_PATH # os.path.join(feeds_dir, BPI_FEED_PATH)
1719
gnius_feed_file = GNIUS_FEED_PATH # os.path.join(feeds_dir, 'gnius_feed.xml')
20+
idf_feed_file = IDF_FEED_PATH # os.path.join(feeds_dir, 'idf_feed.xml')
1821

1922
if update_bpi:
2023
print(f"Updating {bpi_feed_file}...")
@@ -26,22 +29,29 @@ def main(verbose, update_bpi, update_gnius):
2629
gnius_scrapper.update_feed_file(gnius_feed_file, verbose=verbose)
2730
print(f"{gnius_feed_file} updated.")
2831

32+
# Updating XML's file for IDF's appel à projets
33+
if update_idf:
34+
print(f"Updating {idf_feed_file}...")
35+
idf_scrapper.update_feed_file(idf_feed_file, verbose=verbose)
36+
print(f"{idf_feed_file} updated.")
2937

3038
if __name__ == "__main__":
3139
parser = argparse.ArgumentParser(
3240
description="Update feeds with optional verbosity."
3341
)
3442
parser.add_argument(
35-
"--bpifrance", action="store_true", help="Update only bpifrance feed"
43+
"--bpifrance", action="store_true", help="Update only Bpifrance feed"
3644
)
37-
parser.add_argument("--gnius", action="store_true", help="Update only gnius feed")
45+
parser.add_argument("--gnius", action="store_true", help="Update only Gnius feed")
46+
parser.add_argument("--idf", action="store_true", help="Update only IDF feed")
3847
parser.add_argument(
3948
"-v", "--verbose", action="store_true", help="Enable verbose output"
4049
)
4150
args = parser.parse_args()
4251

43-
# Si ni bpifrance ni gnius n'est spécifié, on met à jour les deux
44-
update_bpi = args.bpifrance or (not args.bpifrance and not args.gnius)
45-
update_gnius = args.gnius or (not args.bpifrance and not args.gnius)
52+
# Si aucune option n'est spécifiée, on met à jour tous les feeds
53+
update_bpi = args.bpifrance or (not args.bpifrance and not args.gnius and not args.idf)
54+
update_gnius = args.gnius or (not args.bpifrance and not args.gnius and not args.idf)
55+
update_idf = args.idf or (not args.bpifrance and not args.gnius and not args.idf)
4656

47-
main(args.verbose, update_bpi, update_gnius)
57+
main(args.verbose, update_bpi, update_gnius, update_idf)

scrappers/APIScrapper.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
from scrappers.BaseScrapper import BaseScrapper
2+
from html import unescape
3+
import requests
4+
from bs4 import BeautifulSoup
5+
import dateparser
6+
from feedgen.feed import FeedGenerator
7+
from sentry_sdk import capture_exception
8+
9+
class APIScrapper(BaseScrapper):
10+
11+
def __init__(self, base_url, host, feed_title, feed_author, feed_link):
12+
self.base_url = base_url
13+
self.host = host
14+
self.feed_title = feed_title
15+
self.feed_author = feed_author
16+
self.feed_link = feed_link
17+
18+
def generate_feed(self, verbose=True):
19+
fg = FeedGenerator()
20+
fg.title(self.feed_title)
21+
fg.id(self.feed_link)
22+
fg.author({"name": self.feed_author})
23+
fg.link(href=self.feed_link, rel="alternate")
24+
fg.subtitle("Powered by www.la-forge.ai")
25+
fg.language("fr")
26+
27+
try:
28+
articles = self.scrapPages(verbose=verbose)
29+
for article in articles:
30+
fe = fg.add_entry()
31+
fe.id(article["link"])
32+
fe.title(article["title"])
33+
fe.link(href=article["link"])
34+
fe.description(article["description"])
35+
fe.pubDate(article["date"])
36+
except Exception as e:
37+
print(e)
38+
capture_exception(e)
39+
40+
atomfeed = fg.atom_str(pretty=True)
41+
return atomfeed

scrappers/BaseScrapper.py

Lines changed: 2 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -21,17 +21,6 @@ def __init__(self, base_url, host, feed_title, feed_author, feed_link):
2121
self.feed_link = feed_link
2222

2323
def scrapPages(self, verbose=False):
24-
posts = []
25-
page = 0
26-
count_for_current_page = -1
27-
while count_for_current_page != 0:
28-
posts_on_current_page = self.scrapPage(pageNumber=page, verbose=verbose)
29-
posts.extend(posts_on_current_page)
30-
count_for_current_page = len(posts_on_current_page)
31-
page = page + 1
32-
return posts
33-
34-
def scrapPage(self, pageNumber, verbose=False):
3524
raise NotImplementedError("This method should be overridden by subclasses")
3625

3726
def print_data(self, verbose=False):
@@ -48,47 +37,10 @@ def write_feed_to_file(self, feed, filename):
4837
with open(filename, "wb") as file:
4938
file.write(feed)
5039

51-
def get_full_article_content(self, article_url, content_class):
52-
response = requests.get(article_url)
53-
if response.status_code == 200:
54-
soup = BeautifulSoup(response.content, "html.parser")
55-
article_content = soup.find(class_=content_class).get_text()
56-
return article_content
57-
else:
58-
print(f"Failed to fetch article content from URL: {article_url}")
59-
return ""
60-
6140
def generate_feed(self, verbose=True):
62-
fg = FeedGenerator()
63-
fg.title(self.feed_title)
64-
fg.id(self.feed_link)
65-
fg.author({"name": self.feed_author})
66-
fg.link(href=self.feed_link, rel="alternate")
67-
fg.subtitle("Powered by www.la-forge.ai")
68-
fg.language("fr")
69-
70-
try:
71-
articles = self.scrapPages(verbose=verbose)
72-
for article in articles:
73-
fe = fg.add_entry()
74-
fe.id(article["link"])
75-
fe.title(article["title"])
76-
fe.link(href=article["link"])
77-
fe.description(article["description"])
78-
fe.pubDate(article["date"])
79-
full_content = self.get_full_article_content(
80-
article["link"], article.get("content_class")
81-
)
82-
if full_content:
83-
fe.content(full_content, type="CDATA")
84-
except Exception as e:
85-
print(e)
86-
capture_exception(e)
87-
88-
atomfeed = fg.atom_str(pretty=True)
89-
return atomfeed
41+
raise NotImplementedError("This method should be overridden by subclasses")
9042

9143
def update_feed_file(self, filename="feed.xml", verbose=False):
9244
feed = self.generate_feed(verbose=verbose)
9345
self.write_feed_to_file(feed, filename)
94-
return feed
46+
return feed

scrappers/BpifranceScrapper.py

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,12 +1,18 @@
1-
from scrappers.BaseScrapper import BaseScrapper
1+
from scrappers.WebScrapper import WebScrapper
22
from html import unescape
33
import requests
44
from bs4 import BeautifulSoup
55
import dateparser
6+
from feedgen.feed import FeedGenerator
7+
from sentry_sdk import capture_exception
8+
69

710
FEED_PATH = 'feeds/bpi_feed.xml'
811

9-
class BpifranceScrapper(BaseScrapper):
12+
class BpifranceScrapper(WebScrapper):
13+
"""
14+
Classe pour scrapper les données de BpiFrance - appel à projets.
15+
"""
1016
def __init__(self):
1117
super().__init__(
1218
base_url = "https://www.bpifrance.fr/views/ajax?_wrapper_format=drupal_ajax&labels=All&view_name=events_before_end_date&view_display_id=events_finishing_more_week&view_args=496&view_path=%2Fnode%2F7620&view_base_path=&view_dom_id=de2b6579af442525efdb3720e2433d578ae6af46c8d2cb9812d17facde4592ff&pager_element=0&_drupal_ajax=1&ajax_page_state%5Btheme%5D=bpi_main&ajax_page_state%5Btheme_token%5D=vUo2YdcgaSQx1XGJHIa_CX496Ili2qa2-fmRJpfpgV8&ajax_page_state%5Blibraries%5D=eJxtztsOwjAIBuAXqusjNXTFDkcPFqrOp3fuYotxN-TnCxA8qmJz-KpFMLgr8dqKha7FSfeJ1PjzkYgZG7DxlRzDe3GYlXSxCShv-A02cvHAFxkbVZV_14UpR1OhQWxQJ7Gh9Qo8HDL0XLtnkgmDuXca53Vltns6M0d5_VwUlERp3K8eYmQRxWQ9CJoH4VPsVge4wesHUgmd8QPX0HW2",
@@ -109,4 +115,4 @@ def get_article_content(self, article: BeautifulSoup) -> str:
109115
content = ""
110116
if p and len(p):
111117
content = p[0].text.strip()
112-
return content
118+
return content

scrappers/GniusScrapper.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,18 @@
1-
from scrappers.BaseScrapper import BaseScrapper
1+
from scrappers.WebScrapper import WebScrapper
22
from html import unescape
33
import requests
44
from bs4 import BeautifulSoup
55
import dateparser
6+
from feedgen.feed import FeedGenerator
7+
from sentry_sdk import capture_exception
68

79

810
FEED_PATH = 'feeds/gnius_feed.xml'
911

10-
class GniusScrapper(BaseScrapper):
12+
class GniusScrapper(WebScrapper):
13+
"""
14+
Classe pour scrapper les données de GNius - actualités.
15+
"""
1116
def __init__(self):
1217
super().__init__(
1318
base_url="https://gnius.esante.gouv.fr/fr/a-la-une/actualites?page=<page-number>",

scrappers/IleDeFranceScrapper.py

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
import requests
2+
from scrappers.APIScrapper import APIScrapper
3+
import datetime
4+
5+
FEED_PATH = 'feeds/idf_feed.xml'
6+
7+
class IleDeFranceScrapper(APIScrapper):
8+
"""
9+
Classe pour scrapper les données de l'Ile-de-France - appel à projets.
10+
"""
11+
def __init__(self):
12+
super().__init__(
13+
base_url="https://data.iledefrance.fr/api/explore/v2.1/catalog/datasets/aides-appels-a-projets/records",
14+
host="data.iledefrance.fr",
15+
feed_title="Aides et Appels à Projets - Île-de-France",
16+
feed_author="Île-de-France",
17+
feed_link="https://data.iledefrance.fr/explore/dataset/aides-appels-a-projets/",
18+
)
19+
self.limit_per_request = 100
20+
21+
def scrapPages(self, verbose=False):
22+
"""
23+
Récupère tous les enregistrements en gérant la limite de taille par requête
24+
"""
25+
all_records = []
26+
offset = 0
27+
28+
while True:
29+
params = {
30+
"limit": self.limit_per_request,
31+
"offset": offset,
32+
}
33+
34+
response = requests.get(self.base_url, params=params)
35+
response.raise_for_status()
36+
data = response.json()
37+
38+
current_records = data.get("results", [])
39+
all_records.extend(current_records)
40+
41+
if verbose:
42+
print(f"Page avec offset {offset} : {len(current_records)} enregistrements récupérés.")
43+
44+
if len(current_records) < self.limit_per_request:
45+
break
46+
47+
offset += self.limit_per_request
48+
49+
if verbose:
50+
print(f"Nombre total d'enregistrements récupérés : {len(all_records)}")
51+
52+
return self.format_articles(all_records)
53+
54+
def format_articles(self, data):
55+
"""
56+
Formate les données API dans le format attendu par BaseScrapper.
57+
"""
58+
articles = []
59+
for record in data:
60+
fields = record
61+
description_parts = [
62+
f"Description : {fields.get('chapo_txt', 'Pas de description disponible')}",
63+
f"Pour quel type de projet : {fields.get('objectif_txt', 'Non spécifié')}",
64+
f"Qui peut en bénéficier : {', '.join(fields.get('qui_peut_en_beneficier', [])) or 'Non spécifié'}"
65+
]
66+
description = "\n".join(description_parts)
67+
articles.append({
68+
"title": fields.get("nom_de_l_aide_de_la_demarche", "Titre inconnu"),
69+
"link": fields.get("url_descriptif", ""),
70+
"description": description,
71+
"date": self.parse_date(fields.get("date")),
72+
"content_class": None,
73+
})
74+
return articles
75+
76+
@staticmethod
77+
def parse_date(date_str):
78+
"""
79+
Transforme une date au format ISO 8601 en format RSS (RFC 822).
80+
"""
81+
try:
82+
return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%S%z").strftime("%a, %d %b %Y %H:%M:%S %z")
83+
except Exception:
84+
return None

scrappers/WebScrapper.py

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
from scrappers.BaseScrapper import BaseScrapper
2+
from html import unescape
3+
import requests
4+
from bs4 import BeautifulSoup
5+
import dateparser
6+
from feedgen.feed import FeedGenerator
7+
from sentry_sdk import capture_exception
8+
9+
class WebScrapper(BaseScrapper):
10+
11+
def __init__(self, base_url, host, feed_title, feed_author, feed_link):
12+
self.base_url = base_url
13+
self.host = host
14+
self.feed_title = feed_title
15+
self.feed_author = feed_author
16+
self.feed_link = feed_link
17+
18+
def scrapPages(self, verbose=False):
19+
posts = []
20+
page = 0
21+
count_for_current_page = -1
22+
23+
while count_for_current_page != 0:
24+
posts_on_current_page = self.scrapPage(pageNumber=page, verbose=verbose)
25+
count_for_current_page = len(posts_on_current_page)
26+
posts.extend(posts_on_current_page)
27+
page += 1
28+
29+
return posts
30+
31+
def get_full_article_content(self, article_url, content_class):
32+
response = requests.get(article_url)
33+
if response.status_code == 200:
34+
soup = BeautifulSoup(response.content, "html.parser")
35+
article_content = soup.find(class_=content_class).get_text()
36+
return article_content
37+
else:
38+
print(f"Failed to fetch article content from URL: {article_url}")
39+
return ""
40+
41+
def generate_feed(self, verbose=True):
42+
fg = FeedGenerator()
43+
fg.title(self.feed_title)
44+
fg.id(self.feed_link)
45+
fg.author({"name": self.feed_author})
46+
fg.link(href=self.feed_link, rel="alternate")
47+
fg.subtitle("Powered by www.la-forge.ai")
48+
fg.language("fr")
49+
50+
try:
51+
articles = self.scrapPages(verbose=verbose)
52+
for article in articles:
53+
fe = fg.add_entry()
54+
fe.id(article["link"])
55+
fe.title(article["title"])
56+
fe.link(href=article["link"])
57+
fe.description(article["description"])
58+
fe.pubDate(article["date"])
59+
full_content = self.get_full_article_content(
60+
article["link"], article.get("content_class")
61+
)
62+
if full_content:
63+
fe.content(full_content, type="CDATA")
64+
except Exception as e:
65+
print(e)
66+
capture_exception(e)
67+
68+
atomfeed = fg.atom_str(pretty=True)
69+
return atomfeed

0 commit comments

Comments
 (0)