-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathanalyze_datasets.py
71 lines (58 loc) · 2.3 KB
/
analyze_datasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import csv
import requests
from prettytable import PrettyTable
FILENAME = "datasets.csv"
TARGET_SCHEMA = "etalab/schema-lieux-covoiturage"
URL_SCHEMA = "https://schema.data.gouv.fr/schemas/etalab/schema-lieux-covoiturage/latest/schema.json"
def dataset_slug(dataset_url):
return dataset_url.replace("https://www.data.gouv.fr/fr/datasets/", "")
def dataset_api_url(dataset_url):
return f"https://www.data.gouv.fr/api/1/datasets/{dataset_slug(dataset_url)}"
def get_validata_report(resource_url):
params = {"url": resource_url, "schema": URL_SCHEMA}
validata_response = requests.get("https://api.validata.etalab.studio/validate", params=params)
validata_response.raise_for_status()
try:
return validata_response.json()["report"]
except KeyError:
print(f"No metadatas from validata for: {resource_url}")
with open(FILENAME) as f:
table = PrettyTable()
table.field_names = ["LINE", "DATASET", "RESOURCE", "NB ROWS", "VALID", "ERROR"]
nb_rows = 0
rows = [r for r in csv.DictReader(f)]
errors = []
for row_number, row in enumerate(rows, 1):
dataset_url = row["dataset_url"]
response = requests.get(dataset_api_url(dataset_url))
response.raise_for_status()
hasSchema = False
for r in response.json()["resources"]:
if r["schema"] is not None and r["schema"].get("name") == TARGET_SCHEMA:
hasSchema = True
validataReport = get_validata_report(r["url"])
nbRowsInFile = validataReport["tasks"][0]["resource"]["stats"]["rows"]
table.add_row(
[
row_number,
dataset_slug(dataset_url),
r["title"],
nbRowsInFile,
validataReport.get("valid"),
"",
]
)
nb_rows += nbRowsInFile
if not hasSchema:
table.add_row(
[
row_number,
dataset_slug(dataset_url),
"",
"",
False,
"Aucune ressource avec le schéma",
]
)
table.add_row(["-", "---", "TOTAL", nb_rows, "", ""])
print(table)