This repository has been archived by the owner on Nov 3, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_blitzer.py
executable file
·83 lines (70 loc) · 2.28 KB
/
get_blitzer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
#!/usr/bin/env python
import urllib.request
import urllib.error
from io import BytesIO
import json
import os
import re
import time
from lxml import etree
urls = json.load(open("pressemitteilungen.json"))
result = {}
for url in urls:
print(url["url"])
data = None
i = 3
while data is None and i > 0:
try:
data = urllib.request.urlopen(url["url"]).read()
except urllib.error.URLError as e:
time.sleep(1)
print(e)
i -= 1
if data is None and i == 0:
print("Error: skipping")
continue
parser = etree.HTMLParser()
tree = etree.parse(BytesIO(data), parser)
regex_date = re.compile("^\w+, (?P<day>\d+)\.(?P<month>\d+)(\.(?P<year>\d+))?")
regex_street = re.compile("^\w+")
cur_date = None
cur_ym = None
for elem in tree.xpath("//div[@id='col2_content']/div[2]"):
for elem2 in elem.xpath("descendant-or-self::*/text()"):
m = regex_date.match(elem2)
if m:
month = m.group("month")
year = m.group("year")
if year is None:
url_parts = url["url"].rsplit("/", 2)
if len(url_parts) > 2:
year = url_parts[1]
if int(month) == 1:
year = int(year) + 1
s = "%s-%s-%s" % (year, month, m.group("day"))
ym = "%s-%s" % (year, month)
if ym not in result:
result[ym] = {
"url": url["url"],
"results": {},
}
if s not in result[ym]["results"]:
result[ym]["results"][s] = []
cur_date = s
cur_ym = ym
continue
m = regex_street.match(elem2)
if m:
if cur_ym is None or cur_date is None:
print("error%s" % str(elem2))
continue
for elem3 in elem2.split(","):
result[cur_ym]["results"][cur_date].append(elem3.strip())
if not os.path.isdir("results"):
os.mkdir("results")
for ym, data in result.items():
json.dump(
data,
open("results/%s.json" % ym, "w"),
indent=" "
)