Skip to content

Commit b2972b8

Browse files
authoredJun 11, 2020
Merge pull request NodyHub#16 from codders/feat/crawl-and-filter-dates
Improve efficiency of IDMaintainer, extract 'from' dates
2 parents 72351a1 + 2b34eca commit b2972b8

17 files changed

+231
-103
lines changed
 

‎flathunter/abstract_crawler.py

+5-1
Original file line numberDiff line numberDiff line change
@@ -16,4 +16,8 @@ def crawl(self, url, max_pages=None):
1616
return []
1717

1818
def get_name(self):
19-
return type(self).__name__
19+
return type(self).__name__
20+
21+
def get_expose_details(self, expose):
22+
# Implement in subclass - extract additional data by processing the expose URL
23+
return expose

‎flathunter/crawl_ebaykleinanzeigen.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,28 @@
11
import logging
22
import requests
33
import re
4+
import datetime
45
from bs4 import BeautifulSoup
56
from flathunter.abstract_crawler import Crawler
67

78
class CrawlEbayKleinanzeigen(Crawler):
89
__log__ = logging.getLogger(__name__)
910
USER_AGENT = 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:47.0) Gecko/20100101 Firefox/47.0'
1011
URL_PATTERN = re.compile(r'https://www\.ebay-kleinanzeigen\.de')
12+
MONTHS = {
13+
"Januar": "01",
14+
"Februar": "02",
15+
"März": "03",
16+
"April": "04",
17+
"Mai": "05",
18+
"Juni": "06",
19+
"Juli": "07",
20+
"August": "08",
21+
"September": "09",
22+
"Oktober": "10",
23+
"November": "11",
24+
"Dezember": "12"
25+
}
1126

1227
def __init__(self):
1328
logging.getLogger("requests").setLevel(logging.WARNING)
@@ -29,6 +44,17 @@ def get_page(self, search_url):
2944
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
3045
return BeautifulSoup(resp.content, 'html.parser')
3146

47+
def get_expose_details(self, expose):
48+
soup = self.get_page(expose['url'])
49+
for detail in soup.find_all('li', { "class": "addetailslist--detail" }):
50+
if re.match(r'Verfügbar ab', detail.text):
51+
date_string = re.match(r'(\w+) (\d{4})', detail.text)
52+
if date_string is not None:
53+
expose['from'] = "01." + self.MONTHS[date_string[1]] + "." + date_string[2]
54+
if 'from' not in expose:
55+
expose['from'] = datetime.datetime.now().strftime('%02d.%02m.%Y')
56+
return expose
57+
3258
def extract_data(self, soup):
3359
entries = list()
3460
soup = soup.find(id="srchrslt-adtable")
@@ -57,14 +83,14 @@ def extract_data(self, soup):
5783
address = address.replace('\n', ' ').replace('\r', '')
5884
address = " ".join(address.split())
5985
try:
60-
self.__log__.debug(tags[0].text)
61-
rooms = tags[0].text
86+
self.__log__.debug(tags[1].text)
87+
rooms = re.match(r'(\d+)', tags[1].text)[1]
6288
except IndexError:
6389
self.__log__.debug("Keine Zimmeranzahl gegeben")
6490
rooms = "Nicht gegeben"
6591
try:
66-
self.__log__.debug(tags[1].text)
67-
size = tags[1].text
92+
self.__log__.debug(tags[0].text)
93+
size = tags[0].text
6894
except IndexError:
6995
size = "Nicht gegeben"
7096
self.__log__.debug("Quadratmeter nicht angegeben")

‎flathunter/crawl_immobilienscout.py

+17-3
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import logging
22
import requests
33
import re
4+
import datetime
45
from bs4 import BeautifulSoup
6+
57
from flathunter.abstract_crawler import Crawler
68

79
class CrawlImmobilienscout(Crawler):
@@ -47,12 +49,24 @@ def get_results(self, search_url, max_pages=None):
4749
entries.extend(cur_entry)
4850
return entries
4951

50-
def get_page(self, search_url, page_no):
51-
resp = requests.get(search_url.format(page_no))
52+
def get_soup_from_url(self, url):
53+
resp = requests.get(url)
5254
if resp.status_code != 200:
5355
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
5456
return BeautifulSoup(resp.content, 'html.parser')
5557

58+
def get_page(self, search_url, page_no):
59+
return self.get_soup_from_url(search_url.format(page_no))
60+
61+
def get_expose_details(self, expose):
62+
soup = self.get_soup_from_url(expose['url'])
63+
date = soup.find('dd', { "class": "is24qa-bezugsfrei-ab" })
64+
expose['from'] = datetime.datetime.now().strftime("%2d.%2d.%Y")
65+
if date is not None:
66+
if not re.match(r'.*sofort.*', date.text):
67+
expose['from'] = date.text.strip()
68+
return expose
69+
5670
def extract_data(self, soup):
5771
entries = list()
5872

@@ -98,7 +112,7 @@ def extract_data(self, soup):
98112
'title': title_el.text.strip().replace('NEU', ''),
99113
'price': attr_els[0].text.strip().split(' ')[0].strip(),
100114
'size': attr_els[1].text.strip().split(' ')[0].strip() + " qm",
101-
'rooms': attr_els[2].text.strip().split(' ')[0].strip() + " Zi.",
115+
'rooms': attr_els[2].text.strip().split(' ')[0].strip(),
102116
'address': address,
103117
'crawler': self.get_name()
104118
}

‎flathunter/crawl_immowelt.py

+24
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
11
import logging
22
import requests
33
import re
4+
import datetime
45
from bs4 import BeautifulSoup
6+
57
from flathunter.abstract_crawler import Crawler
68

79
class CrawlImmowelt(Crawler):
@@ -28,6 +30,28 @@ def get_page(self, search_url):
2830
self.__log__.error("Got response (%i): %s" % (resp.status_code, resp.content))
2931
return BeautifulSoup(resp.content, 'html.parser')
3032

33+
def get_expose_details(self, expose):
34+
soup = self.get_page(expose['url'])
35+
immo_div = soup.find("div", { "id": "divImmobilie" })
36+
if immo_div is not None:
37+
details = immo_div.find_all("div", { "class": "clear" })
38+
for detail in details:
39+
if detail.find("div", { "class": "iw_left" }) is None:
40+
continue
41+
if detail.find("div", { "class": "iw_left" }).text.strip() == 'Die Wohnung':
42+
description_element = detail.find("div", { "class": "iw_right" })
43+
if description_element is None or description_element.find("p") is None:
44+
continue
45+
description = description_element.find("p").text
46+
if re.match(r'.*sofort.*', description, re.MULTILINE|re.DOTALL|re.IGNORECASE):
47+
expose['from'] = datetime.datetime.now().strftime("%2d.%2d.%Y")
48+
date_string = re.match(r'.*(\d{2}.\d{2}.\d{4}).*', description, re.MULTILINE|re.DOTALL)
49+
if date_string is not None:
50+
expose['from'] = date_string[1]
51+
if 'from' not in expose:
52+
expose['from'] = datetime.datetime.now().strftime("%2d.%2d.%Y")
53+
return expose
54+
3155
def extract_data(self, soup):
3256
entries = list()
3357
soup = soup.find(id="listItemWrapperFixed")

‎flathunter/crawl_wggesucht.py

+9-3
Original file line numberDiff line numberDiff line change
@@ -59,20 +59,26 @@ def extract_data(self, soup):
5959
numbers_row = row.find("div", { "class": "middle" })
6060
price = numbers_row.find("div", { "class": "col-xs-3" }).text.strip()
6161
rooms = re.findall(r'\d Zimmer', details_array[0])[0][:1]
62-
date = re.findall(r'\d{2}.\d{2}.\d{4}', numbers_row.find("div", { "class": "text-center" }).text)[0]
62+
dates = re.findall(r'\d{2}.\d{2}.\d{4}', numbers_row.find("div", { "class": "text-center" }).text)
6363
size = re.findall(r'\d{2,4}\sm²', numbers_row.find("div", { "class": "text-right" }).text)[0]
6464

6565
details = {
6666
'id': int(url.split('.')[-2]),
6767
'image': image,
6868
'url': url,
69-
'title': "%s ab dem %s" % (title, date),
69+
'title': "%s ab dem %s" % (title, dates[0]),
7070
'price': price,
7171
'size': size,
72-
'rooms': rooms + " Zi.",
72+
'rooms': rooms,
7373
'address': url,
7474
'crawler': self.get_name()
7575
}
76+
if len(dates) == 2:
77+
details['from'] = dates[0]
78+
details['to'] = dates[1]
79+
elif len(dates) == 1:
80+
details['from'] = dates[0]
81+
7682
entries.append(details)
7783

7884
self.__log__.debug('extracted: ' + str(entries))

‎flathunter/default_processors.py

+11
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,17 @@ def process_expose(self, expose):
2727
break
2828
return expose
2929

30+
class CrawlExposeDetails(Processor):
31+
32+
def __init__(self, config):
33+
self.config = config
34+
35+
def process_expose(self, expose):
36+
for searcher in self.config.searchers():
37+
if re.search(searcher.URL_PATTERN, expose['url']):
38+
expose = searcher.get_expose_details(expose)
39+
return expose
40+
3041
class LambdaProcessor(Processor):
3142

3243
def __init__(self, config, func):

‎flathunter/googlecloud_idmaintainer.py

+7-11
Original file line numberDiff line numberDiff line change
@@ -23,22 +23,27 @@ def mark_processed(self, expose_id):
2323
self.__log__.debug('mark_processed(' + str(expose_id) + ')')
2424
self.db.collection(u'processed').document(str(expose_id)).set({ u'id': expose_id })
2525

26+
def is_processed(self, expose_id):
27+
self.__log__.debug('is_processed(' + str(expose_id) + ')')
28+
doc = self.db.collection(u'processed').document(str(expose_id))
29+
return doc.get().exists
30+
2631
def save_expose(self, expose):
2732
record = expose.copy()
2833
record.update({ 'created_at': datetime.datetime.now(), 'created_sort': (0 - datetime.datetime.now().timestamp()) })
2934
self.db.collection(u'exposes').document(str(expose[u'id'])).set(record)
3035

3136
def get_exposes_since(self, min_datetime):
3237
res = []
33-
for doc in self.db.collection(u'exposes').order_by('created_sort').stream():
38+
for doc in self.db.collection(u'exposes').order_by('created_sort').limit(100).stream():
3439
if doc.to_dict()[u'created_at'] < min_datetime:
3540
break
3641
res.append(doc.to_dict())
3742
return res
3843

3944
def get_recent_exposes(self, count, filter=None):
4045
res = []
41-
for doc in self.db.collection(u'exposes').order_by('created_sort').stream():
46+
for doc in self.db.collection(u'exposes').order_by('created_sort').limit(100).stream():
4247
expose = doc.to_dict()
4348
if filter is None or filter.is_interesting_expose(expose):
4449
res.append(expose)
@@ -66,15 +71,6 @@ def get_user_filters(self):
6671
res.append((int(doc.id), settings['filters']))
6772
return res
6873

69-
def get(self):
70-
res = []
71-
for doc in self.db.collection(u'processed').stream():
72-
res.append(doc.to_dict()[u'id'])
73-
74-
self.__log__.info('already processed: ' + str(len(res)))
75-
self.__log__.debug(str(res))
76-
return res
77-
7874
def get_last_run_time(self):
7975
for doc in self.db.collection(u'executions').order_by(u'timestamp', direction=firestore.Query.DESCENDING).limit(1).stream():
8076
return doc.to_dict()[u'timestamp']

‎flathunter/idmaintainer.py

+8-17
Original file line numberDiff line numberDiff line change
@@ -26,12 +26,10 @@ class AlreadySeenFilter:
2626

2727
def __init__(self, id_watch):
2828
self.id_watch = id_watch
29-
self.processed = self.id_watch.get()
3029

3130
def is_interesting(self, expose):
32-
if expose['id'] not in self.processed:
31+
if not self.id_watch.is_processed(expose['id']):
3332
self.id_watch.mark_processed(expose['id'])
34-
self.processed.append(expose['id'])
3533
return True
3634
return False
3735

@@ -59,6 +57,13 @@ def get_connection(self):
5957
raise e
6058
return connection
6159

60+
def is_processed(self, expose_id):
61+
self.__log__.debug('is_processed(' + str(expose_id) + ')')
62+
cur = self.get_connection().cursor()
63+
cur.execute('SELECT id FROM processed WHERE id = ?', (expose_id,))
64+
row = cur.fetchone()
65+
return (row is not None)
66+
6267
def mark_processed(self, expose_id):
6368
self.__log__.debug('mark_processed(' + str(expose_id) + ')')
6469
cur = self.get_connection().cursor()
@@ -111,20 +116,6 @@ def get_user_filters(self):
111116
res.append((row[0], json.loads(row[1])['filters']))
112117
return res
113118

114-
def get(self):
115-
res = []
116-
cur = self.get_connection().cursor()
117-
cur.execute("SELECT * FROM processed ORDER BY 1")
118-
while True:
119-
row = cur.fetchone()
120-
if row == None:
121-
break
122-
res.append(row[0])
123-
124-
self.__log__.info('already processed: ' + str(len(res)))
125-
self.__log__.debug(str(res))
126-
return res
127-
128119
def get_last_run_time(self):
129120
cur = self.get_connection().cursor()
130121
cur.execute("SELECT * FROM executions ORDER BY timestamp DESC LIMIT 1")

‎flathunter/processor.py

+6-1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from flathunter.default_processors import AddressResolver
66
from flathunter.default_processors import Filter
77
from flathunter.default_processors import LambdaProcessor
8+
from flathunter.default_processors import CrawlExposeDetails
89
from flathunter.sender_telegram import SenderTelegram
910
from flathunter.gmaps_duration_processor import GMapsDurationProcessor
1011
from flathunter.idmaintainer import SaveAllExposesProcessor
@@ -30,6 +31,10 @@ def calculate_durations(self):
3031
self.processors.append(GMapsDurationProcessor(self.config))
3132
return self
3233

34+
def crawl_expose_details(self):
35+
self.processors.append(CrawlExposeDetails(self.config))
36+
return self
37+
3338
def map(self, func):
3439
self.processors.append(LambdaProcessor(self.config, func))
3540
return self
@@ -55,4 +60,4 @@ def process(self, exposes):
5560

5661
@staticmethod
5762
def builder(config):
58-
return ProcessorChainBuilder(config)
63+
return ProcessorChainBuilder(config)

‎flathunter/web/templates/exposes.html

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
<div class="exposes">
22
{% for expose in exposes %}
33
<div class="expose">
4-
<p>{{ expose['price'] }}, {{expose['rooms']}} rooms, {{expose['size']}}</p>
4+
<p>{{ expose['price'] }}, {{expose['rooms']}} rooms, {{expose['size']}} from {{expose['from']}}</p>
55
<a href="{{ expose['url'] }}" target="_blank">
66
{% if expose['image'] %}
77
<img src="{{ expose['image'] }}">
@@ -12,4 +12,4 @@
1212
<h3><a href="{{ expose['url'] }}" target="_blank">{{ expose['title'] }}</a></h3>
1313
</div>
1414
{% endfor %}
15-
</div>
15+
</div>

‎flathunter/web_hunter.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,9 @@ def hunt_flats(self):
1414
.build()
1515

1616
processor_chain = ProcessorChain.builder(self.config) \
17-
.save_all_exposes(self.id_watch) \
1817
.apply_filter(filter) \
18+
.crawl_expose_details() \
19+
.save_all_exposes(self.id_watch) \
1920
.resolve_addresses() \
2021
.calculate_durations() \
2122
.build()
@@ -44,4 +45,4 @@ def set_filters_for_user(self, user_id, filters):
4445
self.id_watch.set_filters_for_user(user_id, filters)
4546

4647
def get_filters_for_user(self, user_id):
47-
return self.id_watch.get_filters_for_user(user_id)
48+
return self.id_watch.get_filters_for_user(user_id)

‎test/test_crawl_ebaykleinanzeigen.py

+26-16
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,31 @@
1-
import unittest
1+
import pytest
22
from flathunter.crawl_ebaykleinanzeigen import CrawlEbayKleinanzeigen
33

4-
class EbayKleinanzeigenCrawlerTest(unittest.TestCase):
4+
TEST_URL = 'https://www.ebay-kleinanzeigen.de/s-wohnung-mieten/berlin/preis:1000:1500/c203l3331+wohnung_mieten.qm_d:70,+wohnung_mieten.zimmer_d:2'
55

6-
TEST_URL = 'https://www.ebay-kleinanzeigen.de/s-wohnung-mieten/berlin/preis:1000:1500/c203l3331+wohnung_mieten.qm_d:70,+wohnung_mieten.zimmer_d:2'
6+
@pytest.fixture
7+
def crawler():
8+
return CrawlEbayKleinanzeigen()
79

8-
def setUp(self):
9-
self.crawler = CrawlEbayKleinanzeigen()
10-
11-
def test(self):
12-
soup = self.crawler.get_page(self.TEST_URL)
13-
self.assertIsNotNone(soup, "Should get a soup from the URL")
14-
entries = self.crawler.extract_data(soup)
15-
self.assertIsNotNone(entries, "Should parse entries from search URL")
16-
self.assertTrue(len(entries) > 0, "Should have at least one entry")
17-
self.assertTrue(entries[0]['id'] > 0, "Id should be parsed")
18-
self.assertTrue(entries[0]['url'].startswith("https://www.ebay-kleinanzeigen.de/s-anzeige"), u"URL should be an anzeige link")
19-
for attr in [ 'title', 'price', 'size', 'rooms', 'address' ]:
20-
self.assertIsNotNone(entries[0][attr], attr + " should be set")
10+
def test_crawler(crawler):
11+
soup = crawler.get_page(TEST_URL)
12+
assert soup is not None
13+
entries = crawler.extract_data(soup)
14+
assert entries is not None
15+
assert len(entries) > 0
16+
assert entries[0]['id'] > 0
17+
assert entries[0]['url'].startswith("https://www.ebay-kleinanzeigen.de/s-anzeige")
18+
for attr in [ 'title', 'price', 'size', 'rooms', 'address' ]:
19+
assert entries[0][attr] is not None
2120

21+
def test_process_expose_fetches_details(crawler):
22+
soup = crawler.get_page(TEST_URL)
23+
assert soup is not None
24+
entries = crawler.extract_data(soup)
25+
assert entries is not None
26+
assert len(entries) > 0
27+
updated_entries = [ crawler.get_expose_details(expose) for expose in entries ]
28+
for expose in updated_entries:
29+
print(expose)
30+
for attr in [ 'title', 'price', 'size', 'rooms', 'address', 'from' ]:
31+
assert expose[attr] is not None

‎test/test_crawl_immobilienscout.py

+26-16
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,31 @@
1-
import unittest
2-
from flathunter.crawl_immobilienscout import CrawlImmobilienscout
1+
import pytest
32

4-
class ImmobilienscoutCrawlerTest(unittest.TestCase):
3+
from flathunter.crawl_immobilienscout import CrawlImmobilienscout
54

6-
TEST_URL = 'https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-mieten?numberofrooms=2.0-&price=-1500.0&livingspace=70.0-&sorting=2&pagenumber=1'
5+
TEST_URL = 'https://www.immobilienscout24.de/Suche/de/berlin/berlin/wohnung-mieten?numberofrooms=2.0-&price=-1500.0&livingspace=70.0-&sorting=2&pagenumber=1'
76

8-
def setUp(self):
9-
self.crawler = CrawlImmobilienscout()
7+
@pytest.fixture
8+
def crawler():
9+
return CrawlImmobilienscout()
1010

11-
def test(self):
12-
soup = self.crawler.get_page(self.TEST_URL, 1)
13-
self.assertIsNotNone(soup, "Should get a soup from the URL")
14-
entries = self.crawler.extract_data(soup)
15-
self.assertIsNotNone(entries, "Should parse entries from search URL")
16-
self.assertTrue(len(entries) > 0, "Should have at least one entry")
17-
self.assertTrue(entries[0]['id'] > 0, "Id should be parsed")
18-
self.assertTrue(entries[0]['url'].startswith("https://www.immobilienscout24.de/expose"), u"URL should be an exposé link")
19-
for attr in [ 'title', 'price', 'size', 'rooms', 'address' ]:
20-
self.assertIsNotNone(entries[0][attr], attr + " should be set")
11+
def test_crawl_works(crawler):
12+
soup = crawler.get_page(TEST_URL, 1)
13+
assert soup is not None
14+
entries = crawler.extract_data(soup)
15+
assert entries is not None
16+
assert len(entries) > 0
17+
assert entries[0]['id'] > 0
18+
assert entries[0]['url'].startswith("https://www.immobilienscout24.de/expose")
19+
for attr in [ 'title', 'price', 'size', 'rooms', 'address' ]:
20+
assert entries[0][attr] is not None
2121

22+
def test_process_expose_fetches_details(crawler):
23+
soup = crawler.get_page(TEST_URL, 1)
24+
assert soup is not None
25+
entries = crawler.extract_data(soup)
26+
assert entries is not None
27+
assert len(entries) > 0
28+
updated_entries = [ crawler.get_expose_details(expose) for expose in entries ]
29+
for expose in updated_entries:
30+
for attr in [ 'title', 'price', 'size', 'rooms', 'address', 'from' ]:
31+
assert expose[attr] is not None

‎test/test_crawl_immowelt.py

+30-17
Original file line numberDiff line numberDiff line change
@@ -1,25 +1,38 @@
1-
import unittest
1+
import pytest
2+
23
from flathunter.crawl_immowelt import CrawlImmowelt
34
from test_util import count
45

5-
class ImmoweltCrawlerTest(unittest.TestCase):
6+
TEST_URL = 'https://www.immowelt.de/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc'
67

7-
TEST_URL = 'https://www.immowelt.de/liste/berlin/wohnungen/mieten?roomi=2&prima=1500&wflmi=70&sort=createdate%2Bdesc'
8+
@pytest.fixture
9+
def crawler():
10+
return CrawlImmowelt()
811

9-
def setUp(self):
10-
self.crawler = CrawlImmowelt()
1112

12-
def test(self):
13-
soup = self.crawler.get_page(self.TEST_URL)
14-
self.assertIsNotNone(soup, "Should get a soup from the URL")
15-
entries = self.crawler.extract_data(soup)
16-
self.assertIsNotNone(entries, "Should parse entries from search URL")
17-
self.assertTrue(len(entries) > 0, "Should have at least one entry")
18-
self.assertTrue(entries[0]['id'] > 0, "Id should be parsed")
19-
self.assertTrue(entries[0]['url'].startswith("https://www.immowelt.de/expose"), u"URL should be an exposé link")
20-
for attr in [ 'title', 'price', 'size', 'rooms', 'address', 'image' ]:
21-
self.assertIsNotNone(entries[0][attr], attr + " should be set")
13+
def test_crawler(crawler):
14+
soup = crawler.get_page(TEST_URL)
15+
assert soup is not None
16+
entries = crawler.extract_data(soup)
17+
assert entries is not None
18+
assert len(entries) > 0
19+
assert entries[0]['id'] > 0
20+
assert entries[0]['url'].startswith("https://www.immowelt.de/expose")
21+
for attr in [ 'title', 'price', 'size', 'rooms', 'address', 'image' ]:
22+
assert entries[0][attr] is not None
2223

23-
def test_dont_crawl_other_urls():
24-
exposes = CrawlImmowelt().crawl("https://www.example.com")
24+
def test_dont_crawl_other_urls(crawler):
25+
exposes = crawler.crawl("https://www.example.com")
2526
assert count(exposes) == 0
27+
28+
def test_process_expose_fetches_details(crawler):
29+
soup = crawler.get_page(TEST_URL)
30+
assert soup is not None
31+
entries = crawler.extract_data(soup)
32+
assert entries is not None
33+
assert len(entries) > 0
34+
updated_entries = [ crawler.get_expose_details(expose) for expose in entries ]
35+
for expose in updated_entries:
36+
print(expose)
37+
for attr in [ 'title', 'price', 'size', 'rooms', 'address', 'from' ]:
38+
assert expose[attr] is not None

‎test/test_crawl_wggesucht.py

+6-2
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
import unittest
2+
from functools import reduce
23
from flathunter.crawl_wggesucht import CrawlWgGesucht
34

45
class WgGesuchtCrawlerTest(unittest.TestCase):
56

6-
TEST_URL = 'https://www.wg-gesucht.de/wohnungen-in-Berlin.8.2.1.0.html?offer_filter=1&city_id=8&noDeact=1&categories%5B%5D=2&rent_types%5B%5D=2&sMin=70&rMax=1500&rmMin=2&fur=2&sin=2&exc=2&img_only=1'
7+
TEST_URL = 'https://www.wg-gesucht.de/wohnungen-in-Berlin.8.2.1.0.html?offer_filter=1&city_id=8&noDeact=1&categories%5B%5D=2&rent_types%5B%5D=0&sMin=70&rMax=1500&rmMin=2&fur=2&sin=2&exc=2&img_only=1'
78

89
def setUp(self):
910
self.crawler = CrawlWgGesucht()
@@ -16,6 +17,9 @@ def test(self):
1617
self.assertTrue(len(entries) > 0, "Should have at least one entry")
1718
self.assertTrue(entries[0]['id'] > 0, "Id should be parsed")
1819
self.assertTrue(entries[0]['url'].startswith("https://www.wg-gesucht.de/wohnungen"), u"URL should be an apartment link")
19-
for attr in [ 'title', 'price', 'size', 'rooms', 'address', 'image' ]:
20+
for attr in [ 'title', 'price', 'size', 'rooms', 'address', 'image', 'from' ]:
2021
self.assertIsNotNone(entries[0][attr], attr + " should be set")
22+
for attr in [ 'to' ]:
23+
found = reduce(lambda i, e: attr in e or i, entries, False)
24+
self.assertTrue(found, "Expected " + attr + " to sometimes be set")
2125

‎test/test_googlecloud_idmaintainer.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -27,12 +27,9 @@ def __init__(self):
2727
def id_watch():
2828
return MockGoogleCloudIdMaintainer()
2929

30-
def test_read_from_empty_db(id_watch):
31-
assert id_watch.get() == []
32-
3330
def test_read_after_write(id_watch):
3431
id_watch.mark_processed(12345)
35-
assert id_watch.get() == [12345]
32+
assert id_watch.is_processed(12345)
3633

3734
def test_get_last_run_time_none_by_default(id_watch):
3835
assert id_watch.get_last_run_time() == None
@@ -42,6 +39,15 @@ def test_get_list_run_time_is_updated(id_watch):
4239
assert time != None
4340
assert time == id_watch.get_last_run_time()
4441

42+
def test_is_processed_works(id_watch):
43+
config = Config(string=CONFIG_WITH_FILTERS)
44+
config.set_searchers([DummyCrawler()])
45+
hunter = Hunter(config, id_watch)
46+
exposes = hunter.hunt_flats()
47+
assert count(exposes) > 4
48+
for expose in exposes:
49+
assert id_watch.is_processed(expose['id'])
50+
4551
def test_exposes_are_saved_to_maintainer(id_watch):
4652
config = Config(string=CONFIG_WITH_FILTERS)
4753
config.set_searchers([DummyCrawler()])

‎test/test_id_maintainer.py

+11-4
Original file line numberDiff line numberDiff line change
@@ -29,12 +29,9 @@ class IdMaintainerTest(unittest.TestCase):
2929
def setUp(self):
3030
self.maintainer = IdMaintainer(":memory:")
3131

32-
def test_read_from_empty_db(self):
33-
self.assertEqual(0, len(self.maintainer.get()), "Expected empty db to return empty array")
34-
3532
def test_read_after_write(self):
3633
self.maintainer.mark_processed(12345)
37-
self.assertEqual(12345, self.maintainer.get()[0], "Expected ID to be saved")
34+
self.assertTrue(self.maintainer.is_processed(12345), "Expected ID to be saved")
3835

3936
def test_get_last_run_time_none_by_default(self):
4037
self.assertIsNone(self.maintainer.get_last_run_time(), "Expected last run time to be none")
@@ -44,6 +41,16 @@ def test_get_list_run_time_is_updated(self):
4441
self.assertIsNotNone(time, "Expected time not to be none")
4542
self.assertEqual(time, self.maintainer.get_last_run_time(), "Expected last run time to be updated")
4643

44+
def test_is_processed_works(mocker):
45+
config = Config(string=IdMaintainerTest.DUMMY_CONFIG)
46+
config.set_searchers([DummyCrawler()])
47+
id_watch = IdMaintainer(":memory:")
48+
hunter = Hunter(config, id_watch)
49+
exposes = hunter.hunt_flats()
50+
assert count(exposes) > 4
51+
for expose in exposes:
52+
assert id_watch.is_processed(expose['id'])
53+
4754
def test_ids_are_added_to_maintainer(mocker):
4855
config = Config(string=IdMaintainerTest.DUMMY_CONFIG)
4956
config.set_searchers([DummyCrawler()])

0 commit comments

Comments
 (0)
Please sign in to comment.