-
Notifications
You must be signed in to change notification settings - Fork 0
/
mydrivers.py
89 lines (74 loc) · 1.67 KB
/
mydrivers.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
#!/usr/bin/env python3
#coding: utf-8
from grab_stack import get_matched_link, get_page_content
from db import DB
_www_mydrivers_com = [
("html", None),
("body", None),
("div", {
"class": ["main"],
}),
("div", {
"class": ["main_left"],
}),
("div", None),
("div", {
"class": ["news_info1"],
}),
("ul", {
"class": ["newslist"],
}),
("li", None),
("span", {
"class": ["titl"],
}),
]
def my_get_links(tag):
aa = tag.find_all("a")
return [a["href"] for a in aa]
_content_pattern1 = [
("html", None),
("body", None),
("div", {
"class": ["news_box"],
}),
("div", {
"class": ["news_left"],
}),
("div", {
"class": ["news_n"],
}),
("div", {
"class": ["news_info"],
}),
("p", None)
]
_content_pattern2 = [
("html", None),
("body", None),
("div", {
"class": ["pc_box"],
}),
("div", {
"class": ["pc_info"],
})
]
def my_get_content(tag):
return tag.text
def grab(logger):
db = DB()
urls = get_matched_link(logger, _www_mydrivers_com, "http://www.mydrivers.com", my_get_links)
for url in urls:
if db.has(url):
continue
txt = ""
cs = get_page_content(logger, _content_pattern1, url, my_get_content, False)
if not cs:
cs = get_page_content(logger, _content_pattern2, url, my_get_content, False)
if not cs:
logger.warning("{} no content".format(url))
continue
for c in cs:
txt += c
db.save(url, txt)
logger.info("{} saved, {} bytes".format(url, len(txt)))