-
Notifications
You must be signed in to change notification settings - Fork 0
/
zakup.py
128 lines (103 loc) · 5.15 KB
/
zakup.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
# -*- coding: utf8 -*-
"""
Многопоточный парсер госзакупок с сохранением в csv
https://github.com/tarbagan/zakup2
Автор: Иргит Валерий
Версия: 0.2
"""
from multiprocessing.dummy import Pool as ThreadPool
from dateutil.rrule import rrule, DAILY
from bs4 import BeautifulSoup as bs
from datetime import date
import requests
import re
a = date(1991, 1, 1) # Начало периода
b = date(2020, 4, 26) # конец периода
id_region = '5277386' # id региона
thr = 5 # кол-во потоков
record_file = 'zakup2.csv' # файл с результатами
def split(arr, thr):
"""Делим ссылки на части для мультипотока"""
return [arr[i::thr] for i in range(thr)]
def gen_url():
"""Генерация ссылок по дате"""
start_url = []
for dt in rrule(DAILY, dtstart=a, until=b):
date = dt.strftime("%d.%m.%Y")
url = 'https://zakupki.gov.ru/epz/order/extendedsearch/results.html?searchString=&morphology=on&' \
'search-filter=%D0%94%D0%B0%D1%82%D0%B5+%D1%80%D0%B0%D0%B7%D0%BC%D0%B5%D1%89%D0%B5%D0%BD%D0%B8%D1%8F&pageNumber=1&' \
'sortDirection=false&recordsPerPage=_100&showLotsInfoHidden=false&savedSearchSettingsIdHidden=&' \
'sortBy=UPDATE_DATE&fz44=on&fz223=on&ppRf615=on&fz94=on&af=on&ca=on&pc=on&pa=on&placingWayList=&' \
'okpd2Ids=&okpd2IdsCodes=&selectedSubjectsIdHidden=&npaHidden=&restrictionsToPurchase44=&' \
'publishDateFrom={}&publishDateTo={}&applSubmissionCloseDateFrom=&' \
'applSubmissionCloseDateTo=&priceFromGeneral=&priceFromGWS=&priceFromUnitGWS=&priceToGeneral=&' \
'priceToGWS=&priceToUnitGWS=¤cyIdGeneral=-1&customerIdOrg=&agencyIdOrg=&' \
'customerPlace=5277386&customerPlaceCodes=17000000000&OrderPlacementSmallBusinessSubject=on&' \
'OrderPlacementRnpData=on&OrderPlacementExecutionRequirement=on&orderPlacement94_0=0&' \
'orderPlacement94_1=0&orderPlacement94_2=0'.format(date, date)
start_url.append(url)
return start_url
def request_url(url):
"""Получаем страницу"""
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) '
'AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/71.0.3578.98 Safari/537.36'}
try:
r = requests.get(url, headers=headers).text
soup = bs(r, 'html.parser')
return soup
except Exception as e:
print(e)
def clear(text):
"""Очистка текста"""
text = re.sub(r'\s+', ' ', text)
text = re.sub(r' ₽', '', text)
text = text.lstrip()
text = text.rstrip()
return text
def parser_start(soup):
"""Парсер данных"""
arr_item = []
block = soup.findAll('div', {'class': 'search-registry-entry-block'})
try:
for item in block:
text_fz = item.find('div', {'class': 'registry-entry__header-top__title text-truncate'}).text
text_fz = clear(text_fz) # Тип аукциона
text_nm = item.find('div', {'class': 'registry-entry__header-mid__number'}).text
text_nm = clear(text_nm) # Код аукциона
text_ur = item.find('div', {'class': 'registry-entry__header-mid__number'}).findAll('a')[0].get(
'href') # Ссылка
text_st = item.find('div', {'class': 'registry-entry__header-mid__title'}).text
text_st = clear(text_st) # Статус
text_pr = item.find('div', {'class': 'price-block__value'}).text
text_pr = clear(text_pr) # Статус
text_dt = item.find('div', {'class': 'data-block__value'}).text # дата
text_ds = item.find('div', {'class': 'registry-entry__body-value'}).text
text_ds = clear(text_ds) # Описание
text_zk = item.find('div', {'class': 'registry-entry__body-href'}).text
text_zk = clear(text_zk) # Заказчик
data_item = (text_fz, text_nm, text_ur, text_st, text_pr, text_dt, text_ds, text_zk)
arr_item.append(data_item)
except Exception as e:
print(e)
return arr_item
pool = ThreadPool(thr)
page_count = []
item_count = []
print ('Начинаю сбор данных, подождите...')
with open(record_file, 'a', encoding='utf-8') as f:
for arr_url in split(gen_url(), thr):
date_array = pool.map(request_url, arr_url)
for page in date_array:
page_count.append(page)
data = parser_start(page)
if data:
for item in data:
dat = '|'.join(item)
f.write(dat + '\n')
# логирование
item_count.append(1)
p_count = len(page_count)
z_count = len(item_count)
print(f'Страница {p_count}/закупок {z_count}: {item[1]}')
print (f'Сбор данных завершен и записан в файл \nВсего {z_count}')