-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain_scraper.py
85 lines (62 loc) · 3.08 KB
/
main_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
from bs4 import BeautifulSoup
import requests
import datetime
import csv
import telegram
import time
import pyshorteners
# Insert the telegram bot token
TOKEN = 'TOKEN BOT TELEGRAM'
# Insert name channel with @
CHAT_ID = '@name-channel'
# Insert Name Amazon Affiliate
AMAZON_AFFILIATE_CODE = 'YOUR-AFFILIATE-NAME'
# Insert product to find
search_term = 'PRODUTC TO FIND'
URL = f'https://www.amazon.it/s?k={search_term}&tag={AMAZON_AFFILIATE_CODE}'
HEADERS = ({
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit / 537.36(KHTML, like Gecko) Chrome / 44.0.2403.157 Safari / 537.36',
'Accept-Language': 'en-US, en;q=0.5'})
webpage = requests.get(URL, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "lxml")
# Find the first 10 products in the search results
search_results = soup.find_all('div', {'data-component-type': 's-search-result'})[:10]
# Set to store the links of already found products
found_links = set()
# Initialize the Telegram bot object
bot = telegram.Bot(TOKEN)
# Initialize the shortener object
shortener = pyshorteners.Shortener()
for product in search_results:
# Get the product URL
product_link = 'https://www.amazon.it' + product.find('a', {'class': 'a-link-normal'}).get('href') + '&tag=pakyita0c-21'
# If the product has already been found, skip it
if product_link in found_links:
continue
else:
found_links.add(product_link)
# Get product data
webpage = requests.get(product_link, headers=HEADERS)
soup = BeautifulSoup(webpage.content, "lxml")
product_title_element = soup.find("span", attrs={"id": 'productTitle'})
product_title = product_title_element.string.strip().replace(',', '') if product_title_element else "NA"
print(product_title) # Print the product title to the console
product_price_element = soup.find("span", attrs={'class': 'a-offscreen'})
product_price = product_price_element.string.strip() if product_price_element else "NA"
print(product_price) # Print te product price on the console
time_now = datetime.datetime.now().strftime('%d %b, %Y %H:%M:%S')
# Check if product data is valid and not 'NA'
if 'NA' not in (product_title, product_price):
# Accorcia il link del prodotto
shortened_link = shortener.tinyurl.short(product_link)
# Add affiliate name with shortlink
affiliate_link = f'{shortened_link}?tag={AMAZON_AFFILIATE_CODE}'
# Send short lint to Telegram
message = f'*PRODOTTO*: {product_title}\n*PREZZO*: {product_price}\n*LINK DI ACQUISTO*: {affiliate_link}'
bot.send_message(chat_id=CHAT_ID, text=message, parse_mode=telegram.ParseMode.MARKDOWN)
# Add data to file CSV
with open('amazon_products.csv', mode='a', newline='') as file:
writer = csv.writer(file)
writer.writerow([datetime.datetime.now().strftime("%d-%m-%Y"), datetime.datetime.now().strftime("%H:%M:%S"), product_title, product_price, product_link])
# Delay the request to avoid IP blocking - it is recommended to set a delay of 30 seconds between each request to avoid getting blocked by Amazon
time.sleep(30)