Skip to content

Commit

Permalink
Merge pull request #181 from Harshitmishra001/main
Browse files Browse the repository at this point in the history
Update scrapping.py
  • Loading branch information
sanjay-kv authored Jun 8, 2024
2 parents 947c303 + 8f273fd commit 18f26c5
Showing 1 changed file with 30 additions and 43 deletions.
73 changes: 30 additions & 43 deletions amazon_scrapping/scrapping.py
Original file line number Diff line number Diff line change
@@ -1,59 +1,46 @@
# product name
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import json
import csv
import pandas as pd

## One way to load chrome webdirver
#from webdriver_manager.chrome import ChromeDriverManager
#driver = webdriver.Chrome(ChromeDriverManager().install())

## another way to load chrome webdriver
path = '/Users/mohammedrizwan/Downloads/chromedriver'
path = '/Users/hmharsh/Downloads/chromedriver'
driver = webdriver.Chrome(path)

def product_listing(txt):
name_list = [] # Added this line to define name_list within the function
# Rest of the function remains unchanged
name_list = []
driver.get("https://www.amazon.in/")
driver.implicitly_wait(2)
search = driver.find_element_by_id('twotabsearchtextbox').send_keys(txt)
driver.implicitly_wait(2)
search_button = driver.find_element_by_id('nav-search-submit-button').click()
driver.implicitly_wait(5)

items = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, '//a[@class="a-link-normal a-text-normal"]')))

for item in items:
name_list.append(item.text)

driver.implicitly_wait(5)
c1 = driver.find_element_by_class_name("a-pagination")
c2 = c1.text
c3 = c2.splitlines()
num_of_pg = c3[-2]

for i in range(int(num_of_pg)-5):
print(i)
items = WebDriverWait(driver,10).until(EC.presence_of_all_elements_located((By.XPATH, '//a[@class="a-link-normal a-text-normal"]')))
WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, 'twotabsearchtextbox')))
search_box = driver.find_element(By.ID, 'twotabsearchtextbox')
search_box.clear()
search_box.send_keys(txt)
search_button = driver.find_element(By.ID, 'nav-search-submit-button')
search_button.click()

while True:
WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@class="a-size-medium a-color-base a-text-normal"]')))
items = driver.find_elements(By.XPATH, '//span[@class="a-size-medium a-color-base a-text-normal"]')
for item in items:
name_list.append(item.text)
link = driver.find_element_by_class_name("a-section.a-spacing-none.a-padding-base")
next_lin = link.find_element_by_class_name("a-last").find_element_by_tag_name("a").get_attribute("href")
driver.get(next_lin)
driver.implicitly_wait(2)


try:
next_button = driver.find_element(By.CLASS_NAME, 's-pagination-next')
if 's-pagination-disabled' in next_button.get_attribute('class'):
break
next_button.click()
except:
break
return name_list

names = ['Laptop', 'Phones', 'Printers', 'Desktops', 'Monitors', 'Mouse', 'Pendrive', 'Earphones', 'Smart TV', 'Power banks']
name_list = []
for i in names:
product_listing(i)
df=pd.DataFrame(name_list)
df.to_csv('./prod_listings.csv')
all_product_listings = []

for name in names:
all_product_listings.extend(product_listing(name))

# Convert the list to a DataFrame and save it as a CSV file
df = pd.DataFrame(all_product_listings, columns=['Product Name'])
df.to_csv('./prod_listings.csv', index=False)
print(df)
driver.quit()

driver.quit()

0 comments on commit 18f26c5

Please sign in to comment.