Skip to content

Commit

Permalink
change category separator
Browse files Browse the repository at this point in the history
from "_" to "-"
  • Loading branch information
drelsabrouty committed Apr 19, 2024
1 parent d7fed91 commit d28e255
Show file tree
Hide file tree
Showing 2 changed files with 5 additions and 5 deletions.
2 changes: 1 addition & 1 deletion src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ async def main():
if 'DATABRICKS_RUNTIME_VERSION' in os.environ:
dbutils.library.restartPython()
# Execute EuroPagesScraper
await EuroPagesProductsScraper().scrape_and_export("company_scraper", "italy", "agriculture_livestock", "agricultural_production")
await EuroPagesProductsScraper().scrape_and_export("company_scraper", "italy", "agriculture-livestock", "agricultural-production")

# Run scraper
if __name__ == "__main__":
Expand Down
8 changes: 4 additions & 4 deletions src/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,9 +112,9 @@ async def extract_ep_subsectors(self, url, session):
sector = " ".join(soup.find("div", class_="v-breadcrumbs__item").text.strip().split())

cats_sectors_and_subsectors.append([self.generate_hash_id(subsector.text.strip()),
re.sub('[^0-9a-zA-Z]+', "_", category.lower()),
re.sub('[^0-9a-zA-Z]+', "_", sector.lower()),
re.sub('[^0-9a-zA-Z]+', "_", subsector.text.strip().lower()),
re.sub('[^0-9a-zA-Z]+', "-", category.lower()),
re.sub('[^0-9a-zA-Z]+', "-", sector.lower()),
re.sub('[^0-9a-zA-Z]+', "-", subsector.text.strip().lower()),
"https://www.europages.co.uk/companies/{}.html".format(quote(sector.lower())),
"https://www.europages.co.uk{}".format(subsector["href"])])
return cats_sectors_and_subsectors
Expand Down Expand Up @@ -257,7 +257,7 @@ async def scrape_and_export(self, type="product_updater", country=None, group=No
categorization = pd.read_csv(input)
subsector_urls = categorization[categorization["sector"] == sector]["subsector_url"].tolist()
sector_url = categorization[categorization["sector"] == sector]["sector_url"].values[0]
out = f"{country}-{group}-{sector}.csv"
out = f"{country}_{group}_{sector}.csv"

# store all company_information
all_company_info = []
Expand Down

0 comments on commit d28e255

Please sign in to comment.