We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Describe the bug current journal scraper does not work any more
To Reproduce
Expected behavior
Outputs
Environment (please complete the following information):
Additional context Overwrite the Class Nature in could be a temporary solution (/exsclaim/exsclaim/journal.py)
class Nature(JournalFamily): domain = "https://www.nature.com" relevant = "relevance" recent = "date_desc" path = "/search?q=\"" join = "\"%20\"" pre_sb = "\"&order=" open_pre_sb = "\"&order=" post_sb = "&page=1" article_path = ('/articles/','') prepend = "" extra_key = " " def get_page_info(self, soup): ## Finds total results, page number, and total pages in article html ## Data exists as json inside script tag with 'data-test'='dataLayer' attr. display_results = soup.find("div", class_="c-list-header").find("div", class_="u-mb-0").find("span", class_="u-display-flex").find_all("span") totalResults = int(display_results[-1].text.split(" ")[0]) start_page = 1 try: results = soup.find("ul", class_="c-pagination").find_all("li", class_="c-pagination__item") page_ids = [int(r["data-page"]) for r in results if r.get("data-page") is not None and r.get("data-page").isdigit()] totalPages = max(page_ids) except: totalPages = 1 return start_page, totalPages, totalResults def turn_page(self, url, pg_num, pg_size): return url.split('&page=')[0]+'&page='+str(pg_num) def get_license(self, soup): data_layer = soup.find(attrs = {'data-test': 'dataLayer'}) data_layer_string = str(data_layer.string) data_layer_json = "{" + data_layer_string.split("[{", 1)[1].split("}];", 1)[0] + "}" parsed = json.loads(data_layer_json) ## try to get whether the journal is open try: is_open = parsed["content"]["attributes"]["copyright"]["open"] except: is_open = False ## try to get license try: license = parsed["content"]["attributes"]["copyright"]["legacy"]["webtrendsLicenceType"] except: license = "unknown" return is_open, license def is_link_to_open_article(self, tag): if tag.find("span", attrs = {'data-test': 'open-access'}) is not None: return True return False def get_article_extensions(self, articles_visited=set()) -> list: """ Create a list of article url extensions from search_query Returns: A list of article url extensions from search. """ search_query = self.search_query # print("search query: ", search_query) maximum_scraped = search_query["maximum_scraped"] article_delim, reader_delims = self.get_article_delimiters() search_query_urls = self.get_search_query_urls() # print("search query urls: ", search_query_urls) article_paths = set() for page1 in search_query_urls: self.logger.info("GET request: {}".format(page1)) print("GET request: {}".format(page1)) soup = self.get_soup_from_request(page1, fast_load=True) start_page, stop_page, total_articles = self.get_page_info(soup) print("start_page={}, stop_page={}, total_articles={}".format(start_page, stop_page, total_articles)) for page_number in range(start_page, stop_page + 1): request = self.turn_page(page1, page_number, total_articles) soup = self.get_soup_from_request(request, fast_load=False) for r in soup.find_all("article", class_="u-full-height c-card c-card--flush"): article = r.find("a", href=True).get("href") if article is not None and article.split("/")[-1] not in articles_visited and not (self.open and self.is_link_to_open_article(r) is not None): article_paths.add(article) if len(article_paths) >= maximum_scraped: return list(article_paths) print("page_number={}, num_of_articles={}".format(page_number, len(article_paths))) return list(article_paths)
The text was updated successfully, but these errors were encountered:
thank you, I know there is a problem here but I won't fix it. I wasted an entire afternoon and almost gave up. Fortunately, I saw this
Sorry, something went wrong.
No branches or pull requests
Describe the bug
current journal scraper does not work any more
To Reproduce
Expected behavior
Outputs
Environment (please complete the following information):
Additional context
Overwrite the Class Nature in could be a temporary solution (/exsclaim/exsclaim/journal.py)
The text was updated successfully, but these errors were encountered: