Nature HTML has updated to a new layout #28

WeixinGithubJiang · 2022-06-03T14:50:38Z

Describe the bug
current journal scraper does not work any more

To Reproduce

Expected behavior

Outputs

Environment (please complete the following information):

Additional context
Overwrite the Class Nature in could be a temporary solution (/exsclaim/exsclaim/journal.py)

class Nature(JournalFamily):
    domain =        "https://www.nature.com"
    relevant =      "relevance"
    recent =        "date_desc"
    path =          "/search?q=\""
    join =          "\"%20\""
    pre_sb =        "\"&order="
    open_pre_sb =   "\"&order="
    post_sb =       "&page=1"
    article_path =  ('/articles/','')
    prepend =       ""
    extra_key =     " "

    def get_page_info(self, soup):
        ## Finds total results, page number, and total pages in article html
        ## Data exists as json inside script tag with 'data-test'='dataLayer' attr.
        display_results = soup.find("div", class_="c-list-header").find("div", class_="u-mb-0").find("span", class_="u-display-flex").find_all("span")
        totalResults = int(display_results[-1].text.split(" ")[0])
        start_page = 1
        try:
            results = soup.find("ul", class_="c-pagination").find_all("li", class_="c-pagination__item")
            page_ids = [int(r["data-page"]) for r in results if r.get("data-page") is not None and r.get("data-page").isdigit()]
            totalPages = max(page_ids)
        except:
            totalPages = 1
        return start_page, totalPages, totalResults
        

    def turn_page(self, url, pg_num, pg_size):
        return url.split('&page=')[0]+'&page='+str(pg_num)

    def get_license(self, soup):
        data_layer = soup.find(attrs = {'data-test': 'dataLayer'})
        data_layer_string = str(data_layer.string)
        data_layer_json = "{" + data_layer_string.split("[{", 1)[1].split("}];", 1)[0] + "}"
        parsed = json.loads(data_layer_json)
        ## try to get whether the journal is open
        try:
            is_open = parsed["content"]["attributes"]["copyright"]["open"]
        except:
            is_open = False
        ## try to get license
        try:
            license = parsed["content"]["attributes"]["copyright"]["legacy"]["webtrendsLicenceType"]
        except:
            license = "unknown"
        return is_open, license

    def is_link_to_open_article(self, tag):
        if tag.find("span", attrs = {'data-test': 'open-access'}) is not None:
            return True
        return False
    
    def get_article_extensions(self, articles_visited=set()) -> list:
        """
        Create a list of article url extensions from search_query

        Returns:
            A list of article url extensions from search.
        """
        search_query = self.search_query
#         print("search query: ", search_query)
        maximum_scraped = search_query["maximum_scraped"]
        article_delim, reader_delims = self.get_article_delimiters()
        search_query_urls = self.get_search_query_urls()
#         print("search query urls: ", search_query_urls)
        article_paths = set()
        for page1 in search_query_urls:
            self.logger.info("GET request: {}".format(page1))
            print("GET request: {}".format(page1))
            soup = self.get_soup_from_request(page1, fast_load=True)
            start_page, stop_page, total_articles = self.get_page_info(soup)
            print("start_page={}, stop_page={}, total_articles={}".format(start_page, stop_page, total_articles))
            for page_number in range(start_page, stop_page + 1):
                request = self.turn_page(page1, page_number, total_articles)
                soup = self.get_soup_from_request(request, fast_load=False)
                for r in soup.find_all("article", class_="u-full-height c-card c-card--flush"):
                    article = r.find("a", href=True).get("href")
                    if article is not None and article.split("/")[-1] not in articles_visited and not (self.open and self.is_link_to_open_article(r) is not None):
                        article_paths.add(article)
                    if len(article_paths) >= maximum_scraped:
                        return list(article_paths)
                print("page_number={}, num_of_articles={}".format(page_number, len(article_paths)))

        return list(article_paths)

zzy168 · 2024-12-09T09:13:15Z

thank you, I know there is a problem here but I won't fix it. I wasted an entire afternoon and almost gave up. Fortunately, I saw this

WeixinGithubJiang added the bug Something isn't working label Jun 3, 2022

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Nature HTML has updated to a new layout #28

Nature HTML has updated to a new layout #28

WeixinGithubJiang commented Jun 3, 2022

zzy168 commented Dec 9, 2024

Nature HTML has updated to a new layout #28

Nature HTML has updated to a new layout #28

Comments

WeixinGithubJiang commented Jun 3, 2022

zzy168 commented Dec 9, 2024