Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Nature HTML has updated to a new layout #28

Open
WeixinGithubJiang opened this issue Jun 3, 2022 · 1 comment
Open

Nature HTML has updated to a new layout #28

WeixinGithubJiang opened this issue Jun 3, 2022 · 1 comment
Labels
bug Something isn't working

Comments

@WeixinGithubJiang
Copy link
Collaborator

Describe the bug
current journal scraper does not work any more

To Reproduce

Expected behavior

Outputs

Environment (please complete the following information):

Additional context
Overwrite the Class Nature in could be a temporary solution (/exsclaim/exsclaim/journal.py)

class Nature(JournalFamily):
    domain =        "https://www.nature.com"
    relevant =      "relevance"
    recent =        "date_desc"
    path =          "/search?q=\""
    join =          "\"%20\""
    pre_sb =        "\"&order="
    open_pre_sb =   "\"&order="
    post_sb =       "&page=1"
    article_path =  ('/articles/','')
    prepend =       ""
    extra_key =     " "

    def get_page_info(self, soup):
        ## Finds total results, page number, and total pages in article html
        ## Data exists as json inside script tag with 'data-test'='dataLayer' attr.
        display_results = soup.find("div", class_="c-list-header").find("div", class_="u-mb-0").find("span", class_="u-display-flex").find_all("span")
        totalResults = int(display_results[-1].text.split(" ")[0])
        start_page = 1
        try:
            results = soup.find("ul", class_="c-pagination").find_all("li", class_="c-pagination__item")
            page_ids = [int(r["data-page"]) for r in results if r.get("data-page") is not None and r.get("data-page").isdigit()]
            totalPages = max(page_ids)
        except:
            totalPages = 1
        return start_page, totalPages, totalResults
        

    def turn_page(self, url, pg_num, pg_size):
        return url.split('&page=')[0]+'&page='+str(pg_num)

    def get_license(self, soup):
        data_layer = soup.find(attrs = {'data-test': 'dataLayer'})
        data_layer_string = str(data_layer.string)
        data_layer_json = "{" + data_layer_string.split("[{", 1)[1].split("}];", 1)[0] + "}"
        parsed = json.loads(data_layer_json)
        ## try to get whether the journal is open
        try:
            is_open = parsed["content"]["attributes"]["copyright"]["open"]
        except:
            is_open = False
        ## try to get license
        try:
            license = parsed["content"]["attributes"]["copyright"]["legacy"]["webtrendsLicenceType"]
        except:
            license = "unknown"
        return is_open, license

    def is_link_to_open_article(self, tag):
        if tag.find("span", attrs = {'data-test': 'open-access'}) is not None:
            return True
        return False
    
    def get_article_extensions(self, articles_visited=set()) -> list:
        """
        Create a list of article url extensions from search_query

        Returns:
            A list of article url extensions from search.
        """
        search_query = self.search_query
#         print("search query: ", search_query)
        maximum_scraped = search_query["maximum_scraped"]
        article_delim, reader_delims = self.get_article_delimiters()
        search_query_urls = self.get_search_query_urls()
#         print("search query urls: ", search_query_urls)
        article_paths = set()
        for page1 in search_query_urls:
            self.logger.info("GET request: {}".format(page1))
            print("GET request: {}".format(page1))
            soup = self.get_soup_from_request(page1, fast_load=True)
            start_page, stop_page, total_articles = self.get_page_info(soup)
            print("start_page={}, stop_page={}, total_articles={}".format(start_page, stop_page, total_articles))
            for page_number in range(start_page, stop_page + 1):
                request = self.turn_page(page1, page_number, total_articles)
                soup = self.get_soup_from_request(request, fast_load=False)
                for r in soup.find_all("article", class_="u-full-height c-card c-card--flush"):
                    article = r.find("a", href=True).get("href")
                    if article is not None and article.split("/")[-1] not in articles_visited and not (self.open and self.is_link_to_open_article(r) is not None):
                        article_paths.add(article)
                    if len(article_paths) >= maximum_scraped:
                        return list(article_paths)
                print("page_number={}, num_of_articles={}".format(page_number, len(article_paths)))

        return list(article_paths)
@WeixinGithubJiang WeixinGithubJiang added the bug Something isn't working label Jun 3, 2022
@zzy168
Copy link

zzy168 commented Dec 9, 2024

thank you, I know there is a problem here but I won't fix it. I wasted an entire afternoon and almost gave up. Fortunately, I saw this

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

2 participants