Skip to content

Query for citation metrics differ between repository-wide and dataset-level queries #88

@vchendrix

Description

@vchendrix

In trying to assess the total number of citations for the ESS-DIVE repository, it was discovered that the citation count is off when querying by repository vs querying citations by individual datasets.

Citations from 1/1/2016 to 11/30/2021

Repository-wide query
  235 Total Count Returned (totalCitations)
  216 Total Citations Returned (citations)
  71 Unique Citations (from citations)
Dataset-level query
  330 Total Citations
  121 Data packages with citations
  217 Unique Citations

Python code

The following python code was used to generate the counts above. You need to install pandas and requests libraries. This code was executed in a Jupyter notebook

import requests
import json

# Import pandas library 
import pandas as pd 
pd.set_option('display.max_colwidth', None)

from ipywidgets import widgets, interact
from IPython.display import display

# Setup the inputs
from_date = widgets.Text("01/01/2016", description="From Date:")
to_date = widgets.Text("09/30/2021", description="To Date:")


display(from_date)
display(to_date)

def get_repo_citations(to_date, from_date="01/01/2016"):
    """
    Repository level citations from the metrics service.
    
    IMPORTANT: These have been found to be incomplete when compared
    to the individual doi queries for citations.
    """
    metrics_request_json = {
        "metricsPage": {
            "total": 0,
            "start": 0,
            "count": 0
        },
        "metrics": [
            "citations",
            "downloads",
            "views"
        ],
        "filterBy": [
            {
                "filterType": "repository",
                "values": [
                    "urn:node:ESS_DIVE"
                ],
                "interpretAs": "list"
            },
            {
                "filterType": "month",
                "values": [
                    from_date,
                    to_date
                ],
                "interpretAs": "range"
            }
        ],
        "groupBy": [
            "month"
        ]
    }
    metrics_request = json.dumps(metrics_request_json)
    metrics_response = requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")
    repository_results =  metrics_response.json()['resultDetails']
    repo_citations={}

    for c in repository_results['citations']:
        citation = repository_results['citations'][c]
        for t in citation['target_id']:
            repo_citations.setdefault(f"doi:{t}", set())
            repo_citations[f"doi:{t}"].add(c)
    return repo_citations, repository_results['citations'], repository_results['resultDetails']['totalCitations']


def get_citations(to_date, from_date="01/01/2016"):
    """
    Get the citations for the specified date range
    
    returns tuple (dataframe, dictionary of citations)
    """
    

    # Prepare the data frame
    df = pd.DataFrame(columns=['citations', 'doi', 'title'])


    #IMPORTANT must use archive=* to get all archived and current data packages
    response = requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows=0&archived=*")
    max_rows = response.json()['response']['numFound']
    print(f"{max_rows} datasets found.")


    # query ESS-DIVE and the metrics service to get the data package citations
    #   TODO: this should be updated to page over the results if it is over 400
    response = requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows={max_rows}&archived=*")
    response_json = response.json()
    individual_citations = dict()

    # Iterator over datasets and query the metrics service for citations
    for d in response_json['response']['docs']:
        series_id = 'seriesId' in d and d['seriesId'] or d['id']
        title = d['title']

        metrics_request_json = {"metricsPage":{"total":0,"start":0,"count":0},
                     "metrics":["citations","downloads","views"],
                     "filterBy":[{"filterType":"dataset","values":[series_id],"interpretAs":"list"},{"filterType":"month","values":[from_date,to_date],"interpretAs":"range"}],
                     "groupBy":["month"]}
        metrics_request = json.dumps(metrics_request_json)

        metrics_response = requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")

        # Get the citations from the metrics response
        unique_citations = [c['source_id'] for c in metrics_response.json()['resultDetails']['citations']]
        print(f"{len(unique_citations)} ", end="")

        # append to data frame
        df = df.append({'citations': len(unique_citations), 
                        'doi': series_id, 
                        'title': d['title']}, ignore_index=True)
        individual_citations[series_id]=set(un

Counts the citations

# Dataset level citations
df, individual_citations = get_citations(to_date.value, from_date=from_date.value)
has_citations = df['citations']>0
df_has_citations = df[has_citations]
print(df_has_citations.shape[0])

df_has_citations = df_has_citations.sort_values(by=['citations'], ascending=False).head(df_has_citations.shape[0])
from IPython.display import display, HTML
display(HTML(df_has_citations.to_html(index=False)))

# Repository-wide citations 
unique_repo_citations, repo_query_result, total_citations = get_repo_citations(to_date.value, from_date=from_date.value)
unique=set()
for c in individual_citations:
    unique.update(individual_citations[c])

print("Repository-wide query")
print(f"  {total_citations} Total Count Returned (totalCitations)")
print(f"  {len(repo_query_result)} Total Citations Returned (citations)")
print(f"  {len(unique_repo_citations)} Unique Citations (from citations)")
print("Dataset-level query")
print(f"  {df_has_citations['citations'].sum()} Total Citations")
print(f"  {df_has_citations['citations'].count()} Data packages with citations")
print(f"  {len(unique)} Unique Citations")

Metadata

Metadata

Labels

bugSomething isn't working

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions