-
Couldn't load subscription status.
- Fork 1
Open
Labels
bugSomething isn't workingSomething isn't working
Description
In trying to assess the total number of citations for the ESS-DIVE repository, it was discovered that the citation count is off when querying by repository vs querying citations by individual datasets.
Citations from 1/1/2016 to 11/30/2021
Repository-wide query
235 Total Count Returned (totalCitations)
216 Total Citations Returned (citations)
71 Unique Citations (from citations)
Dataset-level query
330 Total Citations
121 Data packages with citations
217 Unique Citations
Python code
The following python code was used to generate the counts above. You need to install pandas and requests libraries. This code was executed in a Jupyter notebook
import requests
import json
# Import pandas library
import pandas as pd
pd.set_option('display.max_colwidth', None)
from ipywidgets import widgets, interact
from IPython.display import display
# Setup the inputs
from_date = widgets.Text("01/01/2016", description="From Date:")
to_date = widgets.Text("09/30/2021", description="To Date:")
display(from_date)
display(to_date)
def get_repo_citations(to_date, from_date="01/01/2016"):
"""
Repository level citations from the metrics service.
IMPORTANT: These have been found to be incomplete when compared
to the individual doi queries for citations.
"""
metrics_request_json = {
"metricsPage": {
"total": 0,
"start": 0,
"count": 0
},
"metrics": [
"citations",
"downloads",
"views"
],
"filterBy": [
{
"filterType": "repository",
"values": [
"urn:node:ESS_DIVE"
],
"interpretAs": "list"
},
{
"filterType": "month",
"values": [
from_date,
to_date
],
"interpretAs": "range"
}
],
"groupBy": [
"month"
]
}
metrics_request = json.dumps(metrics_request_json)
metrics_response = requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")
repository_results = metrics_response.json()['resultDetails']
repo_citations={}
for c in repository_results['citations']:
citation = repository_results['citations'][c]
for t in citation['target_id']:
repo_citations.setdefault(f"doi:{t}", set())
repo_citations[f"doi:{t}"].add(c)
return repo_citations, repository_results['citations'], repository_results['resultDetails']['totalCitations']
def get_citations(to_date, from_date="01/01/2016"):
"""
Get the citations for the specified date range
returns tuple (dataframe, dictionary of citations)
"""
# Prepare the data frame
df = pd.DataFrame(columns=['citations', 'doi', 'title'])
#IMPORTANT must use archive=* to get all archived and current data packages
response = requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows=0&archived=*")
max_rows = response.json()['response']['numFound']
print(f"{max_rows} datasets found.")
# query ESS-DIVE and the metrics service to get the data package citations
# TODO: this should be updated to page over the results if it is over 400
response = requests.get(f"https://data.ess-dive.lbl.gov/catalog/d1/mn/v2/query/solr?q=formatId:*eml*+AND+NOT+obsoletedBy:*+AND+isPublic:true&fl=id,seriesId,title&wt=json&rows={max_rows}&archived=*")
response_json = response.json()
individual_citations = dict()
# Iterator over datasets and query the metrics service for citations
for d in response_json['response']['docs']:
series_id = 'seriesId' in d and d['seriesId'] or d['id']
title = d['title']
metrics_request_json = {"metricsPage":{"total":0,"start":0,"count":0},
"metrics":["citations","downloads","views"],
"filterBy":[{"filterType":"dataset","values":[series_id],"interpretAs":"list"},{"filterType":"month","values":[from_date,to_date],"interpretAs":"range"}],
"groupBy":["month"]}
metrics_request = json.dumps(metrics_request_json)
metrics_response = requests.get(f"https://logproc-stage-ucsb-1.test.dataone.org/metrics?metricsRequest={metrics_request}")
# Get the citations from the metrics response
unique_citations = [c['source_id'] for c in metrics_response.json()['resultDetails']['citations']]
print(f"{len(unique_citations)} ", end="")
# append to data frame
df = df.append({'citations': len(unique_citations),
'doi': series_id,
'title': d['title']}, ignore_index=True)
individual_citations[series_id]=set(unCounts the citations
# Dataset level citations
df, individual_citations = get_citations(to_date.value, from_date=from_date.value)
has_citations = df['citations']>0
df_has_citations = df[has_citations]
print(df_has_citations.shape[0])
df_has_citations = df_has_citations.sort_values(by=['citations'], ascending=False).head(df_has_citations.shape[0])
from IPython.display import display, HTML
display(HTML(df_has_citations.to_html(index=False)))
# Repository-wide citations
unique_repo_citations, repo_query_result, total_citations = get_repo_citations(to_date.value, from_date=from_date.value)
unique=set()
for c in individual_citations:
unique.update(individual_citations[c])
print("Repository-wide query")
print(f" {total_citations} Total Count Returned (totalCitations)")
print(f" {len(repo_query_result)} Total Citations Returned (citations)")
print(f" {len(unique_repo_citations)} Unique Citations (from citations)")
print("Dataset-level query")
print(f" {df_has_citations['citations'].sum()} Total Citations")
print(f" {df_has_citations['citations'].count()} Data packages with citations")
print(f" {len(unique)} Unique Citations")Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working