Skip to content

Commit

Permalink
Merge pull request #162 from MaRDI4NFDI/zenodo
Browse files Browse the repository at this point in the history
generalized zenodo importer
  • Loading branch information
rimmoussa authored Feb 20, 2025
2 parents 4f06000 + b9f2f00 commit fd39e6f
Show file tree
Hide file tree
Showing 4 changed files with 49,452 additions and 35 deletions.
48 changes: 27 additions & 21 deletions mardi_importer/mardi_importer/publications/ZenodoResource.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def publication_date(self):

@property
def license(self):
if not self._license:
if not self._license and ('license' in self.metadata.keys()):
self._license = self.metadata['license']
return self._license

Expand Down Expand Up @@ -131,12 +131,13 @@ def resource_type(self):

@property
def communities(self):
if not self._communities:
for communityCur in self.metadata["communities"]:
community_id = communityCur.get("id")
if community_id == "mathplus":
community = Community(api = self.api, community_id = community_id)
self._communities.append(community)
if not self._communities and "communities" in self.metadata.keys():
#if "communities" in self.metadata.keys():
for communityCur in self.metadata["communities"]:
community_id = communityCur.get("id")
if community_id == "mathplus":
community = Community(api = self.api, community_id = community_id)
self._communities.append(community)
return self._communities

@property
Expand Down Expand Up @@ -166,7 +167,7 @@ def update(self):

zenodo_id = zenodo_item.is_instance_of_with_property("wd:Q1172284", "wdt:P4901", self.zenodo_id)
new_item = self.api.item.get(entity_id=zenodo_id)

if self.license['id'] == "cc-by-4.0":
new_item.add_claim("wdt:P275", "wd:Q20007257")
elif self.license['id'] == "cc-by-sa-4.0":
Expand All @@ -186,21 +187,22 @@ def create(self, update = False):
return self.QID

item = self.api.item.new()

else:
item = self.api.item.get(entity_id=self.QID)
# Add title


if self.title:
item.labels.set(language="en", value=self.title)


if self.resource_type and self.resource_type != "wd:Q37866906":
desc = f"{self.metadata['resource_type']['title']} published at Zenodo repository. "
item.add_claim('wdt:P31',self.resource_type)
desc = f"{self.metadata['resource_type']['title']} published at Zenodo repository. "
item.add_claim('wdt:P31',self.resource_type)
else:
desc = "Resource published at Zenodo repository. "

item.descriptions.set(language="en", value=desc)


if self.description:
prop_nr = self.api.get_local_id_by_label("description", "property")
item.add_claim(prop_nr, self.description)
Expand All @@ -224,14 +226,15 @@ def create(self, update = False):
item.add_claim('wdt:P356', doi)

# License
if self.license['id'] == "cc-by-4.0":
item.add_claim("wdt:P275", "wd:Q20007257")
elif self.license['id'] == "cc-by-sa-4.0":
item.add_claim("wdt:P275", "wd:Q18199165")
elif self.license['id'] == "cc-by-nc-sa-4.0":
item.add_claim("wdt:P275", "wd:Q42553662")
elif self.license['id'] == "mit-license":
item.add_claim("wdt:P275", "wd:Q334661")
if self.license:
if self.license['id'] == "cc-by-4.0":
item.add_claim("wdt:P275", "wd:Q20007257")
elif self.license['id'] == "cc-by-sa-4.0":
item.add_claim("wdt:P275", "wd:Q18199165")
elif self.license['id'] == "cc-by-nc-sa-4.0":
item.add_claim("wdt:P275", "wd:Q42553662")
elif self.license['id'] == "mit-license":
item.add_claim("wdt:P275", "wd:Q334661")

# Communities
if self.communities:
Expand All @@ -248,6 +251,9 @@ def create(self, update = False):

if self._mardi_type:
item.add_claim('MaRDI profile type', self._mardi_type)


#print(item.claims.get_json())

self.QID = item.write().id

Expand Down
6 changes: 3 additions & 3 deletions mardi_importer/mardi_importer/scripts/import.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def get_parser():
parser.add_argument("--wikidata_id_file_path", required=False)
return parser

def main(**args):
def main(**args):
# logging.config.fileConfig("logging_config.ini", disable_existing_loggers=False)
# Parse command-line arguments

Expand All @@ -32,7 +32,7 @@ def main(**args):
conf_parser = ZBMathConfigParser(args["conf_path"])
conf = conf_parser.parse_config()

data_source = ZBMathSource(
data_source = ZBMathSource(
out_dir=conf["out_dir"],
tags=conf["tags"],
from_date=conf["from_date"],
Expand Down Expand Up @@ -66,7 +66,7 @@ def main(**args):
importer.import_all()

elif args["mode"] == "zenodo":
data_source = ZenodoSource()
data_source = ZenodoSource(resourceTypes = ["dataset"], orcid_id_file = "/mardi_importer/mardi_importer/zenodo/orcids-all.csv")
importer = Importer(data_source)
importer.import_all()

Expand Down
121 changes: 110 additions & 11 deletions mardi_importer/mardi_importer/zenodo/ZenodoSource.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,63 @@
import requests
import json
import os
import sys
import pandas as pd

from mardi_importer.importer import ADataSource
from mardi_importer.integrator import MardiIntegrator
from mardi_importer.publications import ZenodoResource
from typing import List

class ZenodoSource(ADataSource):
"""Reads data from Zenodo API."""

def __init__(
self
self,
communities: List[str] = None,
resourceTypes: List[str] = None,
orcid_id_file: str = None,
customQ: str = None

):
self.integrator = MardiIntegrator()
self.zenodo_ids = []
self.filepath = os.path.realpath(os.path.dirname(__file__))

self.communities = communities
self.resourceTypes = resourceTypes
self.orcid_id_file = orcid_id_file
self.customQ = customQ
self.orcid_ids = None

if self.orcid_id_file:
self.orcid_ids = self.parse_orcids(orcid_id_file)

# if all parameters are set to None, issue a warning


def setup(self):
"""Create all necessary properties and entities for zenodo"""

filename = self.filepath + "/wikidata_entities.txt"
self.integrator.import_entities(filename=filename)
self.create_local_entities()

@staticmethod
def parse_orcids(file):

# check that path and file exists
if not os.path.isfile(file):
sys.exit("File" + file + "not found")

orcid_df = pd.read_csv(file)
if not "orcid" in orcid_df.columns:
sys.exit("The file containing ORCID IDs must contain a column 'orcid'.")

orcid_df.drop_duplicates()
orcids_all = orcid_df['orcid'].tolist()
return orcids_all

def create_local_entities(self):
filename = self.filepath + "/new_entities.json"
f = open(filename)
Expand Down Expand Up @@ -51,19 +86,83 @@ def pull(self):
This method queries the Zenodo API to get a data dump of all records.
"""

response = requests.get('https://zenodo.org/api/records',
params={'size' : 1,
'communities' : 'mathplus'})
response_json = response.json()
total_hits = response_json.get("hits").get("total")
total_hits = 0
q_list = []

for page in range(1, total_hits+1):
url = 'https://zenodo.org/api/records?communities=mathplus&page=' + str(page) + "&size=1&sort=newest"
response = requests.get(url)
if self.communities:
community_str = "communities:(" + ' '.join(self.communities) + ")"
q_list.append(community_str)
if self.resourceTypes:
resources_str = "resource_type.type:(" + ' OR '.join(self.resourceTypes) + ")"
q_list.append(resources_str)
if self.customQ:
q_list.append(self.customQ)

q_str = ' AND '.join(q_list)

if self.orcid_ids:
i=0
while i <= len(self.orcid_ids): # if there are too many orcids the initial request needs to be sent out in batches

orcid_str ='metadata.creators.\*:("' + '" "'.join(self.orcid_ids[i:i+50]) + '")'
print("retrieving zenodo entries for the following ORCID IDs: " + orcid_str)

response = requests.get('https://zenodo.org/api/records',
params={'q' : q_str + ' AND ' + orcid_str,
'sort':'-mostrecent'})
response_json = response.json()
total_hits = response_json.get("hits").get("total")

page_cur = 1
while total_hits > 0:
response = requests.get('https://zenodo.org/api/records',
params={'q' : q_str + ' AND ' + orcid_str,
'sort':'-mostrecent',
'size' : 50,
'page' : page_cur})
response_json = response.json()
total_hits = total_hits - len(response_json.get("hits").get("hits"))
page_cur = page_cur + 1

for entry in response_json.get("hits").get("hits"):
self.zenodo_ids.append(str(entry.get("id")))

i = i+50
else:
response = requests.get('https://zenodo.org/api/records',
params={'q' : q_str})
response_json = response.json()
total_hits = response_json.get("hits").get("total")

page_cur = 1
while total_hits > 0:
response = requests.get('https://zenodo.org/api/records',
params={'q' : q_str,
'sort':'-mostrecent',
'size' : 50,
'page' : page_cur})
response_json = response.json()
total_hits = total_hits - len(response_json.get("hits").get("hits"))
page_cur = page_cur + 1

for entry in response_json.get("hits").get("hits"):
self.zenodo_ids.append(str(entry.get("id")))



# response = requests.get('https://zenodo.org/api/records',
# params={'size' : 1,
# 'communities' : 'mathplus'})
# response_json = response.json()
# total_hits = response_json.get("hits").get("total")

# for page in range(1, total_hits+1):
# url = 'https://zenodo.org/api/records?communities=mathplus&page=' + str(page) + "&size=1&sort=newest"
# response = requests.get(url)
# response_json = response.json()

zenodo_id = response_json.get("hits").get("hits")[0].get("id")
self.zenodo_ids.append(str(zenodo_id))
# zenodo_id = response_json.get("hits").get("hits")[0].get("id")
# self.zenodo_ids.append(str(zenodo_id))

def push(self):
for id in self.zenodo_ids:
Expand Down
Loading

0 comments on commit fd39e6f

Please sign in to comment.