Merge pull request #162 from MaRDI4NFDI/zenodo

generalized zenodo importer
MaRDI4NFDI · Feb 20, 2025 · fd39e6f · fd39e6f
2 parents 4f06000 + b9f2f00
commit fd39e6f
Show file tree

Hide file tree

Showing 4 changed files with 49,452 additions and 35 deletions.
diff --git a/mardi_importer/mardi_importer/publications/ZenodoResource.py b/mardi_importer/mardi_importer/publications/ZenodoResource.py
@@ -83,7 +83,7 @@ def publication_date(self):
 
     @property
     def license(self):
-        if not self._license:
+        if not self._license and ('license' in self.metadata.keys()):
             self._license = self.metadata['license']
         return self._license            
 
@@ -131,12 +131,13 @@ def resource_type(self):
 
     @property
     def communities(self):
-        if not self._communities:
-            for communityCur in self.metadata["communities"]:
-                community_id = communityCur.get("id")
-                if community_id == "mathplus":
-                    community = Community(api = self.api, community_id = community_id)
-                    self._communities.append(community)
+        if not self._communities and "communities" in self.metadata.keys():
+            #if "communities" in self.metadata.keys():
+                for communityCur in self.metadata["communities"]:
+                    community_id = communityCur.get("id")
+                    if community_id == "mathplus":
+                        community = Community(api = self.api, community_id = community_id)
+                        self._communities.append(community)
         return self._communities
 
     @property
@@ -166,7 +167,7 @@ def update(self):
 
         zenodo_id = zenodo_item.is_instance_of_with_property("wd:Q1172284", "wdt:P4901", self.zenodo_id)
         new_item = self.api.item.get(entity_id=zenodo_id)
-
+        
         if self.license['id'] == "cc-by-4.0":
             new_item.add_claim("wdt:P275", "wd:Q20007257")
         elif self.license['id'] == "cc-by-sa-4.0":
@@ -186,21 +187,22 @@ def create(self, update = False):
                 return self.QID
 
             item = self.api.item.new()
+
         else:
             item = self.api.item.get(entity_id=self.QID)
-        # Add title
+
+
         if self.title:
             item.labels.set(language="en", value=self.title)
 
-
         if self.resource_type and self.resource_type != "wd:Q37866906":
-            desc = f"{self.metadata['resource_type']['title']} published at Zenodo repository. "
-            item.add_claim('wdt:P31',self.resource_type)
+                desc = f"{self.metadata['resource_type']['title']} published at Zenodo repository. "
+                item.add_claim('wdt:P31',self.resource_type)
         else:
             desc = "Resource published at Zenodo repository. "
-
         item.descriptions.set(language="en", value=desc)
 
+
         if self.description:
             prop_nr = self.api.get_local_id_by_label("description", "property")
             item.add_claim(prop_nr, self.description)
@@ -224,14 +226,15 @@ def create(self, update = False):
             item.add_claim('wdt:P356', doi)
 
         # License
-        if self.license['id'] == "cc-by-4.0":
-            item.add_claim("wdt:P275", "wd:Q20007257")
-        elif self.license['id'] == "cc-by-sa-4.0":
-            item.add_claim("wdt:P275", "wd:Q18199165")
-        elif self.license['id'] == "cc-by-nc-sa-4.0":
-            item.add_claim("wdt:P275", "wd:Q42553662")
-        elif self.license['id'] == "mit-license":
-            item.add_claim("wdt:P275", "wd:Q334661")
+        if self.license:
+            if self.license['id'] == "cc-by-4.0":
+                item.add_claim("wdt:P275", "wd:Q20007257")
+            elif self.license['id'] == "cc-by-sa-4.0":
+                item.add_claim("wdt:P275", "wd:Q18199165")
+            elif self.license['id'] == "cc-by-nc-sa-4.0":
+                item.add_claim("wdt:P275", "wd:Q42553662")
+            elif self.license['id'] == "mit-license":
+                item.add_claim("wdt:P275", "wd:Q334661")
 
         # Communities
         if self.communities:
@@ -248,6 +251,9 @@ def create(self, update = False):
 
         if self._mardi_type:
             item.add_claim('MaRDI profile type', self._mardi_type)
+
+
+        #print(item.claims.get_json())
 
         self.QID = item.write().id
 

diff --git a/mardi_importer/mardi_importer/scripts/import.py b/mardi_importer/mardi_importer/scripts/import.py
@@ -21,7 +21,7 @@ def get_parser():
     parser.add_argument("--wikidata_id_file_path", required=False)
     return parser
 
-def main(**args):
+def main(**args): 
     # logging.config.fileConfig("logging_config.ini", disable_existing_loggers=False)
     # Parse command-line arguments
 
@@ -32,7 +32,7 @@ def main(**args):
         conf_parser = ZBMathConfigParser(args["conf_path"])
         conf = conf_parser.parse_config()
 
-        data_source = ZBMathSource(
+        data_source = ZBMathSource( 
             out_dir=conf["out_dir"],
             tags=conf["tags"],
             from_date=conf["from_date"],
@@ -66,7 +66,7 @@ def main(**args):
         importer.import_all()
 
     elif args["mode"] == "zenodo":
-        data_source = ZenodoSource()
+        data_source = ZenodoSource(resourceTypes = ["dataset"], orcid_id_file = "/mardi_importer/mardi_importer/zenodo/orcids-all.csv")
         importer = Importer(data_source)
         importer.import_all()
 

diff --git a/mardi_importer/mardi_importer/zenodo/ZenodoSource.py b/mardi_importer/mardi_importer/zenodo/ZenodoSource.py
@@ -1,28 +1,63 @@
 import requests
 import json
 import os
+import sys
+import pandas as pd
 
 from mardi_importer.importer import ADataSource
 from mardi_importer.integrator import MardiIntegrator
 from mardi_importer.publications import ZenodoResource
+from typing import List
 
 class ZenodoSource(ADataSource):
     """Reads data from Zenodo API."""
 
     def __init__(
-        self
+        self,
+        communities: List[str] = None,
+        resourceTypes: List[str] = None,
+        orcid_id_file: str = None,
+        customQ: str = None
+
     ):    
         self.integrator = MardiIntegrator()
         self.zenodo_ids = []    
         self.filepath = os.path.realpath(os.path.dirname(__file__))
 
+        self.communities = communities
+        self.resourceTypes = resourceTypes
+        self.orcid_id_file = orcid_id_file
+        self.customQ = customQ
+        self.orcid_ids = None
+
+        if self.orcid_id_file:
+            self.orcid_ids = self.parse_orcids(orcid_id_file)
+
+        # if all parameters are set to None, issue a warning
+
+
     def setup(self):
         """Create all necessary properties and entities for zenodo"""
 
         filename = self.filepath + "/wikidata_entities.txt"
         self.integrator.import_entities(filename=filename)
         self.create_local_entities() 
 
+    @staticmethod
+    def parse_orcids(file):
+
+        # check that path and file exists
+        if not os.path.isfile(file):
+            sys.exit("File" + file + "not found")
+
+        orcid_df = pd.read_csv(file)
+        if not "orcid" in orcid_df.columns:
+            sys.exit("The file containing ORCID IDs must contain a column 'orcid'.")
+
+        orcid_df.drop_duplicates()
+        orcids_all = orcid_df['orcid'].tolist()
+        return orcids_all
+
     def create_local_entities(self):
         filename = self.filepath + "/new_entities.json"
         f = open(filename)
@@ -51,19 +86,83 @@ def pull(self):
         This method queries the Zenodo API to get a data dump of all records.
         """
 
-        response = requests.get('https://zenodo.org/api/records',
-                                params={'size' : 1,
-                                'communities' : 'mathplus'})
-        response_json = response.json()
-        total_hits = response_json.get("hits").get("total")
+        total_hits = 0
+        q_list = []
 
-        for page in range(1, total_hits+1):
-            url = 'https://zenodo.org/api/records?communities=mathplus&page=' + str(page) + "&size=1&sort=newest"
-            response = requests.get(url)    
+        if self.communities:
+            community_str = "communities:(" + ' '.join(self.communities) + ")"
+            q_list.append(community_str)
+        if self.resourceTypes:
+            resources_str = "resource_type.type:(" + ' OR '.join(self.resourceTypes) + ")"
+            q_list.append(resources_str)
+        if self.customQ:
+            q_list.append(self.customQ)
+
+        q_str = ' AND '.join(q_list)
+
+        if self.orcid_ids: 
+            i=0
+            while i <= len(self.orcid_ids): # if there are too many orcids the initial request needs to be sent out in batches
+
+                orcid_str ='metadata.creators.\*:("' + '" "'.join(self.orcid_ids[i:i+50]) + '")'
+                print("retrieving zenodo entries for the following ORCID IDs: " + orcid_str)
+
+                response = requests.get('https://zenodo.org/api/records',
+                                        params={'q' : q_str + ' AND ' + orcid_str, 
+                                        'sort':'-mostrecent'})
+                response_json = response.json()
+                total_hits = response_json.get("hits").get("total")
+
+                page_cur = 1
+                while total_hits > 0:
+                    response = requests.get('https://zenodo.org/api/records',
+                                            params={'q' :  q_str + ' AND ' + orcid_str, 
+                                            'sort':'-mostrecent',
+                                            'size' : 50,
+                                            'page' : page_cur})
+                    response_json = response.json()
+                    total_hits = total_hits - len(response_json.get("hits").get("hits"))
+                    page_cur = page_cur + 1
+
+                    for entry in response_json.get("hits").get("hits"):
+                        self.zenodo_ids.append(str(entry.get("id")))
+
+                i = i+50
+        else:
+            response = requests.get('https://zenodo.org/api/records',
+                                    params={'q' : q_str})   
             response_json = response.json()
+            total_hits = response_json.get("hits").get("total")     
+
+            page_cur = 1
+            while total_hits > 0:
+                response = requests.get('https://zenodo.org/api/records',
+                                    params={'q' : q_str,
+                                            'sort':'-mostrecent',
+                                            'size' : 50,
+                                            'page' : page_cur})
+                response_json = response.json()
+                total_hits = total_hits - len(response_json.get("hits").get("hits"))
+                page_cur = page_cur + 1
+
+                for entry in response_json.get("hits").get("hits"):
+                    self.zenodo_ids.append(str(entry.get("id")))
+
+
+
+        # response = requests.get('https://zenodo.org/api/records',
+        #                         params={'size' : 1,
+        #                         'communities' : 'mathplus'})
+        # response_json = response.json()
+        # total_hits = response_json.get("hits").get("total")
+
+        # for page in range(1, total_hits+1):
+        #     url = 'https://zenodo.org/api/records?communities=mathplus&page=' + str(page) + "&size=1&sort=newest"
+        #     response = requests.get(url)    
+        #     response_json = response.json()
 
-            zenodo_id = response_json.get("hits").get("hits")[0].get("id")
-            self.zenodo_ids.append(str(zenodo_id))
+        #     zenodo_id = response_json.get("hits").get("hits")[0].get("id")
+        #     self.zenodo_ids.append(str(zenodo_id))
 
     def push(self):
         for id in self.zenodo_ids: