Merge pull request #193 from pangaea-data-publisher/robbranch2

v1.3.5
pangaea-data-publisher · Jul 21, 2021 · 7248d41 · 7248d41
2 parents 22462d6 + 5b25983
commit 7248d41
Show file tree

Hide file tree

Showing 6 changed files with 78 additions and 28 deletions.
diff --git a/fuji_server/controllers/fair_check.py b/fuji_server/controllers/fair_check.py
@@ -106,7 +106,7 @@ class FAIRCheck:
     IDENTIFIERS_ORG_DATA = {}
     GOOGLE_DATA_DOI_CACHE =[]
     GOOGLE_DATA_URL_CACHE = []
-    FUJI_VERSION = 'v1.3.4'
+    FUJI_VERSION = 'v1.3.5'
 
     def __init__(self, uid, test_debug=False, metadata_service_url=None, metadata_service_type =None,use_datacite=True, oaipmh_endpoint = None):
         uid_bytes = uid.encode('utf-8')
@@ -223,14 +223,23 @@ def uri_validator(u):  # TODO integrate into request_helper.py
 
     def set_remote_logging_target(self, host, path):
         if host and path:
+            isHostUp = False
             try:
-                weblogger = logging.handlers.HTTPHandler(host,
-                                                         path + '?testid=' + str(self.test_id), method='POST')
-                webformatter = logging.Formatter('%(levelname)s - %(message)s \r\n')
-                weblogger.setFormatter(webformatter)
-                self.logger.addHandler(weblogger)
+                if urllib.urlopen('http://'+host+''+path).getcode() == 200:
+                    isHostUp = True
             except Exception as e:
+                print('Remote logging not possible, please check config.ini, host not reachable: http://'+str(host)+''+str(path))
                 print(e)
+            if isHostUp:
+                try:
+                    weblogger = logging.handlers.HTTPHandler(host,
+                                                             path + '?testid=' + str(self.test_id), method='POST')
+                    webformatter = logging.Formatter('%(levelname)s - %(message)s \r\n')
+                    weblogger.setFormatter(webformatter)
+                    self.logger.addHandler(weblogger)
+                except Exception as e:
+                    print(e)
+
 
 
     def validate_service_url(self):
@@ -309,7 +318,9 @@ def retrieve_apis_standards(self):
 
             if self.metadata_service_url not in [None,'']:
                 self.logger.info('FsF-R1.3-01M : Metadata service endpoint ('+str(self.metadata_service_type)+') provided as part of the request -: '+str(self.metadata_service_url))
-            else:
+            #else:
+            #check re3data always instead...
+            if self.use_datacite:
                 self.logger.info('FsF-R1.3-01M : Trying to retrieve metadata info from re3data/datacite services using client id -: '+str(client_id))
                 #find endpoint via datacite/re3data if pid is provided
                 #print(client_id ,self.pid_scheme)
@@ -322,7 +333,9 @@ def retrieve_apis_standards(self):
                         self.sparql_endpoint = repoHelper.getRe3MetadataAPIs().get('SPARQL')
                     self.community_standards.extend(repoHelper.getRe3MetadataStandards())
                     self.logger.info('{} : Metadata standards listed in re3data record -: {}'.format('FsF-R1.3-01M', self.community_standards ))
-            # verify the service url by domain matching
+            else:
+                self.logger.info('FsF-R1.3-01M : Skipped re3data metadata standards query since Datacite support is disabled by user')
+                # verify the service url by domain matching
             self.validate_service_url()
             # retrieve metadata standards info from oai-pmh
             if self.oaipmh_endpoint:

diff --git a/fuji_server/data/metadata_standards.json b/fuji_server/data/metadata_standards.json
diff --git a/fuji_server/data/metadata_standards_uris.json b/fuji_server/data/metadata_standards_uris.json
diff --git a/fuji_server/evaluators/fair_evaluator_data_content_metadata.py b/fuji_server/evaluators/fair_evaluator_data_content_metadata.py
@@ -51,7 +51,7 @@ def evaluate(self):
         if resource_type:
             resource_type = str(resource_type).lower()
             if str(resource_type).startswith('http'):
-                    resource_type = resource_type.split('/')[-1]
+                    resource_type = '/'.join(str(resource_type).split('/')[-2:])
             if resource_type in self.fuji.VALID_RESOURCE_TYPES or resource_type in self.fuji.SCHEMA_ORG_CONTEXT:
                 self.logger.log(self.fuji.LOG_SUCCESS,'FsF-R1-01MD : Resource type specified -: {}'.format(resource_type))
                 self.output.object_type = resource_type
@@ -81,13 +81,14 @@ def evaluate(self):
                     'FsF-R1-01MD : Selected content file to be analyzed -: {}'.format(test_data_content_url))
                 try:
                     # Use Tika to parse the file
+                    response_content=None
                     response_body=[]
                     timeout = 10
                     tika_content_size = 0
                     max_download_size =1000000
 
                     start = time.time()
-                    r = requests.get(test_data_content_url, verify=False, stream=True)
+                    #r = requests.get(test_data_content_url, verify=False, stream=True)
                     try:
                         response = urllib.request.urlopen(test_data_content_url)
                         while True:
@@ -103,30 +104,39 @@ def evaluate(self):
                                             timeout) + ' sec or receiving > ' + str(max_download_size) + '- {}'.format(
                                             test_data_content_url))
                                     tika_content_size = 0
-                                    tika_content_size = str(r.headers.get('content-length')).split(';')[0]
+                                    tika_content_size = str(response.headers.get('content-length')).split(';')[0]
                                     break
 
                     except urllib.error.HTTPError as e:
                         self.logger.warning('FsF-F3-01M : Content identifier inaccessible -: {0}, HTTPError code {1} '.format(
                                 test_data_content_url, e.code))
+                        self.logger.warning(
+                            'FsF-R1-01MD : Content identifier inaccessible -: {0}, HTTPError code {1} '.format(
+                                test_data_content_url, e.code))
                     except urllib.error.URLError as e:
                         self.logger.exception(e.reason)
                     except Exception as e:
                         self.logger.warning('FsF-F3-01M : Could not access the resource -:'+str(e))
+                    if response_body:
+                        response_content = b''.join(response_body)
 
-                    response_content = b''.join(response_body)
                     status = 'tika error'
                     parsed_content=''
                     try:
-                        parsedFile = parser.from_buffer(response_content)
-                        status = parsedFile.get("status")
-                        tika_content_types = parsedFile.get("metadata").get('Content-Type')
-                        parsed_content = parsedFile.get("content")
-                        self.logger.info('{0} : Successfully parsed data object file using TIKA'.format(self.metric_identifier))
+                        if response_content:
+                            parsedFile = parser.from_buffer(response_content)
+                            status = parsedFile.get("status")
+                            tika_content_types = parsedFile.get("metadata").get('Content-Type')
+                            parsed_content = parsedFile.get("content")
+                            self.logger.info('{0} : Successfully parsed data object file using TIKA'.format(self.metric_identifier))
+                        else:
+                            self.logger.warning(
+                                '{0} : Could not parse data object file using TIKA'.format(self.metric_identifier))
+
                     except  Exception as e:
                         self.logger.warning('{0} : File parsing using TIKA failed -: {1}'.format(self.metric_identifier, e))
                         # in case TIKA request fails use response header info
-                        tika_content_types = str(r.headers.get('content-type')).split(';')[0]
+                        tika_content_types = str(response.headers.get('content-type')).split(';')[0]
 
                     if isinstance(tika_content_types, list):
                         self.fuji.tika_content_types_list = list(set(i.split(';')[0] for i in tika_content_types))
@@ -209,7 +219,7 @@ def evaluate(self):
                     data_content_filetype_inner.matches_content = matches_content
                     data_content_descriptors.append(data_content_filetype_inner)
                 else:
-                    self.logger.warning('{0} : NO info available about {1} -: '.format(self.metric_identifier, type))
+                    self.logger.warning('{0} : NO info about {1} available in given metadata -: '.format(self.metric_identifier, type))
             ### scoring for file descriptors match
             if matches_type and matches_size:
                 score += 1

diff --git a/fuji_server/evaluators/fair_evaluator_license.py b/fuji_server/evaluators/fair_evaluator_license.py
@@ -49,11 +49,15 @@ def lookup_license_by_url(self, u, metric_id):
         self.logger.info('{0} : Verify URL through SPDX registry -: {1}'.format(metric_id, u))
         html_url = None
         isOsiApproved = False
+        ul = None
+        if 'spdx.org/licenses' in u:
+            ul = u.split('/')[-1]
         for item in self.fuji.SPDX_LICENSES:
             # u = u.lower()
             # if any(u in v.lower() for v in item.values()):
+            licenseId = item.get('licenseId')
             seeAlso = item.get('seeAlso')
-            if any(u in v for v in seeAlso):
+            if any(u in v for v in seeAlso) or licenseId == ul:
                 self.logger.info('{0} : Found SPDX license representation -: {1}'.format(metric_id, item['detailsUrl']))
                 # html_url = '.html'.join(item['detailsUrl'].rsplit('.json', 1))
                 html_url = item['detailsUrl'].replace(".json", ".html")

diff --git a/fuji_server/helper/metadata_collector_rdf.py b/fuji_server/helper/metadata_collector_rdf.py
@@ -20,9 +20,11 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 # SOFTWARE.
 import json
+import urllib
 
 import idutils
 import rdflib
+import requests
 from rdflib import Namespace
 from rdflib.namespace import RDF
 from rdflib.namespace import DCTERMS
@@ -128,14 +130,14 @@ def get_metadata(self,g, item, type='Dataset'):
         DCAT = Namespace("http://www.w3.org/ns/dcat#")
         meta = dict()
         #default sparql
-        met = self.get_default_metadata(item)
+        meta = self.get_default_metadata(g)
         meta['object_identifier'] = (g.value(item, DC.identifier) or g.value(item, DCTERMS.identifier))
         '''
         if self.source_name != self.getEnumSourceNames().RDFA.value:
             meta['object_identifier'] = str(item)
             meta['object_content_identifier'] = [{'url': str(item), 'type': 'application/rdf+xml'}]
         '''
-        meta['object_type'] = type
+
         meta['title'] = (g.value(item, DC.title) or g.value(item, DCTERMS.title))
         meta['summary'] = (g.value(item, DC.description) or g.value(item, DCTERMS.description))
         meta['publication_date'] = (g.value(item, DC.date) or g.value(item, DCTERMS.date)  or g.value(item, DCTERMS.issued))
@@ -158,6 +160,9 @@ def get_metadata(self,g, item, type='Dataset'):
         for v in [meta['title'],meta['summary'], meta['publisher']]:
             if v:
                 v = v.toPython()
+        if meta:
+            meta['object_type'] = type
+
         return meta
 
     def get_ontology_metadata(self, graph):
@@ -206,12 +211,28 @@ def get_dcat_metadata(self, graph):
 
             # distribution
             distribution = graph.objects(datasets[0], DCAT.distribution)
-
             dcat_metadata['object_content_identifier']=[]
             for dist in distribution:
                 dtype,durl ,dsize = None,None,None
                 if not (graph.value(dist, DCAT.accessURL) or graph.value(dist, DCAT.downloadURL)):
-                    durl = str(dist)
+                    self.logger.info('FsF-F2-01M : Trying to retrieve DCAT distributions from remote location -:'+str(dist))
+                    try:
+                        distgraph = rdflib.Graph()
+                        disturl = str(dist)
+                        distresponse = requests.get(disturl,headers={'Accept':'application/rdf+xml'})
+                        if distresponse.text:
+                            distgraph.parse(data=distresponse.text,format="application/rdf+xml")
+                            extdist = list(distgraph[: RDF.type: DCAT.Distribution])
+                            durl = (distgraph.value(extdist[0], DCAT.accessURL) or distgraph.value(extdist[0], DCAT.downloadURL))
+                            dsize = distgraph.value(extdist[0], DCAT.byteSize)
+                            dtype = distgraph.value(extdist[0], DCAT.mediaType)
+                            self.logger.info(
+                                'FsF-F2-01M : Found DCAT distribution URL info from remote location -:' + str(durl))
+                    except Exception as e:
+                        self.logger.info(
+                            'FsF-F2-01M : Failed to retrieve DCAT distributions from remote location -:' + str(dist))
+                        #print(e)
+                        durl = str(dist)
                 else:
                     durl= (graph.value(dist, DCAT.accessURL) or graph.value(dist, DCAT.downloadURL))
                     #taking only one just to check if licence is available
@@ -221,7 +242,9 @@ def get_dcat_metadata(self, graph):
                     dtype=graph.value(dist, DCAT.mediaType)
                     dsize=graph.value(dist, DCAT.bytesSize)
                 if durl or dtype or dsize:
-                    dcat_metadata['object_content_identifier'].append({'url':str(durl),'type':dtype, 'size':dsize})
+                    if idutils.is_url(str(durl)):
+                        dtype= '/'.join(str(dtype).split('/')[-2:])
+                    dcat_metadata['object_content_identifier'].append({'url':str(durl),'type':dtype, 'size':str(dsize)})
 
 
             if dcat_metadata['object_content_identifier']:
@@ -230,7 +253,7 @@ def get_dcat_metadata(self, graph):
         else:
             self.logger.info('FsF-F2-01M : Found DCAT content but could not correctly parse metadata')
             #in order to keep DCAT in the found metadata list, we need to pass at least one metadata value..
-            dcat_metadata['object_type'] = 'Dataset'
+            #dcat_metadata['object_type'] = 'Dataset'
         return dcat_metadata
             #rdf_meta.query(self.metadata_mapping.value)
             #print(rdf_meta)