Skip to content

Commit

Permalink
Merge pull request #193 from pangaea-data-publisher/robbranch2
Browse files Browse the repository at this point in the history
v1.3.5
  • Loading branch information
huberrob authored Jul 21, 2021
2 parents 22462d6 + 5b25983 commit 7248d41
Show file tree
Hide file tree
Showing 6 changed files with 78 additions and 28 deletions.
29 changes: 21 additions & 8 deletions fuji_server/controllers/fair_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,7 @@ class FAIRCheck:
IDENTIFIERS_ORG_DATA = {}
GOOGLE_DATA_DOI_CACHE =[]
GOOGLE_DATA_URL_CACHE = []
FUJI_VERSION = 'v1.3.4'
FUJI_VERSION = 'v1.3.5'

def __init__(self, uid, test_debug=False, metadata_service_url=None, metadata_service_type =None,use_datacite=True, oaipmh_endpoint = None):
uid_bytes = uid.encode('utf-8')
Expand Down Expand Up @@ -223,14 +223,23 @@ def uri_validator(u): # TODO integrate into request_helper.py

def set_remote_logging_target(self, host, path):
if host and path:
isHostUp = False
try:
weblogger = logging.handlers.HTTPHandler(host,
path + '?testid=' + str(self.test_id), method='POST')
webformatter = logging.Formatter('%(levelname)s - %(message)s \r\n')
weblogger.setFormatter(webformatter)
self.logger.addHandler(weblogger)
if urllib.urlopen('http://'+host+''+path).getcode() == 200:
isHostUp = True
except Exception as e:
print('Remote logging not possible, please check config.ini, host not reachable: http://'+str(host)+''+str(path))
print(e)
if isHostUp:
try:
weblogger = logging.handlers.HTTPHandler(host,
path + '?testid=' + str(self.test_id), method='POST')
webformatter = logging.Formatter('%(levelname)s - %(message)s \r\n')
weblogger.setFormatter(webformatter)
self.logger.addHandler(weblogger)
except Exception as e:
print(e)



def validate_service_url(self):
Expand Down Expand Up @@ -309,7 +318,9 @@ def retrieve_apis_standards(self):

if self.metadata_service_url not in [None,'']:
self.logger.info('FsF-R1.3-01M : Metadata service endpoint ('+str(self.metadata_service_type)+') provided as part of the request -: '+str(self.metadata_service_url))
else:
#else:
#check re3data always instead...
if self.use_datacite:
self.logger.info('FsF-R1.3-01M : Trying to retrieve metadata info from re3data/datacite services using client id -: '+str(client_id))
#find endpoint via datacite/re3data if pid is provided
#print(client_id ,self.pid_scheme)
Expand All @@ -322,7 +333,9 @@ def retrieve_apis_standards(self):
self.sparql_endpoint = repoHelper.getRe3MetadataAPIs().get('SPARQL')
self.community_standards.extend(repoHelper.getRe3MetadataStandards())
self.logger.info('{} : Metadata standards listed in re3data record -: {}'.format('FsF-R1.3-01M', self.community_standards ))
# verify the service url by domain matching
else:
self.logger.info('FsF-R1.3-01M : Skipped re3data metadata standards query since Datacite support is disabled by user')
# verify the service url by domain matching
self.validate_service_url()
# retrieve metadata standards info from oai-pmh
if self.oaipmh_endpoint:
Expand Down
2 changes: 1 addition & 1 deletion fuji_server/data/metadata_standards.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion fuji_server/data/metadata_standards_uris.json

Large diffs are not rendered by default.

32 changes: 21 additions & 11 deletions fuji_server/evaluators/fair_evaluator_data_content_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def evaluate(self):
if resource_type:
resource_type = str(resource_type).lower()
if str(resource_type).startswith('http'):
resource_type = resource_type.split('/')[-1]
resource_type = '/'.join(str(resource_type).split('/')[-2:])
if resource_type in self.fuji.VALID_RESOURCE_TYPES or resource_type in self.fuji.SCHEMA_ORG_CONTEXT:
self.logger.log(self.fuji.LOG_SUCCESS,'FsF-R1-01MD : Resource type specified -: {}'.format(resource_type))
self.output.object_type = resource_type
Expand Down Expand Up @@ -81,13 +81,14 @@ def evaluate(self):
'FsF-R1-01MD : Selected content file to be analyzed -: {}'.format(test_data_content_url))
try:
# Use Tika to parse the file
response_content=None
response_body=[]
timeout = 10
tika_content_size = 0
max_download_size =1000000

start = time.time()
r = requests.get(test_data_content_url, verify=False, stream=True)
#r = requests.get(test_data_content_url, verify=False, stream=True)
try:
response = urllib.request.urlopen(test_data_content_url)
while True:
Expand All @@ -103,30 +104,39 @@ def evaluate(self):
timeout) + ' sec or receiving > ' + str(max_download_size) + '- {}'.format(
test_data_content_url))
tika_content_size = 0
tika_content_size = str(r.headers.get('content-length')).split(';')[0]
tika_content_size = str(response.headers.get('content-length')).split(';')[0]
break

except urllib.error.HTTPError as e:
self.logger.warning('FsF-F3-01M : Content identifier inaccessible -: {0}, HTTPError code {1} '.format(
test_data_content_url, e.code))
self.logger.warning(
'FsF-R1-01MD : Content identifier inaccessible -: {0}, HTTPError code {1} '.format(
test_data_content_url, e.code))
except urllib.error.URLError as e:
self.logger.exception(e.reason)
except Exception as e:
self.logger.warning('FsF-F3-01M : Could not access the resource -:'+str(e))
if response_body:
response_content = b''.join(response_body)

response_content = b''.join(response_body)
status = 'tika error'
parsed_content=''
try:
parsedFile = parser.from_buffer(response_content)
status = parsedFile.get("status")
tika_content_types = parsedFile.get("metadata").get('Content-Type')
parsed_content = parsedFile.get("content")
self.logger.info('{0} : Successfully parsed data object file using TIKA'.format(self.metric_identifier))
if response_content:
parsedFile = parser.from_buffer(response_content)
status = parsedFile.get("status")
tika_content_types = parsedFile.get("metadata").get('Content-Type')
parsed_content = parsedFile.get("content")
self.logger.info('{0} : Successfully parsed data object file using TIKA'.format(self.metric_identifier))
else:
self.logger.warning(
'{0} : Could not parse data object file using TIKA'.format(self.metric_identifier))

except Exception as e:
self.logger.warning('{0} : File parsing using TIKA failed -: {1}'.format(self.metric_identifier, e))
# in case TIKA request fails use response header info
tika_content_types = str(r.headers.get('content-type')).split(';')[0]
tika_content_types = str(response.headers.get('content-type')).split(';')[0]

if isinstance(tika_content_types, list):
self.fuji.tika_content_types_list = list(set(i.split(';')[0] for i in tika_content_types))
Expand Down Expand Up @@ -209,7 +219,7 @@ def evaluate(self):
data_content_filetype_inner.matches_content = matches_content
data_content_descriptors.append(data_content_filetype_inner)
else:
self.logger.warning('{0} : NO info available about {1} -: '.format(self.metric_identifier, type))
self.logger.warning('{0} : NO info about {1} available in given metadata -: '.format(self.metric_identifier, type))
### scoring for file descriptors match
if matches_type and matches_size:
score += 1
Expand Down
6 changes: 5 additions & 1 deletion fuji_server/evaluators/fair_evaluator_license.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,15 @@ def lookup_license_by_url(self, u, metric_id):
self.logger.info('{0} : Verify URL through SPDX registry -: {1}'.format(metric_id, u))
html_url = None
isOsiApproved = False
ul = None
if 'spdx.org/licenses' in u:
ul = u.split('/')[-1]
for item in self.fuji.SPDX_LICENSES:
# u = u.lower()
# if any(u in v.lower() for v in item.values()):
licenseId = item.get('licenseId')
seeAlso = item.get('seeAlso')
if any(u in v for v in seeAlso):
if any(u in v for v in seeAlso) or licenseId == ul:
self.logger.info('{0} : Found SPDX license representation -: {1}'.format(metric_id, item['detailsUrl']))
# html_url = '.html'.join(item['detailsUrl'].rsplit('.json', 1))
html_url = item['detailsUrl'].replace(".json", ".html")
Expand Down
35 changes: 29 additions & 6 deletions fuji_server/helper/metadata_collector_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,11 @@
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
import json
import urllib

import idutils
import rdflib
import requests
from rdflib import Namespace
from rdflib.namespace import RDF
from rdflib.namespace import DCTERMS
Expand Down Expand Up @@ -128,14 +130,14 @@ def get_metadata(self,g, item, type='Dataset'):
DCAT = Namespace("http://www.w3.org/ns/dcat#")
meta = dict()
#default sparql
met = self.get_default_metadata(item)
meta = self.get_default_metadata(g)
meta['object_identifier'] = (g.value(item, DC.identifier) or g.value(item, DCTERMS.identifier))
'''
if self.source_name != self.getEnumSourceNames().RDFA.value:
meta['object_identifier'] = str(item)
meta['object_content_identifier'] = [{'url': str(item), 'type': 'application/rdf+xml'}]
'''
meta['object_type'] = type

meta['title'] = (g.value(item, DC.title) or g.value(item, DCTERMS.title))
meta['summary'] = (g.value(item, DC.description) or g.value(item, DCTERMS.description))
meta['publication_date'] = (g.value(item, DC.date) or g.value(item, DCTERMS.date) or g.value(item, DCTERMS.issued))
Expand All @@ -158,6 +160,9 @@ def get_metadata(self,g, item, type='Dataset'):
for v in [meta['title'],meta['summary'], meta['publisher']]:
if v:
v = v.toPython()
if meta:
meta['object_type'] = type

return meta

def get_ontology_metadata(self, graph):
Expand Down Expand Up @@ -206,12 +211,28 @@ def get_dcat_metadata(self, graph):

# distribution
distribution = graph.objects(datasets[0], DCAT.distribution)

dcat_metadata['object_content_identifier']=[]
for dist in distribution:
dtype,durl ,dsize = None,None,None
if not (graph.value(dist, DCAT.accessURL) or graph.value(dist, DCAT.downloadURL)):
durl = str(dist)
self.logger.info('FsF-F2-01M : Trying to retrieve DCAT distributions from remote location -:'+str(dist))
try:
distgraph = rdflib.Graph()
disturl = str(dist)
distresponse = requests.get(disturl,headers={'Accept':'application/rdf+xml'})
if distresponse.text:
distgraph.parse(data=distresponse.text,format="application/rdf+xml")
extdist = list(distgraph[: RDF.type: DCAT.Distribution])
durl = (distgraph.value(extdist[0], DCAT.accessURL) or distgraph.value(extdist[0], DCAT.downloadURL))
dsize = distgraph.value(extdist[0], DCAT.byteSize)
dtype = distgraph.value(extdist[0], DCAT.mediaType)
self.logger.info(
'FsF-F2-01M : Found DCAT distribution URL info from remote location -:' + str(durl))
except Exception as e:
self.logger.info(
'FsF-F2-01M : Failed to retrieve DCAT distributions from remote location -:' + str(dist))
#print(e)
durl = str(dist)
else:
durl= (graph.value(dist, DCAT.accessURL) or graph.value(dist, DCAT.downloadURL))
#taking only one just to check if licence is available
Expand All @@ -221,7 +242,9 @@ def get_dcat_metadata(self, graph):
dtype=graph.value(dist, DCAT.mediaType)
dsize=graph.value(dist, DCAT.bytesSize)
if durl or dtype or dsize:
dcat_metadata['object_content_identifier'].append({'url':str(durl),'type':dtype, 'size':dsize})
if idutils.is_url(str(durl)):
dtype= '/'.join(str(dtype).split('/')[-2:])
dcat_metadata['object_content_identifier'].append({'url':str(durl),'type':dtype, 'size':str(dsize)})


if dcat_metadata['object_content_identifier']:
Expand All @@ -230,7 +253,7 @@ def get_dcat_metadata(self, graph):
else:
self.logger.info('FsF-F2-01M : Found DCAT content but could not correctly parse metadata')
#in order to keep DCAT in the found metadata list, we need to pass at least one metadata value..
dcat_metadata['object_type'] = 'Dataset'
#dcat_metadata['object_type'] = 'Dataset'
return dcat_metadata
#rdf_meta.query(self.metadata_mapping.value)
#print(rdf_meta)
Expand Down

0 comments on commit 7248d41

Please sign in to comment.