Skip to content

Commit

Permalink
fix for #264; improved JSON-LD handling; fixed unnecessary test repet…
Browse files Browse the repository at this point in the history
…ition
  • Loading branch information
huberrob committed Mar 31, 2022
1 parent 7368533 commit cd059b4
Show file tree
Hide file tree
Showing 3 changed files with 36 additions and 22 deletions.
6 changes: 3 additions & 3 deletions fuji_server/controllers/fair_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -561,7 +561,7 @@ def retrieve_metadata_embedded(self, extruct_metadata={}):
data_sign_links = self.get_signposting_links('item')
if data_sign_links:
self.logger.info('FsF-F3-01M : Found data links in response header (signposting) -: ' +
str(len(data_sign_links)))
str(data_sign_links))
if self.metadata_merged.get('object_content_identifier') is None:
self.metadata_merged['object_content_identifier'] = data_sign_links

Expand Down Expand Up @@ -589,7 +589,7 @@ def retrieve_metadata_embedded(self, extruct_metadata={}):
data_meta_links = self.get_html_typed_links(rel='item')
if data_meta_links:
self.logger.info('FsF-F3-01M : Found data links in HTML head (link rel=item) -: ' +
str(len(data_meta_links)))
str(data_meta_links))
if self.metadata_merged.get('object_content_identifier') is None:
self.metadata_merged['object_content_identifier'] = data_meta_links
# self.metadata_sources.append((MetaDataCollector.Sources.TYPED_LINK.value,'linked'))
Expand All @@ -604,6 +604,7 @@ def retrieve_metadata_embedded(self, extruct_metadata={}):
'FsF-F2-01M : Skipped EMBEDDED metadata identification, no landing page URL could be determined')

def check_pidtest_repeat(self):
self.repeat_pid_check = False
if self.related_resources:
for relation in self.related_resources:
if relation.get('relation_type') == 'isPartOf':
Expand All @@ -630,7 +631,6 @@ def check_pidtest_repeat(self):
self.LOG_SUCCESS,
'FsF-F1-02D : Found object identifier in metadata during FsF-F2-01M, PID check was repeated')
self.repeat_pid_check = True

if 'doi' in found_pids:
self.id = found_pids['doi']
self.pid_scheme = 'doi'
Expand Down
11 changes: 6 additions & 5 deletions fuji_server/helper/metadata_collector_dublincore.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def parse_metadata(self):
dc_t = None
if len(dc_name_parts) == 3:
dc_t = dc_name_parts[2]
meta_dc_matches.append([dc_name_parts[1].lower(), dc_t, meta_tag.get('content')])
meta_dc_matches.append([dc_name_parts[1], dc_t, meta_tag.get('content')])
#meta_dc_matches = re.findall(exp, self.source_metadata)
except Exception as e:
self.logger.exception('Parsing error, failed to extract DublinCore -: {}'.format(e))
Expand All @@ -111,12 +111,13 @@ def parse_metadata(self):
for dc_meta in meta_dc_matches:
# dc_meta --> ('', 'DC', 'creator', ' ', 'Hillenbrand, Claus-Dieter')
#key
k = dc_meta[0] #2
k = str(dc_meta[0]) #2
#type
t = dc_meta[1] #3
#value
v = dc_meta[2] #5
if k == 'date':

if k.lower() == 'date':
if t == 'dateAccepted':
dc_core_metadata['accepted_date'] = v
elif t == 'dateSubmitted':
Expand All @@ -127,9 +128,10 @@ def parse_metadata(self):
if k.lower() in dcterms:
#self.logger.info('FsF-F2-01M: DublinCore metadata element, %s = %s , ' % (k, v))
try:
elem = [key for (key, value) in Mapper.DC_MAPPING.value.items() if k in str(value).lower()
elem = [key for (key, value) in Mapper.DC_MAPPING.value.items() if k.lower() in str(value).lower()
][0] # fuji ref fields
except Exception as e:
#nothing found so just continue
pass
if elem == 'related_resources':
#dc_core_metadata['related_resources'] = []
Expand All @@ -138,7 +140,6 @@ def parse_metadata(self):
#qualifiers, subproperties (t):
#https://www.dublincore.org/specifications/dublin-core/dcmes-qualifiers/
#https://www.dublincore.org/specifications/dublin-core/dcq-html/

if k in ['source', 'references']:
t = 'wasDerivedFrom'
elif k == 'relation':
Expand Down
41 changes: 27 additions & 14 deletions fuji_server/helper/metadata_collector_rdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import re
import sys

import extruct
import idutils
import rdflib
import requests
Expand Down Expand Up @@ -198,21 +199,33 @@ def parse_metadata(self):
DCAT = Namespace('http://www.w3.org/ns/dcat#')
if self.content_type == 'application/ld+json':
self.logger.info('FsF-F2-01M : Try to parse RDF (JSON-LD) from -: %s' % (self.target_url))
try:
#this is a workaraund for a rdflib JSON-LD parsing issue proposed here: https://github.com/RDFLib/rdflib/issues/1423
if isinstance(rdf_response, dict):
try:
if rdf_response['@context'].startswith('http://schema.org'):
rdf_response['@context'] = 'https://schema.org/docs/jsonldcontext.json'
schemaorg_collector = MetaDataCollectorSchemaOrg(loggerinst=self.logger,
sourcemetadata=[rdf_response],
mapping=Mapper.SCHEMAORG_MAPPING,
pidurl=None)
source_schemaorg, rdf_metadata = schemaorg_collector.parse_metadata()
except Exception as e:
print(e)
pass
rdf_response = jsonld.expand( rdf_response)
rdf_response = json.dumps(rdf_response)
jsonldgraph = rdflib.ConjunctiveGraph()
rdf_response_graph = jsonldgraph.parse(data=rdf_response, format='json-ld')
rdf_response_graph = jsonldgraph
except Exception as e:
print('JSON-LD parsing error',e)
self.logger.info('FsF-F2-01M : Parsing error, failed to extract JSON-LD -: {}'.format(e))
#graph
else:
try:
#this is a workaraund for a rdflib JSON-LD parsing issue proposed here: https://github.com/RDFLib/rdflib/issues/1423
try:
if rdf_response['@context'].startswith('http://schema.org'):
rdf_response['@context'] = 'https://schema.org/docs/jsonldcontext.json'
except Exception as e:
pass
rdf_response = jsonld.expand( rdf_response)
rdf_response = json.dumps(rdf_response)
jsonldgraph = rdflib.ConjunctiveGraph()
rdf_response_graph = jsonldgraph.parse(data=rdf_response, format='json-ld')
rdf_response_graph = jsonldgraph
except Exception as e:
print('JSON-LD parsing error', e)
self.logger.info('FsF-F2-01M : Parsing error, failed to extract JSON-LD -: {}'.format(e))
else:
# parse RDF
parseformat = re.search(r'[\/+]([a-z0-9]+)$', str(requestHelper.content_type))
Expand Down Expand Up @@ -251,8 +264,8 @@ def parse_metadata(self):

#else:
# neg_source, rdf_response = 'html', self.rdf_graph

rdf_metadata = self.get_metadata_from_graph(rdf_response_graph)
if not rdf_metadata:
rdf_metadata = self.get_metadata_from_graph(rdf_response_graph)

return self.source_name, rdf_metadata

Expand Down

0 comments on commit cd059b4

Please sign in to comment.