-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #47 from workflowhub-eu/refactor-ro-crate
Refactor RO Crate
- Loading branch information
Showing
20 changed files
with
5,055 additions
and
238,871 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Empty file.
106 changes: 106 additions & 0 deletions
106
ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/absolutize.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
import argparse | ||
import copy | ||
import json | ||
from urllib.parse import urlparse | ||
import arcp | ||
import rdflib | ||
|
||
|
||
# TODO: following https://github.com/workflowhub-eu/workflowhub-graph/issues/12 | ||
# builing upon is_all_absolute | ||
# add extended RO-Crate profile validation | ||
# get information like schema.org domain and check if the graph is compliant with the schema | ||
# normative schema.org dev docs: https://schema.org/docs/developers.html | ||
# make a note for validation of the graph | ||
|
||
|
||
def is_all_absolute(G: rdflib.Graph) -> bool: | ||
for triple in G: | ||
for item in triple: | ||
if isinstance(item, rdflib.URIRef): | ||
# TODO: is this enough? | ||
parsed = urlparse(item) | ||
|
||
# we accept file:// with a netloc, even if netloc is not a FQDN, | ||
# see https://github.com/workflowhub-eu/workflowhub-graph/issues/1#issuecomment-2127351752 | ||
if parsed.netloc == "" and parsed.scheme != "mailto": | ||
print( | ||
f"found non-absolute path <{item}> {parsed.netloc}, {urlparse(item)}" | ||
) | ||
return False | ||
return True | ||
|
||
|
||
def make_paths_absolute( | ||
json_data: dict, workflowhub_url: str, workflow_id: int, workflow_version: int | ||
) -> dict: | ||
""" | ||
Makes all paths in the JSON content absolute by adding an '@base' key to the JSON-LD context. | ||
:param json_data: The JSON content as a dictionary. | ||
:param workflowhub_url: The base URL for WorkflowHub. | ||
:param workflow_id: The workflow ID to construct the absolute paths. | ||
:param workflow_version: The workflow version. | ||
:return: The modified JSON content with absolute paths. | ||
:raises ValueError: If '@context' key is missing or if '@base' key already exists in the JSON content. | ||
""" | ||
|
||
json_data = copy.deepcopy(json_data) | ||
|
||
workflow_url = ( | ||
f"{workflowhub_url}/workflows/{workflow_id}/ro_crate?version={workflow_version}" | ||
) | ||
|
||
if "@context" not in json_data: | ||
raise ValueError( | ||
"The JSON content does not contain a '@context' key, refusing to add it, can not absolutize paths" | ||
) | ||
|
||
if not isinstance(json_data["@context"], list): | ||
json_data["@context"] = [json_data["@context"]] | ||
|
||
if any( | ||
isinstance(item, dict) and "@base" in item for item in json_data["@context"] | ||
): | ||
raise ValueError( | ||
"The JSON content already contains an '@base' key, it was probably already processed." | ||
) | ||
|
||
json_data["@context"].append({"@base": arcp.arcp_location(workflow_url)}) | ||
|
||
return json_data | ||
|
||
|
||
def main(): | ||
parser = argparse.ArgumentParser( | ||
description="Make all paths in a JSON file absolute." | ||
) | ||
parser.add_argument("json_file", help="The JSON file to process.") | ||
parser.add_argument("output_file", help="The output file.") | ||
parser.add_argument("workflow_id", help="The Workflow ID.") | ||
parser.add_argument("workflow_version", help="The Workflow version.") | ||
parser.add_argument( | ||
"-u", | ||
"--workflowhub-url", | ||
help="The WorkflowHub URL.", | ||
default="https://workflowhub.eu", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
with open(args.json_file, "r") as f: | ||
json_data = json.load(f) | ||
|
||
processed_json_data = make_paths_absolute( | ||
json_data, args.workflowhub_url, args.workflow_id, args.workflow_version | ||
) | ||
|
||
if args.output_file == "-": | ||
print(json.dumps(processed_json_data, indent=2)) | ||
else: | ||
with open(args.output_file, "w") as f: | ||
json.dump(processed_json_data, f, indent=2) | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
81 changes: 81 additions & 0 deletions
81
ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/cached_url_open.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,81 @@ | ||
import json | ||
import os | ||
import re | ||
from unittest.mock import patch, MagicMock | ||
from contextlib import contextmanager | ||
import io | ||
from urllib.parse import urlparse | ||
from urllib.request import urlopen | ||
|
||
|
||
def url_to_filename(url): | ||
""" | ||
Converts a URL to a filename by removing non-alphanumeric characters and replacing them with dashes. | ||
:param url: The URL to convert. | ||
:return: The filename. | ||
""" | ||
|
||
parsed = urlparse(url) | ||
if parsed.scheme not in ["http", "https"]: | ||
raise ValueError(f"Unsupported scheme {parsed.scheme}") | ||
|
||
return re.sub("[^0-9a-z]+", "-", (parsed.netloc + parsed.path).lower().strip("_")) | ||
|
||
|
||
@contextmanager | ||
def patch_rdflib_urlopen( | ||
cache_base_dir=None, | ||
write_cache=True, | ||
allowed_urls_pattern=r"https://w3id.org/ro/crate/1\.[01]/context", | ||
): | ||
""" | ||
Context manager to patch rdflib.parser.urlopen to cache and return the content of a URL. | ||
:param cache_base_dir: The base directory to store the cached files. | ||
:param write_cache: Whether to write the cache if the file is not found. | ||
:param allowed_urls_pattern: A regex pattern to match the allowed URLs to cache. | ||
""" | ||
|
||
allowed_urls_re = re.compile(allowed_urls_pattern) | ||
if cache_base_dir is None: | ||
cache_base_dir = "cached_urlopen" | ||
os.makedirs(cache_base_dir, exist_ok=True) | ||
|
||
def cached_urlopen(request): | ||
url = request.get_full_url() | ||
|
||
class Response(io.StringIO): | ||
content_type = "text/html" | ||
headers = {"Content-Type": "text/html"} | ||
|
||
def info(self): | ||
return self.headers | ||
|
||
def geturl(self): | ||
return url | ||
|
||
if not allowed_urls_re.match(url): | ||
return Response(json.dumps({"@context": {}})) | ||
# raise ValueError( | ||
# f"URL {url} not allowed to cache, allowed: {allowed_urls_pattern}" | ||
# ) | ||
|
||
cached_filename = os.path.join(cache_base_dir, url_to_filename(url)) | ||
|
||
if not os.path.exists(cached_filename): | ||
if write_cache: | ||
response = urlopen(request) | ||
content = response.read().decode("utf-8") | ||
|
||
with open(cached_filename, "wt") as f: | ||
f.write(content) | ||
else: | ||
raise ValueError( | ||
f"Cache file {cached_filename} not found, not allowed to download and update cache" | ||
) | ||
|
||
content = open(cached_filename, "rt").read() | ||
|
||
return Response(content) | ||
|
||
with patch("rdflib.parser.urlopen", cached_urlopen): | ||
yield |
110 changes: 110 additions & 0 deletions
110
ro-crate-metadata/#12c6426a-fe66-48e6-9863-bde836ce0b16/check_outputs.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,110 @@ | ||
import argparse | ||
import json | ||
import os | ||
import re | ||
|
||
|
||
def parse_args() -> argparse.Namespace: | ||
""" | ||
Parse command-line arguments. | ||
:return: Parsed command-line arguments. | ||
""" | ||
parser = argparse.ArgumentParser( | ||
description="Generate list of created files based on workflow IDs and versions." | ||
) | ||
parser.add_argument( | ||
"--workflow-ids", | ||
type=str, | ||
help="Range of workflow IDs to process (e.g., '1-10').", | ||
) | ||
parser.add_argument( | ||
"--versions", | ||
type=str, | ||
required=True, | ||
help="Comma-separated list of versions to process (e.g., '1,2,3').", | ||
) | ||
parser.add_argument( | ||
"--output-dir", | ||
type=str, | ||
default="data", | ||
help="Directory where the output files are stored (default: 'data').", | ||
) | ||
return parser.parse_args() | ||
|
||
|
||
def get_max_id_from_files(output_dir: str) -> int: | ||
""" | ||
If no workflow ID parameter is provided, get the maximum workflow ID from the files in the output directory. | ||
:param output_dir: The directory where output files are stored. | ||
:return: The maximum workflow ID. | ||
""" | ||
max_id = 0 | ||
pattern = re.compile(r"^(\d+)_\d+_ro-crate-metadata\.json$") | ||
for filename in os.listdir(output_dir): | ||
match = pattern.match(filename) | ||
if match: | ||
wf_id = int(match.group(1)) | ||
if wf_id > max_id: | ||
max_id = wf_id | ||
return max_id | ||
|
||
|
||
def generate_expected_files( | ||
output_dir: str, workflow_ids: range, versions: list[str] | ||
) -> list[str]: | ||
""" | ||
Generate a list of expected file paths based on the workflow IDs and versions. | ||
:param output_dir: The directory where output files are stored. | ||
:param workflow_ids: The range of workflow IDs to process. | ||
:param versions: The list of versions to process. | ||
:return: A list of expected file paths. | ||
""" | ||
|
||
expected_files = [] | ||
for wf_id in workflow_ids: | ||
for ver in versions: | ||
expected_files.append(f"{output_dir}/{wf_id}_{ver}_ro-crate-metadata.json") | ||
return expected_files | ||
|
||
|
||
def verify_created_files(expected_files: list[str]) -> list[str]: | ||
""" | ||
Verify which files from the list of expected files actually exist. | ||
:param expected_files: The list of expected file paths. | ||
:return: A list of file paths that actually exist. | ||
""" | ||
return [f for f in expected_files if os.path.exists(f)] | ||
|
||
|
||
def main(): | ||
args = parse_args() | ||
|
||
if args.workflow_ids: | ||
min_id, max_id = map(int, args.workflow_ids.split("-")) | ||
workflow_ids = range(min_id, max_id + 1) | ||
else: | ||
max_id = get_max_id_from_files(args.output_dir) | ||
workflow_ids = range(1, max_id + 1) | ||
|
||
versions = args.versions.split(",") | ||
|
||
# Generate expected file paths | ||
expected_files = generate_expected_files(args.output_dir, workflow_ids, versions) | ||
|
||
# Check which files were actually created | ||
created_files = verify_created_files(expected_files) | ||
|
||
# Output the list of created files to a JSON file | ||
with open("created_files.json", "w") as f: | ||
json.dump(created_files, f) | ||
|
||
print("\nFile names written to created_files.json") | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |
Oops, something went wrong.