-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathload_subdatasets.py
117 lines (92 loc) · 3.44 KB
/
load_subdatasets.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
from argparse import ArgumentParser
import json
from pathlib import Path
from pprint import pprint
from datalad.api import catalog_add
from datalad_catalog.schema_utils import get_metadata_item
from datalad_next.datasets import Dataset
from datalad_tabby.io import load_tabby
from pyld import jsonld
from utils import mint_dataset_id
def get_tabby_subdataset_path(tabby_file_path, ds_root_path):
"""Get path of subdataset described by tabby
Note: this is currently tuned to a single dir layout, and reports
tabby file's parent dir as the location of described subdataset.
If the tabby collection is located in .datalad/tabby, reports
relative to that directory instead.
"""
relpath = tabby_file_path.parent.relative_to(ds_root_path)
if relpath.match(".datalad/tabby/*"):
return relpath.relative_to(".datalad/tabby/")
return relpath
def list_tabby_ds_files(ds, anywhere=False):
"""List dataset*.tsv tabby files
By default, used glob to report .datalad/tabby contents. If
searching anywhere is requested, uses ls-tree instead.
"""
if not anywhere:
return list(ds.pathobj.glob(".datalad/tabby/**/*dataset*.tsv"))
else:
return [
ds.pathobj.joinpath(p)
for p in ds.repo.call_git_items_(["ls-files", "*dataset@tby*.tsv"])
]
def subdataset_item(ds, tabby_path):
"""Report subdataset path, id, version"""
# path is derived from tabby location
ds_path = get_tabby_subdataset_path(tabby_path, ds.pathobj)
# id & version are derived from tabby content
record = load_tabby(
tabby_path,
cpaths=[Path(__file__).parent / "conventions"],
)
# use context to standardize keys
compacted = jsonld.compact(
input_=record,
ctx={
"schema": "https://schema.org/",
"name": "schema:name",
"version": "schema:version",
"sfbProject": "schema:ResearchProject",
},
)
ds_id = mint_dataset_id(
ds_name=compacted["name"],
project=compacted["sfbProject"],
)
ds_version = compacted["version"]
return {"dataset_path": str(ds_path), "dataset_id": ds_id, "dataset_version": ds_version}
parser = ArgumentParser()
parser.add_argument("ds_path", type=Path, help="Dataset to which tabby files belong")
parser.add_argument("-c", "--catalog", type=Path, help="Catalog to add to")
parser.add_argument("--tabby-anywhere", action="store_true", help="Search outside .datalad/tabby")
args = parser.parse_args()
# Search the dataset and create subdataset metadata dicts
ds = Dataset(args.ds_path)
subdatasets = []
for tabby_path in list_tabby_ds_files(ds, anywhere=args.tabby_anywhere):
if tabby_path.parent.parts[-3:] == (".datalad", "tabby", "self"):
# skip self-description
continue
subdatasets.append(subdataset_item(ds, tabby_path))
# Early exit if nothing to do
if len(subdatasets) == 0:
print("No subdatasets found")
exit()
# Create a catalog metadata item and print it
dataset_item = get_metadata_item(
item_type="dataset",
dataset_id=ds.id,
dataset_version=ds.repo.get_hexsha(),
source_name="tabby",
source_version="0.1.0",
)
dataset_item["subdatasets"] = subdatasets
pprint(dataset_item)
# Add to catalog if requested
if args.catalog is not None:
catalog_add(
catalog=args.catalog,
metadata=json.dumps(dataset_item),
config_file=Path(__file__).with_name("superds-config.json"),
)