Skip to content

Commit 497c2ac

Browse files
authored
Merge pull request #1833 from depositar/provider-ckan
[MRG] Add a repo provider for CKAN datasets
2 parents 0d5dccd + 2945d83 commit 497c2ac

File tree

8 files changed

+149
-3
lines changed

8 files changed

+149
-3
lines changed

binderhub/app.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@
5656
from .ratelimit import RateLimiter
5757
from .registry import DockerRegistry
5858
from .repoproviders import (
59+
CKANProvider,
5960
DataverseProvider,
6061
FigshareProvider,
6162
GistRepoProvider,
@@ -586,6 +587,7 @@ def _default_build_namespace(self):
586587
"figshare": FigshareProvider,
587588
"hydroshare": HydroshareProvider,
588589
"dataverse": DataverseProvider,
590+
"ckan": CKANProvider,
589591
},
590592
config=True,
591593
help="""

binderhub/event-schemas/launch.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,8 @@
1414
"Zenodo",
1515
"Figshare",
1616
"Hydroshare",
17-
"Dataverse"
17+
"Dataverse",
18+
"CKAN"
1819
],
1920
"description": "Provider for the repository being launched"
2021
},

binderhub/main.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"figshare": "Figshare",
2323
"hydroshare": "Hydroshare",
2424
"dataverse": "Dataverse",
25+
"ckan": "CKAN",
2526
}
2627

2728

binderhub/repoproviders.py

Lines changed: 87 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
import time
1616
import urllib.parse
1717
from datetime import datetime, timedelta, timezone
18-
from urllib.parse import urlparse
18+
from urllib.parse import parse_qs, urlencode, urlparse
1919

2020
import escapism
2121
from prometheus_client import Gauge
@@ -475,6 +475,92 @@ def get_build_slug(self):
475475
return f"hydroshare-{self.record_id}"
476476

477477

478+
class CKANProvider(RepoProvider):
479+
"""Provide contents of a CKAN dataset
480+
Users must provide a spec consisting of the CKAN dataset URL.
481+
"""
482+
483+
name = Unicode("CKAN")
484+
485+
display_name = "CKAN dataset"
486+
487+
labels = {
488+
"text": "CKAN dataset URL (https://demo.ckan.org/dataset/sample-dataset-1)",
489+
"tag_text": "Git ref (branch, tag, or commit)",
490+
"ref_prop_disabled": True,
491+
"label_prop_disabled": True,
492+
}
493+
494+
def __init__(self, *args, **kwargs):
495+
super().__init__(*args, **kwargs)
496+
self.repo = urllib.parse.unquote(self.spec)
497+
498+
async def get_resolved_ref(self):
499+
parsed_repo = urlparse(self.repo)
500+
501+
if "/dataset/" not in parsed_repo.path:
502+
# Not actually a dataset
503+
return None
504+
505+
# CKAN may be under a URL prefix, and we should accomodate that
506+
url_prefix, dataset_url = parsed_repo.path.split("/dataset/")
507+
508+
dataset_url_parts = dataset_url.split("/")
509+
self.dataset_id = dataset_url_parts[0]
510+
511+
api = parsed_repo._replace(
512+
path=f"{url_prefix}/api/3/action/", query=""
513+
).geturl()
514+
515+
# Activity ID may be present either as a query parameter, activity_id
516+
# or as part of the URL, under `/history/<activity-id>`. If `/history/`
517+
# is present, that takes precedence over `activity_id`
518+
activity_id = None
519+
if "history" in dataset_url_parts:
520+
activity_id = dataset_url_parts[dataset_url_parts.index("history") + 1]
521+
elif parse_qs(parsed_repo.query).get("activity_id") is not None:
522+
activity_id = parse_qs(parsed_repo.query).get("activity_id")[0]
523+
524+
if activity_id:
525+
fetch_url = f"{api}activity_data_show?" + urlencode(
526+
{"id": activity_id, "object_type": "package"}
527+
)
528+
else:
529+
fetch_url = f"{api}package_show?" + urlencode({"id": self.dataset_id})
530+
531+
client = AsyncHTTPClient()
532+
try:
533+
r = await client.fetch(fetch_url, user_agent="BinderHub")
534+
except HTTPError:
535+
return None
536+
537+
json_response = json.loads(r.body)
538+
date = json_response["result"]["metadata_modified"]
539+
parsed_date = datetime.strptime(date, "%Y-%m-%dT%H:%M:%S.%f")
540+
epoch = parsed_date.replace(tzinfo=timezone(timedelta(0))).timestamp()
541+
# truncate the timestamp
542+
dataset_version = str(int(epoch))
543+
544+
self.record_id = f"{self.dataset_id}.v{dataset_version}"
545+
546+
return self.record_id
547+
548+
async def get_resolved_spec(self):
549+
if not hasattr(self, "record_id"):
550+
await self.get_resolved_ref()
551+
return self.repo
552+
553+
def get_repo_url(self):
554+
return self.repo
555+
556+
async def get_resolved_ref_url(self):
557+
resolved_spec = await self.get_resolved_spec()
558+
return resolved_spec
559+
560+
def get_build_slug(self):
561+
return f"ckan-{self.dataset_id}"
562+
563+
478564
class GitRepoProvider(RepoProvider):
479565
"""Bare bones git repo provider.
480566

binderhub/static/js/src/form.js

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,8 @@ export function getBuildFormValues() {
3131
providerPrefix === "zenodo" ||
3232
providerPrefix === "figshare" ||
3333
providerPrefix === "dataverse" ||
34-
providerPrefix === "hydroshare"
34+
providerPrefix === "hydroshare" ||
35+
providerPrefix === "ckan"
3536
) {
3637
ref = "";
3738
}

binderhub/tests/test_repoproviders.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from tornado.ioloop import IOLoop
77

88
from binderhub.repoproviders import (
9+
CKANProvider,
910
DataverseProvider,
1011
FigshareProvider,
1112
GistRepoProvider,
@@ -209,6 +210,53 @@ async def test_dataverse(
209210
assert spec == resolved_spec
210211

211212

213+
@pytest.mark.parametrize(
214+
"spec,resolved_spec,resolved_ref,resolved_ref_url,build_slug",
215+
[
216+
[
217+
"https://demo.ckan.org/dataset/sample-dataset-1",
218+
"https://demo.ckan.org/dataset/sample-dataset-1",
219+
"sample-dataset-1.v",
220+
"https://demo.ckan.org/dataset/sample-dataset-1",
221+
"ckan-sample-dataset-1",
222+
],
223+
[
224+
"https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f",
225+
"https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f",
226+
"chart-test.v1717501747",
227+
"https://demo.datashades.com/dataset/chart-test?activity_id=061888e9-e3c2-4769-b097-9c195a841e2f",
228+
"ckan-chart-test",
229+
],
230+
[
231+
"https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f",
232+
"https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f",
233+
"chart-test.v1717501747",
234+
"https://demo.datashades.com/dataset/chart-test/history/061888e9-e3c2-4769-b097-9c195a841e2f",
235+
"ckan-chart-test",
236+
],
237+
["https://demo.ckan.org/group/roger", None, None, None, None],
238+
["https://demo.ckan.org/dataset/nosuchdataset", None, None, None, None],
239+
],
240+
)
241+
async def test_ckan(spec, resolved_spec, resolved_ref, resolved_ref_url, build_slug):
242+
provider = CKANProvider(spec=spec)
243+
244+
ref = await provider.get_resolved_ref()
245+
if not resolved_ref:
246+
# We are done here if we don't expect to resolve
247+
return
248+
assert resolved_ref in ref
249+
250+
slug = provider.get_build_slug()
251+
assert slug == build_slug
252+
repo_url = provider.get_repo_url()
253+
assert repo_url == spec
254+
ref_url = await provider.get_resolved_ref_url()
255+
assert ref_url == resolved_ref_url
256+
spec = await provider.get_resolved_spec()
257+
assert spec == resolved_spec
258+
259+
212260
@pytest.mark.github_api
213261
@pytest.mark.parametrize(
214262
"repo,unresolved_ref,resolved_ref",

docs/source/developer/repoproviders.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,8 @@ Currently supported providers, their prefixes and specs are:
3636
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
3737
| Dataverse | ``dataverse`` | ``<dataverse-DOI>`` | `Dataverse <https://dataverse.org/>`_ is open source research data repository software installed all over the world. |
3838
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
39+
| CKAN | ``ckan`` | ``<url-escaped-url>/<dataset-id>`` | `CKAN <https://ckan.org/>`_ is an open source data management system. |
40+
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
3941
| Git | ``git`` | ``<url-escaped-url>/<commit-sha>`` | A generic repository provider for URLs that point directly to a git repository. |
4042
+------------+--------------------+-------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------+
4143

docs/source/reference/repoproviders.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -65,6 +65,11 @@ Module: :mod:`binderhub.repoproviders`
6565
.. autoconfigurable:: DataverseProvider
6666
:members:
6767

68+
:class:`CKANProvider`
69+
---------------------------
70+
71+
.. autoconfigurable:: CKANProvider
72+
:members:
6873

6974
:class:`GitRepoProvider`
7075
---------------------------

0 commit comments

Comments
 (0)