-
Notifications
You must be signed in to change notification settings - Fork 366
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[MRG] Add Figshare content provider (#788)
[MRG] Add Figshare content provider
- Loading branch information
Showing
10 changed files
with
439 additions
and
156 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
from .git import Git | ||
from .base import Local | ||
from .zenodo import Zenodo | ||
from .figshare import Figshare |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import os | ||
import json | ||
import shutil | ||
import logging | ||
|
||
from os import makedirs | ||
from os import path | ||
from urllib import request # urlopen, Request | ||
from urllib.error import HTTPError | ||
from zipfile import ZipFile, is_zipfile | ||
|
||
from .base import ContentProvider | ||
from ..utils import copytree, deep_get | ||
from ..utils import normalize_doi, is_doi | ||
from .. import __version__ | ||
|
||
|
||
class DoiProvider(ContentProvider): | ||
"""Provide contents of a repository identified by a DOI and some helper functions.""" | ||
|
||
def urlopen(self, req, headers=None): | ||
"""A urlopen() helper""" | ||
# someone passed a string, not a request | ||
if not isinstance(req, request.Request): | ||
req = request.Request(req) | ||
|
||
req.add_header("User-Agent", "repo2docker {}".format(__version__)) | ||
if headers is not None: | ||
for key, value in headers.items(): | ||
req.add_header(key, value) | ||
|
||
return request.urlopen(req) | ||
|
||
def doi2url(self, doi): | ||
# Transform a DOI to a URL | ||
# If not a doi, assume we have a URL and return | ||
if is_doi(doi): | ||
doi = normalize_doi(doi) | ||
|
||
try: | ||
resp = self.urlopen("https://doi.org/{}".format(doi)) | ||
# If the DOI doesn't resolve, just return URL | ||
except HTTPError: | ||
return doi | ||
return resp.url | ||
else: | ||
# Just return what is actulally just a URL | ||
return doi | ||
|
||
def fetch_file(self, file_ref, host, output_dir, unzip=False): | ||
# the assumption is that `unzip=True` means that this is the only | ||
# file related to a record | ||
file_url = deep_get(file_ref, host["download"]) | ||
fname = deep_get(file_ref, host["filename"]) | ||
logging.debug("Downloading file {} as {}\n".format(file_url, fname)) | ||
with self.urlopen(file_url) as src: | ||
if path.dirname(fname): | ||
sub_dir = path.join(output_dir, path.dirname(fname)) | ||
if not path.exists(sub_dir): | ||
yield "Creating {}\n".format(sub_dir) | ||
makedirs(sub_dir, exist_ok=True) | ||
|
||
dst_fname = path.join(output_dir, fname) | ||
with open(dst_fname, "wb") as dst: | ||
yield "Fetching {}\n".format(fname) | ||
shutil.copyfileobj(src, dst) | ||
# first close the newly written file, then continue | ||
# processing it | ||
if unzip and is_zipfile(dst_fname): | ||
yield "Extracting {}\n".format(fname) | ||
zfile = ZipFile(dst_fname) | ||
zfile.extractall(path=output_dir) | ||
zfile.close() | ||
|
||
# delete downloaded file ... | ||
os.remove(dst_fname) | ||
# ... and any directories we might have created, | ||
# in which case sub_dir will be defined | ||
if path.dirname(fname): | ||
shutil.rmtree(sub_dir) | ||
|
||
new_subdirs = os.listdir(output_dir) | ||
# if there is only one new subdirectory move its contents | ||
# to the top level directory | ||
if len(new_subdirs) == 1: | ||
d = new_subdirs[0] | ||
copytree(path.join(output_dir, d), output_dir) | ||
shutil.rmtree(path.join(output_dir, d)) | ||
|
||
yield "Fetched files: {}\n".format(os.listdir(output_dir)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import os | ||
import re | ||
import json | ||
import shutil | ||
|
||
from os import makedirs | ||
from os import path | ||
from urllib.request import Request | ||
from urllib.error import HTTPError | ||
from zipfile import is_zipfile | ||
|
||
from .doi import DoiProvider | ||
from ..utils import copytree, deep_get | ||
|
||
|
||
class Figshare(DoiProvider): | ||
"""Provide contents of a Figshare article. | ||
See https://docs.figshare.com/#public_article for API docs. | ||
Examples: | ||
- https://doi.org/10.6084/m9.figshare.9782777 | ||
- https://doi.org/10.6084/m9.figshare.9782777.v2 | ||
- https://figshare.com/articles/binder-examples_requirements/9784088 (only one zipfile, no DOI) | ||
""" | ||
|
||
def __init__(self): | ||
self.hosts = [ | ||
{ | ||
"hostname": [ | ||
"https://figshare.com/articles/", | ||
"http://figshare.com/articles/", | ||
"https://figshare.com/account/articles/", | ||
], | ||
"api": "https://api.figshare.com/v2/articles/", | ||
"filepath": "files", | ||
"filename": "name", | ||
"download": "download_url", | ||
} | ||
] | ||
|
||
url_regex = re.compile(r"(.*)/articles/([^/]+)/(\d+)(/)?(\d+)?") | ||
|
||
def detect(self, doi, ref=None, extra_args=None): | ||
"""Trigger this provider for things that resolve to a Figshare article""" | ||
# We need the hostname (url where records are), api url (for metadata), | ||
# filepath (path to files in metadata), filename (path to filename in | ||
# metadata), download (path to file download URL), and type (path to item type in metadata) | ||
|
||
url = self.doi2url(doi) | ||
|
||
for host in self.hosts: | ||
if any([url.startswith(s) for s in host["hostname"]]): | ||
match = self.url_regex.match(url) | ||
if match: | ||
self.article_id = match.groups()[2] | ||
self.article_version = match.groups()[4] | ||
if not self.article_version: | ||
self.article_version = "1" | ||
return { | ||
"article": self.article_id, | ||
"host": host, | ||
"version": self.article_version, | ||
} | ||
else: | ||
return None | ||
|
||
def fetch(self, spec, output_dir, yield_output=False): | ||
"""Fetch and unpack a Figshare article""" | ||
article_id = spec["article"] | ||
article_version = spec["version"] | ||
host = spec["host"] | ||
|
||
yield "Fetching Figshare article {} in version {}.\n".format( | ||
article_id, article_version | ||
) | ||
req = Request( | ||
"{}{}/versions/{}".format(host["api"], article_id, article_version), | ||
headers={"accept": "application/json"}, | ||
) | ||
resp = self.urlopen(req) | ||
|
||
article = json.loads(resp.read().decode("utf-8")) | ||
|
||
files = deep_get(article, host["filepath"]) | ||
# only fetch files where is_link_only: False | ||
files = [file for file in files if not file["is_link_only"]] | ||
only_one_file = len(files) == 1 | ||
for file_ref in files: | ||
unzip = file_ref["name"].endswith(".zip") and only_one_file | ||
for line in self.fetch_file(file_ref, host, output_dir, unzip): | ||
yield line | ||
|
||
@property | ||
def content_id(self): | ||
"""The Figshare article ID""" | ||
return "{}.v{}".format(self.article_id, self.article_version) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
import json | ||
import os | ||
import re | ||
import urllib | ||
import pytest | ||
import tempfile | ||
import logging | ||
|
||
from unittest.mock import patch, MagicMock, mock_open | ||
from zipfile import ZipFile | ||
|
||
from repo2docker.contentproviders.doi import DoiProvider | ||
from repo2docker.contentproviders.base import ContentProviderException | ||
|
||
|
||
def test_content_id(): | ||
doi = DoiProvider() | ||
assert doi.content_id is None | ||
|
||
|
||
def fake_urlopen(req): | ||
print(req) | ||
return req.headers | ||
|
||
|
||
@patch("urllib.request.urlopen", fake_urlopen) | ||
def test_url_headers(): | ||
doi = DoiProvider() | ||
|
||
headers = {"test1": "value1", "Test2": "value2"} | ||
result = doi.urlopen("https://mybinder.org", headers=headers) | ||
assert "Test1" in result | ||
assert "Test2" in result | ||
assert len(result) is 3 # User-agent is also set | ||
|
||
|
||
def test_unresolving_doi(): | ||
doi = DoiProvider() | ||
|
||
fakedoi = "10.1/1234" | ||
assert doi.doi2url(fakedoi) is fakedoi |
Oops, something went wrong.