diff --git a/CHANGELOG.md b/CHANGELOG.md index 1719555..7acd2a5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are: The versions coincide with releases on pip. ## [0.2.x](https://github.com/rse/rse/tree/master) (0.0.x) + - allowing Zenodo parser to hand off to GitLab or GitHub (0.0.16) - adding import of static issue (markdown) files for annotation (0.0.15) - adding generation of data.json to static site export (0.0.14) - web interface needs software (or other custom) prefix for export (0.0.13) diff --git a/docs/_docs/getting-started/parsers.md b/docs/_docs/getting-started/parsers.md index ace0401..c77e0b0 100644 --- a/docs/_docs/getting-started/parsers.md +++ b/docs/_docs/getting-started/parsers.md @@ -21,16 +21,14 @@ this means version control systems where software is stored. - [Base Parser](#base) - [GitHub Parser](#github) - [GitLab parser](#gitlab) + - [Zenodo Parser](#zenodo) -**Secondary Parsers** -A secondary parser is available as a tool to extract data from a resource, but -isn't exposed via the `rse` command line client. You might want to use these -parsers for your own analysis, although they aren't supported for the research -software encyclopedia core database, which uses version control systems as the -source of truth. +For the parsers above, those with version controlled code are considered sources of +truth. For parsers like Zenodo, we look for a GitHub or Gitlab URL, and add an entry +to the database given that we have one. The user is free to use the Zenodo Parser +outside of the rse to bypass this requirement. - - [Zenodo Parser](#zenodo) ## The Parser Base @@ -327,14 +325,33 @@ data = parser.get_metadata() The "zenodo" parser is intended to parse a Zenodo DOI or url into a software repository, and we use the [Zenodo API](https://developers.zenodo.org/) to handle this. -Only entries that are classified as "software" are allowed. A `RSE_ZENODO_TOKEN` is required -to be exported to the environment, and you can generate one under your [account application settings](https://zenodo.org/account/settings/applications/). + A `RSE_ZENODO_TOKEN` is required to be exported to the environment, and you can generate one under your [account application settings](https://zenodo.org/account/settings/applications/). ```bash export RSE_ZENODO_TOKEN=123456....... ``` #### Example Usage + +To use the Zenodo parser with the Research Software Encyclopedia, you can try +adding the DOI identifier. If there is a GitHub or GitLab record associated, it will +be added, and the doi for zenodo included. + +```bash +$ rse add 10.5281/zenodo.3819202 +INFO:rse.main:Database: filesystem +INFO:rse.main.database.filesystem:github/CLARIAH/grlc was added to the the database. +``` + +On the other hand, if you try to add a record that doesn't have a GitHub identifier, +you'll see this response: + +```bash +$ rse add 10.5281/zenodo.1012531 +INFO:rse.main:Database: filesystem +WARNING:rse.main.parsers.zenodo:Repository url not found with Zenodo record, skipping add. +``` + Example usage of the parser outside of the Encyclopedia might look like the following. If you want to instantiate an empty parser (not associated with a software repository) you can do that as follows: @@ -348,7 +365,6 @@ However, it's more likely that you want to parse a specific repository. Let's sa that we want to parse the [Singularity Registry](https://zenodo.org/record/1012531#.Xu5OOZZME5k) record on Zenodo. We need to provide the DOI to do this: - ```python from rse.main.parsers import ZenodoParser @@ -383,10 +399,12 @@ parser.uid Once the identifier is loaded, you can parse updated metadata for it. Note that you can define an `RSE_ZENODO_TOKEN` to be set in the environment if you want to potentially increase your API limits. -You can then get the metadata about the archive: +You can then get the metadata about the archive. Note that if the record +doesn't have a GitHub association (and you want to return the Zenodo response) you +need to set `require_repo` to False: ```python -data = parser.get_metadata() +data = parser.get_metadata(require_repo=False) {'conceptdoi': '10.5281/zenodo.1012530', 'conceptrecid': '1012530', @@ -462,5 +480,7 @@ data = parser.get_metadata() 'updated': '2020-01-25T07:25:02.258480+00:00'} ``` +If you set it to true, None will be returned if there is no GitHub association. +If there is, you'll get back a GitHub parser with metadata and the added DOI. You might next want to learn about the interactive [dashboard]({{ site.baseurl }}/getting-started/dashboard/). diff --git a/rse/client/__init__.py b/rse/client/__init__.py index 4ecaf44..2adfe49 100644 --- a/rse/client/__init__.py +++ b/rse/client/__init__.py @@ -364,7 +364,6 @@ def help(return_code=0): from .start import main # Pass on to the correct parser - return_code = 0 main(args=args, extra=extra) diff --git a/rse/main/database/filesystem.py b/rse/main/database/filesystem.py index 0685109..fccb246 100644 --- a/rse/main/database/filesystem.py +++ b/rse/main/database/filesystem.py @@ -25,6 +25,7 @@ ) from rse.main.database.base import Database from rse.main.parsers import get_parser +from rse.main.parsers.base import ParserBase from glob import glob import logging import shutil @@ -86,8 +87,14 @@ def add(self, uid): if uid: parser = get_parser(uid, config=self.config) data = parser.get_metadata() + + # If it's a parser handoff + if isinstance(data, ParserBase): + parser = data + data = parser.data + if data: - bot.info(f"{uid} was added to the the database.") + bot.info(f"{parser.uid} was added to the the database.") return SoftwareRepository(parser, data_base=self.data_base) else: bot.error("Please define a unique identifier to add.") diff --git a/rse/main/database/relational.py b/rse/main/database/relational.py index 186d7e8..7162e28 100644 --- a/rse/main/database/relational.py +++ b/rse/main/database/relational.py @@ -16,6 +16,7 @@ ) from rse.main.database.base import Database from rse.main.parsers import get_parser +from rse.main.parsers.base import ParserBase from sqlalchemy import create_engine, desc from sqlalchemy.orm import scoped_session, sessionmaker @@ -107,6 +108,12 @@ def add(self, uid): parser = get_parser(uid, config=self.config) if not self.exists(parser.uid): data = parser.get_metadata() + + # If it's a parser handoff + if isinstance(data, ParserBase): + parser = data + data = parser.data + if data: repo = SoftwareRepository( uid=parser.uid, parser=parser.name, data=json.dumps(parser.export()) diff --git a/rse/main/parsers/__init__.py b/rse/main/parsers/__init__.py index c924d9a..867905d 100644 --- a/rse/main/parsers/__init__.py +++ b/rse/main/parsers/__init__.py @@ -32,6 +32,8 @@ def get_parser(uri, config=None): parser = GitHubParser(uri) if matches(GitLabParser, uri): parser = GitLabParser(uri) + if matches(ZenodoParser, uri): + parser = ZenodoParser(uri) if not parser: raise NotImplementedError(f"There is no matching parser for {uri}") diff --git a/rse/main/parsers/zenodo.py b/rse/main/parsers/zenodo.py index 9a2066d..e319f18 100644 --- a/rse/main/parsers/zenodo.py +++ b/rse/main/parsers/zenodo.py @@ -58,14 +58,20 @@ def get_description(self, data=None): data = data or self.data return data.get("metadata", {}).get("description") - def get_metadata(self, uri=None): + def get_metadata(self, uri=None, require_repo=True): """Retrieve repository metadata. The common metadata (timestamp) is added by the software repository parser, and here we need to ensure that the url field is populated with a correct url. Arguments: uri (str) : a repository uri string to override one currently set + require_repo (bool) : require a repository to parse. """ + from rse.main.parsers import get_parser + from rse.utils.urls import repository_regex + + repository_regex = repository_regex.rstrip("$") + if uri: self.set_uri(uri) self.load_secrets() @@ -85,6 +91,29 @@ def get_metadata(self, uri=None): # Successful query! if response.status_code == 200: self.data = response.json() + + # For Zenodo, we require a GitHub or GitLab related identifier to add + repo_url = None + for identifier in self.data["metadata"].get("related_identifiers", []): + match = re.search(repository_regex, identifier["identifier"]) + if match: + repo_url = "https://%s" % match.group() + break + + # If we return None, the entry is not added + if repo_url is None and require_repo is True: + bot.warning( + "Repository url not found with Zenodo record, skipping add." + ) + return repo_url + + # Convert the class into another parser type + elif repo_url is not None: + uid = self.uid + self = get_parser(repo_url) + self.get_metadata() + self.data["doi"] = uid + return self return self.data elif response.status_code == 404: diff --git a/rse/version.py b/rse/version.py index c2e4b44..953fa5b 100644 --- a/rse/version.py +++ b/rse/version.py @@ -8,7 +8,7 @@ """ -__version__ = "0.0.15" +__version__ = "0.0.16" AUTHOR = "Vanessa Sochat" AUTHOR_EMAIL = "vsochat@stanford.edu" NAME = "rse" diff --git a/tests/test_parser_zenodo.py b/tests/test_parser_zenodo.py index a6ca88e..fcd389e 100644 --- a/tests/test_parser_zenodo.py +++ b/tests/test_parser_zenodo.py @@ -27,7 +27,8 @@ def test_parser_zenodo(tmp_path): assert parser.summary() # Only test one get of data - assert parser.get_metadata() + assert not parser.get_metadata() + assert parser.get_metadata(require_repo=False) data = parser.export() for key in ["timestamp", "doi", "links", "metadata"]: assert key in data