diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 507f94a..302681b 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -21,7 +21,7 @@ jobs: run: | export PATH="/usr/share/miniconda/bin:$PATH" source activate black - pip install black==19.10b0 + pip install black==20.8b1 black --check rse - name: Check imports with pyflakes diff --git a/docs/_docs/getting-started/commands.md b/docs/_docs/getting-started/commands.md index 7eb5651..db6cda4 100644 --- a/docs/_docs/getting-started/commands.md +++ b/docs/_docs/getting-started/commands.md @@ -358,6 +358,64 @@ INFO:rse.main:Database: filesystem 1 github/singularityhub/sregistry ``` +For a filesystem database, you can also search across taxonomy and/or criteria items: + +```bash +$ rse search --taxonomy package +RSE-taxonomy-package-management +1 github/easybuilders/easybuild +2 github/spack/spack + +$ rse search --criteria research +RSE-research-intention +1 github/AA-ALERT/AstroData +2 github/fair-software/howfairis +3 github/BrianAronson/birankr +4 github/3D-e-Chem/knime-sstea +5 github/davidebolo1993/TRiCoLOR +6 github/AA-ALERT/AMBER +7 gitlab/davidtourigny/dynamic-fba +8 github/Sulstice/cocktail-shaker +9 github/spack/spack +10 github/snakemake/snakemake +11 github/potree/PotreeConverter +12 github/Effective-Quadratures/Effective-Quadratures +13 github/3D-e-Chem/knime-pharmacophore +14 github/sunpy/sunpy +15 github/AA-ALERT/frbcatdb +16 github/AA-ALERT/frbcat-web +17 github/Parsl/parsl +18 github/JuliaOpt/JuMP.jl +19 github/AA-ALERT/Dedispersion +20 github/scikit-image/scikit-image +21 github/3D-e-Chem/sygma +22 github/nextflow-io/nextflow +23 gitlab/LouisLab/PiVR +24 github/3D-e-Chem/knime-gpcrdb +25 gitlab/cosmograil/PyCS3 +26 github/sjvrijn/mf2 +27 github/KVSlab/turtleFSI +28 github/ropensci/chirps +29 gitlab/ampere2/metalwalls +``` + +The searches are independent, meaning that you might see the same repository in two +results listings if it has more than one match for a given taxonomy or criteria item. +The same is true for adding a search term at the onset: + +```python +$ rse search singularity --taxonomy package +singularity +1 github/hpcng/singularity +2 github/singularityhub/singularity-compose +3 github/singularityhub/sregistry +4 github/eWaterCycle/setup-singularity + +RSE-taxonomy-package-management +1 github/spack/spack +2 github/easybuilders/easybuild +``` + ## Summary @@ -695,7 +753,7 @@ web-scraping webscraping ``` -Finally, you can provide one or more topics, and find repositories that are labeled +Finally, you can search for one or more topics, and find repositories that are labeled as such: ```bash diff --git a/rse/app/views/repositories.py b/rse/app/views/repositories.py index 0ca10bd..87db54e 100644 --- a/rse/app/views/repositories.py +++ b/rse/app/views/repositories.py @@ -78,7 +78,12 @@ def annotate_criteria(): if last is not None and last.uid != repo.uid: break last = repo - annotation_sets.append((repo, criteria,)) + annotation_sets.append( + ( + repo, + criteria, + ) + ) return render_template( "annotate/criteria.html", diff --git a/rse/client/__init__.py b/rse/client/__init__.py index abaf23e..e8c33f8 100644 --- a/rse/client/__init__.py +++ b/rse/client/__init__.py @@ -16,6 +16,7 @@ import argparse import sys import logging +import os def get_parser(): @@ -49,7 +50,10 @@ def get_parser(): description = "actions for rse" subparsers = parser.add_subparsers( - help="rse actions", title="actions", description=description, dest="command", + help="rse actions", + title="actions", + description=description, + dest="command", ) # print version and exit @@ -106,7 +110,10 @@ def get_parser(): "init", help="Add an rse.ini to the present working directory." ) init.add_argument( - "path", help="Path to generate rse.ini file", nargs="?", default=".", + "path", + help="Path to generate rse.ini file", + nargs="?", + default=".", ) # Config @@ -173,7 +180,9 @@ def get_parser(): "analyze", help="View metrics for a specific repository." ) analyze.add_argument( - "repo", help="Software repository to show", default=None, + "repo", + help="Software repository to show", + default=None, ) analyze.add_argument( "--ct", @@ -244,13 +253,17 @@ def get_parser(): # Search for software search = subparsers.add_parser( - "search", help="Search for a piece of research software", + "search", + help="Search for a piece of research software", ) search.add_argument("query", nargs="*") + search.add_argument("--taxonomy", nargs="*") + search.add_argument("--criteria", nargs="*") # Scrape for new repos scrape = subparsers.add_parser( - "scrape", help="Add new software repositories from a resource.", + "scrape", + help="Add new software repositories from a resource.", ) scrape.add_argument("scraper_name", nargs=1) scrape.add_argument("query", nargs="?") @@ -423,6 +436,8 @@ def help(return_code=0): args, extra = parser.parse_known_args() # Set the logging level + os.putenv("RSE_LOG_LEVEL", args.log_level) + RSE_LOG_LEVEL = args.log_level logging.basicConfig(level=getattr(logging, args.log_level)) bot = logging.getLogger("rse.client") bot.setLevel(getattr(logging, args.log_level)) diff --git a/rse/client/search.py b/rse/client/search.py index 6f38b64..79ca4d4 100644 --- a/rse/client/search.py +++ b/rse/client/search.py @@ -17,8 +17,15 @@ def main(args, extra): enc = Encyclopedia(config_file=args.config_file, database=args.database) query = " ".join(args.query).strip() - if not query: + + # We can search taxonomy, criteria, or both + taxonomy = args.taxonomy or [] + criteria = args.criteria or [] + if not query and not taxonomy and not criteria: sys.exit("Please provide a query to search for.") - results = enc.search(query) + results = enc.search(query, taxonomy=taxonomy, criteria=criteria) if results: - bot.table(results) + for key, listing in results.items(): + bot.info(key) + bot.table(listing) + bot.newline() diff --git a/rse/logger/message.py b/rse/logger/message.py index 52da233..d685ac5 100644 --- a/rse/logger/message.py +++ b/rse/logger/message.py @@ -295,13 +295,14 @@ def is_quiet(self): # Terminal ------------------------------------------ - def table(self, rows, col_width=2): + def table(self, rows, col_width=2, labels=None): """table will print a table of entries. If the rows is a dictionary, the keys are interpreted as column names. if not, a numbered list is used. """ + if not labels: + labels = [str(x) for x in range(1, len(rows) + 1)] - labels = [str(x) for x in range(1, len(rows) + 1)] if isinstance(rows, dict): labels = list(rows.keys()) rows = list(rows.values()) diff --git a/rse/main/__init__.py b/rse/main/__init__.py index 64d4627..94393f5 100644 --- a/rse/main/__init__.py +++ b/rse/main/__init__.py @@ -62,7 +62,7 @@ def initdb(self, database): or "filesystem" ) database_string = self.config.get("DEFAULT", "databaseconnect") - bot.info("Database: %s" % self.database) + bot.debug("Database: %s" % self.database) # Supported database options valid = ("sqlite", "postgresql", "mysql+pymysql", "filesystem") @@ -196,12 +196,12 @@ def label(self, uid, key, value, force=False): except RepoNotFoundError: bot.error(f"{uid} does not exist.") - def search(self, query): + def search(self, query, taxonomy=None, criteria=None): """Search across commands and general metadata for a string of interest. We use regular expressions (re.search) so they are supported. Search is only available for non-filesystem databases. """ - results = self.db.search(query) + results = self.db.search(query, taxonomy=taxonomy, criteria=criteria) if results: return results bot.info(f"No results matching {query}") diff --git a/rse/main/database/filesystem.py b/rse/main/database/filesystem.py index a39abcf..0421caa 100644 --- a/rse/main/database/filesystem.py +++ b/rse/main/database/filesystem.py @@ -36,21 +36,19 @@ class FileSystemDatabase(Database): - """A FileSystemDatabase writes raw json to files to a database. - """ + """A FileSystemDatabase writes raw json to files to a database.""" database = "filesystem" def __init__(self, config_dir, config=None, **kwargs): - """init for the filesystem ensures that the base folder (named - according to the studyid) exists. + """init for the filesystem ensures that the base folder (named + according to the studyid) exists. """ self.config = config self.create_database(config_dir) def create_database(self, config_dir): - """Create the database. The parent folder must exist. - """ + """Create the database. The parent folder must exist.""" self.data_base = os.path.abspath(os.path.join(config_dir, "database")) if not os.path.exists(config_dir): raise DirectoryNotFoundError( @@ -62,8 +60,7 @@ def create_database(self, config_dir): # Global def clear(self): - """clear (delete) all software repositories. - """ + """clear (delete) all software repositories.""" for parser_dir in self.iter_parsers(fullpath=True): if os.path.exists(parser_dir): bot.info(f"Removing {parser_dir}") @@ -73,8 +70,7 @@ def clear(self): # Get, delete, etc. only require uid def exists(self, uid): - """Determine if a repo exists. - """ + """Determine if a repo exists.""" try: self.get(uid, exact=True) return True @@ -82,8 +78,7 @@ def exists(self, uid): return False def add(self, uid): - """Add a new software repository to the database. - """ + """Add a new software repository to the database.""" if uid: parser = get_parser(uid, config=self.config) data = parser.get_metadata() @@ -100,8 +95,7 @@ def add(self, uid): bot.error("Please define a unique identifier to add.") def get_or_create(self, uid): - """Determine if a repo exists. - """ + """Determine if a repo exists.""" try: repo = self.get(uid, exact=True) except: @@ -110,8 +104,8 @@ def get_or_create(self, uid): def get(self, uid=None, exact=False): """Get a software repo based on a uid. If exact is not needed, we can - search for a match based on the partial uid. If exact is False, - and a uid is not provided, get the last repository created. + search for a match based on the partial uid. If exact is False, + and a uid is not provided, get the last repository created. """ if not uid and not exact: repos = get_latest_modified(self.data_base, pattern="metadata*.json") @@ -128,8 +122,8 @@ def get(self, uid=None, exact=False): return SoftwareRepository(parser, exists=True, data_base=self.data_base) def update(self, repo, rewrite=False): - """Update a repository by retrieving metadata, and then calling update - on the software repository to save it. + """Update a repository by retrieving metadata, and then calling update + on the software repository to save it. """ data = repo.parser.get_metadata() if data: @@ -139,8 +133,7 @@ def update(self, repo, rewrite=False): repo.update(updates=data) def label(self, repo, key, value, force=False): - """Update a repository with a specific key/value pair. - """ + """Update a repository with a specific key/value pair.""" if key in repo.data and not force: raise RuntimeError( f"{key} is already defined for {repo.uid}. Use --force to overwrite." @@ -148,18 +141,55 @@ def label(self, repo, key, value, force=False): bot.debug(f"Adding key {key}:{value}") repo.update({key: value}) - def search(self, query): - """A filesystem search can only support returning results with filenames + def search(self, query, taxonomy=None, criteria=None): + """A filesystem search can only support returning results with filenames. + For taxonomy and criteria items, we load them and search. + We organize results based on the query, taxonomy, and criteria + The results returned are separate (e.g., a single repo can appear + in more than one list). """ - results = [] + results = {} + + taxonomy_regex = "(%s)" "|".join(taxonomy or []) + criteria_regex = "(%s)" "|".join(criteria or []) for repo in self.list_repos(): - if re.search(query, repo[0], re.IGNORECASE): - results.append(repo) - return results + + if query: + if re.search(query, repo[0], re.IGNORECASE): + if query not in results: + results[query] = set() + results[query].add(repo[0]) + + if taxonomy or criteria: + repo = self.get(repo[0]) + + # Add taxonomy items + if taxonomy: + for _, tags in repo.load_taxonomy().items(): + for tag in tags: + if re.search(taxonomy_regex, tag, re.IGNORECASE): + if tag not in results: + results[tag] = set() + results[tag].add(repo.uid) + + # Add criteria items + if criteria: + for term, annotations in repo.load_criteria().items(): + for _, annotation in annotations.items(): + if annotation == "yes" and re.search( + criteria_regex, term, re.IGNORECASE + ): + if term not in results: + results[term] = set() + results[term].add(repo.uid) + + final = {} + for key, listing in results.items(): + final[key] = [[x] for x in listing] + return final def delete_repo(self, uid): - """delete a repo based on a specific identifier. - """ + """delete a repo based on a specific identifier.""" if self.exists(uid): repo = self.get(uid) os.remove(repo.filename) @@ -176,8 +206,7 @@ def delete_repo(self, uid): return False def delete_parser(self, name): - """delete all repos for a parser, based on executor's name (str). - """ + """delete all repos for a parser, based on executor's name (str).""" parser_dir = os.path.join(self.data_base, name) if not os.path.exists(parser_dir): bot.info(f"Executor {parser_dir} directory does not exist.") @@ -186,8 +215,7 @@ def delete_parser(self, name): return True def iter_parsers(self, fullpath=False): - """list executors based on the subfolders in the base database folder. - """ + """list executors based on the subfolders in the base database folder.""" for contender in os.listdir(self.data_base): contender = os.path.join(self.data_base, contender) if os.path.isdir(contender): @@ -198,8 +226,8 @@ def iter_parsers(self, fullpath=False): def list_repos(self, name=None): """list software repositories, either under a particular parser name - or just under all parsers. This returns repos in rows to be printed - (or otherwise parsed). + or just under all parsers. This returns repos in rows to be printed + (or otherwise parsed). """ listpath = self.data_base if name: @@ -217,21 +245,21 @@ def list_repos(self, name=None): class SoftwareRepository: - """A software repository is a filesystem representation of a repo. It can - take a uid, determine if the repo exists, and then interact with the - metadata for it. If the repo is instantiated without a unique id - it is assumed to not exist yet, otherwise it must already - exist. + """A software repository is a filesystem representation of a repo. It can + take a uid, determine if the repo exists, and then interact with the + metadata for it. If the repo is instantiated without a unique id + it is assumed to not exist yet, otherwise it must already + exist. """ def __init__(self, parser, data_base, exists=False): """A SoftwareRepository holds some uid for a parser, and controls - interaction with the filesystem. + interaction with the filesystem. - Arguments: - parser (str) : the parser - data_base (str) : the path where the database exists. - exists (bool) : if True, must already exists (default is False) + Arguments: + parser (str) : the parser + data_base (str) : the path where the database exists. + exists (bool) : if True, must already exists (default is False) """ self.uid = parser.uid self.parser = parser @@ -262,16 +290,15 @@ def parser_dir(self): return os.path.join(self.data_base, self.parser.uid) def update(self, updates=None): - """Update a data file. This means reading, updating, and writing. - """ + """Update a data file. This means reading, updating, and writing.""" updates = updates or {} self.data.update(updates) self.save() def update_criteria(self, uid, username, response): - """Update a criteria, meaning adding a True/False answer to the - unique id for the user. We are currently assuming that criteria - have yes/no responses, and True == yes, False == no. + """Update a criteria, meaning adding a True/False answer to the + unique id for the user. We are currently assuming that criteria + have yes/no responses, and True == yes, False == no. """ if uid not in self.criteria: self.criteria[uid] = {} @@ -280,7 +307,7 @@ def update_criteria(self, uid, username, response): def create(self, should_exist=False): """create the filename if it doesn't exist, otherwise if it should (and - does not) exit on error. + does not) exit on error. """ if should_exist: if not os.path.exists(self.filename): @@ -289,7 +316,9 @@ def create(self, should_exist=False): contenders = glob("%s*" % os.path.join(self.data_base, self.parser.uid)) if len(contenders) == 1: self.parser.uid = re.sub( - "(%s/|[.]json)" % self.data_base, "", contenders[0], + "(%s/|[.]json)" % self.data_base, + "", + contenders[0], ) elif len(contenders) > 1: @@ -314,21 +343,18 @@ def create(self, should_exist=False): self.save() def export(self): - """wrapper to expose the executor.export function - """ + """wrapper to expose the executor.export function""" return self.parser.export() def save(self): - """Save a json object (metadata.json) for the software repository. - """ + """Save a json object (metadata.json) for the software repository.""" write_json(self.data, self.filename) def summary(self): return self.parser.summary() def load(self): - """Given a software uid, load data from filename. - """ + """Given a software uid, load data from filename.""" if os.path.exists(self.filename): return read_json(self.filename) @@ -341,8 +367,7 @@ def get_taxonomy(self): return self.taxonomy def load_criteria(self): - """Given a repository directory, load criteria files if they exist - """ + """Given a repository directory, load criteria files if they exist""" criteria = {} for filename in glob(f"{self.parser_dir}/criteria*.tsv"): uid = ( @@ -361,7 +386,7 @@ def load_criteria(self): def save_criteria(self): """Save criteria to file. Each file is named based on the criteria id, - and is a tab separated file that includes the username and response. + and is a tab separated file that includes the username and response. """ for uid, responses in self.criteria.items(): filename = os.path.join(self.parser_dir, "criteria-%s.tsv" % uid) @@ -372,11 +397,11 @@ def save_criteria(self): def load_taxonomy(self): """Given a repository directory, load taxonomy annotations if they exist - The taxonomy.tsv file should be a tab separated file with: - username category-unique-id. This means that we keep a record of - who has categorized what, and load this information into the - taxonomy dictionary (organized by the category-unique-id which - then has a total count and list of users). + The taxonomy.tsv file should be a tab separated file with: + username category-unique-id. This means that we keep a record of + who has categorized what, and load this information into the + taxonomy dictionary (organized by the category-unique-id which + then has a total count and list of users). """ taxonomy = {} taxonomy_file = os.path.join(self.parser_dir, "taxonomy.tsv") @@ -392,7 +417,7 @@ def load_taxonomy(self): def save_taxonomy(self): """Save taxonomy to file. Each file is named taxonomy.tsv, - and is a tab separated file that includes the username and response. + and is a tab separated file that includes the username and response. """ filename = os.path.join(self.parser_dir, "taxonomy.tsv") rows = ["%s\t%s" % (k, ",".join(v)) for k, v in sorted(self.taxonomy.items())] @@ -402,8 +427,7 @@ def save_taxonomy(self): # Annotation def has_criteria_annotation(self, uid, username): - """Determine if a repository has been annotated by a user. - """ + """Determine if a repository has been annotated by a user.""" if uid not in self.criteria: return False if username not in self.criteria[uid]: @@ -411,8 +435,7 @@ def has_criteria_annotation(self, uid, username): return True def has_taxonomy_annotation(self, username): - """Determine if a repository has been annotated by a user. - """ + """Determine if a repository has been annotated by a user.""" if username not in self.taxonomy: return False return True diff --git a/rse/main/database/relational.py b/rse/main/database/relational.py index baac9b7..49a89dc 100644 --- a/rse/main/database/relational.py +++ b/rse/main/database/relational.py @@ -240,19 +240,26 @@ def list_repos(self, name=None): rows.append([repo.uid]) return rows - def search(self, query): + def search(self, query, taxonomy=None, criteria=None): """Search across the database for a particular query.""" from rse.main.database.models import SoftwareRepository + # We will return a lookup of results + results = {} + + # Required to have a query + if not query: + return results + # Ensure that query can be part of a larger string - query = "%" + query + "%" + expression = "%" + query + "%" - query = self.session.query(SoftwareRepository).filter( + result = self.session.query(SoftwareRepository).filter( or_( - SoftwareRepository.data.ilike(query), - SoftwareRepository.uid.ilike(query), + SoftwareRepository.data.ilike(expression), + SoftwareRepository.uid.ilike(expression), ) ) # list of tuples, (uid, datetime, executor] - results = self.session.execute(query).fetchall() - return [[r[0], str(r[2]), str(r[1])] for r in results] + results = self.session.execute(result).fetchall() + return {query: [[r[0], str(r[2]), str(r[1])] for r in results]}