From f1b1a857a6f7b0efa59a7d912a673517afb84845 Mon Sep 17 00:00:00 2001 From: Samuel Grayson Date: Mon, 12 Jun 2023 12:40:55 -0500 Subject: [PATCH 1/7] Deduplicate test names --- tests/test_baseRoutines.py | 4 ++-- tests/test_data.py | 2 +- tests/test_manager.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_baseRoutines.py b/tests/test_baseRoutines.py index c8b5ee7..bf70128 100644 --- a/tests/test_baseRoutines.py +++ b/tests/test_baseRoutines.py @@ -103,8 +103,8 @@ def test_ExternalCommandLineToolRoutine_canSetConfigurationParameters(mocker): assert(genericRoutine.getConfigurationParameters() == configurationParameters) -def test_OnlineRepositoryRoutine_isConstructibleWithMockImplementation(mocker): - mocker.patch.multiple(routines.OnlineRepositoryRoutine, __abstractmethods__=set()) +def test_OfflineRepositoryRoutine_isConstructibleWithMockImplementation(mocker): + mocker.patch.multiple(routines.OfflineRepositoryRoutine, __abstractmethods__=set()) genericRoutine = routines.OfflineRepositoryRoutine() diff --git a/tests/test_data.py b/tests/test_data.py index 82c7dd1..d44da23 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -159,7 +159,7 @@ def test_YAMLData_isDirectlyConstructible(): dataEntity = data.YAMLData("test.yaml") -def test_AnnotatedCSVData_isConstructibleByFactory(): +def test_AnnotatedYAMLData_isConstructibleByFactory(): factory = data.DataEntityFactory() factory.createYAMLData("test.yaml") diff --git a/tests/test_manager.py b/tests/test_manager.py index f56b1f5..e4fa9e6 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -29,7 +29,7 @@ def test_ManagerAnalysisTask_isDirectlyConstructible(): task = management.ManagerAnalysisTask(request=requests.AnalysisRequestModel()) -def test_ManagerRepositoryRoutineTask_isConstructibleByFactory(): +def test_ManagerAnalysisTask_isConstructibleByFactory(): factory = management.TaskFactory() task = factory.createManagerAnalysisTask(request=requests.AnalysisRequestModel()) From da96bf153ed34b19bbe415f1bf610496da33c3c8 Mon Sep 17 00:00:00 2001 From: Samuel Grayson Date: Mon, 12 Jun 2023 13:18:58 -0500 Subject: [PATCH 2/7] Reduce code --- src/reposcanner/contrib.py | 79 ++++++-------------------------------- src/reposcanner/dummy.py | 19 +-------- src/reposcanner/git.py | 15 ++------ src/util.py | 5 +++ 4 files changed, 22 insertions(+), 96 deletions(-) create mode 100644 src/util.py diff --git a/src/reposcanner/contrib.py b/src/reposcanner/contrib.py index 5417ad1..7bc9748 100644 --- a/src/reposcanner/contrib.py +++ b/src/reposcanner/contrib.py @@ -4,6 +4,7 @@ from reposcanner.response import ResponseFactory from reposcanner.provenance import ReposcannerRunInformant from reposcanner.data import DataEntityFactory +from reposcanner.util import replaceNoneWithEmptyString as _replaceNoneWithEmptyString import pygit2 from pathlib import Path @@ -21,8 +22,7 @@ class CommitInfoMiningRoutineRequest(OfflineRoutineRequest): - def __init__(self, repositoryURL, outputDirectory, workspaceDirectory): - super().__init__(repositoryURL, outputDirectory, workspaceDirectory) + pass class CommitInfoMiningRoutine(OfflineRepositoryRoutine): @@ -126,16 +126,8 @@ def _getStats(commit): changes['files'] += diff.stats.files_changed return changes - def _replaceNoneWithEmptyString(value): - if value is None: - return "" - else: - return value - for commit in session.walk(session.head.target, pygit2.GIT_SORT_TIME | pygit2.GIT_SORT_TOPOLOGICAL): - extractedCommitData = {} - # The person who originally made the change and when they made it, a # pygit2.Signature. author = commit.author @@ -200,21 +192,7 @@ def _replaceNoneWithEmptyString(value): class OnlineCommitAuthorshipRoutineRequest(OnlineRoutineRequest): - def __init__( - self, - repositoryURL, - outputDirectory, - username=None, - password=None, - token=None, - keychain=None): - super().__init__( - repositoryURL, - outputDirectory, - username=username, - password=password, - token=token, - keychain=keychain) + pass class OnlineCommitAuthorshipRoutine(OnlineRepositoryRoutine): @@ -226,13 +204,6 @@ class OnlineCommitAuthorshipRoutine(OnlineRepositoryRoutine): def getRequestType(self): return OnlineCommitAuthorshipRoutineRequest - def githubImplementation(self, request, session): - def _replaceNoneWithEmptyString(value): - if value is None: - return "" - else: - return value - factory = DataEntityFactory() output = factory.createAnnotatedCSVData( "{outputDirectory}/{repoName}_OnlineCommitAuthorship.csv".format( @@ -270,12 +241,6 @@ def _replaceNoneWithEmptyString(value): message="OnlineCommitAuthorshipRoutine completed!", attachments=output) def gitlabImplementation(self, request, session): - def _replaceNoneWithEmptyString(value): - if value is None: - return "" - else: - return value - factory = DataEntityFactory() output = factory.createAnnotatedCSVData( "{outputDirectory}/{repoName}_OnlineCommitAuthorship.csv".format( @@ -479,8 +444,7 @@ def execute(self, request): class OfflineCommitCountsRoutineRequest(OfflineRoutineRequest): - def __init__(self, repositoryURL, outputDirectory, workspaceDirectory): - super().__init__(repositoryURL, outputDirectory, workspaceDirectory) + pass class OfflineCommitCountsRoutine(OfflineRepositoryRoutine): @@ -527,21 +491,7 @@ def offlineImplementation(self, request, session): class ContributorAccountListRoutineRequest(OnlineRoutineRequest): - def __init__( - self, - repositoryURL, - outputDirectory, - username=None, - password=None, - token=None, - keychain=None): - super().__init__( - repositoryURL, - outputDirectory, - username=username, - password=password, - token=token, - keychain=keychain) + pass class ContributorAccountListRoutine(OnlineRepositoryRoutine): @@ -554,12 +504,6 @@ class ContributorAccountListRoutine(OnlineRepositoryRoutine): def getRequestType(self): return ContributorAccountListRoutineRequest - def _replaceNoneWithEmptyString(self, value): - if value is None: - return "" - else: - return value - def githubImplementation(self, request, session): factory = DataEntityFactory() output = factory.createAnnotatedCSVData( @@ -578,9 +522,9 @@ def githubImplementation(self, request, session): contributors = [contributor for contributor in session.get_contributors()] for contributor in contributors: output.addRecord([ - self._replaceNoneWithEmptyString(contributor.login), - self._replaceNoneWithEmptyString(contributor.name), - ';'.join([self._replaceNoneWithEmptyString(contributor.email)]) + _replaceNoneWithEmptyString(contributor.login), + _replaceNoneWithEmptyString(contributor.name), + ';'.join([_replaceNoneWithEmptyString(contributor.email)]) ]) @@ -607,8 +551,8 @@ def gitlabImplementation(self, request, session): contributors = [contributor for contributor in session.get_contributors()] for contributor in contributors: output.addRecord([ - self._replaceNoneWithEmptyString(contributor.username), - self._replaceNoneWithEmptyString(contributor.name), + _replaceNoneWithEmptyString(contributor.username), + _replaceNoneWithEmptyString(contributor.name), ';'.join(contributor.emails.list()) ]) @@ -619,8 +563,7 @@ def gitlabImplementation(self, request, session): class FileInteractionRoutineRequest(OfflineRoutineRequest): - def __init__(self, repositoryURL, outputDirectory, workspaceDirectory): - super().__init__(repositoryURL, outputDirectory, workspaceDirectory) + pass class FileInteractionRoutine(OfflineRepositoryRoutine): diff --git a/src/reposcanner/dummy.py b/src/reposcanner/dummy.py index 7a5b5b5..6a77272 100644 --- a/src/reposcanner/dummy.py +++ b/src/reposcanner/dummy.py @@ -11,8 +11,7 @@ class DummyOfflineRoutineRequest(OfflineRoutineRequest): - def __init__(self, repositoryURL, outputDirectory, workspaceDirectory): - super().__init__(repositoryURL, outputDirectory, workspaceDirectory) + pass class DummyOfflineRoutine(OfflineRepositoryRoutine): @@ -42,21 +41,7 @@ def offlineImplementation(self, request, session): class DummyOnlineRoutineRequest(OnlineRoutineRequest): - def __init__( - self, - repositoryURL, - outputDirectory, - username=None, - password=None, - token=None, - keychain=None): - super().__init__( - repositoryURL, - outputDirectory, - username=username, - password=password, - token=token, - keychain=keychain) + pass class DummyOnlineRoutine(OnlineRepositoryRoutine): diff --git a/src/reposcanner/git.py b/src/reposcanner/git.py index 6dce062..c8301d4 100644 --- a/src/reposcanner/git.py +++ b/src/reposcanner/git.py @@ -399,19 +399,12 @@ def __init__(self, credentialsDictionary): a dictionary object, but got a {wrongType} instead!".format( wrongType=type(credentialsDictionary))) - def safeAccess(dictionary, key): - """A convenience function for error-free access to a dictionary""" - if key in dictionary: - return dictionary[key] - else: - return None - for entryName in credentialsDictionary: entry = credentialsDictionary[entryName] - url = safeAccess(entry, "url") - username = safeAccess(entry, "username") - password = safeAccess(entry, "password") - token = safeAccess(entry, "token") + url = entry.get("url", None) + username = entry.get("username", None) + password = entry.get("password", None) + token = entry.get("token", None) if url is None: print("Reposcanner: Warning, the entry {entryName} in \ the credentials file is missing a URL. Skipping.".format( diff --git a/src/util.py b/src/util.py new file mode 100644 index 0000000..b38d554 --- /dev/null +++ b/src/util.py @@ -0,0 +1,5 @@ +def replaceNoneWithEmptyString(value): + if value is None: + return "" + else: + return value From 5e0249b241179b9ad4d59422e2a5659cdb5ad8d4 Mon Sep 17 00:00:00 2001 From: Samuel Grayson Date: Mon, 12 Jun 2023 13:31:00 -0500 Subject: [PATCH 3/7] Fix GitHub CI --- .github/workflows/python-package.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index 8473dc1..662bc31 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -30,7 +30,7 @@ jobs: python -m pip install --upgrade pip python -m pip install flake8 pytest pytest-cov python -m pip install --upgrade setuptools setuptools_scm wheel - python setup.py install + python -m pip install . #- name: Lint with flake8 # run: | # # stop the build if there are Python syntax errors or undefined names From fcea45316b26227b3a348de972881d90098d6048 Mon Sep 17 00:00:00 2001 From: Samuel Grayson Date: Tue, 13 Jun 2023 16:58:19 -0500 Subject: [PATCH 4/7] Use reflection to allow new routines externally --- src/reposcanner/manager.py | 135 +++++++++++++++++++++------------- src/{ => reposcanner}/util.py | 0 tests/test_manager.py | 16 +++- 3 files changed, 97 insertions(+), 54 deletions(-) rename src/{ => reposcanner}/util.py (100%) diff --git a/src/reposcanner/manager.py b/src/reposcanner/manager.py index ea108b4..c01758a 100644 --- a/src/reposcanner/manager.py +++ b/src/reposcanner/manager.py @@ -1,10 +1,10 @@ -from reposcanner.contrib import ContributorAccountListRoutine, OfflineCommitCountsRoutine, GambitCommitAuthorshipInferenceAnalysis -from reposcanner.contrib import CommitInfoMiningRoutine, OnlineCommitAuthorshipRoutine -from reposcanner.dummy import DummyOfflineRoutine, DummyOnlineRoutine, DummyAnalysis from reposcanner.git import CredentialKeychain from reposcanner.data import DataEntityStore from reposcanner.response import ResponseFactory from reposcanner.routines import RepositoryRoutine, ExternalCommandLineToolRoutine +from reposcanner.analyses import DataAnalysis +import warnings +import importlib import datetime import logging import curses @@ -209,57 +209,90 @@ def __init__( self._guiModeEnabled = gui self._store = DataEntityStore() + @staticmethod + def dynamicallyImportFrom(name): + if ":" not in name: + warnings.warn( + "Unqualified routine names ({}) are deprecated. " + "Use . or .:." + .format(name), + DeprecationWarning, + ) + import reposcanner.contrib, reposcanner.dummy + if hasattr(reposcanner.contrib, name): + return getattr(reposcanner.contrib, name) + elif hasattr(reposcanner.dummy, name): + return getattr(reposcanner.dummy, name) + elif name in globals(): + return globals()[name] + else: + raise ValueError( + "{} not found in the default search locations." + .format(name) + ) + else: + importName, _, objectName = name.partition(":") + module = importlib.import_module(importName) + return getattr(module, objectName) + + def initializeRoutinesAndAnalyses(self, configData): """Constructs RepositoryRoutine and DataAnalysis objects that belong to the manager.""" - if 'routines' in configData: - for routineEntry in configData['routines']: - if isinstance(routineEntry, dict): - # The routineEntry is a dictionary, implying it - # has parameters we need to pass to the - # constructor. Otherwise it'll just be a plain string. - routineName = list(routineEntry.keys())[0] - configParameters = routineEntry[routineName] - else: - routineName = routineEntry - configParameters = None - try: - routineClazz = getattr(sys.modules[__name__], routineName) - routineInstance = routineClazz() - routineInstance.setConfigurationParameters(configParameters) - - if isinstance(routineInstance, RepositoryRoutine): - self._repositoryRoutines.append(routineInstance) - elif isinstance(routineInstance, ExternalCommandLineToolRoutine): - self._externalCommandLineToolRoutines.append(routineInstance) - else: - raise TypeError("ReposcannerManager does not know how to \ - handle this routine type: {routineType}".format(type(routineInstance))) - except BaseException: - raise ValueError( - "Failed to instantiate routine matching name {name}".format( - name=routineName)) - - if 'analyses' in configData: - for analysisEntry in configData['analyses']: - if isinstance(routineEntry, dict): - # The analysisEntry is a dictionary, implying it - # has parameters we need to pass to the - # constructor. Otherwise it'll just be a plain string. - analysisName = list(analysisEntry.keys())[0] - configParameters = analysisEntry[analysisName] - else: - analysisName = analysisEntry - configParameters = None - try: - analysisClazz = getattr(sys.modules[__name__], analysisName) - analysisInstance = analysisClazz() - analysisInstance.setConfigurationParameters(configParameters) - self._analyses.append(analysisInstance) - except BaseException: - raise ValueError( - "Failed to instantiate analysis matching name {name}".format( - name=analysisName)) + for routineEntry in configData.get('routines', []): + if isinstance(routineEntry, dict): + # The routineEntry is a dictionary, implying it + # has parameters we need to pass to the + # constructor. Otherwise it'll just be a plain string. + routineName = list(routineEntry.keys())[0] + configParameters = routineEntry[routineName] + elif isinstance(routineEntry, str): + routineName = routineEntry + configParameters = None + else: + raise TypeError("Invalid routine: {} ({})" + .format(routineEntry, type(routineEntry))) + + routineClazz = self.dynamicallyImportFrom(routineName) + routineInstance = routineClazz() + routineInstance.setConfigurationParameters(configParameters) + + if isinstance(routineInstance, RepositoryRoutine): + self._repositoryRoutines.append(routineInstance) + elif isinstance(routineInstance, ExternalCommandLineToolRoutine): + self._externalCommandLineToolRoutines.append(routineInstance) + else: + raise TypeError( + "ReposcannerManager does not know how to handle this " + "routine type: {}" + .format(type(routineInstance)) + ) + + for analysisEntry in configData.get('analyses', []): + if isinstance(analysisEntry, dict): + # The analysisEntry is a dictionary, implying it + # has parameters we need to pass to the + # constructor. Otherwise it'll just be a plain string. + analysisName = list(analysisEntry.keys())[0] + configParameters = analysisEntry[analysisName] + elif isinstance(analysisEntry, str): + analysisName = analysisEntry + configParameters = None + else: + raise TypeError("Invalid analysis: {} ({})" + .format(analysisName, type(analysisName))) + analysisClazz = self.dynamicallyImportFrom(analysisName) + analysisInstance = analysisClazz() + analysisInstance.setConfigurationParameters(configParameters) + + if isinstance(analysisInstance, DataAnalysis): + self._analyses.append(analysisInstance) + else: + raise TypeError( + "ReposcannerManager does not know how to handle this " + "analysis type: {}" + .format(type(analysisInstance)) + ) for routine in self._repositoryRoutines: if self._notebook is not None: diff --git a/src/util.py b/src/reposcanner/util.py similarity index 100% rename from src/util.py rename to src/reposcanner/util.py diff --git a/tests/test_manager.py b/tests/test_manager.py index e4fa9e6..52a7e13 100644 --- a/tests/test_manager.py +++ b/tests/test_manager.py @@ -84,6 +84,10 @@ def test_ReposcannerManager_CanParseConfigYAMLFileAndConstructRoutines(tmpdir): contents = """ routines: - ContributorAccountListRoutine + - reposcanner.contrib:ContributorAccountListRoutine + analyses: + - GambitCommitAuthorshipInferenceAnalysis + - reposcanner.contrib:GambitCommitAuthorshipInferenceAnalysis """ outfile.write(contents) @@ -91,10 +95,16 @@ def test_ReposcannerManager_CanParseConfigYAMLFileAndConstructRoutines(tmpdir): configEntity.readFromFile() configDict = configEntity.getData() - manager.initializeRoutinesAndAnalyses(configDict) + with pytest.deprecated_call(): + manager.initializeRoutinesAndAnalyses(configDict) routines = manager.getRoutines() - assert(len(routines) == 1) + assert(len(routines) == 2) assert(routines[0].__class__.__name__ == "ContributorAccountListRoutine") + assert(routines[1].__class__.__name__ == "ContributorAccountListRoutine") + analyses = manager.getAnalyses() + assert(len(analyses) == 2) + assert(analyses[0].__class__.__name__ == "GambitCommitAuthorshipInferenceAnalysis") + assert(analyses[1].__class__.__name__ == "GambitCommitAuthorshipInferenceAnalysis") def test_ReposcannerManager_missingRoutinesInConfigCausesValueError(tmpdir): @@ -119,5 +129,5 @@ def test_ReposcannerManager_missingRoutinesInConfigCausesValueError(tmpdir): configDict = configEntity.getData() # Attempting to find and initialize NonexistentRoutine will trigger a ValueError. - with pytest.raises(ValueError): + with pytest.raises(ValueError), pytest.deprecated_call(): manager.initializeRoutinesAndAnalyses(configDict) From c0255b5f7a1473f07232e9c7be2404cf04c11ca4 Mon Sep 17 00:00:00 2001 From: Samuel Grayson Date: Tue, 13 Jun 2023 16:58:30 -0500 Subject: [PATCH 5/7] Add documentation --- README.md | 40 ++++++++++++++++++++++++++++++++++------ 1 file changed, 34 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index a8538fc..0e8450f 100644 --- a/README.md +++ b/README.md @@ -48,14 +48,42 @@ reposcanner --credentials tutorial/inputs/credentials.yml --config tutorial/inpu 3. examine the output files written to `tutorial/outputs` - # How to extend functionality -1. Create a new source file, `src/reposcanner/`, including a class - based on the `ContributorAccountListRoutine`. See `stars.py` as an - example of the kind of modifications required. +In the early days, the only way to extend Reposcanner was to modify its source, but now Reposcanner can be extended externally as well. We recommend the external method for future projects, so we don't create tons of forks of Reposcanner for each new analysis. + +1. Create a new source file, `my_module.py` or `my_package/my_module.py`. + +2. Import `reposcanner.requests` and one of {`reposcanner.routine` or `reposcanner.analysis`}, depending on if you want to write a routine or an analysis. + +3. Locate the most relevant subclass of `reposcanner.requests.BaseRequestModel` and one of {`reposcanner.routines.DataMiningRoutine` or `reposcanner.analyses.DataAnalysis`}. E.g., for a routine that requires GitHub API access, one would subclass `OnlineRoutineRequest` and `OnlineRepositoryRoutine`. Reference the minimal blank example in `reposcanner.dummy` or real-world examples in `reposcanner.contrib`. + +4. Write a config file that refers to your routines and analyses. See the next section on configuration files. + +5. Check that `my_module` or `my_package.my_module` is importable. E.g., `python -c 'from my_module import MyRoutineOrAnalysis'`. + - The current working directory is implicitly in the `PYTHONPATH`, so your module or package will be importable if you run Python and Reposcanner from the directory which contains your module or package + - If your module or package does not reside in the current working directory, you need to add it to your `$PYTHONPATH` for it to be importable: `export PYTHONPATH=/path/to/proj:$PYTHONPATH`. This only has to be done once for your entire shell session. Note that the `$PYTHONPATH` should have the path to the directory containing your module or package, not the path to your module or package itself. E.g., In the previous example, if you have `/path/to/proj/my_module.py` or `/path/to/proj/my_package/my_module.py`, set the `PYTHONPATH` to `/path/to/proj`. -2. Add the new class name (for example `- StarGazersRoutine`) to the end of `config.yml`. +6. Run Reposcanner. -3. Run the test scan and inspect output to ensure your scan worked as intended. +# Input files + +## config.yaml + +The config file contains a list of routines and a list of analyses. Each routine or analysis is identified as `my_module:ClassName` or `my_module.my_package:ClassName`. + +Within each routine, one can put a dictionary of keyword parameters that will get passed to that routine. + +``` +routines: + - my_module:NameOfOneRoutine + - routine: my_module:NameOfAnotherOneRoutine + arg0: "foo" + arg1: [1, 2, 3] +analysis: + - my_module:NameOfOneRoutine + - my_module:NameOfAnotherOneRoutine + arg0: "foo" + arg1: [1, 2, 3] +``` From b2b8f440f851bb0f1daaed6897f6028c76b792f2 Mon Sep 17 00:00:00 2001 From: Samuel Grayson Date: Mon, 12 Jun 2023 13:18:58 -0500 Subject: [PATCH 6/7] Reduce code --- src/util.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 src/util.py diff --git a/src/util.py b/src/util.py new file mode 100644 index 0000000..b38d554 --- /dev/null +++ b/src/util.py @@ -0,0 +1,5 @@ +def replaceNoneWithEmptyString(value): + if value is None: + return "" + else: + return value From 8b61d380a0eac7c20edb962872da7f0a346ad92d Mon Sep 17 00:00:00 2001 From: Samuel Grayson Date: Tue, 13 Jun 2023 16:58:19 -0500 Subject: [PATCH 7/7] Use reflection to allow new routines externally --- src/util.py | 5 ----- 1 file changed, 5 deletions(-) delete mode 100644 src/util.py diff --git a/src/util.py b/src/util.py deleted file mode 100644 index b38d554..0000000 --- a/src/util.py +++ /dev/null @@ -1,5 +0,0 @@ -def replaceNoneWithEmptyString(value): - if value is None: - return "" - else: - return value