From 6d754bc091b81bdd9a5fe0a0ea73b34d23de8649 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:20:46 -0700 Subject: [PATCH 1/6] Refactor '_check_string' method to use inspect library to grab calling method name instead of an explicit argument from calling method --- src/hashstore/filehashstore.py | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e96ef690..e0345d19 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -8,6 +8,7 @@ import hashlib import os import logging +import inspect from pathlib import Path from contextlib import closing from tempfile import NamedTemporaryFile @@ -223,13 +224,7 @@ def _write_properties(self, properties): checked_properties = self._validate_properties(properties) # Collect configuration properties from validated & supplied dictionary - ( - _, - store_depth, - store_width, - store_algorithm, - store_metadata_namespace, - ) = [ + (_, store_depth, store_width, store_algorithm, store_metadata_namespace,) = [ checked_properties[property_name] for property_name in self.property_required_keys ] @@ -479,7 +474,7 @@ def store_object( "FileHashStore - store_object: Request to store object for pid: %s", pid ) # Validate input parameters - self._check_string(pid, "pid", "store_object") + self._check_string(pid, "pid") self._check_arg_data(data) self._check_integer(expected_object_size) ( @@ -2537,18 +2532,19 @@ def _check_integer(file_size): raise ValueError(exception_string) @staticmethod - def _check_string(string, arg, method): + def _check_string(string, arg): """Check whether a string is None or empty; throw an exception if so. :param str string: Value to check. :param str arg: Name of the argument to check. - :param str method: Calling method for logging purposes. """ if string is None or string.strip() == "": + method = inspect.stack()[1].function exception_string = ( f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." ) + print(exception_string) logging.error(exception_string) raise ValueError(exception_string) From d81ebc76f4ae8f603d2e297b72558ad1668ce308 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:23:31 -0700 Subject: [PATCH 2/6] Fix functions calling '_check_string' --- src/hashstore/filehashstore.py | 37 +++++++++++++--------------------- 1 file changed, 14 insertions(+), 23 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index e0345d19..508fe7f1 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -560,8 +560,8 @@ def store_object( def verify_object( self, object_metadata, checksum, checksum_algorithm, expected_file_size ): - self._check_string(checksum, "checksum", "verify_object") - self._check_string(checksum_algorithm, "checksum_algorithm", "verify_object") + self._check_string(checksum, "checksum") + self._check_string(checksum_algorithm, "checksum_algorithm") self._check_integer(expected_file_size) if object_metadata is None or not isinstance(object_metadata, ObjectMetadata): exception_string = ( @@ -601,8 +601,8 @@ def tag_object(self, pid, cid): cid, pid, ) - self._check_string(pid, "pid", "tag_object") - self._check_string(cid, "cid", "tag_object") + self._check_string(pid, "pid") + self._check_string(cid, "cid") sync_begin_debug_msg = ( f"FileHashStore - tag_object: Adding cid ({pid}) to locked list." @@ -758,7 +758,7 @@ def find_object(self, pid): logging.debug( "FileHashStore - find_object: Request to find object for for pid: %s", pid ) - self._check_string(pid, "pid", "find_object") + self._check_string(pid, "pid") pid_ref_abs_path = self._resolve_path("pid", pid) if os.path.exists(pid_ref_abs_path): @@ -832,7 +832,7 @@ def store_metadata(self, pid, metadata, format_id=None): "FileHashStore - store_metadata: Request to store metadata for pid: %s", pid ) # Validate input parameters - self._check_string(pid, "pid", "store_metadata") + self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "store_metadata") self._check_arg_data(metadata) pid_doc = self._computehash(pid + checked_format_id) @@ -892,7 +892,7 @@ def retrieve_object(self, pid): "FileHashStore - retrieve_object: Request to retrieve object for pid: %s", pid, ) - self._check_string(pid, "pid", "retrieve_object") + self._check_string(pid, "pid") object_info_dict = self.find_object(pid) object_cid = object_info_dict.get("cid") @@ -921,7 +921,7 @@ def retrieve_metadata(self, pid, format_id=None): "FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s", pid, ) - self._check_string(pid, "pid", "retrieve_metadata") + self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata") entity = "metadata" @@ -951,7 +951,7 @@ def delete_object(self, ab_id, id_type=None): logging.debug( "FileHashStore - delete_object: Request to delete object for id: %s", ab_id ) - self._check_string(ab_id, "ab_id", "delete_object") + self._check_string(ab_id, "ab_id") if id_type == "cid": cid_refs_abs_path = self._resolve_path("cid", ab_id) @@ -1200,7 +1200,7 @@ def delete_metadata(self, pid, format_id=None): "FileHashStore - delete_metadata: Request to delete metadata for pid: %s", pid, ) - self._check_string(pid, "pid", "delete_metadata") + self._check_string(pid, "pid") checked_format_id = self._check_arg_format_id(format_id, "delete_metadata") metadata_directory = self._computehash(pid) rel_path = "/".join(self._shard(metadata_directory)) @@ -1330,8 +1330,8 @@ def get_hex_digest(self, pid, algorithm): "FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s", pid, ) - self._check_string(pid, "pid", "get_hex_digest") - self._check_string(algorithm, "algorithm", "get_hex_digest") + self._check_string(pid, "pid") + self._check_string(algorithm, "algorithm") entity = "objects" algorithm = self._clean_algorithm(algorithm) @@ -2146,17 +2146,9 @@ def _check_arg_algorithms_and_checksum( additional_algorithm_checked = self._clean_algorithm(additional_algorithm) checksum_algorithm_checked = None if checksum is not None: - self._check_string( - checksum_algorithm, - "checksum_algorithm", - "_check_arg_algorithms_and_checksum (store_object)", - ) + self._check_string(checksum_algorithm, "checksum_algorithm") if checksum_algorithm is not None: - self._check_string( - checksum, - "checksum", - "_check_arg_algorithms_and_checksum (store_object)", - ) + self._check_string(checksum, "checksum") # Set checksum_algorithm checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm) return additional_algorithm_checked, checksum_algorithm_checked @@ -2544,7 +2536,6 @@ def _check_string(string, arg): f"FileHashStore - {method}: {arg} cannot be None" + f" or empty, {arg}: {string}." ) - print(exception_string) logging.error(exception_string) raise ValueError(exception_string) From 10b5de58ce186adc2f4ec9434eed9449237f73a5 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:31:20 -0700 Subject: [PATCH 3/6] Add new pytest for '_check_string' and also revise '_check_string' to check if any character is illegal --- src/hashstore/filehashstore.py | 2 +- tests/test_filehashstore.py | 34 +++++++++++++++++++++++++++------- 2 files changed, 28 insertions(+), 8 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 508fe7f1..57112baf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2530,7 +2530,7 @@ def _check_string(string, arg): :param str string: Value to check. :param str arg: Name of the argument to check. """ - if string is None or string.strip() == "": + if string is None or string.strip() == "" or any(ch.isspace() for ch in string): method = inspect.stack()[1].function exception_string = ( f"FileHashStore - {method}: {arg} cannot be None" diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 6c8ff99c..12374449 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -11,6 +11,7 @@ UnsupportedAlgorithm, ) + # pylint: disable=W0212 @@ -355,7 +356,7 @@ def test_store_data_only_file_size(pids, store): def test_store_data_only_hex_digests(pids, store): - """Check _store_data_only generates hex digests dictionary.""" + """Check _store_data_only generates a hex digests dictionary.""" test_dir = "tests/testdata/" for pid in pids.keys(): path = test_dir + pid.replace("/", "_") @@ -467,12 +468,7 @@ def test_move_and_get_checksums_incorrect_file_size(pids, store): input_stream = io.open(path, "rb") incorrect_file_size = 1000 # pylint: disable=W0212 - ( - _, - _, - _, - _, - ) = store._move_and_get_checksums( + (_, _, _, _,) = store._move_and_get_checksums( pid, input_stream, file_size_to_validate=incorrect_file_size ) input_stream.close() @@ -1115,3 +1111,27 @@ def test_resolve_path_refs_cid(pids, store): calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid)) assert resolved_cid_ref_abs_path == calculated_cid_ref_path + + +def test_check_string(store): + """Confirm that an exception is raised when a string is None, empty or an illegal character + (ex. tabs or new lines)""" + empty_pid_with_spaces = " " + with pytest.raises(ValueError): + store._check_string(empty_pid_with_spaces, "empty_pid_with_spaces") + + none_value = None + with pytest.raises(ValueError): + store._check_string(none_value, "none_value") + + new_line = "\n" + with pytest.raises(ValueError): + store._check_string(new_line, "new_line") + + new_line_with_other_chars = "hello \n" + with pytest.raises(ValueError): + store._check_string(new_line_with_other_chars, "new_line_with_other_chars") + + tab_line = "\t" + with pytest.raises(ValueError): + store._check_string(tab_line, "tab_line") From e345834a9d48d36e9cc1e2e74da1ec9f8294cd14 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:32:08 -0700 Subject: [PATCH 4/6] Revise doc strings --- src/hashstore/filehashstore.py | 3 ++- tests/test_filehashstore.py | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 57112baf..6717afd4 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -2525,7 +2525,8 @@ def _check_integer(file_size): @staticmethod def _check_string(string, arg): - """Check whether a string is None or empty; throw an exception if so. + """Check whether a string is None or empty - or if it contains an illegal character; + throws an exception if so. :param str string: Value to check. :param str arg: Name of the argument to check. diff --git a/tests/test_filehashstore.py b/tests/test_filehashstore.py index 12374449..26881976 100644 --- a/tests/test_filehashstore.py +++ b/tests/test_filehashstore.py @@ -1114,8 +1114,8 @@ def test_resolve_path_refs_cid(pids, store): def test_check_string(store): - """Confirm that an exception is raised when a string is None, empty or an illegal character - (ex. tabs or new lines)""" + """Confirm that an exception is raised when a string is None, empty or contains an illegal + character (ex. tabs or new lines)""" empty_pid_with_spaces = " " with pytest.raises(ValueError): store._check_string(empty_pid_with_spaces, "empty_pid_with_spaces") From 37a38d8f3a6c9bb9ace22e155f17af5b5cdb3423 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:39:56 -0700 Subject: [PATCH 5/6] Fix inaccurate docstrings in '_store_and_validate_data' and related methods RE: 'file_size_to_validate' arg --- src/hashstore/filehashstore.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 6717afd4..7faa3ec8 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -1375,7 +1375,7 @@ def _store_and_validate_data( :param str checksum: Optional checksum to validate object against hex digest before moving to permanent location. :param str checksum_algorithm: Algorithm value of the given checksum. - :param bytes file_size_to_validate: Expected size of the object. + :param int file_size_to_validate: Expected size of the object. :return: ObjectMetadata - object that contains the object id, object file size, and hex digest dictionary. @@ -1484,7 +1484,7 @@ def _move_and_get_checksums( :param str checksum: Optional checksum to validate the object against hex digest before moving to the permanent location. :param str checksum_algorithm: Algorithm value of the given checksum. - :param bytes file_size_to_validate: Expected size of the object. + :param int file_size_to_validate: Expected size of the object. :return: tuple - Object ID, object file size, and hex digest dictionary. """ From 812905083dbaeba17b5dbf220142a296851250c3 Mon Sep 17 00:00:00 2001 From: Dou Mok Date: Sun, 8 Sep 2024 14:53:42 -0700 Subject: [PATCH 6/6] Revise docstrings to resolve linting warnings --- src/hashstore/filehashstore.py | 26 +++++++++++++------------- src/hashstore/hashstore.py | 2 +- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/hashstore/filehashstore.py b/src/hashstore/filehashstore.py index 7faa3ec8..b9f7addf 100644 --- a/src/hashstore/filehashstore.py +++ b/src/hashstore/filehashstore.py @@ -630,8 +630,8 @@ def tag_object(self, pid, cid): pid_refs_path = self._resolve_path("pid", pid) cid_refs_path = self._resolve_path("cid", cid) # Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid' - self._create_path(os.path.dirname(pid_refs_path)) - self._create_path(os.path.dirname(cid_refs_path)) + self._create_path(Path(os.path.dirname(pid_refs_path))) + self._create_path(Path(os.path.dirname(cid_refs_path))) if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path): self._verify_hashstore_references( @@ -1476,7 +1476,7 @@ def _move_and_get_checksums( not match what is provided). :param str pid: Authority-based identifier. - :param io.BufferedReader stream: Object stream. + :param Stream stream: Object stream. :param str extension: Optional extension to append to the file when saving. :param str additional_algorithm: Optional algorithm value to include @@ -1524,7 +1524,7 @@ def _move_and_get_checksums( tmp_file_size, file_size_to_validate, ) - self._create_path(os.path.dirname(abs_file_path)) + self._create_path(Path(os.path.dirname(abs_file_path))) try: debug_msg = ( "FileHashStore - _move_and_get_checksums: Moving temp file to permanent" @@ -1625,7 +1625,7 @@ def _write_to_tmp_file_and_get_hex_digests( algorithm is provided, it will add the respective hex digest to the dictionary if it is supported. - :param io.BufferedReader stream: Object stream. + :param Stream stream: Object stream. :param str additional_algorithm: Algorithm of additional hex digest to generate. :param str checksum_algorithm: Algorithm of additional checksum algo to generate. @@ -1711,7 +1711,7 @@ def _write_to_tmp_file_and_get_hex_digests( def _mktmpfile(self, path): """Create a temporary file at the given path ready to be written. - :param str path: Path to the file location. + :param Path path: Path to the file location. :return: file object - object with a file-like interface. """ @@ -1743,7 +1743,7 @@ def _write_refs_file(self, path, ref_id, ref_type): difference being that a cid reference file can potentially contain multiple lines of `pid`s that reference the `cid`. - :param str path: Directory to write a temporary file into + :param path path: Directory to write a temporary file into :param str ref_id: Authority-based, persistent or content identifier :param str ref_type: 'cid' or 'pid' @@ -1907,7 +1907,7 @@ def _put_metadata(self, metadata, pid, metadata_doc_name): def _mktmpmetadata(self, stream): """Create a named temporary file with `stream` (metadata). - :param io.BufferedReader stream: Metadata stream. + :param Stream stream: Metadata stream. :return: Path/name of temporary file created and written into. :rtype: str @@ -1947,12 +1947,12 @@ def _verify_object_information( """Evaluates an object's integrity - if there is a mismatch, deletes the object in question and raises an exception. - :param str pid: For logging purposes. + :param Optional[str] pid: For logging purposes. :param str checksum: Value of the checksum to check. :param str checksum_algorithm: Algorithm of the checksum. :param str entity: Type of object ('objects' or 'metadata'). :param dict hex_digests: Dictionary of hex digests to parse. - :param str tmp_file_name: Name of the temporary file. + :param Optional[str] tmp_file_name: Name of the temporary file. :param int tmp_file_size: Size of the temporary file. :param int file_size_to_validate: Expected size of the object. """ @@ -2340,7 +2340,7 @@ def _delete(self, entity, file): def _rename_path_for_deletion(path): """Rename a given path by appending '_delete' and move it to the renamed path. - :param Path path: Path to file to rename + :param string path: Path to file to rename :return: Path to the renamed file :rtype: str @@ -2354,7 +2354,7 @@ def _rename_path_for_deletion(path): def _create_path(self, path): """Physically create the folder path (and all intermediate ones) on disk. - :param str path: The path to create. + :param Path path: The path to create. :raises AssertionError: If the path already exists but is not a directory. """ try: @@ -2544,7 +2544,7 @@ def _check_string(string, arg): def _cast_to_bytes(text): """Convert text to a sequence of bytes using utf-8 encoding. - :param str text: String to convert. + :param Any text: String to convert. :return: Bytes with utf-8 encoding. :rtype: bytes """ diff --git a/src/hashstore/hashstore.py b/src/hashstore/hashstore.py index cf874864..a40c7735 100644 --- a/src/hashstore/hashstore.py +++ b/src/hashstore/hashstore.py @@ -253,7 +253,7 @@ class ObjectMetadata( :param str pid: An authority-based or persistent identifier :param str cid: A unique identifier for the object (Hash ID, hex digest). - :param bytes obj_size: The size of the object in bytes. + :param int obj_size: The size of the object in bytes. :param list hex_digests: A list of hex digests to validate objects (md5, sha1, sha256, sha384, sha512) (optional). """