Skip to content

Commit

Permalink
Merge pull request #128 from DataONEorg/feature-121-safeguard-pid
Browse files Browse the repository at this point in the history
Feature-121: Safeguard Pid Value
  • Loading branch information
doulikecookiedough authored Sep 8, 2024
2 parents 94f8d13 + 8129050 commit 5443b39
Show file tree
Hide file tree
Showing 3 changed files with 65 additions and 57 deletions.
86 changes: 37 additions & 49 deletions src/hashstore/filehashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import hashlib
import os
import logging
import inspect
from pathlib import Path
from contextlib import closing
from tempfile import NamedTemporaryFile
Expand Down Expand Up @@ -223,13 +224,7 @@ def _write_properties(self, properties):
checked_properties = self._validate_properties(properties)

# Collect configuration properties from validated & supplied dictionary
(
_,
store_depth,
store_width,
store_algorithm,
store_metadata_namespace,
) = [
(_, store_depth, store_width, store_algorithm, store_metadata_namespace,) = [
checked_properties[property_name]
for property_name in self.property_required_keys
]
Expand Down Expand Up @@ -479,7 +474,7 @@ def store_object(
"FileHashStore - store_object: Request to store object for pid: %s", pid
)
# Validate input parameters
self._check_string(pid, "pid", "store_object")
self._check_string(pid, "pid")
self._check_arg_data(data)
self._check_integer(expected_object_size)
(
Expand Down Expand Up @@ -565,8 +560,8 @@ def store_object(
def verify_object(
self, object_metadata, checksum, checksum_algorithm, expected_file_size
):
self._check_string(checksum, "checksum", "verify_object")
self._check_string(checksum_algorithm, "checksum_algorithm", "verify_object")
self._check_string(checksum, "checksum")
self._check_string(checksum_algorithm, "checksum_algorithm")
self._check_integer(expected_file_size)
if object_metadata is None or not isinstance(object_metadata, ObjectMetadata):
exception_string = (
Expand Down Expand Up @@ -606,8 +601,8 @@ def tag_object(self, pid, cid):
cid,
pid,
)
self._check_string(pid, "pid", "tag_object")
self._check_string(cid, "cid", "tag_object")
self._check_string(pid, "pid")
self._check_string(cid, "cid")

sync_begin_debug_msg = (
f"FileHashStore - tag_object: Adding cid ({pid}) to locked list."
Expand Down Expand Up @@ -635,8 +630,8 @@ def tag_object(self, pid, cid):
pid_refs_path = self._resolve_path("pid", pid)
cid_refs_path = self._resolve_path("cid", cid)
# Create paths for pid ref file in '.../refs/pid' and cid ref file in '.../refs/cid'
self._create_path(os.path.dirname(pid_refs_path))
self._create_path(os.path.dirname(cid_refs_path))
self._create_path(Path(os.path.dirname(pid_refs_path)))
self._create_path(Path(os.path.dirname(cid_refs_path)))

if os.path.exists(pid_refs_path) and os.path.exists(cid_refs_path):
self._verify_hashstore_references(
Expand Down Expand Up @@ -763,7 +758,7 @@ def find_object(self, pid):
logging.debug(
"FileHashStore - find_object: Request to find object for for pid: %s", pid
)
self._check_string(pid, "pid", "find_object")
self._check_string(pid, "pid")

pid_ref_abs_path = self._resolve_path("pid", pid)
if os.path.exists(pid_ref_abs_path):
Expand Down Expand Up @@ -837,7 +832,7 @@ def store_metadata(self, pid, metadata, format_id=None):
"FileHashStore - store_metadata: Request to store metadata for pid: %s", pid
)
# Validate input parameters
self._check_string(pid, "pid", "store_metadata")
self._check_string(pid, "pid")
checked_format_id = self._check_arg_format_id(format_id, "store_metadata")
self._check_arg_data(metadata)
pid_doc = self._computehash(pid + checked_format_id)
Expand Down Expand Up @@ -897,7 +892,7 @@ def retrieve_object(self, pid):
"FileHashStore - retrieve_object: Request to retrieve object for pid: %s",
pid,
)
self._check_string(pid, "pid", "retrieve_object")
self._check_string(pid, "pid")

object_info_dict = self.find_object(pid)
object_cid = object_info_dict.get("cid")
Expand Down Expand Up @@ -926,7 +921,7 @@ def retrieve_metadata(self, pid, format_id=None):
"FileHashStore - retrieve_metadata: Request to retrieve metadata for pid: %s",
pid,
)
self._check_string(pid, "pid", "retrieve_metadata")
self._check_string(pid, "pid")
checked_format_id = self._check_arg_format_id(format_id, "retrieve_metadata")

entity = "metadata"
Expand Down Expand Up @@ -956,7 +951,7 @@ def delete_object(self, ab_id, id_type=None):
logging.debug(
"FileHashStore - delete_object: Request to delete object for id: %s", ab_id
)
self._check_string(ab_id, "ab_id", "delete_object")
self._check_string(ab_id, "ab_id")

if id_type == "cid":
cid_refs_abs_path = self._resolve_path("cid", ab_id)
Expand Down Expand Up @@ -1205,7 +1200,7 @@ def delete_metadata(self, pid, format_id=None):
"FileHashStore - delete_metadata: Request to delete metadata for pid: %s",
pid,
)
self._check_string(pid, "pid", "delete_metadata")
self._check_string(pid, "pid")
checked_format_id = self._check_arg_format_id(format_id, "delete_metadata")
metadata_directory = self._computehash(pid)
rel_path = "/".join(self._shard(metadata_directory))
Expand Down Expand Up @@ -1335,8 +1330,8 @@ def get_hex_digest(self, pid, algorithm):
"FileHashStore - get_hex_digest: Request to get hex digest for object with pid: %s",
pid,
)
self._check_string(pid, "pid", "get_hex_digest")
self._check_string(algorithm, "algorithm", "get_hex_digest")
self._check_string(pid, "pid")
self._check_string(algorithm, "algorithm")

entity = "objects"
algorithm = self._clean_algorithm(algorithm)
Expand Down Expand Up @@ -1380,7 +1375,7 @@ def _store_and_validate_data(
:param str checksum: Optional checksum to validate object against hex digest before moving
to permanent location.
:param str checksum_algorithm: Algorithm value of the given checksum.
:param bytes file_size_to_validate: Expected size of the object.
:param int file_size_to_validate: Expected size of the object.
:return: ObjectMetadata - object that contains the object id, object file size,
and hex digest dictionary.
Expand Down Expand Up @@ -1481,15 +1476,15 @@ def _move_and_get_checksums(
not match what is provided).
:param str pid: Authority-based identifier.
:param io.BufferedReader stream: Object stream.
:param Stream stream: Object stream.
:param str extension: Optional extension to append to the file
when saving.
:param str additional_algorithm: Optional algorithm value to include
when returning hex digests.
:param str checksum: Optional checksum to validate the object
against hex digest before moving to the permanent location.
:param str checksum_algorithm: Algorithm value of the given checksum.
:param bytes file_size_to_validate: Expected size of the object.
:param int file_size_to_validate: Expected size of the object.
:return: tuple - Object ID, object file size, and hex digest dictionary.
"""
Expand Down Expand Up @@ -1529,7 +1524,7 @@ def _move_and_get_checksums(
tmp_file_size,
file_size_to_validate,
)
self._create_path(os.path.dirname(abs_file_path))
self._create_path(Path(os.path.dirname(abs_file_path)))
try:
debug_msg = (
"FileHashStore - _move_and_get_checksums: Moving temp file to permanent"
Expand Down Expand Up @@ -1630,7 +1625,7 @@ def _write_to_tmp_file_and_get_hex_digests(
algorithm is provided, it will add the respective hex digest to the dictionary if
it is supported.
:param io.BufferedReader stream: Object stream.
:param Stream stream: Object stream.
:param str additional_algorithm: Algorithm of additional hex digest to generate.
:param str checksum_algorithm: Algorithm of additional checksum algo to generate.
Expand Down Expand Up @@ -1716,7 +1711,7 @@ def _write_to_tmp_file_and_get_hex_digests(
def _mktmpfile(self, path):
"""Create a temporary file at the given path ready to be written.
:param str path: Path to the file location.
:param Path path: Path to the file location.
:return: file object - object with a file-like interface.
"""
Expand Down Expand Up @@ -1748,7 +1743,7 @@ def _write_refs_file(self, path, ref_id, ref_type):
difference being that a cid reference file can potentially contain multiple
lines of `pid`s that reference the `cid`.
:param str path: Directory to write a temporary file into
:param path path: Directory to write a temporary file into
:param str ref_id: Authority-based, persistent or content identifier
:param str ref_type: 'cid' or 'pid'
Expand Down Expand Up @@ -1912,7 +1907,7 @@ def _put_metadata(self, metadata, pid, metadata_doc_name):
def _mktmpmetadata(self, stream):
"""Create a named temporary file with `stream` (metadata).
:param io.BufferedReader stream: Metadata stream.
:param Stream stream: Metadata stream.
:return: Path/name of temporary file created and written into.
:rtype: str
Expand Down Expand Up @@ -1952,12 +1947,12 @@ def _verify_object_information(
"""Evaluates an object's integrity - if there is a mismatch, deletes the object
in question and raises an exception.
:param str pid: For logging purposes.
:param Optional[str] pid: For logging purposes.
:param str checksum: Value of the checksum to check.
:param str checksum_algorithm: Algorithm of the checksum.
:param str entity: Type of object ('objects' or 'metadata').
:param dict hex_digests: Dictionary of hex digests to parse.
:param str tmp_file_name: Name of the temporary file.
:param Optional[str] tmp_file_name: Name of the temporary file.
:param int tmp_file_size: Size of the temporary file.
:param int file_size_to_validate: Expected size of the object.
"""
Expand Down Expand Up @@ -2151,17 +2146,9 @@ def _check_arg_algorithms_and_checksum(
additional_algorithm_checked = self._clean_algorithm(additional_algorithm)
checksum_algorithm_checked = None
if checksum is not None:
self._check_string(
checksum_algorithm,
"checksum_algorithm",
"_check_arg_algorithms_and_checksum (store_object)",
)
self._check_string(checksum_algorithm, "checksum_algorithm")
if checksum_algorithm is not None:
self._check_string(
checksum,
"checksum",
"_check_arg_algorithms_and_checksum (store_object)",
)
self._check_string(checksum, "checksum")
# Set checksum_algorithm
checksum_algorithm_checked = self._clean_algorithm(checksum_algorithm)
return additional_algorithm_checked, checksum_algorithm_checked
Expand Down Expand Up @@ -2353,7 +2340,7 @@ def _delete(self, entity, file):
def _rename_path_for_deletion(path):
"""Rename a given path by appending '_delete' and move it to the renamed path.
:param Path path: Path to file to rename
:param string path: Path to file to rename
:return: Path to the renamed file
:rtype: str
Expand All @@ -2367,7 +2354,7 @@ def _rename_path_for_deletion(path):
def _create_path(self, path):
"""Physically create the folder path (and all intermediate ones) on disk.
:param str path: The path to create.
:param Path path: The path to create.
:raises AssertionError: If the path already exists but is not a directory.
"""
try:
Expand Down Expand Up @@ -2537,14 +2524,15 @@ def _check_integer(file_size):
raise ValueError(exception_string)

@staticmethod
def _check_string(string, arg, method):
"""Check whether a string is None or empty; throw an exception if so.
def _check_string(string, arg):
"""Check whether a string is None or empty - or if it contains an illegal character;
throws an exception if so.
:param str string: Value to check.
:param str arg: Name of the argument to check.
:param str method: Calling method for logging purposes.
"""
if string is None or string.strip() == "":
if string is None or string.strip() == "" or any(ch.isspace() for ch in string):
method = inspect.stack()[1].function
exception_string = (
f"FileHashStore - {method}: {arg} cannot be None"
+ f" or empty, {arg}: {string}."
Expand All @@ -2556,7 +2544,7 @@ def _check_string(string, arg, method):
def _cast_to_bytes(text):
"""Convert text to a sequence of bytes using utf-8 encoding.
:param str text: String to convert.
:param Any text: String to convert.
:return: Bytes with utf-8 encoding.
:rtype: bytes
"""
Expand Down
2 changes: 1 addition & 1 deletion src/hashstore/hashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -253,7 +253,7 @@ class ObjectMetadata(
:param str pid: An authority-based or persistent identifier
:param str cid: A unique identifier for the object (Hash ID, hex digest).
:param bytes obj_size: The size of the object in bytes.
:param int obj_size: The size of the object in bytes.
:param list hex_digests: A list of hex digests to validate objects
(md5, sha1, sha256, sha384, sha512) (optional).
"""
Expand Down
34 changes: 27 additions & 7 deletions tests/test_filehashstore.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
UnsupportedAlgorithm,
)


# pylint: disable=W0212


Expand Down Expand Up @@ -355,7 +356,7 @@ def test_store_data_only_file_size(pids, store):


def test_store_data_only_hex_digests(pids, store):
"""Check _store_data_only generates hex digests dictionary."""
"""Check _store_data_only generates a hex digests dictionary."""
test_dir = "tests/testdata/"
for pid in pids.keys():
path = test_dir + pid.replace("/", "_")
Expand Down Expand Up @@ -467,12 +468,7 @@ def test_move_and_get_checksums_incorrect_file_size(pids, store):
input_stream = io.open(path, "rb")
incorrect_file_size = 1000
# pylint: disable=W0212
(
_,
_,
_,
_,
) = store._move_and_get_checksums(
(_, _, _, _,) = store._move_and_get_checksums(
pid, input_stream, file_size_to_validate=incorrect_file_size
)
input_stream.close()
Expand Down Expand Up @@ -1115,3 +1111,27 @@ def test_resolve_path_refs_cid(pids, store):
calculated_cid_ref_path = store.cids + "/" + "/".join(store._shard(cid))

assert resolved_cid_ref_abs_path == calculated_cid_ref_path


def test_check_string(store):
"""Confirm that an exception is raised when a string is None, empty or contains an illegal
character (ex. tabs or new lines)"""
empty_pid_with_spaces = " "
with pytest.raises(ValueError):
store._check_string(empty_pid_with_spaces, "empty_pid_with_spaces")

none_value = None
with pytest.raises(ValueError):
store._check_string(none_value, "none_value")

new_line = "\n"
with pytest.raises(ValueError):
store._check_string(new_line, "new_line")

new_line_with_other_chars = "hello \n"
with pytest.raises(ValueError):
store._check_string(new_line_with_other_chars, "new_line_with_other_chars")

tab_line = "\t"
with pytest.raises(ValueError):
store._check_string(tab_line, "tab_line")

0 comments on commit 5443b39

Please sign in to comment.