Skip to content

Fix incorrect assignment of deeply-nested tags to top-level tags #277

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Apr 15, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ and **Merged pull requests**. Critical items to know are:
Referenced versions in headers are tagged on Github, in parentheses are for pypi.

## [vxx](https://github.com/pydicom/deid/tree/master) (master)
- Fix incorrect assignment of deeply-nested tags to top-level tags [#277](https://github.com/pydicom/deid/pull/277) (0.4.2)
- `deid` client saves cleaned dicoms when requested [#276](https://github.com/pydicom/deid/pull/276) (0.4.1)
- Update to use pydicom 3 [#267](https://github.com/pydicom/deid/pull/267) (0.4.0)
- Refactor INCLUDE_REQUIRES and provide max pydicom version [#267](https://github.com/pydicom/deid/pull/267) (0.3.25)
Expand Down
50 changes: 48 additions & 2 deletions deid/dicom/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from deid.utils import parse_value, read_json

here = os.path.dirname(os.path.abspath(__file__))
parentheses_hex_tag_format = re.compile(r"\(([0-9A-Fa-f]{4}),([0-9A-Fa-f]{4})\)")
bare_hex_tag_format = re.compile(r"[0-9A-Fa-f]{8}")


class DicomParser:
Expand Down Expand Up @@ -451,12 +453,56 @@ def add_field(self, field, value):
# The addition will be different depending on if we have filemeta
is_filemeta = False

# Helper function to update dicom
def parse_tag_string(tag_string):
"""
There are four possible varieties of `tag_string`:
1. A string of the form "(0002,0010)"
2. A string of the form "00020010"
3. A string of the form "2" (decimal), indicating the index within
a sequence
4. A tag name (e.g., "PatientID"), rather than a number.
"""
if tag_string.startswith("("):
result = parentheses_hex_tag_format.match(tag_string)
return int(f"0x{result.group(1)}{result.group(2)}", base=16)
try:
if bare_hex_tag_format.match(tag_string):
return int(f"0x{tag_string}", base=16)
return int(tag_string)
except ValueError:
# If all numerical parsing failed, this is likely a tag name
return tag_string

# A string representation of the full tag path, including parent sequences.
# If `field` is a string, this will represent either the keyword ("PatientID")
# or tag ("(0010,0020)") of the element. If it is an object containing a `uid`
# property, the `uid` property will contain the full path of tags to the element
# with double-underscore delimiters between tags (e.g.
# "(0008,1115)__0__(0020,000E)")
full_tag_path = field
if hasattr(field, "uid"):
full_tag_path = field.uid

def update_dicom(element, is_filemeta):
"""
Update the dataset in `self.dicom` with the given element.

If `is_filemeta` is True, add the element to the file meta information.

Otherwise, traverse the tag path and update the element in the dataset.
"""
if is_filemeta:
self.dicom.file_meta.add(element)
else:
self.dicom.add(element)
dataset_cursor = self.dicom
# Navigate down the nested tag hierarchy by splitting the path
# on the "__" delimiter. We split the last tag in the path out
# in order to use it for directly assigning the element to a
# nested field in the `self.dicom` dataset.
*parent_tags, last_tag = full_tag_path.split("__")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you add a comment here about how this data is unwrapped, and why the last_tag is special/different?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment added! Let me know if you'd like me to expand on it more.

for tag in parent_tags:
dataset_cursor = dataset_cursor[parse_tag_string(tag)]
dataset_cursor[parse_tag_string(last_tag)] = element

# Assume we don't want to add an empty value
if value is not None:
Expand Down
104 changes: 104 additions & 0 deletions deid/tests/test_nested_dicom_fields.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import contextlib
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice job to add a full test! 👏

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! :)

import os
import tempfile
import unittest

from pydicom import dcmread
from pydicom.dataset import Dataset
from pydicom.sequence import Sequence
from pydicom.uid import generate_uid

from deid.config import DeidRecipe
from deid.dicom.header import get_identifiers, replace_identifiers


@contextlib.contextmanager
def temporary_recipe(recipe_text: str):
"""
Create a temporary recipe file for testing.
"""
with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as recipe_file:
recipe_file.write(recipe_text.encode())
recipe_file.flush()
recipe = DeidRecipe(deid=recipe_file.name, base=False)
yield recipe


def hashuid(item, value, field, dicom, element_name=None):
"""
Generate a new UID based on the previous UID
"""
if hasattr(field, "element"):
hash_src = str(field.element.value)
else:
hash_src = field
new_uid = generate_uid(entropy_srcs=[hash_src])
return new_uid


class TestNestedDicomFields(unittest.TestCase):
def setUp(self):
print("\n######################START######################")

def tearDown(self):
print("\n######################END########################")

def test_nested_dicom_fields(self):
"""
Tests that header deidentification does not overwrite existing top-level tags
when iterating over deeply nested tags.
"""
# Create a mock DICOM dataset with a top-level and nested SeriesInstanceUID
original_dicom = Dataset()
original_dicom.SeriesInstanceUID = generate_uid()

referenced_series = Dataset()
referenced_series.SeriesInstanceUID = generate_uid()

original_dicom.ReferencedSeriesSequence = Sequence([referenced_series])

# Enforce precondition that the two SeriesInstanceUID attributes are different
self.assertNotEqual(
original_dicom.ReferencedSeriesSequence[0].SeriesInstanceUID,
original_dicom.SeriesInstanceUID,
)

recipe_text = """
FORMAT dicom
%header
REPLACE SeriesInstanceUID func:hashuid
"""

with (
tempfile.TemporaryDirectory() as temp_dir,
temporary_recipe(recipe_text) as recipe,
):
temp_file_name = os.path.join(temp_dir, "input.dcm")
original_dicom.save_as(temp_file_name, implicit_vr=True, little_endian=True)
dicom_paths = [temp_file_name]

# Add hash function to deid context
ids = get_identifiers(dicom_paths)
for dicom_id in ids:
ids[dicom_id]["hashuid"] = hashuid

os.makedirs(os.path.join(temp_dir, "out"))
output_paths = replace_identifiers(
dicom_paths,
ids=ids,
deid=recipe,
save=True,
overwrite=True,
output_folder=os.path.join(temp_dir, "out"),
)

output_dataset = dcmread(output_paths[0], force=True)
# Assert that the SeriesInstanceUID has been replaced
self.assertNotEqual(
output_dataset.SeriesInstanceUID, original_dicom.SeriesInstanceUID
)
# Assert that the two unique UIDs were deidentified to different values
self.assertNotEqual(
output_dataset.ReferencedSeriesSequence[0].SeriesInstanceUID,
output_dataset.SeriesInstanceUID,
)
2 changes: 1 addition & 1 deletion deid/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
__copyright__ = "Copyright 2016-2025, Vanessa Sochat"
__license__ = "MIT"

__version__ = "0.4.1"
__version__ = "0.4.2"
AUTHOR = "Vanessa Sochat"
AUTHOR_EMAIL = "[email protected]"
NAME = "deid"
Expand Down