Handle multi-byte single-value encodings in conversion

mrbean-bremen · mrbean-bremen · commit a8044bbaa7d3 · 2025-12-18T21:03:08.000+01:00
- fixes the problem described in bug #684
diff --git a/dcmdata/libsrc/dcspchrs.cc b/dcmdata/libsrc/dcspchrs.cc
@@ -587,10 +587,20 @@ OFCondition DcmSpecificCharacterSet::convertStringWithoutCodeExtensions(const ch
         size_t pos = 0;
         const char *firstChar = fromString;
         const char *currentChar = fromString;
+        const bool isMultiByte = isNonAsciiConformMultiByteSingleValueCharacterSet(SourceCharacterSet);
+
         // iterate over all characters of the string (as long as there is no error)
         while ((pos < fromLength) && status.good())
         {
             const char c0 = *currentChar++;
+            if (isMultiByte && (c0 & 0x80) != 0)
+            {
+                // this is a 2-byte character or the first or second part
+                // of a 4-byte character - skip the next byte
+                currentChar++;
+                pos += 2;
+                continue;
+            }
             // check for characters HT, LF, FF, CR or any other specified delimiter
             const OFBool isDelimiter =  ((c0 == '\011') || (c0 == '\012') || (c0 == '\014') || (c0 == '\015') ||
                 (delimiters.find(c0) != OFString_npos));
diff --git a/dcmdata/tests/tspchrs.cc b/dcmdata/tests/tspchrs.cc
@@ -155,6 +155,10 @@ OFTEST(dcmdata_specificCharacterSet_3)
         OFCHECK(converter.selectCharacterSet("GB18030").good());
         OFCHECK(converter.convertString("Wang^XiaoDong=\315\365^\320\241\266\253=", resultStr, delimiters).good());
         OFCHECK_EQUAL(resultStr, "Wang^XiaoDong=\347\216\213^\345\260\217\344\270\234=");
+        // check whether a byte looking like a delimiter inside a multi-byte character is not handled as delimiter
+        // 0x5c is the byte for a backslash in single-byte encodings, but here part of two Kanji characters
+        OFCHECK(converter.convertString("Noriwa=\x81\x5c\x82\x5c", resultStr, delimiters).good());
+        OFCHECK_EQUAL(resultStr, "Noriwa=\xe4\xb9\x97\xe4\xbf\x93");
         // check whether string conversion from Chinese language to UTF-8 works
         // example taken from DICOM PS 3.5 Annex K.2
         OFCHECK(converter.selectCharacterSet("\\ISO 2022 IR 58").good());