Skip to content

Commit 08b7929

Browse files
committed
DcmCharString: add some support for multi-byte characters
- add DcmCharString::getVM(), getOFString() and putOFStringAtPos(), which handle multi-byte charsets - DcmByteString::containsExtendedCharacters(): add check for ESCAPE characters (only allowed in code extensions) - removed obsolete DcmCharString::containsExtendedCharacters()
1 parent d360b01 commit 08b7929

File tree

18 files changed

+612
-103
lines changed

18 files changed

+612
-103
lines changed

dcmdata/include/dcmtk/dcmdata/dcbytstr.h

Lines changed: 34 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -274,20 +274,19 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
274274
*/
275275
virtual OFCondition verify(const OFBool autocorrect = OFFalse);
276276

277-
/** check if this element contains non-ASCII characters. Please note that this check
278-
* is pretty simple and only works for single-byte character sets that do include
279-
* the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character
280-
* codes below 128 are considered to be ASCII codes and all others are considered to
281-
* be non-ASCII.
277+
/** check if this element contains non-ASCII characters.
278+
* This works by checking for any byte values above 127, which works for any
279+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
280+
* which will mean that a code extension is used.
282281
* @param checkAllStrings if true, also check elements with string values not affected
283282
* by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
284-
* UC and UT, i.e. none of the derived VR classes.
283+
* UC and UT.
285284
* @return true if element contains non-ASCII characters, false otherwise
286285
*/
287286
virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse);
288287

289288
/** check if this element is affected by SpecificCharacterSet
290-
* @return always returns false since none of the derived VR classes is affected by
289+
* @return returns false, overwritten by derived VR classes that are affected by
291290
* the SpecificCharacterSet (0008,0005) element
292291
*/
293292
virtual OFBool isAffectedBySpecificCharacterSet() const;
@@ -379,6 +378,20 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
379378
*/
380379
virtual OFCondition makeMachineByteString(const Uint32 length = 0);
381380

381+
/** check if the VR supports more than one value.
382+
* @return OFTrue
383+
*/
384+
virtual OFBool supportsMultiValue() const { return OFTrue; }
385+
386+
/** find the start index of the next component.
387+
* @param str pointer to the string value to be searched
388+
* @param len the length of @a str
389+
* @param start the start character index for the search
390+
* @param charSet the value of Specific Character Set; not used
391+
* @return the index of the next value, or OFString_npos if none exists.
392+
*/
393+
virtual size_t findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const;
394+
382395
/** convert currently stored string value to DICOM representation.
383396
* It removes trailing spaces apart from a possibly required single padding
384397
* character (in case of odd string length).
@@ -418,13 +431,23 @@ class DCMTK_DCMDATA_EXPORT DcmByteString: public DcmElement
418431
*/
419432
void setNonSignificantChars(const OFString &characters) { nonSignificantChars = characters; }
420433

434+
/** set element value at a specific value position in the given character string,
435+
* considering the specific character set for finding the position, if given.
436+
* @param stringVal input character string (possibly multi-valued)
437+
* @param pos position (0..vm) where the value should be inserted
438+
* @param charSet the value of the Specific Character Set
439+
* @return status, EC_Normal if successful, an error code otherwise
440+
*/
441+
OFCondition putOFStringAtPosWithCharset(const OFString& stringVal,
442+
const unsigned long pos,
443+
const OFString& charSet);
444+
421445
/* --- static helper functions --- */
422446

423447
/** check if a given character string contains non-ASCII characters.
424-
* Please note that this check is pretty simple and only works for single-byte character
425-
* sets that do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other
426-
* words: All character codes below 128 are considered to be ASCII codes and all others
427-
* are considered to be non-ASCII.
448+
* This works by checking for any byte values above 127, which works for any
449+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
450+
* which will mean that a code extension is used.
428451
* @param stringVal character string to be checked
429452
* @param stringLen length of the string (number of characters without the trailing
430453
* NULL byte)

dcmdata/include/dcmtk/dcmdata/dcchrstr.h

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -107,17 +107,31 @@ class DCMTK_DCMDATA_EXPORT DcmCharString
107107
*/
108108
virtual OFCondition verify(const OFBool autocorrect = OFFalse);
109109

110-
/** check if this element contains non-ASCII characters. Please note that this check
111-
* is pretty simple and only works for single-byte character sets that do include
112-
* the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All character
113-
* codes below 128 are considered to be ASCII codes and all others are considered to
114-
* be non-ASCII.
115-
* @param checkAllStrings not used in this class
116-
* @return true if element contains non-ASCII characters, false otherwise
110+
/** get value multiplicity
111+
* @return number of string components (separated by a backslash)
117112
*/
118-
virtual OFBool containsExtendedCharacters(const OFBool checkAllStrings = OFFalse);
113+
virtual unsigned long getVM();
119114

120-
/** check if this element is affected by SpecificCharacterSet
115+
/** get a copy of a particular string component
116+
* @param stringVal variable in which the result value is stored
117+
* @param pos index of the value in case of multi-valued elements (0..vm-1)
118+
* @param normalize not used since string normalization depends on value representation
119+
* @return status, EC_Normal if successful, an error code otherwise
120+
*/
121+
virtual OFCondition getOFString(OFString &stringVal,
122+
const unsigned long pos,
123+
OFBool normalize = OFTrue);
124+
125+
126+
/** set element value at specific VM position in the given character string.
127+
* @param stringVal input character string (possibly multi-valued)
128+
* @param pos position (0..vm) where the value should be inserted
129+
* @return status, EC_Normal if successful, an error code otherwise
130+
*/
131+
virtual OFCondition putOFStringAtPos(const OFString& stringVal,
132+
const unsigned long pos = 0);
133+
134+
/** check if this element is affected by SpecificCharacterSet
121135
* @return always returns true since all derived VR classes are affected by the
122136
* SpecificCharacterSet (0008,0005) element
123137
*/
@@ -169,6 +183,15 @@ class DCMTK_DCMDATA_EXPORT DcmCharString
169183
*/
170184
virtual const OFString& getDelimiterChars() const;
171185

186+
/** find the start index of the next value in a multi-valued attribute.
187+
* @param str pointer to the string value to be searched
188+
* @param len the length of @a str
189+
* @param start the start character index for the search
190+
* @param charSet the value of Specific Character Set; if not set, single-byte encoding is assumed
191+
* @return the index of the next value, or OFString_npos if none exists.
192+
*/
193+
virtual size_t findNextValuePosition(const char* str, size_t len, size_t start, const OFString& charSet) const;
194+
172195
};
173196

174197

dcmdata/include/dcmtk/dcmdata/dcitem.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -317,11 +317,10 @@ class DCMTK_DCMDATA_EXPORT DcmItem
317317
*/
318318
virtual OFBool containsUnknownVR() const;
319319

320-
/** check if this object contains non-ASCII characters at any nesting level. Please note
321-
* that this check is pretty simple and only works for single-byte character sets that
322-
* do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All
323-
* character codes below 128 are considered to be ASCII codes and all others are
324-
* considered to be non-ASCII.
320+
/** check if this object contains non-ASCII characters.
321+
* This works by checking for any byte values above 127, which works for any
322+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
323+
* which will mean that a code extension is used.
325324
* @param checkAllStrings if true, also check elements with string values not affected
326325
* by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
327326
* UC and UT.

dcmdata/include/dcmtk/dcmdata/dcsequen.h

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -313,11 +313,10 @@ class DCMTK_DCMDATA_EXPORT DcmSequenceOfItems : public DcmElement
313313
*/
314314
virtual OFBool containsUnknownVR() const;
315315

316-
/** check if this object contains non-ASCII characters at any nesting level. Please note
317-
* that this check is pretty simple and only works for single-byte character sets that
318-
* do include the 7-bit ASCII codes, e.g. for the ISO 8859 family. In other words: All
319-
* character codes below 128 are considered to be ASCII codes and all others are
320-
* considered to be non-ASCII.
316+
/** check if this object contains non-ASCII characters.
317+
* This works by checking for any byte values above 127, which works for any
318+
* single-byte code and for single-value multi-byte codes, and for ESC characters,
319+
* which will mean that a code extension is used.
321320
* @param checkAllStrings if true, also check elements with string values not affected
322321
* by SpecificCharacterSet (0008,0005). By default, only check PN, LO, LT, SH, ST,
323322
* UC and UT.

dcmdata/include/dcmtk/dcmdata/dcspchrs.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -232,6 +232,15 @@ class DCMTK_DCMDATA_EXPORT DcmSpecificCharacterSet
232232
*/
233233
static size_t countCharactersInUTF8String(const OFString &utf8String);
234234

235+
/** check whether the given Specific Character Set value belongs to a multi-byte
236+
* character set that is only allowed as a single value
237+
* in SpecificCharacterSet and that may contain bytes that look like ASCII (e.g.
238+
* with the highest bit cleared), but are part of a multi-byte non-ASCII character.
239+
* This currently includes only Chinese encodings, as in UTF-8 (ISO_IR 192) any byte
240+
* belonging to a non-ASCII character has the highest bit set
241+
*/
242+
static OFBool isNonAsciiConformMultiByteSingleValueCharacterSet(const OFString& charset);
243+
235244

236245
protected:
237246

dcmdata/include/dcmtk/dcmdata/dcvrlt.h

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -114,13 +114,6 @@ class DCMTK_DCMDATA_EXPORT DcmLongText
114114
virtual OFCondition checkValue(const OFString &vm = "",
115115
const OFBool oldFormat = OFFalse);
116116

117-
/** get the value multiplicity.
118-
* Since the backslash "\" is not regarded as a separator the value
119-
* multiplicity is always 1.
120-
* @return value multiplicity of the currently stored value
121-
*/
122-
virtual unsigned long getVM();
123-
124117
/** get a copy of a particular string component
125118
* @param stringVal variable in which the result value is stored
126119
* @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -151,6 +144,14 @@ class DCMTK_DCMDATA_EXPORT DcmLongText
151144
*/
152145
static OFCondition checkStringValue(const OFString &value,
153146
const OFString &charset = "");
147+
148+
protected:
149+
/** check if the VR supports more than one value.
150+
* Since the backslash "\" is not regarded as a separator,
151+
* multiple values cannot be encoded.
152+
* @return OFFalse
153+
*/
154+
virtual OFBool supportsMultiValue() const { return OFFalse; };
154155
};
155156

156157

dcmdata/include/dcmtk/dcmdata/dcvrst.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -115,13 +115,6 @@ class DCMTK_DCMDATA_EXPORT DcmShortText
115115
virtual OFCondition checkValue(const OFString &vm = "",
116116
const OFBool oldFormat = OFFalse);
117117

118-
/** get the value multiplicity.
119-
* Since the backslash "\" is not regarded as a separator the value
120-
* multiplicity is always 1.
121-
* @return value multiplicity of the currently stored value
122-
*/
123-
virtual unsigned long getVM();
124-
125118
/** get a copy of a particular string component
126119
* @param stringVal variable in which the result value is stored
127120
* @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -152,6 +145,13 @@ class DCMTK_DCMDATA_EXPORT DcmShortText
152145
*/
153146
static OFCondition checkStringValue(const OFString &value,
154147
const OFString &charset = "");
148+
protected:
149+
/** check if the VR supports more than one value.
150+
* Since the backslash "\" is not regarded as a separator,
151+
* multiple values cannot be encoded.
152+
* @return OFFalse
153+
*/
154+
virtual OFBool supportsMultiValue() const { return OFFalse; };
155155
};
156156

157157

dcmdata/include/dcmtk/dcmdata/dcvrut.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -117,13 +117,6 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText
117117
virtual OFCondition checkValue(const OFString &vm = "",
118118
const OFBool oldFormat = OFFalse);
119119

120-
/** get the value multiplicity.
121-
* Since the backslash "\" is not regarded as a separator the value
122-
* multiplicity is always 1.
123-
* @return value multiplicity of the currently stored value
124-
*/
125-
virtual unsigned long getVM();
126-
127120
/** get a copy of a particular string component
128121
* @param stringVal variable in which the result value is stored
129122
* @param pos index of the value in case of multi-valued elements (0..vm-1)
@@ -154,6 +147,13 @@ class DCMTK_DCMDATA_EXPORT DcmUnlimitedText
154147
*/
155148
static OFCondition checkStringValue(const OFString &value,
156149
const OFString &charset = "");
150+
protected:
151+
/** check if the VR supports more than one value.
152+
* Since the backslash "\" is not regarded as a separator,
153+
* multiple values cannot be encoded.
154+
* @return OFFalse
155+
*/
156+
virtual OFBool supportsMultiValue() const { return OFFalse; };
157157
};
158158

159159

dcmdata/libsrc/dcbytstr.cc

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -467,9 +467,8 @@ OFCondition DcmByteString::putString(const char *stringVal,
467467
return errorFlag;
468468
}
469469

470-
471-
OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
472-
const unsigned long pos)
470+
OFCondition DcmByteString::putOFStringAtPosWithCharset(const OFString& stringVal, const unsigned long pos,
471+
const OFString& charSet)
473472
{
474473
OFCondition result;
475474
// Get old value
@@ -505,46 +504,42 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
505504
// First value is set: Replace old value with new value
506505
else
507506
{
508-
rightPos = str.find_first_of('\\', 0);
509-
str = str.replace(0, rightPos, stringVal);
507+
rightPos = findNextValuePosition(str.c_str(), str.length(), 0, charSet);
508+
str = str.replace(0, rightPos - 1, stringVal);
510509
}
511510
return putOFStringArray(str);
512511
}
513512

514513
// 3rd case: New value should be inserted somewhere in the middle
515514
size_t leftPos = 0;
516515
size_t vmPos = 0;
516+
size_t strLen = str.length();
517517
// First, find the correct position, and then insert / replace new value
518518
do
519519
{
520520
// Step from value to value by looking for delimiters.
521-
// Special handling first search (start looking at position 0 instead of 1)
522-
if (vmPos == 0) leftPos = str.find('\\', 0);
523-
else leftPos = str.find('\\', leftPos + 1 );
524-
// leftPos = str.find('\\', leftPos == 0 ? 0 : leftPos +1);
521+
leftPos = findNextValuePosition(str.c_str(), strLen, leftPos, charSet);
525522
if (leftPos != OFString_npos)
526-
{
527523
vmPos++;
528-
}
529524
}
530525
while ( (leftPos != OFString_npos) && (vmPos != pos) );
531-
rightPos = str.find_first_of('\\', leftPos+1);
532-
if (rightPos == OFString_npos) rightPos = str.length();
526+
rightPos = findNextValuePosition(str.c_str(), strLen, leftPos, charSet);
527+
if (rightPos == OFString_npos) rightPos = strLen + 1;
533528

534529
// If we do not have an old value of size 1 or we have an empty value
535530
if (rightPos - leftPos == 1)
536531
{
537532
// Empty value
538533
if (str.at(leftPos) == '\\')
539-
str = str.insert(rightPos, stringVal);
534+
str = str.insert(leftPos, stringVal);
540535
// Old value (length 1)
541536
else
542537
str = str.replace(leftPos, 1, stringVal);
543538
}
544539
// Otherwise replace existing old value (length > 1)
545540
else
546541
{
547-
str = str.replace(leftPos+1, rightPos - leftPos - 1, stringVal);
542+
str = str.replace(leftPos, rightPos - leftPos - 1, stringVal);
548543
}
549544
// Finally re-insert all values include new value
550545
result = putOFStringArray( str );
@@ -553,6 +548,28 @@ OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
553548
}
554549

555550

551+
552+
OFCondition DcmByteString::putOFStringAtPos(const OFString& stringVal,
553+
const unsigned long pos)
554+
{
555+
return putOFStringAtPosWithCharset(stringVal, pos, "");
556+
}
557+
558+
559+
// ********************************
560+
561+
562+
size_t DcmByteString::findNextValuePosition(const char* str, size_t len, size_t start, const OFString& /*charSet*/) const
563+
{
564+
const char *p = str + start;
565+
for (size_t i = start; i < len; ++i)
566+
{
567+
if (*p++ == '\\')
568+
return i + 1;
569+
}
570+
return OFString_npos;
571+
}
572+
556573
// ********************************
557574

558575

@@ -770,7 +787,7 @@ OFBool DcmByteString::containsExtendedCharacters(const OFBool checkAllStrings)
770787
OFBool result = OFFalse;
771788
/* only check if parameter is true since derived VRs are not affected
772789
by the attribute SpecificCharacterSet (0008,0005) */
773-
if (checkAllStrings)
790+
if (checkAllStrings || isAffectedBySpecificCharacterSet())
774791
{
775792
char *str = NULL;
776793
Uint32 len = 0;
@@ -876,10 +893,10 @@ OFBool DcmByteString::containsExtendedCharacters(const char *stringVal,
876893
{
877894
if (stringVal != NULL)
878895
{
879-
for (size_t i = stringLen; i != 0; --i)
896+
for (size_t i = stringLen; i != 0; --i, ++stringVal)
880897
{
881-
/* check for 8 bit characters */
882-
if (OFstatic_cast(unsigned char, *stringVal++) > 127)
898+
/* check for 8 bit and Escape characters */
899+
if ((*stringVal & 0x80) != 0 || (*stringVal == 0x1b))
883900
return OFTrue;
884901
}
885902
}

0 commit comments

Comments
 (0)