Fix content encoding and decoding

- Include element_data.text on content encoding - Uniform wildcard decode or encode in case of missing element - Postpone lxml nsmap processing in case of xsi:type (still not usable for encoding)
sissaschool · Dec 21, 2019 · 2281aef · 2281aef
1 parent a6b9781
commit 2281aef
Show file tree

Hide file tree

Showing 11 changed files with 68 additions and 98 deletions.
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -2,7 +2,7 @@
 CHANGELOG
 *********
 
-`v1.0.17`_ (2019-11-xx)
+`v1.0.17`_ (2019-11-21)
 =======================
 * Enhancement of validation-only speed (~15%)
 * Added *is_valid()* and *iter_errors()* to module API

diff --git a/publiccode.yml b/publiccode.yml
@@ -6,7 +6,7 @@ publiccodeYmlVersion: '0.2'
 name: xmlschema
 url: 'https://github.com/sissaschool/xmlschema'
 landingURL: 'https://github.com/sissaschool/xmlschema'
-releaseDate: '2019-11-xx'
+releaseDate: '2019-11-21'
 softwareVersion: v1.0.17
 developmentStatus: stable
 platforms:

diff --git a/xmlschema/converters.py b/xmlschema/converters.py
@@ -326,6 +326,8 @@ def element_encode(self, obj, xsd_element, level=0):
         if not isinstance(obj, (self.dict, dict)):
             if xsd_element.type.is_simple() or xsd_element.type.has_simple_content():
                 return ElementData(tag, obj, None, {})
+            elif xsd_element.type.mixed and not isinstance(obj, list):
+                return ElementData(tag, obj, None, {})
             else:
                 return ElementData(tag, None, obj, {})
 

diff --git a/xmlschema/tests/test_factory/validation_tests.py b/xmlschema/tests/test_factory/validation_tests.py
@@ -125,6 +125,7 @@ def check_etree_encode(self, root, converter=None, **kwargs):
                 if converter not in (ParkerConverter, AbderaConverter, JsonMLConverter) and not skip_strict:
                     if debug_mode:
                         pdb.set_trace()
+                    breakpoint()
                     raise AssertionError(str(err) + msg_tmpl % "encoded tree differs from original")
                 elif converter is ParkerConverter and any(XSI_TYPE in e.attrib for e in root.iter()):
                     return  # can't check encode equivalence if xsi:type is provided

diff --git a/xmlschema/tests/test_w3c_suite.py b/xmlschema/tests/test_w3c_suite.py
@@ -97,8 +97,10 @@
 
     # Invalid XML tests
     '../sunData/combined/xsd005/xsd005.n05.xml',  # 3984: Invalid if lxml is used (xsi:type and duplicate prefix)
-    '../msData/additional/test93490_4.xml',     # 4795: https://www.w3.org/Bugs/Public/show_bug.cgi?id=4078
-    '../msData/additional/test93490_8.xml',     # 4799: Idem
+    '../msData/additional/test93490_4.xml',  # 4795: https://www.w3.org/Bugs/Public/show_bug.cgi?id=4078
+    '../msData/additional/test93490_8.xml',  # 4799: Idem
+    '../msData/datatypes/gMonth002.xml',  # 8017: gMonth bogus: conflicts with other invalid schema tests
+    '../msData/datatypes/gMonth004.xml',  # 8019: (http://www.w3.org/Bugs/Public/show_bug.cgi?id=6901)
 
     # Valid XML tests
     '../ibmData/instance_invalid/S3_4_2_4/s3_4_2_4ii03.xml',  # defaultAttributeApply is true (false in comment)

diff --git a/xmlschema/tests/validation/test_decoding.py b/xmlschema/tests/validation/test_decoding.py
@@ -480,9 +480,9 @@ def test_dict_granularity(self):
     def test_any_type(self):
         any_type = xmlschema.XMLSchema.meta_schema.types['anyType']
         xml_data_1 = ElementTree.Element('dummy')
-        self.assertEqual(any_type.decode(xml_data_1), (None, [], []))
+        self.assertIsNone(any_type.decode(xml_data_1))
         xml_data_2 = ElementTree.fromstring('<root>\n    <child_1/>\n    <child_2/>\n</root>')
-        self.assertEqual(any_type.decode(xml_data_2), (None, [], []))  # Currently no decoding yet
+        self.assertIsNone(any_type.decode(xml_data_2))  # Currently no decoding yet
 
     def test_choice_model_decoding(self):
         schema = xmlschema.XMLSchema(self.casepath('issues/issue_041/issue_041.xsd'))

diff --git a/xmlschema/tests/validation/test_encoding.py b/xmlschema/tests/validation/test_encoding.py
@@ -306,7 +306,7 @@ def test_max_occurs_sequence(self):
     def test_encode_unordered_content(self):
         schema = self.get_schema("""
         <xs:element name="A" type="A_type" />
-        <xs:complexType name="A_type">
+        <xs:complexType name="A_type" mixed="true">
             <xs:sequence>
                 <xs:element name="B1" type="xs:string"/>
                 <xs:element name="B2" type="xs:integer"/>

diff --git a/xmlschema/validators/complex_types.py b/xmlschema/validators/complex_types.py
@@ -17,6 +17,7 @@
     XSD_SIMPLE_CONTENT, XSD_ANY_SIMPLE_TYPE, XSD_OPEN_CONTENT, XSD_ASSERT, \
     get_qname, local_name
 from ..helpers import get_xsd_derivation_attribute
+from ..converters import ElementData
 
 from .exceptions import XMLSchemaValidationError, XMLSchemaDecodeError
 from .xsdbase import XsdComponent, XsdType, ValidationMixin
@@ -575,89 +576,44 @@ def decode(self, data, *args, **kwargs):
 
     def iter_decode(self, elem, validation='lax', **kwargs):
         """
-        Decode an Element instance.
+        Decode an Element instance. A dummy element is created for the type and it's
+        used for decode data. Typically used for decoding with xs:anyType when an XSD
+        element is not available.
 
         :param elem: the Element that has to be decoded.
         :param validation: the validation mode. Can be 'lax', 'strict' or 'skip.
         :param kwargs: keyword arguments for the decoding process.
-        :return: yields a 3-tuple (simple content, complex content, attributes) containing \
-        the decoded parts, eventually preceded by a sequence of validation or decoding errors.
+        :return: yields a decoded object, eventually preceded by a sequence of \
+        validation or decoding errors.
         """
-        if self.is_empty() and elem.text:
-            reason = "character data between child elements not allowed because the type's content is empty"
-            yield self.validation_error(validation, reason, elem, **kwargs)
+        xsd_element = self.schema.create_element(name=elem.tag)
+        xsd_element.type = self
+        for result in xsd_element.iter_decode(elem, validation, **kwargs):
+            yield result
 
-        # XSD 1.1 assertions
-        for assertion in self.assertions:
-            for error in assertion(elem, **kwargs):
-                yield self.validation_error(validation, error, **kwargs)
-
-        for result in self.attributes.iter_decode(elem.attrib, validation, **kwargs):
-            if isinstance(result, XMLSchemaValidationError):
-                yield result
-            else:
-                attributes = result
-                break
-        else:
-            attributes = None
-
-        if self.has_simple_content():
-            if len(elem) and validation != 'skip':
-                reason = "a simple content element can't has child elements."
-                yield self.validation_error(validation, reason, elem, **kwargs)
-
-            if elem.text is not None:
-                text = elem.text or kwargs.pop('default', '')
-                for result in self.content_type.iter_decode(text, validation, **kwargs):
-                    if isinstance(result, XMLSchemaValidationError):
-                        yield result
-                    else:
-                        yield result, None, attributes
-            else:
-                yield None, None, attributes
-        else:
-            for result in self.content_type.iter_decode(elem, validation, **kwargs):
-                if isinstance(result, XMLSchemaValidationError):
-                    yield result
-                else:
-                    yield None, result, attributes
-
-    def iter_encode(self, element_data, validation='lax', **kwargs):
+    def iter_encode(self, obj, validation='lax', **kwargs):
         """
-        Encode an element data instance.
+        Encode XML data. A dummy element is created for the type and it's used for
+        encode data. Typically used for encoding with xs:anyType when an XSD element
+        is not available.
 
-        :param element_data: an ElementData instance with unencoded data.
+        :param obj: decoded XML data.
         :param validation: the validation mode: can be 'lax', 'strict' or 'skip'.
         :param kwargs: keyword arguments for the encoding process.
-        :return: yields a 3-tuple (text, content, attributes) containing the encoded parts, \
-        eventually preceded by a sequence of validation or decoding errors.
+        :return: yields an Element, eventually preceded by a sequence of \
+        validation or encoding errors.
         """
-        for result in self.attributes.iter_encode(element_data.attributes, validation, **kwargs):
-            if isinstance(result, XMLSchemaValidationError):
-                yield result
-            else:
-                attributes = result
-                break
-        else:
-            attributes = ()
+        name, value = obj
+        xsd_element = self.schema.create_element(name=name)
+        xsd_element.type = self
 
-        if self.has_simple_content():
-            if element_data.text is None:
-                yield None, element_data.content, attributes
-            else:
-                for result in self.content_type.iter_encode(element_data.text, validation, **kwargs):
-                    if isinstance(result, XMLSchemaValidationError):
-                        yield result
-                    else:
-                        yield result, element_data.content, attributes
-        else:
-            for result in self.content_type.iter_encode(element_data, validation, **kwargs):
-                if isinstance(result, XMLSchemaValidationError):
+        if isinstance(value, list):
+            for item in value:
+                for result in xsd_element.iter_encode(item, validation, **kwargs):
                     yield result
-                elif result:
-                    yield result[0], result[1], attributes
-                else:
-                    yield None, None, attributes
+        else:
+            for result in xsd_element.iter_encode(value, validation, **kwargs):
+                yield result
 
 
 class Xsd11ComplexType(XsdComplexType):

diff --git a/xmlschema/validators/elements.py b/xmlschema/validators/elements.py
@@ -530,8 +530,7 @@ def iter_decode(self, elem, validation='lax', **kwargs):
         if XSI_TYPE in elem.attrib:
             type_name = elem.attrib[XSI_TYPE].strip()
             try:
-                nsmap = getattr(elem, 'nsmap', namespaces)
-                xsd_type = self.maps.get_instance_type(type_name, xsd_type, namespaces=nsmap)
+                xsd_type = self.maps.get_instance_type(type_name, xsd_type, namespaces)
             except (KeyError, TypeError) as err:
                 yield self.validation_error(validation, err, elem, **kwargs)
 

diff --git a/xmlschema/validators/groups.py b/xmlschema/validators/groups.py
@@ -38,6 +38,10 @@
     })
 
 
+def not_whitespace(s):
+    return s and s.strip()
+
+
 class XsdGroup(XsdComponent, ModelGroup, ValidationMixin):
     """
     Class for XSD 1.0 *model group* definitions.
@@ -567,9 +571,6 @@ def iter_decode(self, elem, validation='lax', **kwargs):
         :return: yields a list of 3-tuples (key, decoded data, decoder), \
         eventually preceded by a sequence of validation or decoding errors.
         """
-        def not_whitespace(s):
-            return s is not None and s.strip()
-
         result_list = []
         cdata_index = 1  # keys for CDATA sections are positive integers
 
@@ -698,13 +699,9 @@ def iter_encode(self, element_data, validation='lax', **kwargs):
         (key, decoded data, decoder), eventually preceded by a sequence of validation \
         or encoding errors.
         """
-        if not element_data.content:  # <tag/> or <tag></tag>
-            yield element_data.content
-            return
-
         level = kwargs['level'] = kwargs.get('level', 0) + 1
         errors = []
-        text = None
+        text = element_data.text
         children = []
         try:
             indent = kwargs['indent']
@@ -720,11 +717,14 @@ def iter_encode(self, element_data, validation='lax', **kwargs):
 
         default_namespace = converter.get('')
         model = ModelVisitor(self)
-        cdata_index = 0
+        index = cdata_index = 0
+        wrong_content_type = False
 
         if isinstance(element_data.content, dict) or kwargs.get('unordered'):
             content = model.iter_unordered_content(element_data.content)
         elif not isinstance(element_data.content, list):
+            if element_data.content is not None:
+                wrong_content_type = True
             content = []
         elif converter.losslessly:
             content = element_data.content
@@ -785,30 +785,38 @@ def iter_encode(self, element_data, validation='lax', **kwargs):
                     children.append(result)
 
         if model.element is not None:
-            index = len(element_data.content) - cdata_index
             for particle, occurs, expected in model.stop():
-                errors.append((index, particle, occurs, expected))
+                errors.append((index - cdata_index, particle, occurs, expected))
 
         if children:
             if children[-1].tail is None:
                 children[-1].tail = padding[:-indent] or '\n'
             else:
                 children[-1].tail = children[-1].tail.strip() + (padding[:-indent] or '\n')
 
-        if validation != 'skip' and (errors or not content):
+        cdata_not_allowed = not self.mixed and not_whitespace(text) and self and \
+            (len(self) > 1 or not isinstance(self[0], XsdAnyElement))
+
+        if validation != 'skip' and (errors or cdata_not_allowed or wrong_content_type):
             attrib = {k: unicode_type(v) for k, v in element_data.attributes.items()}
             if validation == 'lax' and converter.etree_element_class is not etree_element:
                 child_tags = [converter.etree_element(e.tag, attrib=e.attrib) for e in children]
                 elem = converter.etree_element(element_data.tag, text, child_tags, attrib)
             else:
                 elem = converter.etree_element(element_data.tag, text, children, attrib)
 
-            if not content:
+            if wrong_content_type:
                 reason = "wrong content type {!r}".format(type(element_data.content))
                 yield self.validation_error(validation, reason, elem, **kwargs)
 
+            if cdata_not_allowed:
+                reason = "character data between child elements not allowed"
+                yield self.validation_error(validation, reason, elem, **kwargs)
+
             for index, particle, occurs, expected in errors:
-                yield self.children_validation_error(validation, elem, index, particle, occurs, expected, **kwargs)
+                yield self.children_validation_error(
+                    validation, elem, index, particle, occurs, expected, **kwargs
+                )
 
         yield text, children
 

diff --git a/xmlschema/validators/wildcards.py b/xmlschema/validators/wildcards.py
@@ -440,9 +440,10 @@ def iter_decode(self, elem, validation='lax', **kwargs):
                     xsd_element = self.schema.create_element(name=elem.tag)
                     for result in xsd_element.iter_decode(elem, validation, **kwargs):
                         yield result
-                elif validation == 'skip':
-                    yield self.any_type.decode(elem) if len(elem) > 0 else elem.text
-                elif self.process_contents == 'strict':
+                elif validation == 'skip' or self.process_contents == 'lax':
+                    for result in self.any_type.iter_decode(elem, validation, **kwargs):
+                        yield result
+                else:
                     reason = "element %r not found." % elem.tag
                     yield self.validation_error(validation, reason, elem, **kwargs)
             else:
@@ -472,8 +473,9 @@ def iter_encode(self, obj, validation='lax', **kwargs):
             try:
                 xsd_element = self.maps.lookup_element(name)
             except LookupError:
-                if validation == 'skip':
-                    yield self.any_type.encode(value)
+                if validation == 'skip' or self.process_contents == 'lax':
+                    for result in self.any_type.iter_encode(obj, validation, **kwargs):
+                        yield result
                 elif self.process_contents == 'strict':
                     reason = "element %r not found." % name
                     yield self.validation_error(validation, reason, **kwargs)