Skip to content

Commit

Permalink
Fix content encoding and decoding
Browse files Browse the repository at this point in the history
  - Include element_data.text on content encoding
  - Uniform wildcard decode or encode in case of missing element
  - Postpone lxml nsmap processing in case of xsi:type (still not
    usable for encoding)
  • Loading branch information
brunato committed Dec 21, 2019
1 parent a6b9781 commit 2281aef
Show file tree
Hide file tree
Showing 11 changed files with 68 additions and 98 deletions.
2 changes: 1 addition & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
CHANGELOG
*********

`v1.0.17`_ (2019-11-xx)
`v1.0.17`_ (2019-11-21)
=======================
* Enhancement of validation-only speed (~15%)
* Added *is_valid()* and *iter_errors()* to module API
Expand Down
2 changes: 1 addition & 1 deletion publiccode.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ publiccodeYmlVersion: '0.2'
name: xmlschema
url: 'https://github.com/sissaschool/xmlschema'
landingURL: 'https://github.com/sissaschool/xmlschema'
releaseDate: '2019-11-xx'
releaseDate: '2019-11-21'
softwareVersion: v1.0.17
developmentStatus: stable
platforms:
Expand Down
2 changes: 2 additions & 0 deletions xmlschema/converters.py
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,8 @@ def element_encode(self, obj, xsd_element, level=0):
if not isinstance(obj, (self.dict, dict)):
if xsd_element.type.is_simple() or xsd_element.type.has_simple_content():
return ElementData(tag, obj, None, {})
elif xsd_element.type.mixed and not isinstance(obj, list):
return ElementData(tag, obj, None, {})
else:
return ElementData(tag, None, obj, {})

Expand Down
1 change: 1 addition & 0 deletions xmlschema/tests/test_factory/validation_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,7 @@ def check_etree_encode(self, root, converter=None, **kwargs):
if converter not in (ParkerConverter, AbderaConverter, JsonMLConverter) and not skip_strict:
if debug_mode:
pdb.set_trace()
breakpoint()
raise AssertionError(str(err) + msg_tmpl % "encoded tree differs from original")
elif converter is ParkerConverter and any(XSI_TYPE in e.attrib for e in root.iter()):
return # can't check encode equivalence if xsi:type is provided
Expand Down
6 changes: 4 additions & 2 deletions xmlschema/tests/test_w3c_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,8 +97,10 @@

# Invalid XML tests
'../sunData/combined/xsd005/xsd005.n05.xml', # 3984: Invalid if lxml is used (xsi:type and duplicate prefix)
'../msData/additional/test93490_4.xml', # 4795: https://www.w3.org/Bugs/Public/show_bug.cgi?id=4078
'../msData/additional/test93490_8.xml', # 4799: Idem
'../msData/additional/test93490_4.xml', # 4795: https://www.w3.org/Bugs/Public/show_bug.cgi?id=4078
'../msData/additional/test93490_8.xml', # 4799: Idem
'../msData/datatypes/gMonth002.xml', # 8017: gMonth bogus: conflicts with other invalid schema tests
'../msData/datatypes/gMonth004.xml', # 8019: (http://www.w3.org/Bugs/Public/show_bug.cgi?id=6901)

# Valid XML tests
'../ibmData/instance_invalid/S3_4_2_4/s3_4_2_4ii03.xml', # defaultAttributeApply is true (false in comment)
Expand Down
4 changes: 2 additions & 2 deletions xmlschema/tests/validation/test_decoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -480,9 +480,9 @@ def test_dict_granularity(self):
def test_any_type(self):
any_type = xmlschema.XMLSchema.meta_schema.types['anyType']
xml_data_1 = ElementTree.Element('dummy')
self.assertEqual(any_type.decode(xml_data_1), (None, [], []))
self.assertIsNone(any_type.decode(xml_data_1))
xml_data_2 = ElementTree.fromstring('<root>\n <child_1/>\n <child_2/>\n</root>')
self.assertEqual(any_type.decode(xml_data_2), (None, [], [])) # Currently no decoding yet
self.assertIsNone(any_type.decode(xml_data_2)) # Currently no decoding yet

def test_choice_model_decoding(self):
schema = xmlschema.XMLSchema(self.casepath('issues/issue_041/issue_041.xsd'))
Expand Down
2 changes: 1 addition & 1 deletion xmlschema/tests/validation/test_encoding.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,7 +306,7 @@ def test_max_occurs_sequence(self):
def test_encode_unordered_content(self):
schema = self.get_schema("""
<xs:element name="A" type="A_type" />
<xs:complexType name="A_type">
<xs:complexType name="A_type" mixed="true">
<xs:sequence>
<xs:element name="B1" type="xs:string"/>
<xs:element name="B2" type="xs:integer"/>
Expand Down
96 changes: 26 additions & 70 deletions xmlschema/validators/complex_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
XSD_SIMPLE_CONTENT, XSD_ANY_SIMPLE_TYPE, XSD_OPEN_CONTENT, XSD_ASSERT, \
get_qname, local_name
from ..helpers import get_xsd_derivation_attribute
from ..converters import ElementData

from .exceptions import XMLSchemaValidationError, XMLSchemaDecodeError
from .xsdbase import XsdComponent, XsdType, ValidationMixin
Expand Down Expand Up @@ -575,89 +576,44 @@ def decode(self, data, *args, **kwargs):

def iter_decode(self, elem, validation='lax', **kwargs):
"""
Decode an Element instance.
Decode an Element instance. A dummy element is created for the type and it's
used for decode data. Typically used for decoding with xs:anyType when an XSD
element is not available.
:param elem: the Element that has to be decoded.
:param validation: the validation mode. Can be 'lax', 'strict' or 'skip.
:param kwargs: keyword arguments for the decoding process.
:return: yields a 3-tuple (simple content, complex content, attributes) containing \
the decoded parts, eventually preceded by a sequence of validation or decoding errors.
:return: yields a decoded object, eventually preceded by a sequence of \
validation or decoding errors.
"""
if self.is_empty() and elem.text:
reason = "character data between child elements not allowed because the type's content is empty"
yield self.validation_error(validation, reason, elem, **kwargs)
xsd_element = self.schema.create_element(name=elem.tag)
xsd_element.type = self
for result in xsd_element.iter_decode(elem, validation, **kwargs):
yield result

# XSD 1.1 assertions
for assertion in self.assertions:
for error in assertion(elem, **kwargs):
yield self.validation_error(validation, error, **kwargs)

for result in self.attributes.iter_decode(elem.attrib, validation, **kwargs):
if isinstance(result, XMLSchemaValidationError):
yield result
else:
attributes = result
break
else:
attributes = None

if self.has_simple_content():
if len(elem) and validation != 'skip':
reason = "a simple content element can't has child elements."
yield self.validation_error(validation, reason, elem, **kwargs)

if elem.text is not None:
text = elem.text or kwargs.pop('default', '')
for result in self.content_type.iter_decode(text, validation, **kwargs):
if isinstance(result, XMLSchemaValidationError):
yield result
else:
yield result, None, attributes
else:
yield None, None, attributes
else:
for result in self.content_type.iter_decode(elem, validation, **kwargs):
if isinstance(result, XMLSchemaValidationError):
yield result
else:
yield None, result, attributes

def iter_encode(self, element_data, validation='lax', **kwargs):
def iter_encode(self, obj, validation='lax', **kwargs):
"""
Encode an element data instance.
Encode XML data. A dummy element is created for the type and it's used for
encode data. Typically used for encoding with xs:anyType when an XSD element
is not available.
:param element_data: an ElementData instance with unencoded data.
:param obj: decoded XML data.
:param validation: the validation mode: can be 'lax', 'strict' or 'skip'.
:param kwargs: keyword arguments for the encoding process.
:return: yields a 3-tuple (text, content, attributes) containing the encoded parts, \
eventually preceded by a sequence of validation or decoding errors.
:return: yields an Element, eventually preceded by a sequence of \
validation or encoding errors.
"""
for result in self.attributes.iter_encode(element_data.attributes, validation, **kwargs):
if isinstance(result, XMLSchemaValidationError):
yield result
else:
attributes = result
break
else:
attributes = ()
name, value = obj
xsd_element = self.schema.create_element(name=name)
xsd_element.type = self

if self.has_simple_content():
if element_data.text is None:
yield None, element_data.content, attributes
else:
for result in self.content_type.iter_encode(element_data.text, validation, **kwargs):
if isinstance(result, XMLSchemaValidationError):
yield result
else:
yield result, element_data.content, attributes
else:
for result in self.content_type.iter_encode(element_data, validation, **kwargs):
if isinstance(result, XMLSchemaValidationError):
if isinstance(value, list):
for item in value:
for result in xsd_element.iter_encode(item, validation, **kwargs):
yield result
elif result:
yield result[0], result[1], attributes
else:
yield None, None, attributes
else:
for result in xsd_element.iter_encode(value, validation, **kwargs):
yield result


class Xsd11ComplexType(XsdComplexType):
Expand Down
3 changes: 1 addition & 2 deletions xmlschema/validators/elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -530,8 +530,7 @@ def iter_decode(self, elem, validation='lax', **kwargs):
if XSI_TYPE in elem.attrib:
type_name = elem.attrib[XSI_TYPE].strip()
try:
nsmap = getattr(elem, 'nsmap', namespaces)
xsd_type = self.maps.get_instance_type(type_name, xsd_type, namespaces=nsmap)
xsd_type = self.maps.get_instance_type(type_name, xsd_type, namespaces)
except (KeyError, TypeError) as err:
yield self.validation_error(validation, err, elem, **kwargs)

Expand Down
36 changes: 22 additions & 14 deletions xmlschema/validators/groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,10 @@
})


def not_whitespace(s):
return s and s.strip()


class XsdGroup(XsdComponent, ModelGroup, ValidationMixin):
"""
Class for XSD 1.0 *model group* definitions.
Expand Down Expand Up @@ -567,9 +571,6 @@ def iter_decode(self, elem, validation='lax', **kwargs):
:return: yields a list of 3-tuples (key, decoded data, decoder), \
eventually preceded by a sequence of validation or decoding errors.
"""
def not_whitespace(s):
return s is not None and s.strip()

result_list = []
cdata_index = 1 # keys for CDATA sections are positive integers

Expand Down Expand Up @@ -698,13 +699,9 @@ def iter_encode(self, element_data, validation='lax', **kwargs):
(key, decoded data, decoder), eventually preceded by a sequence of validation \
or encoding errors.
"""
if not element_data.content: # <tag/> or <tag></tag>
yield element_data.content
return

level = kwargs['level'] = kwargs.get('level', 0) + 1
errors = []
text = None
text = element_data.text
children = []
try:
indent = kwargs['indent']
Expand All @@ -720,11 +717,14 @@ def iter_encode(self, element_data, validation='lax', **kwargs):

default_namespace = converter.get('')
model = ModelVisitor(self)
cdata_index = 0
index = cdata_index = 0
wrong_content_type = False

if isinstance(element_data.content, dict) or kwargs.get('unordered'):
content = model.iter_unordered_content(element_data.content)
elif not isinstance(element_data.content, list):
if element_data.content is not None:
wrong_content_type = True
content = []
elif converter.losslessly:
content = element_data.content
Expand Down Expand Up @@ -785,30 +785,38 @@ def iter_encode(self, element_data, validation='lax', **kwargs):
children.append(result)

if model.element is not None:
index = len(element_data.content) - cdata_index
for particle, occurs, expected in model.stop():
errors.append((index, particle, occurs, expected))
errors.append((index - cdata_index, particle, occurs, expected))

if children:
if children[-1].tail is None:
children[-1].tail = padding[:-indent] or '\n'
else:
children[-1].tail = children[-1].tail.strip() + (padding[:-indent] or '\n')

if validation != 'skip' and (errors or not content):
cdata_not_allowed = not self.mixed and not_whitespace(text) and self and \
(len(self) > 1 or not isinstance(self[0], XsdAnyElement))

if validation != 'skip' and (errors or cdata_not_allowed or wrong_content_type):
attrib = {k: unicode_type(v) for k, v in element_data.attributes.items()}
if validation == 'lax' and converter.etree_element_class is not etree_element:
child_tags = [converter.etree_element(e.tag, attrib=e.attrib) for e in children]
elem = converter.etree_element(element_data.tag, text, child_tags, attrib)
else:
elem = converter.etree_element(element_data.tag, text, children, attrib)

if not content:
if wrong_content_type:
reason = "wrong content type {!r}".format(type(element_data.content))
yield self.validation_error(validation, reason, elem, **kwargs)

if cdata_not_allowed:
reason = "character data between child elements not allowed"
yield self.validation_error(validation, reason, elem, **kwargs)

for index, particle, occurs, expected in errors:
yield self.children_validation_error(validation, elem, index, particle, occurs, expected, **kwargs)
yield self.children_validation_error(
validation, elem, index, particle, occurs, expected, **kwargs
)

yield text, children

Expand Down
12 changes: 7 additions & 5 deletions xmlschema/validators/wildcards.py
Original file line number Diff line number Diff line change
Expand Up @@ -440,9 +440,10 @@ def iter_decode(self, elem, validation='lax', **kwargs):
xsd_element = self.schema.create_element(name=elem.tag)
for result in xsd_element.iter_decode(elem, validation, **kwargs):
yield result
elif validation == 'skip':
yield self.any_type.decode(elem) if len(elem) > 0 else elem.text
elif self.process_contents == 'strict':
elif validation == 'skip' or self.process_contents == 'lax':
for result in self.any_type.iter_decode(elem, validation, **kwargs):
yield result
else:
reason = "element %r not found." % elem.tag
yield self.validation_error(validation, reason, elem, **kwargs)
else:
Expand Down Expand Up @@ -472,8 +473,9 @@ def iter_encode(self, obj, validation='lax', **kwargs):
try:
xsd_element = self.maps.lookup_element(name)
except LookupError:
if validation == 'skip':
yield self.any_type.encode(value)
if validation == 'skip' or self.process_contents == 'lax':
for result in self.any_type.iter_encode(obj, validation, **kwargs):
yield result
elif self.process_contents == 'strict':
reason = "element %r not found." % name
yield self.validation_error(validation, reason, **kwargs)
Expand Down

0 comments on commit 2281aef

Please sign in to comment.