Skip to content

Commit 82c48e0

Browse files
committed
Add a funciton to workaround the truncted XML elment text property:
- lxml element text property only returns the first text child node and is split by embeded comments - use xpath('text()') and string join to get the full list of text parts returned for the entire element. - Misc whitespace and spelling corrections.
1 parent 508f6db commit 82c48e0

File tree

2 files changed

+38
-22
lines changed

2 files changed

+38
-22
lines changed

compare_wef.py

Lines changed: 38 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@
6868
}
6969
m_include = [m_choices_map[i] for i in args.metadata]
7070
if args.metadata_full_description and 'description' not in args.metadata:
71-
logger.warning(f'\'description\' was not given with \'--metadata\', but \'--metadata-full-description\' was given, so description metadata will be inlcuded in full.')
71+
logger.warning(f'\'description\' was not given with \'--metadata\', but \'--metadata-full-description\' was given, so description metadata will be included in full.')
7272
m_include.append('Description')
7373

7474
## panda's options
@@ -109,7 +109,7 @@ def get_provider_metadata(log_path, provider=None):
109109
- Log path required
110110
- Provider optional
111111
"""
112-
112+
113113
metadata_columns = ['Id', 'Keywords.Name', 'Levels.Name', 'Tasks.Name', 'Opcodes.Name', 'ProviderName']
114114
metadata_lookup = None
115115
if provider:
@@ -232,6 +232,21 @@ def get_event_id_list(xpath):
232232
return event_ids
233233

234234

235+
def xml_element_get_all_text(element: lxml.etree.Element):
236+
"""
237+
Function to get all text, not just first child:
238+
239+
- The .text property of elements from lxml only includes the first text part and is split by commented causing <Element>.text to produce truncated/partial text.
240+
- Instead of <Element>.text, using <Element>.xpath('text()') is safer and will get a list of all text.
241+
- Extending lxml etree BaseElement class was not used due to:
242+
- Complexity where one cannot simply extend a class without interacting with the element tree. See: https://lxml.de/element_classes.html.
243+
- Extending the class injects a parent node element and shifts the current element with its properrties as a child element of the extended element.
244+
"""
245+
246+
#return ''.join([t for t in element.itertext()])
247+
return ''.join(element.xpath('text()'))
248+
249+
235250
def get_queries(xml_query_list):
236251
"""
237252
Extract elements and metadata from a windows event XML query list
@@ -243,6 +258,7 @@ def get_queries(xml_query_list):
243258
for x_query in x_query_list:
244259
# NOTE:
245260
# - The Path (log name) might be specified in either the query attributes (along with to the query ID) or in the XPath definition.
261+
# - The etree.Element class was extended as QueryElement with and all_text property because the standard text property truncates text after any embedded comments.
246262
# - TODO: It's undetermined if both the XPath and the query can simultaneously be set and are allowed to be different or must be consistant.
247263
# - TODO: Collected comments can get disassociated from the nearby select or suppress statement they annotate, so it's less useful for large/complex query IDs.
248264
queries.append(
@@ -251,19 +267,19 @@ def get_queries(xml_query_list):
251267
'Attributes': dict(x_query.attrib),
252268
'Selections': [
253269
{ 'Path': str(s.xpath('@Path')[0]),
254-
'XPath': s.text,
255-
'Providers': re_xpath_provider.findall(s.text),
256-
'Levels': re_xpath_level.findall(s.text),
257-
'EventIDs': get_event_id_list(s.text)
270+
'XPath': xml_element_get_all_text(s),
271+
'Providers': re_xpath_provider.findall(xml_element_get_all_text(s)),
272+
'Levels': re_xpath_level.findall(xml_element_get_all_text(s)),
273+
'EventIDs': get_event_id_list(xml_element_get_all_text(s))
258274
}
259275
for s in x_query.xpath('./Select')
260276
],
261277
'Suppressions': [
262278
{ 'Path': str(s.xpath('@Path')[0]),
263-
'XPath': s.text,
264-
'Providers': re_xpath_provider.findall(s.text),
265-
'Levels': re_xpath_level.findall(s.text),
266-
'EventIDs': get_event_id_list(s.text)
279+
'XPath': xml_element_get_all_text(s),
280+
'Providers': re_xpath_provider.findall(xml_element_get_all_text(s)),
281+
'Levels': re_xpath_level.findall(xml_element_get_all_text(s)),
282+
'EventIDs': get_event_id_list(xml_element_get_all_text(s))
267283
} for s in x_query.xpath('./Suppress')
268284
]
269285
}
@@ -281,48 +297,48 @@ def get_subscription_query_list_xml(xml_file_path):
281297
# XPath selection requires namespace to used!
282298
x_subscription_query = x_subscription.xpath('/ns:Subscription/ns:Query', namespaces={'ns': 'http://schemas.microsoft.com/2006/03/windows/events/subscription'})
283299
assert len(x_subscription_query) == 1, f"Unexpected number of elements matched by XPath query '/ns:Subscription/ns:Query' for file {xml_file_path}."
284-
s_x_query = x_subscription_query[0].text
300+
s_x_query = xml_element_get_all_text(x_subscription_query[0])
285301
return s_x_query
286302

287303

288304
def enum_query_combinations(enum, s_file, q_id, q_parent_path, q_type, q):
289305
"""
290306
Enumerate combinations of select or suppress sub-query elements and propergate references to the deepest level of event specificity.
291307
During enumeration, event and provider metadata lookups are done and used to increase the specificity.
292-
308+
293309
enum: dict object passed as a reference to add enumerated events and references to.
294310
s_file: subscription file name to reference
295311
q_id: Query ID of the XML element
296312
q_parent_path: Path attribute in the Query element
297313
q_type: Query type element, either Select or Suppress
298314
q_xpath: XPath data/text within Select or Suppress element
299-
315+
300316
Note, a pseudo-hierarchy of event query specificity, and related reference level, is as follows:
301-
317+
302318
Paths:
303319
Path: required
304320
Providers
305321
Provider: optional
306322
Events:
307323
Event ID: optional
308324
(Reference at this level)
309-
325+
310326
When query parsing and metadata lookups fail to resolve a specific provider(s) or event ID(s), a single null node is created.
311-
327+
312328
NOTE: Query specificity:
313329
- A query could just select an Event directly from a Path without specifying the Provider.
314330
- When an Event ID or Level is selected without a Provider, ambiguity results and multiple Providers and Events are in scope.
315331
- With the Security log Path/Channel, the Microsoft-Windows-Security-Auditing is the most common provider producing into this channel.
316332
- The Path attribute can be stipulated at query level and the sub-query Select or Suppress level. The deeper Select/Suppress level is assumed to take precedence.
317-
333+
318334
NOTE: Query metadata lookup failure reasons:
319335
- FIXME: Regex to extract event provider, event ID, or level failed to match properly.
320336
- Query did not select a valid/defined event or provider.
321337
- Source system metadata extract lacked provider or event manifests.
322338
323339
- FIXME: This function has become too complex and bloated. Perhaps it should be simplified by refactored using objects to compartmentalise and abstract the complexity.
324340
"""
325-
341+
326342
# Assume sub-level the <Select Path=...> or <Suppress Path=...> attribute will take precadence over parent <Query Path=...> attribute when choosing a channel
327343
q_path = q['Path']
328344
if not q_path:
@@ -333,10 +349,10 @@ def enum_query_combinations(enum, s_file, q_id, q_parent_path, q_type, q):
333349
q_path = q_parent_path
334350
elif q_parent_path and q_parent_path != q_path:
335351
logger.warning(f"Contradicting query Path in '{s_file}'. Query ID {q_id}'s the child path attribute, <{q_type} Path='{q_path}'>, does not match the parent path attribute, <Query Path='{q_parent_path}'>. The child Path will take precadence.")
336-
352+
337353
# XPath extractions and permutations
338354
q_xpath = q['XPath']
339-
# NOTE:
355+
# NOTE:
340356
# - itertools.product or the list comprehension building a tuple from nested iterations will return an empty list if one of the iterations/lists input is empty
341357
# - Avoid this by replacing empty lists with a non-empty list and the single null/None value to force the product to expand out the empty sets
342358
# - FIXME: expansion is only based on Providers, Event IDs and Levels and this will fail to expand on queries that use other event attribtues such as Keywords, Tasks, Opcodes or Event Data.
@@ -644,7 +660,7 @@ def enum_query_combinations(enum, s_file, q_id, q_parent_path, q_type, q):
644660
} for k_event_id,v_event in v_events.items()
645661
]
646662
} for k_provider,v_events in v_providers.items()
647-
]
663+
]
648664
} for k_path,v_providers in query_combinations.items()
649665
]
650666
}
@@ -679,7 +695,7 @@ def enum_query_combinations(enum, s_file, q_id, q_parent_path, q_type, q):
679695
reference_dirs = ['microsoft', 'nsacyber', 'palantir']
680696
if len(custom_subscription_list) > 0:
681697
reference_dirs.append('custom')
682-
698+
683699
# Append expanded reference columns
684700
for i, r_dir in enumerate(reference_dirs):
685701
# Boolean to list if related to core reference or not

custom/custom_eg.xml

510 Bytes
Binary file not shown.

0 commit comments

Comments
 (0)