Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Markdown table fixes #601

Merged
merged 7 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1082,6 +1082,21 @@ def test_table_processing():
htmlstring = '<html><body><article><table><tr><th>head 1</th><th>head 2</th></tr><tr><td>1</td><td>2</td></tr></table></article></body></html>'
assert "---|---|" in extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)

# remove new lines in table cells in text format
htmlstring = '<html><body><article><table><tr><td>cell<br>1</td><td>cell<p>2</p></td></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "cell 1 | cell 2 |" in result

# only one header row is allowed in text format
htmlstring = '<html><body><article><table><tr><th>a</th><th>b</th></tr><tr><th>c</th><th>d</th></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert result.count("---|") == 2

# handle colspan by appending columns in text format
htmlstring = '<html><body><article><table><tr><td colspan="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
assert "a | b | |" in result


def test_list_processing():
options = DEFAULT_OPTIONS
Expand Down
24 changes: 19 additions & 5 deletions trafilatura/main_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -335,32 +335,43 @@ def handle_paragraphs(element, potential_tags, options):
return None


def define_cell_type(element):
def define_cell_type(element, is_header):
"Determine cell element type and mint new element."
# define tag
cell_element = Element("cell")
if element.tag == "th":
if is_header:
cell_element.set("role", "head")
return cell_element


def handle_table(table_elem, potential_tags, options):
"Process single table element."
newtable = Element("table")
newrow = Element("row")

# strip these structural elements
strip_tags(table_elem, "thead", "tbody", "tfoot")

# calculate maximum number of columns per row, includin colspan
max_cols = 0
for tr in table_elem.iter('tr'):
max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))

# explore sub-elements
seen_header_row = False
seen_header = False
row_attrs = {"span": str(max_cols)} if max_cols > 1 else {}
newrow = Element("row", **row_attrs)
for subelement in table_elem.iterdescendants():
if subelement.tag == "tr":
# process existing row
if len(newrow) > 0:
newtable.append(newrow)
newrow = Element("row")
newrow = Element("row", **row_attrs)
seen_header_row = seen_header_row or seen_header
elif subelement.tag in TABLE_ELEMS:
new_child_elem = define_cell_type(subelement)
is_header = subelement.tag == "th" and not seen_header_row
seen_header = seen_header or is_header
new_child_elem = define_cell_type(subelement, is_header)
# process
if len(subelement) == 0:
processed_cell = process_node(subelement, options)
Expand Down Expand Up @@ -398,6 +409,9 @@ def handle_table(table_elem, potential_tags, options):
# cleanup
subelement.tag = "done"

# clean up row attributes
newrow.attrib.pop("span", None)

# end of processing
if len(newrow) > 0:
newtable.append(newrow)
Expand Down
32 changes: 24 additions & 8 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
**{tag: '\n' for tag in ['code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table']}
}
SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
WITH_ATTRIBUTES = {'cell', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}

NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}

Expand Down Expand Up @@ -217,6 +217,7 @@ def replace_element_text(element, include_formatting):
"Determine element text based on just the text of the element. One must deal with the tail separately."
elem_text = element.text or ""
# handle formatting: convert to markdown
children = element.getchildren()
if include_formatting and element.text:
if element.tag == "head":
try:
Expand Down Expand Up @@ -247,6 +248,9 @@ def replace_element_text(element, include_formatting):
elem_text = link_text
else:
LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
# cells
if element.tag == "cell" and elem_text and children and children[0].tag == 'p':
elem_text = f"{elem_text} "
# lists
elif element.tag == "item" and elem_text:
elem_text = f"- {elem_text}\n"
Expand Down Expand Up @@ -291,21 +295,33 @@ def process_element(element, returnlist, include_formatting):
returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
# newlines for textless elements
if element.tag in NEWLINE_ELEMS:
returnlist.append('\n')
# add line after table head
if element.tag == "row":
num_cells = len(element.xpath("./cell[@role='head']"))
if num_cells > 0:
returnlist.append("---|" * len(element.xpath(".//cell")) + "\n")
return # Nothing more to do with textless elements
max_span = int(element.get("span", 1))
cell_count = len(element.xpath(".//cell"))
# row ended so draw extra empty cells to match max_span
returnlist.append("|" * (max_span - cell_count) + "\n")
# if this is a head row, draw the separator below
is_head = bool(element.xpath("./cell[@role='head']"))
if is_head:
returnlist.append("\n" + "---|" * max_span + "\n")
else:
returnlist.append('\n')
if element.tag != 'cell':
# cells still need to append vertical bars
# but nothing more to do with other textless elements
return

# Process text

# Common elements (Now processes end-tag logic correctly)
within_cell = element.xpath("ancestor::cell")
if element.tag == 'p' and include_formatting:
returnlist.append('\n\u2424\n')
if not within_cell:
adbar marked this conversation as resolved.
Show resolved Hide resolved
returnlist.append('\n\u2424\n')
elif element.tag in NEWLINE_ELEMS:
returnlist.extend([NEWLINE_ELEMS[element.tag], '\n'])
if not within_cell:
returnlist.extend([NEWLINE_ELEMS[element.tag], '\n'])
elif element.tag == 'cell':
returnlist.extend(" | ")
elif element.tag == 'comments':
Expand Down
Loading