adbar · adbar · May 30, 2024 · May 17, 2024 · May 17, 2024 · May 17, 2024
diff --git a/tests/unit_tests.py b/tests/unit_tests.py
@@ -1082,6 +1082,21 @@ def test_table_processing():
  htmlstring = '<html><body><article><table><tr><th>head 1</th><th>head 2</th></tr><tr><td>1</td><td>2</td></tr></table></article></body></html>'
  assert "---|---|" in extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
 
+ # remove new lines in table cells in text format
+ htmlstring = '<html><body><article><table><tr><td>cell<br>1</td><td>cell<p>2</p></td></tr></table></article></body></html>'
+ result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+ assert "cell 1 | cell 2 |" in result
+
+ # only one header row is allowed in text format
+ htmlstring = '<html><body><article><table><tr><th>a</th><th>b</th></tr><tr><th>c</th><th>d</th></tr></table></article></body></html>'
+ result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+ assert result.count("---|") == 2
+
+ # handle colspan by appending columns in text format
+ htmlstring = '<html><body><article><table><tr><td colspan="2">a</td><td>b</td></tr><tr><td>c</td><td>d</td><td>e</td></tr></table></article></body></html>'
+ result = extract(htmlstring, no_fallback=True, output_format='txt', config=ZERO_CONFIG, include_tables=True)
+ assert "a | b | |" in result
+
 
 def test_list_processing():
  options = DEFAULT_OPTIONS

diff --git a/trafilatura/main_extractor.py b/trafilatura/main_extractor.py
@@ -335,32 +335,43 @@ def handle_paragraphs(element, potential_tags, options):
  return None
 
 
-def define_cell_type(element):
+def define_cell_type(element, is_header):
  "Determine cell element type and mint new element."
  # define tag
  cell_element = Element("cell")
- if element.tag == "th":
+ if is_header:
  cell_element.set("role", "head")
  return cell_element
 
 
 def handle_table(table_elem, potential_tags, options):
  "Process single table element."
  newtable = Element("table")
- newrow = Element("row")
 
  # strip these structural elements
  strip_tags(table_elem, "thead", "tbody", "tfoot")
 
+ # calculate maximum number of columns per row, includin colspan
+ max_cols = 0
+ for tr in table_elem.iter('tr'):
+ max_cols = max(max_cols, sum(int(td.get("colspan", 1)) for td in tr.iter(TABLE_ELEMS)))
+
  # explore sub-elements
+ seen_header_row = False
+ seen_header = False
+ row_attrs = {"span": str(max_cols)} if max_cols > 1 else {}
+ newrow = Element("row", **row_attrs)
  for subelement in table_elem.iterdescendants():
  if subelement.tag == "tr":
  # process existing row
  if len(newrow) > 0:
  newtable.append(newrow)
- newrow = Element("row")
+ newrow = Element("row", **row_attrs)
+ seen_header_row = seen_header_row or seen_header
  elif subelement.tag in TABLE_ELEMS:
- new_child_elem = define_cell_type(subelement)
+ is_header = subelement.tag == "th" and not seen_header_row
+ seen_header = seen_header or is_header
+ new_child_elem = define_cell_type(subelement, is_header)
  # process
  if len(subelement) == 0:
  processed_cell = process_node(subelement, options)
@@ -398,6 +409,9 @@ def handle_table(table_elem, potential_tags, options):
  # cleanup
  subelement.tag = "done"
 
+ # clean up row attributes
+ newrow.attrib.pop("span", None)
+
  # end of processing
  if len(newrow) > 0:
  newtable.append(newrow)

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -42,7 +42,7 @@
  **{tag: '\n' for tag in ['code', 'graphic', 'head', 'lb', 'list', 'p', 'quote', 'row', 'table']}
 }
 SPECIAL_FORMATTING = {'del', 'head', 'hi', 'ref'}
-WITH_ATTRIBUTES = {'cell', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
+WITH_ATTRIBUTES = {'cell', 'row', 'del', 'graphic', 'head', 'hi', 'item', 'list', 'ref'}
 
 NESTING_WHITELIST = {"cell", "figure", "item", "note", "quote"}
 
@@ -217,6 +217,7 @@ def replace_element_text(element, include_formatting):
  "Determine element text based on just the text of the element. One must deal with the tail separately."
  elem_text = element.text or ""
  # handle formatting: convert to markdown
+ children = element.getchildren()
  if include_formatting and element.text:
  if element.tag == "head":
  try:
@@ -247,6 +248,9 @@ def replace_element_text(element, include_formatting):
  elem_text = link_text
  else:
  LOGGER.warning("empty link: %s %s", elem_text, element.attrib)
+ # cells
+ if element.tag == "cell" and elem_text and children and children[0].tag == 'p':
+ elem_text = f"{elem_text} "
  # lists
  elif element.tag == "item" and elem_text:
  elem_text = f"- {elem_text}\n"
@@ -291,21 +295,33 @@ def process_element(element, returnlist, include_formatting):
  returnlist.extend(['![', text.strip(), ']', '(', element.get('src', ''), ')'])
  # newlines for textless elements
  if element.tag in NEWLINE_ELEMS:
- returnlist.append('\n')
  # add line after table head
  if element.tag == "row":
- num_cells = len(element.xpath("./cell[@role='head']"))
- if num_cells > 0:
- returnlist.append("---|" * len(element.xpath(".//cell")) + "\n")
- return # Nothing more to do with textless elements
+ max_span = int(element.get("span", 1))
+ cell_count = len(element.xpath(".//cell"))
+ # row ended so draw extra empty cells to match max_span
+ returnlist.append("|" * (max_span - cell_count) + "\n")
+ # if this is a head row, draw the separator below
+ is_head = bool(element.xpath("./cell[@role='head']"))
+ if is_head:
+ returnlist.append("\n" + "---|" * max_span + "\n")
+ else:
+ returnlist.append('\n')
+ if element.tag != 'cell':
+ # cells still need to append vertical bars
+ # but nothing more to do with other textless elements
+ return
 
  # Process text
 
  # Common elements (Now processes end-tag logic correctly)
+ within_cell = element.xpath("ancestor::cell")
  if element.tag == 'p' and include_formatting:
- returnlist.append('\n\u2424\n')
+ if not within_cell:
+ returnlist.append('\n\u2424\n')
  elif element.tag in NEWLINE_ELEMS:
- returnlist.extend([NEWLINE_ELEMS[element.tag], '\n'])
+ if not within_cell:
+ returnlist.extend([NEWLINE_ELEMS[element.tag], '\n'])
  elif element.tag == 'cell':
  returnlist.extend(" | ")
  elif element.tag == 'comments':