Skip to content

Commit ec209c6

Browse files
authored
Remove IDs from HTML code (#4012)
In this pull request parent-child relationship for elements generated with v2 parser is based on actual element IDs instead of IDs baked somewhere in the HTML script. With some extra bug fixing it allowed for significantly simplifying json -> HTML script
1 parent b6ab471 commit ec209c6

18 files changed

+1098
-1100
lines changed

CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,12 @@
1+
## 0.17.11-dev0
2+
3+
### Enhancements
4+
5+
### Features
6+
7+
### Fixes
8+
- Invalid elements IDs are not visible in VLM output. Parent-child hierarchy is now retrieved based on unstructured element ID, instead of id injected into HTML code of element.
9+
110
## 0.17.10
211
- Drop Python 3.9 support as it reaches EOL in October 2025
312
- Update pip-compile script to use Python 3.10 and newer

scripts/html/rendered_html_from_elements.py

Lines changed: 5 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -10,16 +10,12 @@
1010
"""
1111

1212
import argparse
13+
import html
1314
import logging
1415
import os
1516
import select
1617
import sys
17-
from collections import defaultdict
18-
from typing import List, Sequence
1918

20-
from bs4 import BeautifulSoup
21-
22-
from unstructured.documents import elements
2319
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
2420
from unstructured.staging.base import elements_from_json
2521

@@ -28,48 +24,6 @@
2824
logger = logging.getLogger(__name__)
2925

3026

31-
def extract_document_div(html_content: str) -> str:
32-
pos = html_content.find(">")
33-
if pos != -1:
34-
return html_content[: pos + 1]
35-
logger.error("No '>' found in the HTML content.")
36-
raise ValueError("No '>' found in the HTML content.")
37-
38-
39-
def extract_page_div(html_content: str) -> str:
40-
soup = BeautifulSoup(html_content, "html.parser")
41-
page_divs = soup.find_all("div", class_="Page")
42-
if len(page_divs) != 1:
43-
logger.error(
44-
"Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs)
45-
)
46-
raise ValueError("Expected exactly one <div> element with class 'Page'.")
47-
return str(page_divs[0])
48-
49-
50-
def fold_document_div(
51-
html_document_start: str, html_document_end: str, html_per_page: List[str]
52-
) -> str:
53-
html_document = html_document_start
54-
for page_html in html_per_page:
55-
html_document += page_html
56-
html_document += html_document_end
57-
return html_document
58-
59-
60-
def group_elements_by_page(
61-
unstructured_elements: Sequence[elements.Element],
62-
) -> Sequence[Sequence[elements.Element]]:
63-
pages_dict = defaultdict(list)
64-
65-
for element in unstructured_elements:
66-
page_number = element.metadata.page_number
67-
pages_dict[page_number].append(element)
68-
69-
pages_list = list(pages_dict.values())
70-
return pages_list
71-
72-
7327
def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str:
7428
"""Renders HTML from a JSON file with unstructured elements.
7529
@@ -91,18 +45,10 @@ def rendered_html(*, filepath: str | None = None, text: str | None = None) -> st
9145
logger.info("Rendering HTML from text.")
9246

9347
unstructured_elements = elements_from_json(filename=filepath, text=text)
94-
unstructured_elements_per_page = group_elements_by_page(unstructured_elements)
95-
# parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
96-
parsed_ontology_per_page = [
97-
unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page
98-
]
99-
html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page]
100-
101-
html_document_start = extract_document_div(html_per_page[0])
102-
html_document_end = "</div>"
103-
html_per_page = [extract_page_div(page) for page in html_per_page]
104-
105-
return fold_document_div(html_document_start, html_document_end, html_per_page)
48+
ontology_root = unstructured_elements_to_ontology(unstructured_elements)
49+
html_document = ontology_root.to_html()
50+
unescaped_html = html.unescape(html_document)
51+
return unescaped_html
10652

10753

10854
def _main():

test_unstructured/documents/html_files/example.html

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,53 +1,53 @@
1-
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
2-
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
3-
<header class="Header" id="45b3d0053468484ba1c7b53998115412">
4-
<h1 class="Title" id="c95473e8a3704fc2b418697f9fddb27b">
1+
<body class="Document">
2+
<div class="Page" data-page-number="1">
3+
<header class="Header">
4+
<h1 class="Title">
55
Header
66
</h1>
7-
<time class="CalendarDate" id="379cbfdc16d44bd6a59e6cfabe6438d5">
7+
<time class="CalendarDate">
88
Date: October 30, 2023
99
</time>
1010
</header>
11-
<form class="Form" id="637c2f6935fb4353a5f73025ce04619d">
12-
<label class="FormField" for="company-name" id="50027cccbe1948c9853ce0de37b635c2">
11+
<form class="Form">
12+
<label class="FormField" for="company-name">
1313
From field name
1414
</label>
15-
<input class="FormFieldValue" id="0032242af75c4b37984ea7fea9aac74c" value="Example value"/>
15+
<input class="FormFieldValue" value="Example value"/>
1616
</form>
17-
<section class="Section" id="592422373ed741b68a077e2003f8ed81">
18-
<table class="Table" id="dc3792d4422e444f90876b56d0cfb20d">
19-
<thead class="TableHeader" id="50a5548a87e84024af590b3d2830d140">
20-
<tr class="TableRow" id="5e473d7742474412be72dc4e2c45bd4a">
21-
<th class="TableCellHeader" id="01800309aa42411c98ae30f85b23f399">
17+
<section class="Section">
18+
<table class="Table">
19+
<thead class="TableHeader">
20+
<tr class="TableRow">
21+
<th class="TableCellHeader">
2222
Description
2323
</th>
24-
<th class="TableCellHeader" id="c2765b63d08946a2851955e79e301de4">
24+
<th class="TableCellHeader">
2525
Row header
2626
</th>
2727
</tr>
2828
</thead>
29-
<tbody class="TableBody" id="e0a9a8ffdd7148ad8b4a274b073d340a">
30-
<tr class="TableRow" id="77e829974632455191330b0b8545d1e3">
31-
<td class="TableCell" id="7fee12d4c5554b7da778d6f8fdec8a57">
29+
<tbody class="TableBody">
30+
<tr class="TableRow">
31+
<td class="TableCell">
3232
Value description
3333
</td>
34-
<td class="TableCell" id="5a7a33b0c57b4eb881a35bce9f87c831">
35-
<span class="Currency" id="87220f9d62c3482e92e7de72a26869cd">
34+
<td class="TableCell">
35+
<span class="Currency">
3636
50 $
3737
</span>
38-
<span class="Measurement" id="0095b9efb90a4cca991e73547c7165f1">
38+
<span class="Measurement">
3939
(1.32 %)
4040
</span>
4141
</td>
4242
</tr>
4343
</tbody>
4444
</table>
4545
</section>
46-
<section class="Section" id="1032242af75c4b37984ea7fea9aac74c">
47-
<h2 class="Subtitle" id="2a4e2c4a689f4f9a8c180b6b521e45c3">
46+
<section class="Section">
47+
<h2 class="Subtitle">
4848
2. Subtitle
4949
</h2>
50-
<p class="NarrativeText" id="5591f7a4df01447e82515ce45f686fbe">
50+
<p class="NarrativeText">
5151
Paragraph text
5252
</p>
5353
</section>

0 commit comments

Comments
 (0)