Skip to content

Remove IDs from HTML code #4012

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
Jun 11, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,12 @@
## 0.17.11-dev0

### Enhancements

### Features

### Fixes
- Invalid elements IDs are not visible in VLM output. Parent-child hierarchy is now retrieved based on unstructured element ID, instead of id injected into HTML code of element.

## 0.17.10
- Drop Python 3.9 support as it reaches EOL in October 2025
- Update pip-compile script to use Python 3.10 and newer
Expand Down
64 changes: 5 additions & 59 deletions scripts/html/rendered_html_from_elements.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,12 @@
"""

import argparse
import html
import logging
import os
import select
import sys
from collections import defaultdict
from typing import List, Sequence

from bs4 import BeautifulSoup

from unstructured.documents import elements
from unstructured.partition.html.transformations import unstructured_elements_to_ontology
from unstructured.staging.base import elements_from_json

Expand All @@ -28,48 +24,6 @@
logger = logging.getLogger(__name__)


def extract_document_div(html_content: str) -> str:
pos = html_content.find(">")
if pos != -1:
return html_content[: pos + 1]
logger.error("No '>' found in the HTML content.")
raise ValueError("No '>' found in the HTML content.")


def extract_page_div(html_content: str) -> str:
soup = BeautifulSoup(html_content, "html.parser")
page_divs = soup.find_all("div", class_="Page")
if len(page_divs) != 1:
logger.error(
"Expected exactly one <div> element with class 'Page'. Found %d.", len(page_divs)
)
raise ValueError("Expected exactly one <div> element with class 'Page'.")
return str(page_divs[0])


def fold_document_div(
html_document_start: str, html_document_end: str, html_per_page: List[str]
) -> str:
html_document = html_document_start
for page_html in html_per_page:
html_document += page_html
html_document += html_document_end
return html_document


def group_elements_by_page(
unstructured_elements: Sequence[elements.Element],
) -> Sequence[Sequence[elements.Element]]:
pages_dict = defaultdict(list)

for element in unstructured_elements:
page_number = element.metadata.page_number
pages_dict[page_number].append(element)

pages_list = list(pages_dict.values())
return pages_list


def rendered_html(*, filepath: str | None = None, text: str | None = None) -> str:
"""Renders HTML from a JSON file with unstructured elements.

Expand All @@ -91,18 +45,10 @@ def rendered_html(*, filepath: str | None = None, text: str | None = None) -> st
logger.info("Rendering HTML from text.")

unstructured_elements = elements_from_json(filename=filepath, text=text)
unstructured_elements_per_page = group_elements_by_page(unstructured_elements)
# parsed_ontology = unstructured_elements_to_ontology(unstructured_elements)
parsed_ontology_per_page = [
unstructured_elements_to_ontology(elements) for elements in unstructured_elements_per_page
]
html_per_page = [parsed_ontology.to_html() for parsed_ontology in parsed_ontology_per_page]

html_document_start = extract_document_div(html_per_page[0])
html_document_end = "</div>"
html_per_page = [extract_page_div(page) for page in html_per_page]

return fold_document_div(html_document_start, html_document_end, html_per_page)
ontology_root = unstructured_elements_to_ontology(unstructured_elements)
html_document = ontology_root.to_html()
unescaped_html = html.unescape(html_document)
return unescaped_html


def _main():
Expand Down
46 changes: 23 additions & 23 deletions test_unstructured/documents/html_files/example.html
Original file line number Diff line number Diff line change
@@ -1,53 +1,53 @@
<body class="Document" id="897a8a47377c4ad6aab839a929879537">
<div class="Page" data-page-number="1" id="3a6b156a81764e17be128264241f8136">
<header class="Header" id="45b3d0053468484ba1c7b53998115412">
<h1 class="Title" id="c95473e8a3704fc2b418697f9fddb27b">
<body class="Document">
<div class="Page" data-page-number="1">
<header class="Header">
<h1 class="Title">
Header
</h1>
<time class="CalendarDate" id="379cbfdc16d44bd6a59e6cfabe6438d5">
<time class="CalendarDate">
Date: October 30, 2023
</time>
</header>
<form class="Form" id="637c2f6935fb4353a5f73025ce04619d">
<label class="FormField" for="company-name" id="50027cccbe1948c9853ce0de37b635c2">
<form class="Form">
<label class="FormField" for="company-name">
From field name
</label>
<input class="FormFieldValue" id="0032242af75c4b37984ea7fea9aac74c" value="Example value"/>
<input class="FormFieldValue" value="Example value"/>
</form>
<section class="Section" id="592422373ed741b68a077e2003f8ed81">
<table class="Table" id="dc3792d4422e444f90876b56d0cfb20d">
<thead class="TableHeader" id="50a5548a87e84024af590b3d2830d140">
<tr class="TableRow" id="5e473d7742474412be72dc4e2c45bd4a">
<th class="TableCellHeader" id="01800309aa42411c98ae30f85b23f399">
<section class="Section">
<table class="Table">
<thead class="TableHeader">
<tr class="TableRow">
<th class="TableCellHeader">
Description
</th>
<th class="TableCellHeader" id="c2765b63d08946a2851955e79e301de4">
<th class="TableCellHeader">
Row header
</th>
</tr>
</thead>
<tbody class="TableBody" id="e0a9a8ffdd7148ad8b4a274b073d340a">
<tr class="TableRow" id="77e829974632455191330b0b8545d1e3">
<td class="TableCell" id="7fee12d4c5554b7da778d6f8fdec8a57">
<tbody class="TableBody">
<tr class="TableRow">
<td class="TableCell">
Value description
</td>
<td class="TableCell" id="5a7a33b0c57b4eb881a35bce9f87c831">
<span class="Currency" id="87220f9d62c3482e92e7de72a26869cd">
<td class="TableCell">
<span class="Currency">
50 $
</span>
<span class="Measurement" id="0095b9efb90a4cca991e73547c7165f1">
<span class="Measurement">
(1.32 %)
</span>
</td>
</tr>
</tbody>
</table>
</section>
<section class="Section" id="1032242af75c4b37984ea7fea9aac74c">
<h2 class="Subtitle" id="2a4e2c4a689f4f9a8c180b6b521e45c3">
<section class="Section">
<h2 class="Subtitle">
2. Subtitle
</h2>
<p class="NarrativeText" id="5591f7a4df01447e82515ce45f686fbe">
<p class="NarrativeText">
Paragraph text
</p>
</section>
Expand Down
Loading
Loading