Skip to content

Commit

Permalink
add style cleaning for markdown conversion
Browse files Browse the repository at this point in the history
  • Loading branch information
tahouse committed Feb 23, 2025
1 parent 82dda49 commit 41c2158
Show file tree
Hide file tree
Showing 4 changed files with 41 additions and 8 deletions.
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "streamlit-chat-prompt"
version = "0.3.11"
version = "0.3.12"
description = "A streamlit custom component that allows you to create a chat prompt with paste and image attachment support"
readme = { file = "README.md", content-type = "text/markdown" }
authors = [
Expand Down
8 changes: 5 additions & 3 deletions streamlit_chat_prompt/frontend/src/components/ChatInput.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ export class ChatInput extends StreamlitComponentBase<State, Props> {
this.handlePasteEvent = this.handlePasteEvent.bind(this);
}
private async handlePasteEvent(e: ClipboardEvent) {
if (this.state.disabled) return;
if (this.state.disabled || this.state.clipboardInspector.open) return;

const clipboardData = e.clipboardData;
if (!clipboardData) return;
Expand All @@ -101,8 +101,10 @@ export class ChatInput extends StreamlitComponentBase<State, Props> {
// Check if clipboard inspector is enabled via props
const clipboardInspectorEnabled = this.props.args?.clipboard_inspector_enabled ?? false;

// If more than one type, show the inspector
if (uniqueTypes.size > 1 && clipboardInspectorEnabled) {
// If more than one type, or one type but its not image/plaintext, show the inspector
if ((uniqueTypes.size > 1 ||
(!uniqueTypes.has('image') && !uniqueTypes.has('text'))) &&
clipboardInspectorEnabled) {
e.preventDefault(); // Prevent default paste
const clipboardInspectorData = inspectClipboard(e);
this.isShowingDialog = true;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -386,11 +386,15 @@ export const ClipboardInspector: React.FC<ClipboardInspectorProps> = ({
Logger.info('component', 'Starting markdown preview generation for HTML:', {
itemId,
htmlLength: html.length,
selectedLanguage: selectedLanguage
selectedLanguage: selectedLanguage,
htmlPreview: html.substring(0, 100) + '...'
});

// Step 1: Clean the HTML content
let workingHtml = html;

// Step 1: Extract code blocks and replace with placeholders
const codeBlocks = extractCodeBlocks(html);
const codeBlocks = extractCodeBlocks(workingHtml);

Logger.info('component', 'Extracted code blocks:', codeBlocks.map((block, index) => ({
index,
Expand All @@ -403,8 +407,6 @@ export const ClipboardInspector: React.FC<ClipboardInspectorProps> = ({
isStandalone: block.isStandalone,
})));

let workingHtml = html;

// Step 2: Create temporary placeholders for code blocks
codeBlocks.forEach((block, index) => {
// Only replace full code blocks with placeholders
Expand All @@ -418,6 +420,8 @@ export const ClipboardInspector: React.FC<ClipboardInspectorProps> = ({
workingHtmlPreview: workingHtml.substring(0, 200) + '...',
});

workingHtml = stripHtmlStyling(workingHtml);

// Step 3: Convert remaining HTML to markdown
let markdown = turndownService.turndown(workingHtml);

Expand Down
27 changes: 27 additions & 0 deletions streamlit_chat_prompt/frontend/src/utils/htmlProcessing.ts
Original file line number Diff line number Diff line change
Expand Up @@ -461,6 +461,33 @@ export function isCodeBlock(element: Element): boolean {
return isMonospace || isPreFormatted;
}

export function stripHtmlStyling(html: string): string {

const cleanHtml = html
.replace(/<!--[\s\S]*?(?:@font-face|Style Definitions)[\s\S]*?-->/gi, '') // Remove font/style definition blocks
.replace(/<o:p>\s*<\/o:p>/g, '') // Remove empty o:p tags
.replace(/<\/?\w+:[^>]*>/g, '') // Remove all other namespace tags
.replace(/style="[^"]*"/g, '') // Remove style attributes
.replace(/class="Mso[^"]*"/g, '') // Remove MSO classes
.replace(/<!--[\s\S]*?-->/g, ''); // Remove any remaining comments

if (html !== cleanHtml) {
Logger.debug("component", "Cleaned Microsoft Word HTML:", {
originalLength: html.length,
originalHtml: html.slice(0, 100),
cleanedLength: cleanHtml.length,
cleanedHtml: cleanHtml.slice(0, 100)
});
return cleanHtml;
} else {
Logger.debug("component", "Did not find any Microsoft Word HTML to clean.", {
originalLength: html.length,
cleanedLength: cleanHtml.length,
});
return html;
}
}

export function extractCodeBlocks(html: string): CodeBlock[] {
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
Expand Down

0 comments on commit 41c2158

Please sign in to comment.