Skip to content

Commit 9c070c7

Browse files
authored
Puppeteer / Playwright web crawler bug fixes/improvements (#4998)
* feature/bugfix: added otpional css selector to puppeteer web scraper, fixed error when puppeteerLoader does not work. * feature: added button to add empty link in web scraper tools * feature: added custom executable file path as an input to puppeteer to fix issues when puppeteer can not find/launch the browser. * feature: added new puppeteer features to playwright aswell. * fixed review comments
1 parent fddd40a commit 9c070c7

File tree

4 files changed

+125
-38
lines changed

4 files changed

+125
-38
lines changed

packages/components/nodes/documentloaders/Playwright/Playwright.ts

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
1-
import { omit } from 'lodash'
2-
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
3-
import { TextSplitter } from 'langchain/text_splitter'
41
import {
52
Browser,
63
Page,
74
PlaywrightWebBaseLoader,
85
PlaywrightWebBaseLoaderOptions
96
} from '@langchain/community/document_loaders/web/playwright'
7+
import { Document } from '@langchain/core/documents'
8+
import { TextSplitter } from 'langchain/text_splitter'
109
import { test } from 'linkifyjs'
10+
import { omit } from 'lodash'
1111
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
12+
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
1213

1314
class Playwright_DocumentLoaders implements INode {
1415
label: string
@@ -113,6 +114,14 @@ class Playwright_DocumentLoaders implements INode {
113114
additionalParams: true,
114115
description: 'CSS selectors like .div or #div'
115116
},
117+
{
118+
label: 'CSS Selector (Optional)',
119+
name: 'cssSelector',
120+
type: 'string',
121+
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
122+
optional: true,
123+
additionalParams: true
124+
},
116125
{
117126
label: 'Additional Metadata',
118127
name: 'metadata',
@@ -155,8 +164,14 @@ class Playwright_DocumentLoaders implements INode {
155164
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
156165
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
157166
let limit = parseInt(nodeData.inputs?.limit as string)
158-
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as 'load' | 'domcontentloaded' | 'networkidle' | 'commit' | undefined
159-
let waitForSelector = nodeData.inputs?.waitForSelector as string
167+
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as
168+
| 'load'
169+
| 'domcontentloaded'
170+
| 'networkidle'
171+
| 'commit'
172+
| undefined
173+
const waitForSelector = nodeData.inputs?.waitForSelector as string
174+
const cssSelector = nodeData.inputs?.cssSelector as string
160175
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
161176
const output = nodeData.outputs?.output as string
162177
const orgId = options.orgId
@@ -172,26 +187,37 @@ class Playwright_DocumentLoaders implements INode {
172187
throw new Error('Invalid URL')
173188
}
174189

175-
async function playwrightLoader(url: string): Promise<any> {
190+
async function playwrightLoader(url: string): Promise<Document[] | undefined> {
176191
try {
177192
let docs = []
178193
const config: PlaywrightWebBaseLoaderOptions = {
179194
launchOptions: {
180195
args: ['--no-sandbox'],
181-
headless: true
196+
headless: true,
197+
executablePath: process.env.PLAYWRIGHT_EXECUTABLE_FILE_PATH
182198
}
183199
}
184200
if (waitUntilGoToOption) {
185201
config['gotoOptions'] = {
186202
waitUntil: waitUntilGoToOption
187203
}
188204
}
189-
if (waitForSelector) {
205+
if (cssSelector || waitForSelector) {
190206
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
191-
await page.waitForSelector(waitForSelector)
207+
if (waitForSelector) {
208+
await page.waitForSelector(waitForSelector)
209+
}
192210

193-
const result = await page.evaluate(() => document.body.innerHTML)
194-
return result
211+
if (cssSelector) {
212+
const selectorHandle = await page.$(cssSelector)
213+
const result = await page.evaluate(
214+
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
215+
selectorHandle
216+
)
217+
return result
218+
} else {
219+
return await page.evaluate(() => document.body.innerHTML)
220+
}
195221
}
196222
}
197223
const loader = new PlaywrightWebBaseLoader(url, config)
@@ -208,7 +234,7 @@ class Playwright_DocumentLoaders implements INode {
208234
}
209235
}
210236

211-
let docs: IDocument[] = []
237+
let docs: Document[] = []
212238
if (relativeLinksMethod) {
213239
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PlaywrightWebBaseLoader ${relativeLinksMethod}`)
214240
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -225,7 +251,10 @@ class Playwright_DocumentLoaders implements INode {
225251
options.logger.info(`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
226252
if (!pages || pages.length === 0) throw new Error('No relative links found')
227253
for (const page of pages) {
228-
docs.push(...(await playwrightLoader(page)))
254+
const result = await playwrightLoader(page)
255+
if (result) {
256+
docs.push(...result)
257+
}
229258
}
230259
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PlaywrightWebBaseLoader ${relativeLinksMethod}`)
231260
} else if (selectedLinks && selectedLinks.length > 0) {
@@ -234,10 +263,16 @@ class Playwright_DocumentLoaders implements INode {
234263
`[${orgId}]: PlaywrightWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
235264
)
236265
for (const page of selectedLinks.slice(0, limit)) {
237-
docs.push(...(await playwrightLoader(page)))
266+
const result = await playwrightLoader(page)
267+
if (result) {
268+
docs.push(...result)
269+
}
238270
}
239271
} else {
240-
docs = await playwrightLoader(url)
272+
const result = await playwrightLoader(url)
273+
if (result) {
274+
docs.push(...result)
275+
}
241276
}
242277

243278
if (metadata) {

packages/components/nodes/documentloaders/Puppeteer/Puppeteer.ts

Lines changed: 47 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
1-
import { omit } from 'lodash'
2-
import { ICommonObject, IDocument, INode, INodeData, INodeParams } from '../../../src/Interface'
3-
import { TextSplitter } from 'langchain/text_splitter'
41
import { Browser, Page, PuppeteerWebBaseLoader, PuppeteerWebBaseLoaderOptions } from '@langchain/community/document_loaders/web/puppeteer'
2+
import { Document } from '@langchain/core/documents'
3+
import { TextSplitter } from 'langchain/text_splitter'
54
import { test } from 'linkifyjs'
6-
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
5+
import { omit } from 'lodash'
76
import { PuppeteerLifeCycleEvent } from 'puppeteer'
7+
import { handleEscapeCharacters, INodeOutputsValue, webCrawl, xmlScrape } from '../../../src'
8+
import { ICommonObject, INode, INodeData, INodeParams } from '../../../src/Interface'
89

910
class Puppeteer_DocumentLoaders implements INode {
1011
label: string
@@ -109,6 +110,14 @@ class Puppeteer_DocumentLoaders implements INode {
109110
additionalParams: true,
110111
description: 'CSS selectors like .div or #div'
111112
},
113+
{
114+
label: 'CSS Selector (Optional)',
115+
name: 'cssSelector',
116+
type: 'string',
117+
description: 'Only content inside this selector will be extracted. Leave empty to use the entire page body.',
118+
optional: true,
119+
additionalParams: true
120+
},
112121
{
113122
label: 'Additional Metadata',
114123
name: 'metadata',
@@ -151,8 +160,9 @@ class Puppeteer_DocumentLoaders implements INode {
151160
const relativeLinksMethod = nodeData.inputs?.relativeLinksMethod as string
152161
const selectedLinks = nodeData.inputs?.selectedLinks as string[]
153162
let limit = parseInt(nodeData.inputs?.limit as string)
154-
let waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
155-
let waitForSelector = nodeData.inputs?.waitForSelector as string
163+
const waitUntilGoToOption = nodeData.inputs?.waitUntilGoToOption as PuppeteerLifeCycleEvent
164+
const waitForSelector = nodeData.inputs?.waitForSelector as string
165+
const cssSelector = nodeData.inputs?.cssSelector as string
156166
const _omitMetadataKeys = nodeData.inputs?.omitMetadataKeys as string
157167
const output = nodeData.outputs?.output as string
158168
const orgId = options.orgId
@@ -168,26 +178,37 @@ class Puppeteer_DocumentLoaders implements INode {
168178
throw new Error('Invalid URL')
169179
}
170180

171-
async function puppeteerLoader(url: string): Promise<any> {
181+
async function puppeteerLoader(url: string): Promise<Document[] | undefined> {
172182
try {
173-
let docs = []
183+
let docs: Document[] = []
174184
const config: PuppeteerWebBaseLoaderOptions = {
175185
launchOptions: {
176186
args: ['--no-sandbox'],
177-
headless: 'new'
187+
headless: 'new',
188+
executablePath: process.env.PUPPETEER_EXECUTABLE_FILE_PATH
178189
}
179190
}
180191
if (waitUntilGoToOption) {
181192
config['gotoOptions'] = {
182193
waitUntil: waitUntilGoToOption
183194
}
184195
}
185-
if (waitForSelector) {
196+
if (cssSelector || waitForSelector) {
186197
config['evaluate'] = async (page: Page, _: Browser): Promise<string> => {
187-
await page.waitForSelector(waitForSelector)
198+
if (waitForSelector) {
199+
await page.waitForSelector(waitForSelector)
200+
}
188201

189-
const result = await page.evaluate(() => document.body.innerHTML)
190-
return result
202+
if (cssSelector) {
203+
const selectorHandle = await page.$(cssSelector)
204+
const result = await page.evaluate(
205+
(htmlSelection) => htmlSelection?.innerHTML ?? document.body.innerHTML,
206+
selectorHandle
207+
)
208+
return result
209+
} else {
210+
return await page.evaluate(() => document.body.innerHTML)
211+
}
191212
}
192213
}
193214
const loader = new PuppeteerWebBaseLoader(url, config)
@@ -204,7 +225,7 @@ class Puppeteer_DocumentLoaders implements INode {
204225
}
205226
}
206227

207-
let docs: IDocument[] = []
228+
let docs: Document[] = []
208229
if (relativeLinksMethod) {
209230
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Start PuppeteerWebBaseLoader ${relativeLinksMethod}`)
210231
// if limit is 0 we don't want it to default to 10 so we check explicitly for null or undefined
@@ -221,7 +242,10 @@ class Puppeteer_DocumentLoaders implements INode {
221242
options.logger.info(`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(pages)}, length: ${pages.length}`)
222243
if (!pages || pages.length === 0) throw new Error('No relative links found')
223244
for (const page of pages) {
224-
docs.push(...(await puppeteerLoader(page)))
245+
const result = await puppeteerLoader(page)
246+
if (result) {
247+
docs.push(...result)
248+
}
225249
}
226250
if (process.env.DEBUG === 'true') options.logger.info(`[${orgId}]: Finish PuppeteerWebBaseLoader ${relativeLinksMethod}`)
227251
} else if (selectedLinks && selectedLinks.length > 0) {
@@ -230,10 +254,16 @@ class Puppeteer_DocumentLoaders implements INode {
230254
`[${orgId}]: PuppeteerWebBaseLoader pages: ${JSON.stringify(selectedLinks)}, length: ${selectedLinks.length}`
231255
)
232256
for (const page of selectedLinks.slice(0, limit)) {
233-
docs.push(...(await puppeteerLoader(page)))
257+
const result = await puppeteerLoader(page)
258+
if (result) {
259+
docs.push(...result)
260+
}
234261
}
235262
} else {
236-
docs = await puppeteerLoader(url)
263+
const result = await puppeteerLoader(url)
264+
if (result) {
265+
docs.push(...result)
266+
}
237267
}
238268

239269
if (metadata) {

packages/server/.env.example

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -169,4 +169,12 @@ JWT_REFRESH_TOKEN_EXPIRY_IN_MINUTES=43200
169169
############################################## SECURITY ####################################################
170170
############################################################################################################
171171

172-
# HTTP_DENY_LIST=
172+
# HTTP_DENY_LIST=
173+
174+
175+
############################################################################################################
176+
########################################### DOCUMENT LOADERS ###############################################
177+
############################################################################################################
178+
179+
# PUPPETEER_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'
180+
# PLAYWRIGHT_EXECUTABLE_FILE_PATH='C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe'

packages/ui/src/ui-component/dialog/ManageScrapedLinksDialog.jsx

Lines changed: 19 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import PropTypes from 'prop-types'
2+
import { useEffect, useState } from 'react'
23
import { createPortal } from 'react-dom'
34
import { useDispatch } from 'react-redux'
4-
import { useState, useEffect } from 'react'
55

66
import {
77
Box,
@@ -16,11 +16,11 @@ import {
1616
Stack,
1717
Typography
1818
} from '@mui/material'
19-
import { IconEraser, IconTrash, IconX } from '@tabler/icons-react'
19+
import { IconEraser, IconPlus, IconTrash, IconX } from '@tabler/icons-react'
2020
import PerfectScrollbar from 'react-perfect-scrollbar'
2121

22-
import { BackdropLoader } from '@/ui-component/loading/BackdropLoader'
2322
import { StyledButton } from '@/ui-component/button/StyledButton'
23+
import { BackdropLoader } from '@/ui-component/loading/BackdropLoader'
2424

2525
import scraperApi from '@/api/scraper'
2626

@@ -29,8 +29,8 @@ import useNotifier from '@/utils/useNotifier'
2929
import {
3030
HIDE_CANVAS_DIALOG,
3131
SHOW_CANVAS_DIALOG,
32-
enqueueSnackbar as enqueueSnackbarAction,
33-
closeSnackbar as closeSnackbarAction
32+
closeSnackbar as closeSnackbarAction,
33+
enqueueSnackbar as enqueueSnackbarAction
3434
} from '@/store/actions'
3535

3636
const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
@@ -112,6 +112,10 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
112112
setSelectedLinks(links)
113113
}
114114

115+
const handleAddLink = () => {
116+
setSelectedLinks([...selectedLinks, ''])
117+
}
118+
115119
const handleRemoveAllLinks = () => {
116120
setSelectedLinks([])
117121
}
@@ -160,6 +164,16 @@ const ManageScrapedLinksDialog = ({ show, dialogProps, onCancel, onSave }) => {
160164
</Box>
161165
<Box sx={{ display: 'flex', alignItems: 'center', justifyContent: 'space-between', mb: 1.5 }}>
162166
<Typography sx={{ fontWeight: 500 }}>Scraped Links</Typography>
167+
<Box sx={{ width: 'auto', flexGrow: 1 }}>
168+
<IconButton
169+
sx={{ height: 30, width: 30, marginLeft: '8px' }}
170+
size='small'
171+
color='primary'
172+
onClick={() => handleAddLink()}
173+
>
174+
<IconPlus />
175+
</IconButton>
176+
</Box>
163177
{selectedLinks.length > 0 ? (
164178
<Button
165179
sx={{ height: 'max-content', width: 'max-content' }}

0 commit comments

Comments
 (0)