From a06f2dc95a970bdc5fe458b89e65065d7457ef89 Mon Sep 17 00:00:00 2001 From: Alfred Nutile Date: Mon, 5 Aug 2024 21:47:18 -0400 Subject: [PATCH] Ok FireCrawl added --- app/Domains/Prompts/EventPagePrompt.php | 9 +- app/Domains/Sources/WebSearch/GetPage.php | 10 +- app/Domains/WebParser/DefaultClient.php | 6 +- app/Domains/WebParser/FireCrawlClient.php | 2 +- .../WebParser/Results/FireCrawResultsDto.php | 4 +- .../WebParser/WebContentResultsDto.php | 2 +- app/Jobs/GetWebContentJob.php | 2 + resources/js/Components/Templates.vue | 4 +- tests/Feature/FireCrawlClientTest.php | 2 +- tests/Feature/GetPageTest.php | 5 - tests/fixtures/claude_payload_chat.json | 22 +- tests/fixtures/web_page_prompt.txt | 1046 +++++++++++++++++ 12 files changed, 1067 insertions(+), 47 deletions(-) create mode 100644 tests/fixtures/web_page_prompt.txt diff --git a/app/Domains/Prompts/EventPagePrompt.php b/app/Domains/Prompts/EventPagePrompt.php index bc3ef2ff..93cf8468 100644 --- a/app/Domains/Prompts/EventPagePrompt.php +++ b/app/Domains/Prompts/EventPagePrompt.php @@ -16,8 +16,8 @@ public static function prompt(string $context): string 1. Analyze the provided website HTML content below the tag. -2. Look for information about events within the content. -3. If no event data is found, respond with a single word: false +2. Look for information about sporting events within the content. +3. If no event data is found summarize what is on the page 4. If event data is found, extract the following information for each event: - Event Title - Start Date @@ -37,15 +37,12 @@ public static function prompt(string $context): string "additionalInfo": "Any other relevant data" -If no events are found, return the word false +If no events are found, return the words "No Content Found" and summarize what was on the page $context - -Respond only with Markdown or 'false' if no events are found. Do not include any explanations or additional text in your response. - PROMPT; } } diff --git a/app/Domains/Sources/WebSearch/GetPage.php b/app/Domains/Sources/WebSearch/GetPage.php index 73d2014c..997adced 100644 --- a/app/Domains/Sources/WebSearch/GetPage.php +++ b/app/Domains/Sources/WebSearch/GetPage.php @@ -2,11 +2,11 @@ namespace App\Domains\Sources\WebSearch; -use Facades\App\Domains\WebParser\DefaultClient; -use Facades\App\Domains\WebParser\FireCrawlClient; use App\Domains\WebParser\WebContentResultsDto; use App\Models\Collection; use App\Models\Setting; +use Facades\App\Domains\WebParser\DefaultClient; +use Facades\App\Domains\WebParser\FireCrawlClient; use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Storage; use League\HTMLToMarkdown\Converter\CodeConverter; @@ -15,7 +15,6 @@ use League\HTMLToMarkdown\Converter\TextConverter; use League\HTMLToMarkdown\Environment; use League\HTMLToMarkdown\HtmlConverter; -use Spatie\Browsershot\Browsershot; class GetPage { @@ -36,15 +35,14 @@ public function handle(string $url, bool $parseHtml = true): WebContentResultsDt * @TODO * Make this a driver like the rest of the system */ - if(Setting::getSecret('fire_crawl', 'api_token')) { + if (Setting::getSecret('fire_crawl', 'api_key')) { Log::info('Using FireCrawl'); $results = FireCrawlClient::scrape($url); } else { Log::info('Using Default Browsershot'); /** @var WebContentResultsDto $results */ $results = DefaultClient::scrape($url); - /** @phpstan-ignore-next-line */ - Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf()); + Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->browserShot->pdf()); } return $results; diff --git a/app/Domains/WebParser/DefaultClient.php b/app/Domains/WebParser/DefaultClient.php index 91a886b3..1cf0feb6 100644 --- a/app/Domains/WebParser/DefaultClient.php +++ b/app/Domains/WebParser/DefaultClient.php @@ -2,7 +2,6 @@ namespace App\Domains\WebParser; -use App\Domains\Sources\WebSearch\GetPage; use League\HTMLToMarkdown\Converter\CodeConverter; use League\HTMLToMarkdown\Converter\PreformattedConverter; use League\HTMLToMarkdown\Converter\TableConverter; @@ -14,14 +13,13 @@ class DefaultClient extends BaseClient { - - public function scrape(string $url): WebContentResultsDto { + public function scrape(string $url): WebContentResultsDto + { $results = Browsershot::url($url) ->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows') ->dismissDialogs() ->fullPage(); - $plainResults = $this->parseHtml($results->bodyHtml()); return WebContentResultsDto::from([ diff --git a/app/Domains/WebParser/FireCrawlClient.php b/app/Domains/WebParser/FireCrawlClient.php index 2f5dc8a7..bb476885 100644 --- a/app/Domains/WebParser/FireCrawlClient.php +++ b/app/Domains/WebParser/FireCrawlClient.php @@ -27,7 +27,7 @@ public function scrape(string $url): WebContentResultsDto protected function getClient(): PendingRequest { $url = Setting::getSecret('fire_crawl', 'api_url'); - $token = Setting::getSecret('fire_crawl', 'api_token'); + $token = Setting::getSecret('fire_crawl', 'api_key'); return Http::baseUrl($url)->withHeaders([ 'Authorization' => 'Bearer '.$token, diff --git a/app/Domains/WebParser/Results/FireCrawResultsDto.php b/app/Domains/WebParser/Results/FireCrawResultsDto.php index eefde255..35c6451a 100644 --- a/app/Domains/WebParser/Results/FireCrawResultsDto.php +++ b/app/Domains/WebParser/Results/FireCrawResultsDto.php @@ -10,14 +10,14 @@ class FireCrawResultsDto extends WebContentResultsDto public function __construct( #[MapInputName('data.metadata.title')] public string $title, - #[MapInputName('data.metadata.description')] - public string $description, #[MapInputName('data.markdown')] public string $content, #[MapInputName('data.content')] public string $content_raw, #[MapInputName('data.metadata.sourceURL')] public string $url, + #[MapInputName('data.metadata.description')] + public string $description = '', ) { } diff --git a/app/Domains/WebParser/WebContentResultsDto.php b/app/Domains/WebParser/WebContentResultsDto.php index 1b15288f..65fd2563 100644 --- a/app/Domains/WebParser/WebContentResultsDto.php +++ b/app/Domains/WebParser/WebContentResultsDto.php @@ -9,9 +9,9 @@ class WebContentResultsDto extends Data { public function __construct( public string $title, - public string $description, public string $content, public string $url, + public string $description = '', public ?Browsershot $browserShot = null, ) { diff --git a/app/Jobs/GetWebContentJob.php b/app/Jobs/GetWebContentJob.php index 9fefaf96..c5c24ff7 100644 --- a/app/Jobs/GetWebContentJob.php +++ b/app/Jobs/GetWebContentJob.php @@ -75,6 +75,8 @@ public function handle(): void $prompt = Templatizer::appendContext(true) ->handle($this->source->getPrompt(), $htmlResults->content); + put_fixture('web_page_prompt.txt', $prompt, false); + $results = LlmDriverFacade::driver( $this->source->getDriver() )->completion($prompt); diff --git a/resources/js/Components/Templates.vue b/resources/js/Components/Templates.vue index 96332b5c..379817ea 100644 --- a/resources/js/Components/Templates.vue +++ b/resources/js/Components/Templates.vue @@ -62,11 +62,11 @@ const checkTemplate = (label) => {