From 4b0f100eee9d4e7091d8323677f51d414ed82c71 Mon Sep 17 00:00:00 2001 From: Alfred Nutile Date: Mon, 5 Aug 2024 21:09:21 -0400 Subject: [PATCH] hmm stan might be right on this --- app/Domains/Sources/WebSearch/GetPage.php | 35 ++++---- app/Domains/WebParser/BaseWebParserClient.php | 7 ++ app/Domains/WebParser/DefaultClient.php | 58 +++++++++++++ app/Domains/WebParser/FireCrawlClient.php | 36 ++++++++ .../WebParser/Results/FireCrawResultsDto.php | 24 ++++++ .../WebParser/WebContentResultsDto.php | 19 +++++ app/Http/Controllers/SettingController.php | 17 ++++ app/Jobs/GetWebContentJob.php | 6 +- ...24_08_06_002122_add_fields_to_settings.php | 30 +++++++ .../Pages/Settings/Partials/ClaudeSecrets.vue | 2 +- .../js/Pages/Settings/Partials/FireCrawl.vue | 82 +++++++++++++++++++ .../Pages/Settings/Partials/GroqSecrets.vue | 2 +- .../Settings/Partials/OllamaApiSecrets.vue | 2 +- .../Pages/Settings/Partials/OpenAiSecrets.vue | 2 +- resources/js/Pages/Settings/Show.vue | 6 ++ routes/web.php | 2 + tests/Feature/FireCrawResultsDtoTest.php | 22 +++++ tests/Feature/FireCrawlClientTest.php | 41 ++++++++++ tests/Feature/GetPageTest.php | 39 --------- tests/Feature/Jobs/GetWebContentJobTest.php | 15 +++- tests/fixtures/test_firecrawl_parse.json | 14 ++++ 21 files changed, 397 insertions(+), 64 deletions(-) create mode 100644 app/Domains/WebParser/BaseWebParserClient.php create mode 100644 app/Domains/WebParser/DefaultClient.php create mode 100644 app/Domains/WebParser/FireCrawlClient.php create mode 100644 app/Domains/WebParser/Results/FireCrawResultsDto.php create mode 100644 app/Domains/WebParser/WebContentResultsDto.php create mode 100644 database/migrations/2024_08_06_002122_add_fields_to_settings.php create mode 100644 resources/js/Pages/Settings/Partials/FireCrawl.vue create mode 100644 tests/Feature/FireCrawResultsDtoTest.php create mode 100644 tests/Feature/FireCrawlClientTest.php create mode 100644 tests/fixtures/test_firecrawl_parse.json diff --git a/app/Domains/Sources/WebSearch/GetPage.php b/app/Domains/Sources/WebSearch/GetPage.php index 6211ae3f..73d2014c 100644 --- a/app/Domains/Sources/WebSearch/GetPage.php +++ b/app/Domains/Sources/WebSearch/GetPage.php @@ -2,7 +2,12 @@ namespace App\Domains\Sources\WebSearch; +use Facades\App\Domains\WebParser\DefaultClient; +use Facades\App\Domains\WebParser\FireCrawlClient; +use App\Domains\WebParser\WebContentResultsDto; use App\Models\Collection; +use App\Models\Setting; +use Illuminate\Support\Facades\Log; use Illuminate\Support\Facades\Storage; use League\HTMLToMarkdown\Converter\CodeConverter; use League\HTMLToMarkdown\Converter\PreformattedConverter; @@ -24,27 +29,25 @@ public static function make(Collection $collection): self return new static($collection); } - public function handle(string $url, bool $parseHtml = true): string + public function handle(string $url, bool $parseHtml = true): WebContentResultsDto { - $results = Browsershot::url($url) - ->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows') - ->dismissDialogs() - ->fullPage(); - + $name = md5($url).'.pdf'; /** - * @TODO this can repeat + * @TODO + * Make this a driver like the rest of the system */ - $name = md5($url).'.pdf'; - - Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf()); - - $body = $results->bodyHtml(); - - if ($parseHtml) { - $body = $this->parseHtml($body); + if(Setting::getSecret('fire_crawl', 'api_token')) { + Log::info('Using FireCrawl'); + $results = FireCrawlClient::scrape($url); + } else { + Log::info('Using Default Browsershot'); + /** @var WebContentResultsDto $results */ + $results = DefaultClient::scrape($url); + /** @phpstan-ignore-next-line */ + Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf()); } - return $body; + return $results; } public function parseHtml(string $html): string diff --git a/app/Domains/WebParser/BaseWebParserClient.php b/app/Domains/WebParser/BaseWebParserClient.php new file mode 100644 index 00000000..4738e204 --- /dev/null +++ b/app/Domains/WebParser/BaseWebParserClient.php @@ -0,0 +1,7 @@ +userAgent('DailyAI Studio Browser 1.0, helping users automate workflows') + ->dismissDialogs() + ->fullPage(); + + + $plainResults = $this->parseHtml($results->bodyHtml()); + + return WebContentResultsDto::from([ + 'title' => str($plainResults)->limit(128)->title()->toString(), + 'description' => str($plainResults)->limit(256)->title()->toString(), + 'content' => $plainResults, + 'content_raw' => $results->bodyHtml(), + 'url' => $url, + 'browserShot' => $results, + ]); + } + + public function parseHtml(string $html): string + { + $environment = new Environment([ + 'strip_tags' => true, + 'suppress_errors' => true, + 'hard_break' => true, + 'strip_placeholder_links' => true, + 'remove_nodes' => 'nav footer header script style meta', + ]); + $environment->addConverter(new TableConverter()); + $environment->addConverter(new CodeConverter()); + $environment->addConverter(new PreformattedConverter()); + $environment->addConverter(new TextConverter()); + + $converter = new HtmlConverter($environment); + + $markdown = $converter->convert($html); + + return str($markdown)->trim()->toString(); + + } +} diff --git a/app/Domains/WebParser/FireCrawlClient.php b/app/Domains/WebParser/FireCrawlClient.php new file mode 100644 index 00000000..2f5dc8a7 --- /dev/null +++ b/app/Domains/WebParser/FireCrawlClient.php @@ -0,0 +1,36 @@ +getClient()->post('/scrape', [ + 'url' => $url, + ]); + + if ($results->failed()) { + throw new \Exception('FireCrawl API Error '.$results->json()); + } + + $data = $results->json(); + + return FireCrawResultsDto::from($data); + } + + protected function getClient(): PendingRequest + { + $url = Setting::getSecret('fire_crawl', 'api_url'); + $token = Setting::getSecret('fire_crawl', 'api_token'); + + return Http::baseUrl($url)->withHeaders([ + 'Authorization' => 'Bearer '.$token, + ]); + } +} diff --git a/app/Domains/WebParser/Results/FireCrawResultsDto.php b/app/Domains/WebParser/Results/FireCrawResultsDto.php new file mode 100644 index 00000000..eefde255 --- /dev/null +++ b/app/Domains/WebParser/Results/FireCrawResultsDto.php @@ -0,0 +1,24 @@ +validate([ + 'api_key' => 'string|required', + 'api_url' => 'string|required', + ]); + + $secrets = $setting->secrets; + $secrets['fire_crawl'] = $validated; + $setting->secrets = $secrets; + $setting->save(); + $setting->updateStep($setting); + $this->clearCache(); + + return back(); + } + public function updateGroq(Request $request, Setting $setting) { $validated = $request->validate([ diff --git a/app/Jobs/GetWebContentJob.php b/app/Jobs/GetWebContentJob.php index 8a252531..9fefaf96 100644 --- a/app/Jobs/GetWebContentJob.php +++ b/app/Jobs/GetWebContentJob.php @@ -73,7 +73,7 @@ public function handle(): void ->handle($this->webResponseDto->url, true); $prompt = Templatizer::appendContext(true) - ->handle($this->source->getPrompt(), $htmlResults); + ->handle($this->source->getPrompt(), $htmlResults->content); $results = LlmDriverFacade::driver( $this->source->getDriver() @@ -112,7 +112,7 @@ public function handle(): void 'source_id' => $this->source->id, 'type' => TypesEnum::HTML, 'subject' => to_utf8($title), - 'document_md5' => md5($htmlResults), + 'document_md5' => md5($htmlResults->content), 'link' => $this->webResponseDto->url, 'collection_id' => $this->source->collection_id, ], @@ -121,7 +121,7 @@ public function handle(): void 'file_path' => $this->webResponseDto->url, 'status_summary' => StatusEnum::Pending, 'meta_data' => $this->webResponseDto->toArray(), - 'original_content' => $htmlResults, + 'original_content' => $htmlResults->content, ] ); diff --git a/database/migrations/2024_08_06_002122_add_fields_to_settings.php b/database/migrations/2024_08_06_002122_add_fields_to_settings.php new file mode 100644 index 00000000..1c489650 --- /dev/null +++ b/database/migrations/2024_08_06_002122_add_fields_to_settings.php @@ -0,0 +1,30 @@ +longText('main_prompt')->nullable(); + $table->longText('source_prompt')->nullable(); + $table->longText('output_prompt')->nullable(); + }); + } + + /** + * Reverse the migrations. + */ + public function down(): void + { + Schema::table('settings', function (Blueprint $table) { + // + }); + } +}; diff --git a/resources/js/Pages/Settings/Partials/ClaudeSecrets.vue b/resources/js/Pages/Settings/Partials/ClaudeSecrets.vue index 9c9aba1e..a2bec1a9 100644 --- a/resources/js/Pages/Settings/Partials/ClaudeSecrets.vue +++ b/resources/js/Pages/Settings/Partials/ClaudeSecrets.vue @@ -26,7 +26,7 @@ const updateSecrets = () => { form.put(route('settings.update.claude', { setting: props.setting.id, }), { - errorBag: 'updateProfileInformation', + errorBag: 'updateClaude', preserveScroll: true, }); }; diff --git a/resources/js/Pages/Settings/Partials/FireCrawl.vue b/resources/js/Pages/Settings/Partials/FireCrawl.vue new file mode 100644 index 00000000..b965173a --- /dev/null +++ b/resources/js/Pages/Settings/Partials/FireCrawl.vue @@ -0,0 +1,82 @@ + + + diff --git a/resources/js/Pages/Settings/Partials/GroqSecrets.vue b/resources/js/Pages/Settings/Partials/GroqSecrets.vue index c3e051bb..aead8620 100644 --- a/resources/js/Pages/Settings/Partials/GroqSecrets.vue +++ b/resources/js/Pages/Settings/Partials/GroqSecrets.vue @@ -26,7 +26,7 @@ const updateSecrets = () => { form.put(route('settings.update.groq', { setting: props.setting.id, }), { - errorBag: 'updateProfileInformation', + errorBag: 'updateGroq', preserveScroll: true, }); }; diff --git a/resources/js/Pages/Settings/Partials/OllamaApiSecrets.vue b/resources/js/Pages/Settings/Partials/OllamaApiSecrets.vue index 54ce5fcf..042e8ebf 100644 --- a/resources/js/Pages/Settings/Partials/OllamaApiSecrets.vue +++ b/resources/js/Pages/Settings/Partials/OllamaApiSecrets.vue @@ -26,7 +26,7 @@ const updateSecrets = () => { form.put(route('settings.update.ollama', { setting: props.setting.id, }), { - errorBag: 'updateProfileInformation', + errorBag: 'updateOllama', preserveScroll: true, }); }; diff --git a/resources/js/Pages/Settings/Partials/OpenAiSecrets.vue b/resources/js/Pages/Settings/Partials/OpenAiSecrets.vue index 6644f317..17055bff 100644 --- a/resources/js/Pages/Settings/Partials/OpenAiSecrets.vue +++ b/resources/js/Pages/Settings/Partials/OpenAiSecrets.vue @@ -28,7 +28,7 @@ const updateSecrets = () => { form.put(route('settings.update.open_ai', { setting: props.setting.id, }), { - errorBag: 'updateProfileInformation', + errorBag: 'updateOpenAi', preserveScroll: true, }); }; diff --git a/resources/js/Pages/Settings/Show.vue b/resources/js/Pages/Settings/Show.vue index becc28cc..cd13780a 100644 --- a/resources/js/Pages/Settings/Show.vue +++ b/resources/js/Pages/Settings/Show.vue @@ -5,6 +5,7 @@ import OpenAiSecrets from "@/Pages/Settings/Partials/OpenAiSecrets.vue"; import ClaudeSecrets from "@/Pages/Settings/Partials/ClaudeSecrets.vue"; import OllamaApiSecrets from "@/Pages/Settings/Partials/OllamaApiSecrets.vue"; import GroqSecrets from "@/Pages/Settings/Partials/GroqSecrets.vue"; +import FireCrawl from "@/Pages/Settings/Partials/FireCrawl.vue"; const props = defineProps({ setting: Object, @@ -41,6 +42,11 @@ const props = defineProps({ + +
+ + +
diff --git a/routes/web.php b/routes/web.php index 46fdd421..0ea033bc 100644 --- a/routes/web.php +++ b/routes/web.php @@ -170,6 +170,8 @@ function () { ->name('settings.update.ollama'); Route::put('/settings/{setting}/groq', 'updateGroq') ->name('settings.update.groq'); + Route::put('/settings/{setting}/fire_crawl', 'updateFireCrawl') + ->name('settings.update.fire_crawl'); } ); diff --git a/tests/Feature/FireCrawResultsDtoTest.php b/tests/Feature/FireCrawResultsDtoTest.php new file mode 100644 index 00000000..5e5bc8c0 --- /dev/null +++ b/tests/Feature/FireCrawResultsDtoTest.php @@ -0,0 +1,22 @@ +assertEquals('Mendable | AI for CX and Sales', $dto->title); + $this->assertEquals('AI for CX and Sales', $dto->description); + $this->assertEquals('# Markdown Content', $dto->content); + $this->assertEquals('https://www.mendable.ai/', $dto->url); + } +} diff --git a/tests/Feature/FireCrawlClientTest.php b/tests/Feature/FireCrawlClientTest.php new file mode 100644 index 00000000..e83743f8 --- /dev/null +++ b/tests/Feature/FireCrawlClientTest.php @@ -0,0 +1,41 @@ +create([ + 'secrets' => [ + 'fire_crawl' => [ + 'api_url' => 'https://api.firecrawl.dev', + 'api_token' => 'foo', + ], + ], + ]); + + $data = get_fixture('test_firecrawl_parse.json'); + + Http::fake([ + 'api.firecrawl.dev/*' => Http::response($data, 200), + ]); + + Http::preventStrayRequests(); + + $client = new \App\Domains\WebParser\FireCrawlClient(); + $results = $client->scrape('https://www.mendable.ai/'); + $this->assertEquals('Mendable | AI for CX and Sales', $results->title); + $this->assertEquals('AI for CX and Sales', $results->description); + $this->assertEquals('# Markdown Content', $results->content); + $this->assertEquals('https://www.mendable.ai/', $results->url); + + } +} diff --git a/tests/Feature/GetPageTest.php b/tests/Feature/GetPageTest.php index e2b727e6..b0b5d769 100644 --- a/tests/Feature/GetPageTest.php +++ b/tests/Feature/GetPageTest.php @@ -10,56 +10,17 @@ class GetPageTest extends TestCase { - /** - * A basic feature test example. - */ - public function test_get_page(): void - { - Storage::fake('collections'); - $this->markTestSkipped('@TODO mock browser shot'); - $url = 'https://alnutile.medium.com/multiple-openai-functions-php-laravel-466cb72eefb8'; - $url = 'https://alfrednutile.info/ssh-config'; - //$url = 'https://laravel-news.com/replicate-laravel-php-client'; - $html = get_fixture('test_blog.html', false); - $results = GetPage::handle($url); - } public function test_iterator() { - - $html = get_fixture('test.html', false); $html = get_fixture('test_blog.html', false); - $json = get_fixture('web_search_html_converted_to_json_ollama.json', false); - $collection = Collection::factory()->create(); $results = GetPage::make($collection)->parseHtml($html); $this->assertNotEmpty($results); - } - public function test_ideas_for_markdown(): void - { - //Storage::fake('collections'); - $this->markTestSkipped('@TODO mock browser shot'); - $html = get_fixture('test.html', false); - $html = get_fixture('test_blog.html', false); - - $markdown = str($html)->markdown()->toString(); - - $converter = new HtmlConverter( - [ - 'strip_tags' => true, - 'suppress_errors' => true, - 'hard_break' => true, - 'strip_placeholder_links' => true, - 'remove_nodes' => 'footer header script style meta', - ] - ); - - $markdown = $converter->convert($html); - } } diff --git a/tests/Feature/Jobs/GetWebContentJobTest.php b/tests/Feature/Jobs/GetWebContentJobTest.php index f65d88c0..eda3dcfe 100644 --- a/tests/Feature/Jobs/GetWebContentJobTest.php +++ b/tests/Feature/Jobs/GetWebContentJobTest.php @@ -3,6 +3,7 @@ namespace Tests\Feature\Jobs; use App\Domains\Sources\WebSearch\Response\WebResponseDto; +use App\Domains\WebParser\WebContentResultsDto; use App\Jobs\GetWebContentJob; use App\Models\Document; use App\Models\Source; @@ -37,7 +38,12 @@ public function test_job_html(): void $html = get_fixture('test_medium_2.html', false); - GetPage::shouldReceive('make->handle')->once()->andReturn($html); + GetPage::shouldReceive('make->handle')->once()->andReturn(WebContentResultsDto::from([ + 'title' => 'Example', + 'description' => 'Example description', + 'content' => $html, + 'url' => 'https://example.com', + ])); LlmDriverFacade::shouldReceive('driver->onQueue')->andReturn('default'); @@ -78,7 +84,12 @@ public function test_array(): void $html = get_fixture('test_medium_2.html', false); - GetPage::shouldReceive('make->handle')->once()->andReturn($html); + GetPage::shouldReceive('make->handle')->once()->andReturn(WebContentResultsDto::from([ + 'title' => 'Example', + 'description' => 'Example description', + 'content' => $html, + 'url' => 'https://example.com', + ])); LlmDriverFacade::shouldReceive('driver->onQueue')->andReturn('default'); diff --git a/tests/fixtures/test_firecrawl_parse.json b/tests/fixtures/test_firecrawl_parse.json new file mode 100644 index 00000000..0b9b750a --- /dev/null +++ b/tests/fixtures/test_firecrawl_parse.json @@ -0,0 +1,14 @@ +{ + "success": true, + "data": { + "content": "Raw Content ", + "markdown": "# Markdown Content", + "provider": "web-scraper", + "metadata": { + "title": "Mendable | AI for CX and Sales", + "description": "AI for CX and Sales", + "language": null, + "sourceURL": "https://www.mendable.ai/" + } + } +}