Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add FireCrawl Scraper #44

Merged
merged 5 commits into from
Aug 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion Modules/LlmDriver/app/ClaudeClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ public function chat(array $messages): CompletionResponse

$payload = $this->modifyPayload($payload);

put_fixture('claude_payload_chat.json', $payload);

$results = $this->getClient()->post('/messages', $payload);

if (! $results->ok()) {
Expand Down Expand Up @@ -303,7 +305,6 @@ public function functionPromptChat(array $messages, array $only = []): array

$results = $this->getClient()->post('/messages', [
'model' => $model,
'system' => 'Return a markdown response.',
'max_tokens' => $maxTokens,
'messages' => $messages,
'tools' => $this->getFunctions(),
Expand Down
1 change: 1 addition & 0 deletions app/Domains/Messages/RoleEnum.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ enum RoleEnum: string
case User = 'user';
case System = 'system';
case Assistant = 'assistant';
case Tool = 'tool';
}
9 changes: 3 additions & 6 deletions app/Domains/Prompts/EventPagePrompt.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ public static function prompt(string $context): string

<INSTRUCTIONS>
1. Analyze the provided website HTML content below the <CONTENT> tag.
2. Look for information about events within the content.
3. If no event data is found, respond with a single word: false
2. Look for information about sporting events within the content.
3. If no event data is found summarize what is on the page
4. If event data is found, extract the following information for each event:
- Event Title
- Start Date
Expand All @@ -37,15 +37,12 @@ public static function prompt(string $context): string
"additionalInfo": "Any other relevant data"


If no events are found, return the word false
If no events are found, return the words "No Content Found" and summarize what was on the page


<CONTENT>
$context
</CONTENT>

Respond only with Markdown or 'false' if no events are found. Do not include any explanations or additional text in your response.

PROMPT;
}
}
1 change: 0 additions & 1 deletion app/Domains/Prompts/SummarizePrompt.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ public static function prompt(string $originalPrompt, string $context): string
**Format**
Deliver the response in a concise, clear Markdown format (Text). Use quotes as needed from the context.

[DO NOT INCLUDE THE ABOVE IN THE RESPONSE]

**The User's Query**:
```$originalPrompt```
Expand Down
35 changes: 18 additions & 17 deletions app/Domains/Sources/WebSearch/GetPage.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@

namespace App\Domains\Sources\WebSearch;

use App\Domains\WebParser\WebContentResultsDto;
use App\Models\Collection;
use App\Models\Setting;
use Facades\App\Domains\WebParser\DefaultClient;
use Facades\App\Domains\WebParser\FireCrawlClient;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use League\HTMLToMarkdown\Converter\CodeConverter;
use League\HTMLToMarkdown\Converter\PreformattedConverter;
use League\HTMLToMarkdown\Converter\TableConverter;
use League\HTMLToMarkdown\Converter\TextConverter;
use League\HTMLToMarkdown\Environment;
use League\HTMLToMarkdown\HtmlConverter;
use Spatie\Browsershot\Browsershot;

class GetPage
{
Expand All @@ -24,27 +28,24 @@ public static function make(Collection $collection): self
return new static($collection);
}

public function handle(string $url, bool $parseHtml = true): string
public function handle(string $url, bool $parseHtml = true): WebContentResultsDto
{
$results = Browsershot::url($url)
->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows')
->dismissDialogs()
->fullPage();

$name = md5($url).'.pdf';
/**
* @TODO this can repeat
* @TODO
* Make this a driver like the rest of the system
*/
$name = md5($url).'.pdf';

Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf());

$body = $results->bodyHtml();

if ($parseHtml) {
$body = $this->parseHtml($body);
if (Setting::getSecret('fire_crawl', 'api_key')) {
Log::info('Using FireCrawl');
$results = FireCrawlClient::scrape($url);
} else {
Log::info('Using Default Browsershot');
/** @var WebContentResultsDto $results */
$results = DefaultClient::scrape($url);
Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->browserShot->pdf());
}

return $body;
return $results;
}

public function parseHtml(string $html): string
Expand Down
7 changes: 7 additions & 0 deletions app/Domains/WebParser/BaseWebParserClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?php

namespace App\Domains\WebParser;

abstract class BaseWebParserClient
{
}
56 changes: 56 additions & 0 deletions app/Domains/WebParser/DefaultClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?php

namespace App\Domains\WebParser;

use League\HTMLToMarkdown\Converter\CodeConverter;
use League\HTMLToMarkdown\Converter\PreformattedConverter;
use League\HTMLToMarkdown\Converter\TableConverter;
use League\HTMLToMarkdown\Converter\TextConverter;
use League\HTMLToMarkdown\Environment;
use League\HTMLToMarkdown\HtmlConverter;
use LlmLaraHub\LlmDriver\BaseClient;
use Spatie\Browsershot\Browsershot;

class DefaultClient extends BaseClient
{
public function scrape(string $url): WebContentResultsDto
{
$results = Browsershot::url($url)
->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows')
->dismissDialogs()
->fullPage();

$plainResults = $this->parseHtml($results->bodyHtml());

return WebContentResultsDto::from([
'title' => str($plainResults)->limit(128)->title()->toString(),
'description' => str($plainResults)->limit(256)->title()->toString(),
'content' => $plainResults,
'content_raw' => $results->bodyHtml(),
'url' => $url,
'browserShot' => $results,
]);
}

public function parseHtml(string $html): string
{
$environment = new Environment([
'strip_tags' => true,
'suppress_errors' => true,
'hard_break' => true,
'strip_placeholder_links' => true,
'remove_nodes' => 'nav footer header script style meta',
]);
$environment->addConverter(new TableConverter());
$environment->addConverter(new CodeConverter());
$environment->addConverter(new PreformattedConverter());
$environment->addConverter(new TextConverter());

$converter = new HtmlConverter($environment);

$markdown = $converter->convert($html);

return str($markdown)->trim()->toString();

}
}
36 changes: 36 additions & 0 deletions app/Domains/WebParser/FireCrawlClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace App\Domains\WebParser;

use App\Domains\WebParser\Results\FireCrawResultsDto;
use App\Models\Setting;
use Illuminate\Http\Client\PendingRequest;
use Illuminate\Support\Facades\Http;

class FireCrawlClient extends BaseWebParserClient
{
public function scrape(string $url): WebContentResultsDto
{
$results = $this->getClient()->post('/scrape', [
'url' => $url,
]);

if ($results->failed()) {
throw new \Exception('FireCrawl API Error '.$results->json());
}

$data = $results->json();

return FireCrawResultsDto::from($data);
}

protected function getClient(): PendingRequest
{
$url = Setting::getSecret('fire_crawl', 'api_url');
$token = Setting::getSecret('fire_crawl', 'api_key');

return Http::baseUrl($url)->withHeaders([
'Authorization' => 'Bearer '.$token,
]);
}
}
24 changes: 24 additions & 0 deletions app/Domains/WebParser/Results/FireCrawResultsDto.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?php

namespace App\Domains\WebParser\Results;

use App\Domains\WebParser\WebContentResultsDto;
use Spatie\LaravelData\Attributes\MapInputName;

class FireCrawResultsDto extends WebContentResultsDto
{
public function __construct(
#[MapInputName('data.metadata.title')]
public string $title,
#[MapInputName('data.markdown')]
public string $content,
#[MapInputName('data.content')]
public string $content_raw,
#[MapInputName('data.metadata.sourceURL')]
public string $url,
#[MapInputName('data.metadata.description')]
public string $description = '',
) {

}
}
19 changes: 19 additions & 0 deletions app/Domains/WebParser/WebContentResultsDto.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php

namespace App\Domains\WebParser;

use Spatie\Browsershot\Browsershot;
use Spatie\LaravelData\Data;

class WebContentResultsDto extends Data
{
public function __construct(
public string $title,
public string $content,
public string $url,
public string $description = '',
public ?Browsershot $browserShot = null,
) {

}
}
17 changes: 17 additions & 0 deletions app/Http/Controllers/SettingController.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,23 @@ public function updateOllama(Request $request, Setting $setting)
return back();
}

public function updateFireCrawl(Request $request, Setting $setting)
{
$validated = $request->validate([
'api_key' => 'string|required',
'api_url' => 'string|required',
]);

$secrets = $setting->secrets;
$secrets['fire_crawl'] = $validated;
$setting->secrets = $secrets;
$setting->save();
$setting->updateStep($setting);
$this->clearCache();

return back();
}

public function updateGroq(Request $request, Setting $setting)
{
$validated = $request->validate([
Expand Down
8 changes: 5 additions & 3 deletions app/Jobs/GetWebContentJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ public function handle(): void
->handle($this->webResponseDto->url, true);

$prompt = Templatizer::appendContext(true)
->handle($this->source->getPrompt(), $htmlResults);
->handle($this->source->getPrompt(), $htmlResults->content);

put_fixture('web_page_prompt.txt', $prompt, false);

$results = LlmDriverFacade::driver(
$this->source->getDriver()
Expand Down Expand Up @@ -112,7 +114,7 @@ public function handle(): void
'source_id' => $this->source->id,
'type' => TypesEnum::HTML,
'subject' => to_utf8($title),
'document_md5' => md5($htmlResults),
'document_md5' => md5($htmlResults->content),
'link' => $this->webResponseDto->url,
'collection_id' => $this->source->collection_id,
],
Expand All @@ -121,7 +123,7 @@ public function handle(): void
'file_path' => $this->webResponseDto->url,
'status_summary' => StatusEnum::Pending,
'meta_data' => $this->webResponseDto->toArray(),
'original_content' => $htmlResults,
'original_content' => $htmlResults->content,
]
);

Expand Down
5 changes: 3 additions & 2 deletions app/Models/Message.php
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,12 @@ public function run(): void
$meta_data->driver = $chat->getDriver();
$message->updateQuietly(['meta_data' => $meta_data]);

if ($message->meta_data?->tool === 'completion') {
if ($message->meta_data?->tool === 'chat') {
Log::info('[LaraChain] Running Simple Completion');

$messages = $chat->getChatResponse();
$response = LlmDriverFacade::driver($chat->getDriver())->chat($messages);
$response = LlmDriverFacade::driver($chat->getDriver())
->chat($messages);
$response = $response->content;

$chat->addInput(
Expand Down
30 changes: 30 additions & 0 deletions database/migrations/2024_08_06_002122_add_fields_to_settings.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?php

use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;

return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::table('settings', function (Blueprint $table) {
$table->longText('main_prompt')->nullable();
$table->longText('source_prompt')->nullable();
$table->longText('output_prompt')->nullable();
});
}

/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::table('settings', function (Blueprint $table) {
//
});
}
};
22 changes: 22 additions & 0 deletions resources/js/Components/ScrollButton.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<template>
<button @click="scrollToBottom"

class="fixed top-4 right-4 btn btn-secondary btn-circle text-white">
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="3" stroke="currentColor" class="size-6">
<path stroke-linecap="round" stroke-linejoin="round" d="M19.5 13.5 12 21m0 0-7.5-7.5M12 21V3" />
</svg>

</button>
</template>

<script setup>
import { ref } from 'vue'

const bottomTarget = ref(null)

const scrollToBottom = () => {
bottomTarget.value?.scrollIntoView({ behavior: 'smooth' })
}

defineExpose({ bottomTarget })
</script>
Loading
Loading