Skip to content

Commit

Permalink
Merge pull request #44 from LlmLaraHub/add_firecrawl
Browse files Browse the repository at this point in the history
Add FireCrawl Scraper
  • Loading branch information
alnutile authored Aug 6, 2024
2 parents 9dcb4eb + 98210b1 commit b0340e5
Show file tree
Hide file tree
Showing 33 changed files with 1,689 additions and 210 deletions.
3 changes: 2 additions & 1 deletion Modules/LlmDriver/app/ClaudeClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ public function chat(array $messages): CompletionResponse

$payload = $this->modifyPayload($payload);

put_fixture('claude_payload_chat.json', $payload);

$results = $this->getClient()->post('/messages', $payload);

if (! $results->ok()) {
Expand Down Expand Up @@ -303,7 +305,6 @@ public function functionPromptChat(array $messages, array $only = []): array

$results = $this->getClient()->post('/messages', [
'model' => $model,
'system' => 'Return a markdown response.',
'max_tokens' => $maxTokens,
'messages' => $messages,
'tools' => $this->getFunctions(),
Expand Down
1 change: 1 addition & 0 deletions app/Domains/Messages/RoleEnum.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,5 @@ enum RoleEnum: string
case User = 'user';
case System = 'system';
case Assistant = 'assistant';
case Tool = 'tool';
}
9 changes: 3 additions & 6 deletions app/Domains/Prompts/EventPagePrompt.php
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ public static function prompt(string $context): string
<INSTRUCTIONS>
1. Analyze the provided website HTML content below the <CONTENT> tag.
2. Look for information about events within the content.
3. If no event data is found, respond with a single word: false
2. Look for information about sporting events within the content.
3. If no event data is found summarize what is on the page
4. If event data is found, extract the following information for each event:
- Event Title
- Start Date
Expand All @@ -37,15 +37,12 @@ public static function prompt(string $context): string
"additionalInfo": "Any other relevant data"
If no events are found, return the word false
If no events are found, return the words "No Content Found" and summarize what was on the page
<CONTENT>
$context
</CONTENT>
Respond only with Markdown or 'false' if no events are found. Do not include any explanations or additional text in your response.
PROMPT;
}
}
1 change: 0 additions & 1 deletion app/Domains/Prompts/SummarizePrompt.php
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,6 @@ public static function prompt(string $originalPrompt, string $context): string
**Format**
Deliver the response in a concise, clear Markdown format (Text). Use quotes as needed from the context.
[DO NOT INCLUDE THE ABOVE IN THE RESPONSE]
**The User's Query**:
```$originalPrompt```
Expand Down
35 changes: 18 additions & 17 deletions app/Domains/Sources/WebSearch/GetPage.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,19 @@

namespace App\Domains\Sources\WebSearch;

use App\Domains\WebParser\WebContentResultsDto;
use App\Models\Collection;
use App\Models\Setting;
use Facades\App\Domains\WebParser\DefaultClient;
use Facades\App\Domains\WebParser\FireCrawlClient;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use League\HTMLToMarkdown\Converter\CodeConverter;
use League\HTMLToMarkdown\Converter\PreformattedConverter;
use League\HTMLToMarkdown\Converter\TableConverter;
use League\HTMLToMarkdown\Converter\TextConverter;
use League\HTMLToMarkdown\Environment;
use League\HTMLToMarkdown\HtmlConverter;
use Spatie\Browsershot\Browsershot;

class GetPage
{
Expand All @@ -24,27 +28,24 @@ public static function make(Collection $collection): self
return new static($collection);
}

public function handle(string $url, bool $parseHtml = true): string
public function handle(string $url, bool $parseHtml = true): WebContentResultsDto
{
$results = Browsershot::url($url)
->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows')
->dismissDialogs()
->fullPage();

$name = md5($url).'.pdf';
/**
* @TODO this can repeat
* @TODO
* Make this a driver like the rest of the system
*/
$name = md5($url).'.pdf';

Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf());

$body = $results->bodyHtml();

if ($parseHtml) {
$body = $this->parseHtml($body);
if (Setting::getSecret('fire_crawl', 'api_key')) {
Log::info('Using FireCrawl');
$results = FireCrawlClient::scrape($url);
} else {
Log::info('Using Default Browsershot');
/** @var WebContentResultsDto $results */
$results = DefaultClient::scrape($url);
Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->browserShot->pdf());
}

return $body;
return $results;
}

public function parseHtml(string $html): string
Expand Down
7 changes: 7 additions & 0 deletions app/Domains/WebParser/BaseWebParserClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?php

namespace App\Domains\WebParser;

abstract class BaseWebParserClient
{
}
56 changes: 56 additions & 0 deletions app/Domains/WebParser/DefaultClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
<?php

namespace App\Domains\WebParser;

use League\HTMLToMarkdown\Converter\CodeConverter;
use League\HTMLToMarkdown\Converter\PreformattedConverter;
use League\HTMLToMarkdown\Converter\TableConverter;
use League\HTMLToMarkdown\Converter\TextConverter;
use League\HTMLToMarkdown\Environment;
use League\HTMLToMarkdown\HtmlConverter;
use LlmLaraHub\LlmDriver\BaseClient;
use Spatie\Browsershot\Browsershot;

class DefaultClient extends BaseClient
{
public function scrape(string $url): WebContentResultsDto
{
$results = Browsershot::url($url)
->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows')
->dismissDialogs()
->fullPage();

$plainResults = $this->parseHtml($results->bodyHtml());

return WebContentResultsDto::from([
'title' => str($plainResults)->limit(128)->title()->toString(),
'description' => str($plainResults)->limit(256)->title()->toString(),
'content' => $plainResults,
'content_raw' => $results->bodyHtml(),
'url' => $url,
'browserShot' => $results,
]);
}

public function parseHtml(string $html): string
{
$environment = new Environment([
'strip_tags' => true,
'suppress_errors' => true,
'hard_break' => true,
'strip_placeholder_links' => true,
'remove_nodes' => 'nav footer header script style meta',
]);
$environment->addConverter(new TableConverter());
$environment->addConverter(new CodeConverter());
$environment->addConverter(new PreformattedConverter());
$environment->addConverter(new TextConverter());

$converter = new HtmlConverter($environment);

$markdown = $converter->convert($html);

return str($markdown)->trim()->toString();

}
}
36 changes: 36 additions & 0 deletions app/Domains/WebParser/FireCrawlClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace App\Domains\WebParser;

use App\Domains\WebParser\Results\FireCrawResultsDto;
use App\Models\Setting;
use Illuminate\Http\Client\PendingRequest;
use Illuminate\Support\Facades\Http;

class FireCrawlClient extends BaseWebParserClient
{
public function scrape(string $url): WebContentResultsDto
{
$results = $this->getClient()->post('/scrape', [
'url' => $url,
]);

if ($results->failed()) {
throw new \Exception('FireCrawl API Error '.$results->json());
}

$data = $results->json();

return FireCrawResultsDto::from($data);
}

protected function getClient(): PendingRequest
{
$url = Setting::getSecret('fire_crawl', 'api_url');
$token = Setting::getSecret('fire_crawl', 'api_key');

return Http::baseUrl($url)->withHeaders([
'Authorization' => 'Bearer '.$token,
]);
}
}
24 changes: 24 additions & 0 deletions app/Domains/WebParser/Results/FireCrawResultsDto.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?php

namespace App\Domains\WebParser\Results;

use App\Domains\WebParser\WebContentResultsDto;
use Spatie\LaravelData\Attributes\MapInputName;

class FireCrawResultsDto extends WebContentResultsDto
{
public function __construct(
#[MapInputName('data.metadata.title')]
public string $title,
#[MapInputName('data.markdown')]
public string $content,
#[MapInputName('data.content')]
public string $content_raw,
#[MapInputName('data.metadata.sourceURL')]
public string $url,
#[MapInputName('data.metadata.description')]
public string $description = '',
) {

}
}
19 changes: 19 additions & 0 deletions app/Domains/WebParser/WebContentResultsDto.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php

namespace App\Domains\WebParser;

use Spatie\Browsershot\Browsershot;
use Spatie\LaravelData\Data;

class WebContentResultsDto extends Data
{
public function __construct(
public string $title,
public string $content,
public string $url,
public string $description = '',
public ?Browsershot $browserShot = null,
) {

}
}
17 changes: 17 additions & 0 deletions app/Http/Controllers/SettingController.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,23 @@ public function updateOllama(Request $request, Setting $setting)
return back();
}

public function updateFireCrawl(Request $request, Setting $setting)
{
$validated = $request->validate([
'api_key' => 'string|required',
'api_url' => 'string|required',
]);

$secrets = $setting->secrets;
$secrets['fire_crawl'] = $validated;
$setting->secrets = $secrets;
$setting->save();
$setting->updateStep($setting);
$this->clearCache();

return back();
}

public function updateGroq(Request $request, Setting $setting)
{
$validated = $request->validate([
Expand Down
8 changes: 5 additions & 3 deletions app/Jobs/GetWebContentJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,9 @@ public function handle(): void
->handle($this->webResponseDto->url, true);

$prompt = Templatizer::appendContext(true)
->handle($this->source->getPrompt(), $htmlResults);
->handle($this->source->getPrompt(), $htmlResults->content);

put_fixture('web_page_prompt.txt', $prompt, false);

$results = LlmDriverFacade::driver(
$this->source->getDriver()
Expand Down Expand Up @@ -112,7 +114,7 @@ public function handle(): void
'source_id' => $this->source->id,
'type' => TypesEnum::HTML,
'subject' => to_utf8($title),
'document_md5' => md5($htmlResults),
'document_md5' => md5($htmlResults->content),
'link' => $this->webResponseDto->url,
'collection_id' => $this->source->collection_id,
],
Expand All @@ -121,7 +123,7 @@ public function handle(): void
'file_path' => $this->webResponseDto->url,
'status_summary' => StatusEnum::Pending,
'meta_data' => $this->webResponseDto->toArray(),
'original_content' => $htmlResults,
'original_content' => $htmlResults->content,
]
);

Expand Down
5 changes: 3 additions & 2 deletions app/Models/Message.php
Original file line number Diff line number Diff line change
Expand Up @@ -223,11 +223,12 @@ public function run(): void
$meta_data->driver = $chat->getDriver();
$message->updateQuietly(['meta_data' => $meta_data]);

if ($message->meta_data?->tool === 'completion') {
if ($message->meta_data?->tool === 'chat') {
Log::info('[LaraChain] Running Simple Completion');

$messages = $chat->getChatResponse();
$response = LlmDriverFacade::driver($chat->getDriver())->chat($messages);
$response = LlmDriverFacade::driver($chat->getDriver())
->chat($messages);
$response = $response->content;

$chat->addInput(
Expand Down
30 changes: 30 additions & 0 deletions database/migrations/2024_08_06_002122_add_fields_to_settings.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?php

use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;

return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::table('settings', function (Blueprint $table) {
$table->longText('main_prompt')->nullable();
$table->longText('source_prompt')->nullable();
$table->longText('output_prompt')->nullable();
});
}

/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::table('settings', function (Blueprint $table) {
//
});
}
};
22 changes: 22 additions & 0 deletions resources/js/Components/ScrollButton.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<template>
<button @click="scrollToBottom"

class="fixed top-4 right-4 btn btn-secondary btn-circle text-white">
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="3" stroke="currentColor" class="size-6">
<path stroke-linecap="round" stroke-linejoin="round" d="M19.5 13.5 12 21m0 0-7.5-7.5M12 21V3" />
</svg>

</button>
</template>

<script setup>
import { ref } from 'vue'
const bottomTarget = ref(null)
const scrollToBottom = () => {
bottomTarget.value?.scrollIntoView({ behavior: 'smooth' })
}
defineExpose({ bottomTarget })
</script>
Loading

0 comments on commit b0340e5

Please sign in to comment.