-
Notifications
You must be signed in to change notification settings - Fork 24
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #44 from LlmLaraHub/add_firecrawl
Add FireCrawl Scraper
- Loading branch information
Showing
33 changed files
with
1,689 additions
and
210 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
<?php | ||
|
||
namespace App\Domains\WebParser; | ||
|
||
abstract class BaseWebParserClient | ||
{ | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
<?php | ||
|
||
namespace App\Domains\WebParser; | ||
|
||
use League\HTMLToMarkdown\Converter\CodeConverter; | ||
use League\HTMLToMarkdown\Converter\PreformattedConverter; | ||
use League\HTMLToMarkdown\Converter\TableConverter; | ||
use League\HTMLToMarkdown\Converter\TextConverter; | ||
use League\HTMLToMarkdown\Environment; | ||
use League\HTMLToMarkdown\HtmlConverter; | ||
use LlmLaraHub\LlmDriver\BaseClient; | ||
use Spatie\Browsershot\Browsershot; | ||
|
||
class DefaultClient extends BaseClient | ||
{ | ||
public function scrape(string $url): WebContentResultsDto | ||
{ | ||
$results = Browsershot::url($url) | ||
->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows') | ||
->dismissDialogs() | ||
->fullPage(); | ||
|
||
$plainResults = $this->parseHtml($results->bodyHtml()); | ||
|
||
return WebContentResultsDto::from([ | ||
'title' => str($plainResults)->limit(128)->title()->toString(), | ||
'description' => str($plainResults)->limit(256)->title()->toString(), | ||
'content' => $plainResults, | ||
'content_raw' => $results->bodyHtml(), | ||
'url' => $url, | ||
'browserShot' => $results, | ||
]); | ||
} | ||
|
||
public function parseHtml(string $html): string | ||
{ | ||
$environment = new Environment([ | ||
'strip_tags' => true, | ||
'suppress_errors' => true, | ||
'hard_break' => true, | ||
'strip_placeholder_links' => true, | ||
'remove_nodes' => 'nav footer header script style meta', | ||
]); | ||
$environment->addConverter(new TableConverter()); | ||
$environment->addConverter(new CodeConverter()); | ||
$environment->addConverter(new PreformattedConverter()); | ||
$environment->addConverter(new TextConverter()); | ||
|
||
$converter = new HtmlConverter($environment); | ||
|
||
$markdown = $converter->convert($html); | ||
|
||
return str($markdown)->trim()->toString(); | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,36 @@ | ||
<?php | ||
|
||
namespace App\Domains\WebParser; | ||
|
||
use App\Domains\WebParser\Results\FireCrawResultsDto; | ||
use App\Models\Setting; | ||
use Illuminate\Http\Client\PendingRequest; | ||
use Illuminate\Support\Facades\Http; | ||
|
||
class FireCrawlClient extends BaseWebParserClient | ||
{ | ||
public function scrape(string $url): WebContentResultsDto | ||
{ | ||
$results = $this->getClient()->post('/scrape', [ | ||
'url' => $url, | ||
]); | ||
|
||
if ($results->failed()) { | ||
throw new \Exception('FireCrawl API Error '.$results->json()); | ||
} | ||
|
||
$data = $results->json(); | ||
|
||
return FireCrawResultsDto::from($data); | ||
} | ||
|
||
protected function getClient(): PendingRequest | ||
{ | ||
$url = Setting::getSecret('fire_crawl', 'api_url'); | ||
$token = Setting::getSecret('fire_crawl', 'api_key'); | ||
|
||
return Http::baseUrl($url)->withHeaders([ | ||
'Authorization' => 'Bearer '.$token, | ||
]); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
<?php | ||
|
||
namespace App\Domains\WebParser\Results; | ||
|
||
use App\Domains\WebParser\WebContentResultsDto; | ||
use Spatie\LaravelData\Attributes\MapInputName; | ||
|
||
class FireCrawResultsDto extends WebContentResultsDto | ||
{ | ||
public function __construct( | ||
#[MapInputName('data.metadata.title')] | ||
public string $title, | ||
#[MapInputName('data.markdown')] | ||
public string $content, | ||
#[MapInputName('data.content')] | ||
public string $content_raw, | ||
#[MapInputName('data.metadata.sourceURL')] | ||
public string $url, | ||
#[MapInputName('data.metadata.description')] | ||
public string $description = '', | ||
) { | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<?php | ||
|
||
namespace App\Domains\WebParser; | ||
|
||
use Spatie\Browsershot\Browsershot; | ||
use Spatie\LaravelData\Data; | ||
|
||
class WebContentResultsDto extends Data | ||
{ | ||
public function __construct( | ||
public string $title, | ||
public string $content, | ||
public string $url, | ||
public string $description = '', | ||
public ?Browsershot $browserShot = null, | ||
) { | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
30 changes: 30 additions & 0 deletions
30
database/migrations/2024_08_06_002122_add_fields_to_settings.php
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
<?php | ||
|
||
use Illuminate\Database\Migrations\Migration; | ||
use Illuminate\Database\Schema\Blueprint; | ||
use Illuminate\Support\Facades\Schema; | ||
|
||
return new class extends Migration | ||
{ | ||
/** | ||
* Run the migrations. | ||
*/ | ||
public function up(): void | ||
{ | ||
Schema::table('settings', function (Blueprint $table) { | ||
$table->longText('main_prompt')->nullable(); | ||
$table->longText('source_prompt')->nullable(); | ||
$table->longText('output_prompt')->nullable(); | ||
}); | ||
} | ||
|
||
/** | ||
* Reverse the migrations. | ||
*/ | ||
public function down(): void | ||
{ | ||
Schema::table('settings', function (Blueprint $table) { | ||
// | ||
}); | ||
} | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
<template> | ||
<button @click="scrollToBottom" | ||
|
||
class="fixed top-4 right-4 btn btn-secondary btn-circle text-white"> | ||
<svg xmlns="http://www.w3.org/2000/svg" fill="none" viewBox="0 0 24 24" stroke-width="3" stroke="currentColor" class="size-6"> | ||
<path stroke-linecap="round" stroke-linejoin="round" d="M19.5 13.5 12 21m0 0-7.5-7.5M12 21V3" /> | ||
</svg> | ||
|
||
</button> | ||
</template> | ||
|
||
<script setup> | ||
import { ref } from 'vue' | ||
const bottomTarget = ref(null) | ||
const scrollToBottom = () => { | ||
bottomTarget.value?.scrollIntoView({ behavior: 'smooth' }) | ||
} | ||
defineExpose({ bottomTarget }) | ||
</script> |
Oops, something went wrong.