Skip to content

Commit

Permalink
hmm stan might be right on this
Browse files Browse the repository at this point in the history
  • Loading branch information
alnutile committed Aug 6, 2024
1 parent b433057 commit 4b0f100
Show file tree
Hide file tree
Showing 21 changed files with 397 additions and 64 deletions.
35 changes: 19 additions & 16 deletions app/Domains/Sources/WebSearch/GetPage.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,12 @@

namespace App\Domains\Sources\WebSearch;

use Facades\App\Domains\WebParser\DefaultClient;
use Facades\App\Domains\WebParser\FireCrawlClient;
use App\Domains\WebParser\WebContentResultsDto;
use App\Models\Collection;
use App\Models\Setting;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use League\HTMLToMarkdown\Converter\CodeConverter;
use League\HTMLToMarkdown\Converter\PreformattedConverter;
Expand All @@ -24,27 +29,25 @@ public static function make(Collection $collection): self
return new static($collection);
}

public function handle(string $url, bool $parseHtml = true): string
public function handle(string $url, bool $parseHtml = true): WebContentResultsDto
{
$results = Browsershot::url($url)
->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows')
->dismissDialogs()
->fullPage();

$name = md5($url).'.pdf';
/**
* @TODO this can repeat
* @TODO
* Make this a driver like the rest of the system
*/
$name = md5($url).'.pdf';

Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf());

$body = $results->bodyHtml();

if ($parseHtml) {
$body = $this->parseHtml($body);
if(Setting::getSecret('fire_crawl', 'api_token')) {
Log::info('Using FireCrawl');
$results = FireCrawlClient::scrape($url);
} else {
Log::info('Using Default Browsershot');
/** @var WebContentResultsDto $results */
$results = DefaultClient::scrape($url);
/** @phpstan-ignore-next-line */
Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf());
}

return $body;
return $results;
}

public function parseHtml(string $html): string
Expand Down
7 changes: 7 additions & 0 deletions app/Domains/WebParser/BaseWebParserClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<?php

namespace App\Domains\WebParser;

abstract class BaseWebParserClient
{
}
58 changes: 58 additions & 0 deletions app/Domains/WebParser/DefaultClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
<?php

namespace App\Domains\WebParser;

use App\Domains\Sources\WebSearch\GetPage;
use League\HTMLToMarkdown\Converter\CodeConverter;
use League\HTMLToMarkdown\Converter\PreformattedConverter;
use League\HTMLToMarkdown\Converter\TableConverter;
use League\HTMLToMarkdown\Converter\TextConverter;
use League\HTMLToMarkdown\Environment;
use League\HTMLToMarkdown\HtmlConverter;
use LlmLaraHub\LlmDriver\BaseClient;
use Spatie\Browsershot\Browsershot;

class DefaultClient extends BaseClient
{

public function scrape(string $url): WebContentResultsDto {
$results = Browsershot::url($url)
->userAgent('DailyAI Studio Browser 1.0, helping users automate workflows')
->dismissDialogs()
->fullPage();


$plainResults = $this->parseHtml($results->bodyHtml());

return WebContentResultsDto::from([
'title' => str($plainResults)->limit(128)->title()->toString(),
'description' => str($plainResults)->limit(256)->title()->toString(),
'content' => $plainResults,
'content_raw' => $results->bodyHtml(),
'url' => $url,
'browserShot' => $results,
]);
}

public function parseHtml(string $html): string
{
$environment = new Environment([
'strip_tags' => true,
'suppress_errors' => true,
'hard_break' => true,
'strip_placeholder_links' => true,
'remove_nodes' => 'nav footer header script style meta',
]);
$environment->addConverter(new TableConverter());
$environment->addConverter(new CodeConverter());
$environment->addConverter(new PreformattedConverter());
$environment->addConverter(new TextConverter());

$converter = new HtmlConverter($environment);

$markdown = $converter->convert($html);

return str($markdown)->trim()->toString();

}
}
36 changes: 36 additions & 0 deletions app/Domains/WebParser/FireCrawlClient.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
<?php

namespace App\Domains\WebParser;

use App\Domains\WebParser\Results\FireCrawResultsDto;
use App\Models\Setting;
use Illuminate\Http\Client\PendingRequest;
use Illuminate\Support\Facades\Http;

class FireCrawlClient extends BaseWebParserClient
{
public function scrape(string $url): WebContentResultsDto
{
$results = $this->getClient()->post('/scrape', [
'url' => $url,
]);

if ($results->failed()) {
throw new \Exception('FireCrawl API Error '.$results->json());
}

$data = $results->json();

return FireCrawResultsDto::from($data);
}

protected function getClient(): PendingRequest
{
$url = Setting::getSecret('fire_crawl', 'api_url');
$token = Setting::getSecret('fire_crawl', 'api_token');

return Http::baseUrl($url)->withHeaders([
'Authorization' => 'Bearer '.$token,
]);
}
}
24 changes: 24 additions & 0 deletions app/Domains/WebParser/Results/FireCrawResultsDto.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?php

namespace App\Domains\WebParser\Results;

use App\Domains\WebParser\WebContentResultsDto;
use Spatie\LaravelData\Attributes\MapInputName;

class FireCrawResultsDto extends WebContentResultsDto
{
public function __construct(
#[MapInputName('data.metadata.title')]
public string $title,
#[MapInputName('data.metadata.description')]
public string $description,
#[MapInputName('data.markdown')]
public string $content,
#[MapInputName('data.content')]
public string $content_raw,
#[MapInputName('data.metadata.sourceURL')]
public string $url,
) {

}
}
19 changes: 19 additions & 0 deletions app/Domains/WebParser/WebContentResultsDto.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php

namespace App\Domains\WebParser;

use Spatie\Browsershot\Browsershot;
use Spatie\LaravelData\Data;

class WebContentResultsDto extends Data
{
public function __construct(
public string $title,
public string $description,
public string $content,
public string $url,
public ?Browsershot $browserShot = null,
) {

}
}
17 changes: 17 additions & 0 deletions app/Http/Controllers/SettingController.php
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,23 @@ public function updateOllama(Request $request, Setting $setting)
return back();
}

public function updateFireCrawl(Request $request, Setting $setting)
{
$validated = $request->validate([
'api_key' => 'string|required',
'api_url' => 'string|required',
]);

$secrets = $setting->secrets;
$secrets['fire_crawl'] = $validated;
$setting->secrets = $secrets;
$setting->save();
$setting->updateStep($setting);
$this->clearCache();

return back();
}

public function updateGroq(Request $request, Setting $setting)
{
$validated = $request->validate([
Expand Down
6 changes: 3 additions & 3 deletions app/Jobs/GetWebContentJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ public function handle(): void
->handle($this->webResponseDto->url, true);

$prompt = Templatizer::appendContext(true)
->handle($this->source->getPrompt(), $htmlResults);
->handle($this->source->getPrompt(), $htmlResults->content);

$results = LlmDriverFacade::driver(
$this->source->getDriver()
Expand Down Expand Up @@ -112,7 +112,7 @@ public function handle(): void
'source_id' => $this->source->id,
'type' => TypesEnum::HTML,
'subject' => to_utf8($title),
'document_md5' => md5($htmlResults),
'document_md5' => md5($htmlResults->content),
'link' => $this->webResponseDto->url,
'collection_id' => $this->source->collection_id,
],
Expand All @@ -121,7 +121,7 @@ public function handle(): void
'file_path' => $this->webResponseDto->url,
'status_summary' => StatusEnum::Pending,
'meta_data' => $this->webResponseDto->toArray(),
'original_content' => $htmlResults,
'original_content' => $htmlResults->content,
]
);

Expand Down
30 changes: 30 additions & 0 deletions database/migrations/2024_08_06_002122_add_fields_to_settings.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
<?php

use Illuminate\Database\Migrations\Migration;
use Illuminate\Database\Schema\Blueprint;
use Illuminate\Support\Facades\Schema;

return new class extends Migration
{
/**
* Run the migrations.
*/
public function up(): void
{
Schema::table('settings', function (Blueprint $table) {
$table->longText('main_prompt')->nullable();
$table->longText('source_prompt')->nullable();
$table->longText('output_prompt')->nullable();
});
}

/**
* Reverse the migrations.
*/
public function down(): void
{
Schema::table('settings', function (Blueprint $table) {
//
});
}
};
2 changes: 1 addition & 1 deletion resources/js/Pages/Settings/Partials/ClaudeSecrets.vue
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ const updateSecrets = () => {
form.put(route('settings.update.claude', {
setting: props.setting.id,
}), {
errorBag: 'updateProfileInformation',
errorBag: 'updateClaude',
preserveScroll: true,
});
};
Expand Down
82 changes: 82 additions & 0 deletions resources/js/Pages/Settings/Partials/FireCrawl.vue
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
<script setup>
import { ref } from 'vue';
import { Link, router, useForm } from '@inertiajs/vue3';
import ActionMessage from '@/Components/ActionMessage.vue';
import FormSection from '@/Components/FormSection.vue';
import InputError from '@/Components/InputError.vue';
import InputLabel from '@/Components/InputLabel.vue';
import PrimaryButton from '@/Components/PrimaryButton.vue';
import SecondaryButton from '@/Components/SecondaryButton.vue';
import TextInput from '@/Components/TextInput.vue';
import SecretInput from "@/Components/SecretInput.vue";
const props = defineProps({
setting: Object,
});
const form = useForm({
_method: 'PUT',
api_key: props.setting.secrets?.fire_crawl?.api_key ,
api_url: props.setting.secrets?.fire_crawl?.api_url ?? "https://api.firecrawl.dev/v0",
});
const updateSecrets = () => {
form.put(route('settings.update.fire_crawl', {
setting: props.setting.id,
}), {
errorBag: 'updateFireCrawlInformation',
preserveScroll: true,
});
};
</script>
<template>
<FormSection @submitted="updateSecrets">
<template #title>
Add FireCrawl Token and Url
</template>
<template #description>
This service can boost the web scraping quality over the
default scraper built in.
<a
class="underline"
href="https://docs.firecrawl.dev/features/scrape" target="_blank">here</a>
</template>
<template #form>
<!-- Name -->
<div class="col-span-6 sm:col-span-4">
<InputLabel for="name" value="Api Token" />
<SecretInput v-model="form.api_key" class="mt-1 block w-full" />
<InputError :message="form.errors.api_key" class="mt-2" />
</div>
<div class="col-span-6 sm:col-span-4">
<InputLabel for="name" value="Api Url" />
<TextInput
id="name"
v-model="form.api_url"
type="text"
class="mt-1 block w-full"
/>
<InputError :message="form.errors.api_url" class="mt-2" />
</div>
</template>
<template #actions>
<ActionMessage :on="form.recentlySuccessful" class="me-3">
Saved.
</ActionMessage>
<PrimaryButton :class="{ 'opacity-25': form.processing }" :disabled="form.processing">
Save
</PrimaryButton>
</template>
</FormSection>
</template>
2 changes: 1 addition & 1 deletion resources/js/Pages/Settings/Partials/GroqSecrets.vue
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ const updateSecrets = () => {
form.put(route('settings.update.groq', {
setting: props.setting.id,
}), {
errorBag: 'updateProfileInformation',
errorBag: 'updateGroq',
preserveScroll: true,
});
};
Expand Down
2 changes: 1 addition & 1 deletion resources/js/Pages/Settings/Partials/OllamaApiSecrets.vue
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ const updateSecrets = () => {
form.put(route('settings.update.ollama', {
setting: props.setting.id,
}), {
errorBag: 'updateProfileInformation',
errorBag: 'updateOllama',
preserveScroll: true,
});
};
Expand Down
Loading

0 comments on commit 4b0f100

Please sign in to comment.