Skip to content

Commit

Permalink
start the web source
Browse files Browse the repository at this point in the history
  • Loading branch information
alnutile committed May 5, 2024
1 parent 081b47e commit 83c83ec
Show file tree
Hide file tree
Showing 23 changed files with 300 additions and 281 deletions.
6 changes: 6 additions & 0 deletions Modules/LlmDriver/app/BaseClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -239,4 +239,10 @@ public function onQueue(): string
{
return 'api_request';
}

public function getMaxTokenSize(string $driver) : int
{
$driver = config("llmdriver.drivers.$driver");
return data_get($driver, 'max_tokens', 8192);
}
}
8 changes: 8 additions & 0 deletions Modules/LlmDriver/tests/Feature/MockClientTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -66,4 +66,12 @@ public function test_Chat(): void
$this->assertInstanceOf(CompletionResponse::class, $results);

}

public function test_max_token() : void {
$client = new MockClient();

$results = $client->getMaxTokenSize('claude');

$this->assertEquals(4096, $results);
}
}
2 changes: 1 addition & 1 deletion app/Domains/Sources/WebSearch/WebSearchProvider.php
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ public function register(): void
$driver = config('llmdriver.sources.search_driver');
$client = new WebSearchDriverClient();

return $client->driver($driver);
return $client;
});
}

Expand Down
83 changes: 67 additions & 16 deletions app/Domains/Sources/WebSearchSource.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,30 +4,81 @@

use App\Domains\Sources\WebSearch\Response\SearchResponseDto;
use App\Domains\Sources\WebSearch\WebSearchFacade;
use App\Jobs\GetWebContentJob;
use App\Models\Source;
use Illuminate\Support\Facades\Bus;
use Illuminate\Support\Facades\Log;
use LlmLaraHub\LlmDriver\LlmDriverFacade;

class WebSearchSource extends BaseSource
{
public function handle(Source $source): void
{
Log::info('[LaraChain] - Running WebSearchSource');

$meta_data = $source->meta_data;
$search = $source->details;
$limit = data_get($meta_data, 'limit', 5);
$driver = data_get($meta_data, 'driver', 'mock');

/** @var SearchResponseDto $response */
/** @phpstan-ignore-next-line */
$response = WebSearchFacade::driver($driver)->search(
search: $search,
options: [
'limit' => $limit,
]
);

$source->last_run = now();
$source->save();
try {
$meta_data = $source->meta_data;
$search = $source->details;
$limit = data_get($meta_data, 'limit', 5);
$driver = data_get($meta_data, 'driver', 'mock');



$prompt = <<<PROMPT
The user is asking to search the web but I want you to review the query and clean it up so I can pass
it to an api to get results: Just return their content but with your rework so I can pass it right to the
search api. ONLY return the updated query I will pass this directly to the API via code:
### START USER QUERY
$search
### END USER QUERY
PROMPT;

Log::info('[LaraChain] Asking LLM to optimize search query');

$response = LlmDriverFacade::driver($source->getDriver())
->completion($prompt);

$search = $response->content;

Log::info('[LaraChain] Starting web search ', [
'content reworked' => $search,
]);


/** @var SearchResponseDto $response */
$response = WebSearchFacade::driver($driver)->search(
search: $search,
options: [
'limit' => $limit,
]
);

$jobs = [];

Log::info('[Larachain] Getting Content from websearch');

foreach ($response->getWeb() as $web) {
$jobs[] = new GetWebContentJob($source, $web);
}

Bus::batch($jobs)
->name("Getting Web Content for Source - {$source->title}")
->onQueue(LlmDriverFacade::driver($source->getDriver())->onQueue())
->allowFailures()
->dispatch();

$source->last_run = now();
$source->save();


} catch(\Exception $e) {
Log::error('[LaraChain] - Error running WebSearchSource', [
'error' => $e->getMessage(),
]);
}


}
}
38 changes: 0 additions & 38 deletions app/Http/Controllers/SearchSourceController.php

This file was deleted.

70 changes: 55 additions & 15 deletions app/Jobs/GetWebContentJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,12 @@

namespace App\Jobs;

use App\Domains\Documents\StatusEnum;
use App\Domains\Documents\TypesEnum;
use App\Domains\Sources\WebSearch\Response\WebResponseDto;
use App\Models\Document;
use App\Models\DocumentChunk;
use App\Models\Source;
use Facades\App\Domains\Sources\WebSearch\GetPage;
use Illuminate\Bus\Batchable;
use Illuminate\Bus\Queueable;
Expand All @@ -13,6 +16,7 @@
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Log;
use LlmLaraHub\LlmDriver\LlmDriverFacade;

class GetWebContentJob implements ShouldQueue
{
Expand All @@ -22,7 +26,7 @@ class GetWebContentJob implements ShouldQueue
* Create a new job instance.
*/
public function __construct(
public Document $document,
public Source $source,
public WebResponseDto $webResponseDto
) {
//
Expand All @@ -39,31 +43,67 @@ public function handle(): void
return;
}

Log::info("[Larachain] GetWebContentJob - {$this->document->id} - URL: {$this->webResponseDto->url}");
$html = GetPage::make($this->document->collection)->handle($this->webResponseDto->url);
Log::info("[Larachain] GetWebContentJob - {$this->source->title} - URL: {$this->webResponseDto->url}");
$html = GetPage::make($this->source->collection)->handle($this->webResponseDto->url);

$results = GetPage::make($this->document->collection)->parseHtml($html);
$results = GetPage::make($this->source->collection)->parseHtml($html);

$page_number = 1;
$guid = md5($this->webResponseDto->url);

Log::info("[Larachain] GetWebContentJob - {$this->document->id} - GUID: $guid");
$DocumentChunk = DocumentChunk::updateOrCreate(
/**
* Document can reference a source
*/
$document = Document::updateOrCreate(
[
'guid' => $guid,
'document_id' => $this->document->id,
'source_id' => $this->source->id,
'type' => TypesEnum::HTML
],
[
'content' => $results,
'sort_order' => $page_number,
'status' => StatusEnum::Pending,
'status_summary' => StatusEnum::Pending,
'file_path' => $this->webResponseDto->url,
'collection_id' => $this->source->collection_id,
'meta_data' => $this->webResponseDto->toArray(),
]
);

Log::info('[Larachain] adding to new batch');
$this->batch()->add([
new VectorlizeDataJob($DocumentChunk),
new SummarizeDataJob($DocumentChunk),
]);

/**
* @TODO
* I need to use the token_counter and the break up the string till
* all of it fits into that limit
* In the meantime just doing below
*/

$maxTokenSize = LlmDriverFacade::driver($this->source->getDriver())
->getMaxTokenSize($this->source->getDriver());

$page_number = 1;

$chunks = chunk_string($results, $maxTokenSize);

foreach ($chunks as $chunk) {
$DocumentChunk = DocumentChunk::updateOrCreate(
[
'guid' => $guid . '-' . $page_number,
'document_id' => $document->id,
],
[
'content' => $chunk,
'sort_order' => $page_number,
]
);

Log::info('[Larachain] adding to new batch');

$this->batch()->add([
new VectorlizeDataJob($DocumentChunk),
new SummarizeDataJob($DocumentChunk),
]);

$page_number++;
}
return;
}
}
85 changes: 0 additions & 85 deletions app/Jobs/KickOffWebSearchCreationJob.php

This file was deleted.

6 changes: 6 additions & 0 deletions app/Models/Document.php
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ class Document extends Model implements HasDrivers, TaggableContract
protected $casts = [
'type' => TypesEnum::class,
'status' => StatusEnum::class,
'meta_data' => 'array',
'summary_status' => StatusEnum::class,
];

Expand Down Expand Up @@ -113,4 +114,9 @@ public function getEmbeddingDriver(): string
{
return $this->collection->embedding_driver->value;
}

public function source() : BelongsTo
{
return $this->belongsTo(Source::class);
}
}
Loading

0 comments on commit 83c83ec

Please sign in to comment.