Skip to content

Commit

Permalink
search getting closer
Browse files Browse the repository at this point in the history
  • Loading branch information
alnutile committed May 2, 2024
1 parent 49b2535 commit f6c875f
Show file tree
Hide file tree
Showing 16 changed files with 663 additions and 14 deletions.
Binary file modified .DS_Store
Binary file not shown.
6 changes: 1 addition & 5 deletions Modules/LlmDriver/app/GroqClient.php
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ protected function getClient()
throw new \Exception('Groq API Token not found');
}

return Http::retry(3, 6000)->withToken($api_token)->withHeaders([
return Http::retry(3, 6000)->timeout(120)->withToken($api_token)->withHeaders([
'content-type' => 'application/json',
])->baseUrl($this->baseUrl);
}
Expand All @@ -125,8 +125,6 @@ public function functionPromptChat(array $messages, array $only = []): array

$messages = $this->insertFunctionsIntoMessageArray($messages);

put_fixture('groq_functions_prompt_real.json', $messages);

$results = $this->getClient()->post('/chat/completions', [
'model' => $model,
'max_tokens' => $maxTokens,
Expand All @@ -141,8 +139,6 @@ public function functionPromptChat(array $messages, array $only = []): array
throw new \Exception('Groq API Error '.$error);
}

put_fixture('groq_functions_response_real.json', $results->json());

foreach ($results->json()['choices'] as $content) {
$functionArray = data_get($content, 'message.content', []);
$functionArray = json_decode($functionArray, true);
Expand Down
107 changes: 105 additions & 2 deletions app/Domains/Sources/WebSearch/GetPage.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,21 +2,124 @@

namespace App\Domains\Sources\WebSearch;

use App\Models\Collection;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;
use LlmLaraHub\LlmDriver\LlmDriverFacade;
use LlmLaraHub\LlmDriver\Responses\CompletionResponse;
use Spatie\Browsershot\Browsershot;
use SundanceSolutions\LarachainTokenCount\Facades\LarachainTokenCount;

class GetPage
{
public function __construct(public Collection $collection)
{
}

public static function make(Collection $collection): self
{
/** @phpstan-ignore-next-line */
return new static($collection);
}

public function handle(string $url): string
{

$results = Browsershot::url($url)
->dismissDialogs()
->fullPage();

$name = md5($url).'.pdf';
Storage::disk('collections')->put($name, $results->pdf());
$name = str($url)->afterLast('/')->toString().'.pdf';

Storage::disk('collections')->put($this->collection->id.'/'.$name, $results->pdf());

return $results->bodyHtml();
}

public function parseHtml(string $html): array|string
{
$tokenCount = LarachainTokenCount::count($html);

Log::info("[LaraChain] Token Count of html: $tokenCount");

$html = preg_replace('#<script(.*?)>(.*?)</script>#is', '', $html);

$html = str($html)->after('<body')->beforeLast('</body>')->toString();

$tokenCountAfter = LarachainTokenCount::count($html);

Log::info("[LaraChain] Token Count After of html: $tokenCountAfter");

$prompt = <<<PROMPT
I need you to convert this HTML To JSON.
The HTML below needs to be an array of objects.
Each object will have a key type and the key content.
The key type would have the type of data eg title, narrative, image, video, etc.
Ignore footer, header, ads, etc.
The content would have the related content.
ONLY RETURN JSON NO INTRO TEXT ETC.
No Comments like "Here is the JSON array of objects" I am going to pass it to json_decode in PHP.
So the response would be:
[
{
"type": "title",
"content": "The title of the page"
},
{
"type": "narrative",
"content": "The narrative of the page"
},
{
"type": "image",
"content": "full_url_path_to_image"
},
{
"type": "video",
"content": "url_path_to_video"
}
]
### END EXAMPLE
### START HTML
$html
### END HTML
PROMPT;

Log::info('[LaraChain] Prompt for HTML to JSON: ', [$prompt]);

/** @var CompletionResponse $results */
$results = LlmDriverFacade::driver($this->collection->getDriver())
->completion($prompt);

$content = $results->content;

$prompt = <<<PROMPT
As a JSON verification assistant you will review this
encoded json, clean it up and turn it back as
encoded json that the next line of code will pass to
json_decode in PHP so it has to be valid.
Do not comment on it, do not tell me what you did
Do not add fluff like "Here is the cleaned up JSON"
Just return the cleaned up JSON.
```json
$content
```
PROMPT;

/** @var CompletionResponse $results */
$results = LlmDriverFacade::driver($this->collection->getDriver())
->completion($prompt);

Log::info('[LaraChain] Results from json converion: ', [$results->content]);

return json_decode($results->content, true);
}
}
11 changes: 11 additions & 0 deletions app/helpers.php
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,17 @@ function put_fixture($file_name, $content = [], $json = true)
}
}

if (! function_exists('token_counter')) {
function token_counter(string $message)
{
$words = preg_split('/\s+/', trim($message));

$tokenCount = count($words);

return $tokenCount;
}
}

if (! function_exists('notify_ui')) {
function notify_ui(HasDrivers $model, string $message)
{
Expand Down
2 changes: 2 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,11 @@
"roach-php/core": "^3.2",
"roach-php/laravel": "^3.1",
"smalot/pdfparser": "^2.9",
"soundasleep/html2text": "^2.1",
"spatie/browsershot": "^4.0",
"spatie/laravel-data": "^4.4",
"spatie/laravel-markdown": "^2.3",
"sundance-solutions/larachain-token-count": "dev-main",
"tightenco/ziggy": "^2.0",
"voku/stop-words": "^2.0",
"wamania/php-stemmer": "^3.0",
Expand Down
133 changes: 131 additions & 2 deletions composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Binary file modified tests/.DS_Store
Binary file not shown.
Loading

0 comments on commit f6c875f

Please sign in to comment.