Skip to content

Commit

Permalink
working on an idea for search
Browse files Browse the repository at this point in the history
  • Loading branch information
alnutile committed May 1, 2024
1 parent af81c97 commit 3b978df
Show file tree
Hide file tree
Showing 11 changed files with 2,326 additions and 82 deletions.
13 changes: 13 additions & 0 deletions app/Domains/Sources/WebSearch/ConfigurationLoader.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
<?php

namespace App\Domains\Sources\WebSearch;

use RoachPHP\Support\Configurable;
use RoachPHP\Support\ConfigurableInterface;

class ConfigurationLoader implements ConfigurableInterface
{
use Configurable;

public array $urls;
}
22 changes: 22 additions & 0 deletions app/Domains/Sources/WebSearch/GetPage.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<?php

namespace App\Domains\Sources\WebSearch;

use Illuminate\Support\Facades\Storage;
use Spatie\Browsershot\Browsershot;

class GetPage
{
public function handle(string $url): string
{

$results = Browsershot::url($url)
->dismissDialogs()
->fullPage();

$name = md5($url).'.pdf';
Storage::disk('collections')->put($name, $results->pdf());

return $results->bodyHtml();
}
}
67 changes: 67 additions & 0 deletions app/Spiders/GenericSiteSpider.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
<?php

namespace App\Spiders;

use Generator;
use Illuminate\Support\Facades\Storage;
use League\HTMLToMarkdown\HtmlConverter;
use RoachPHP\Downloader\Middleware\ExecuteJavascriptMiddleware;
use RoachPHP\Downloader\Middleware\RequestDeduplicationMiddleware;
use RoachPHP\Extensions\LoggerExtension;
use RoachPHP\Extensions\StatsCollectorExtension;
use RoachPHP\Http\Response;
use RoachPHP\Spider\BasicSpider;
use RoachPHP\Spider\ParseResult;
use RoachPHP\Support\Configurable;

class GenericSiteSpider extends BasicSpider
{
use Configurable;

public array $startUrls = [];

public array $downloaderMiddleware = [
RequestDeduplicationMiddleware::class,
];

public array $spiderMiddleware = [

];

public array $responseMiddleware = [
ExecuteJavascriptMiddleware::class,
];

public array $itemProcessors = [
//
];

public array $extensions = [
LoggerExtension::class,
StatsCollectorExtension::class,
];

public int $concurrency = 2;

public int $requestDelay = 1;

/**
* @return Generator<ParseResult>
*/
public function parse(Response $response): Generator
{
$collection = $this->context['collection'];
$title = str($response->getUri())->afterLast('/')->snake()->replace("\/", '_')->toString();
$body = $response->getBody();
$name = $title.'.html';
$converter = new HtmlConverter(['strip_tags' => true]);
$markdown = $converter->convert($body);
$markdownName = $title.'.md';
Storage::disk('collections')->put($collection->id.'/'.$name, $body);
Storage::disk('collections')->put($collection->id.'/'.$markdownName, $markdown);

yield $this->item([
'body' => $body,
]);
}
}
4 changes: 4 additions & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,11 +18,15 @@
"laravel/reverb": "@beta",
"laravel/sanctum": "^4.0",
"laravel/tinker": "^2.9",
"league/html-to-markdown": "^5.1",
"nwidart/laravel-modules": "^11.0",
"openai-php/laravel": "^0.8.1",
"owenvoke/blade-fontawesome": "^2.3",
"phpoffice/phppresentation": "dev-fix-pptx",
"roach-php/core": "^3.2",
"roach-php/laravel": "^3.1",
"smalot/pdfparser": "^2.9",
"spatie/browsershot": "^4.0",
"spatie/laravel-data": "^4.4",
"spatie/laravel-markdown": "^2.3",
"tightenco/ziggy": "^2.0",
Expand Down
Loading

0 comments on commit 3b978df

Please sign in to comment.