Skip to content

Commit

Permalink
this updates the unstructured flow so it now has a consistant dto str…
Browse files Browse the repository at this point in the history
…ucture that I will move to the PDF parser then more into the model then the query
  • Loading branch information
alnutile committed Apr 22, 2024
1 parent 74297e2 commit 662d862
Show file tree
Hide file tree
Showing 11 changed files with 73 additions and 89 deletions.
47 changes: 26 additions & 21 deletions app/Domains/Documents/Transformers/BaseTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,32 @@

use App\Domains\UnStructured\StructuredDto;
use App\Domains\UnStructured\StructuredTypeEnum;
use Nette\Schema\Elements\Structure;


/**
* @NOTE
* @NOTE
* the properies so far are things that might be shared page to page
*
*/
abstract class BaseTransformer {
abstract class BaseTransformer
{
public string $creator = '';

public string $last_updated_by = '';

public string $path_to_file = '';

public mixed $updated_at = '';

public string $coordinates = '';

public string $keywords = '';

public string $creator = "";
public string $last_updated_by = "";
public string $path_to_file = "";
public string $updated_at = "";
public string $coordinates = "";
public string $keywords = "";
public string $category = "";
public string $description = "";
public string $subject = "";
public string $title = "";
public string $category = '';

public string $description = '';

public string $subject = '';

public string $title = '';

public function output(
StructuredTypeEnum $type,
Expand All @@ -33,13 +38,13 @@ public function output(
mixed $guid,
mixed $element_depth,
bool $is_continuation = false,
) : StructuredDto {
): StructuredDto {

return StructuredDto::from([
'type' => $type,
'content' => $content,
'title' => $this->title,
'created_by' => $this->creator,
'created_by' => $this->creator,
'last_updated_by' => $this->last_updated_by,
'page' => $page_number,
'guid' => $guid,
Expand All @@ -52,13 +57,13 @@ public function output(
'subject' => $this->subject,
'keywords' => $this->keywords,
'category' => $this->category,
'parent_id' => null
'parent_id' => null,
]);

}

public function getTitleFormatted() : string {
public function getTitleFormatted(): string
{
return str($this->title)->snake()->toString();
}

}
}
1 change: 0 additions & 1 deletion app/Domains/Documents/Transformers/PdfTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
use Illuminate\Support\Facades\Bus;
use Illuminate\Support\Facades\Log;
use LlmLaraHub\LlmDriver\LlmDriverFacade;
use LlmLaraHub\TagFunction\Functions\TaggingFunction;
use Smalot\PdfParser\Parser;

class PdfTransformer
Expand Down
11 changes: 4 additions & 7 deletions app/Domains/Documents/Transformers/PowerPointTransformer.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,16 +2,13 @@

namespace App\Domains\Documents\Transformers;

use App\Domains\Collections\CollectionStatusEnum;
use App\Events\CollectionStatusEvent;
use App\Jobs\SummarizeDataJob;
use App\Domains\UnStructured\StructuredDto;
use App\Jobs\SummarizeDataJob;
use App\Jobs\VectorlizeDataJob;
use App\Models\Document;
use App\Models\DocumentChunk;
use Illuminate\Support\Facades\Log;
use PhpOffice\PhpPresentation\IOFactory;
use PhpOffice\PhpPresentation\Shape\RichText;

class PowerPointTransformer
{
Expand All @@ -33,7 +30,7 @@ public function handle(Document $document): array

$chunks = [];
while ($results->valid()) {
/** @var StructuredDto $dto */
/** @var StructuredDto $dto */
$dto = $results->current();
$DocumentChunk = DocumentChunk::updateOrCreate(
[
Expand All @@ -43,7 +40,7 @@ public function handle(Document $document): array
[
'content' => $dto->content,
'sort_order' => $dto->page,
'meta_data' => $dto->toArray()
'meta_data' => $dto->toArray(),
]
);

Expand All @@ -54,7 +51,7 @@ public function handle(Document $document): array

$results->next();
}

Log::info('PowerPointTransformer:handle', ['chunks' => count($chunks)]);

return $chunks;
Expand Down
21 changes: 9 additions & 12 deletions app/Domains/Documents/Transformers/ProcessPpt.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@

namespace App\Domains\Documents\Transformers;

use App\Domains\UnStructured\StructuredDto;
use App\Domains\UnStructured\StructuredTypeEnum;
use Generator;
use Illuminate\Support\Facades\File as FacadesFile;
Expand Down Expand Up @@ -35,11 +34,9 @@ public function handle(string $pathToFile): Generator
$this->category = $documentProperties->getCategory();
$this->description = $documentProperties->getDescription();
$this->updated_at = $documentProperties->getModified();


foreach ($oPHPPresentation->getAllSlides() as $page_number => $page) {


$page_number = $page_number + 1;

try {
Expand All @@ -52,7 +49,7 @@ public function handle(string $pathToFile): Generator
if ($shape instanceof RichText) {
$pageContent = $shape->getPlainText();
$guid = $shape->getHashCode();

$content = $this->output(
type: StructuredTypeEnum::Narrative,
content: $pageContent,
Expand All @@ -65,19 +62,19 @@ public function handle(string $pathToFile): Generator
yield $content;
} elseif ($shape instanceof Table) {
$table = $shape->getRows();
$this->title = "Table";
$this->subject = "Table";
$this->title = 'Table';
$this->subject = 'Table';

$content = $this->output(
type: StructuredTypeEnum::Table,
content: "table data",
content: 'table data',
page_number: $page_number,
guid: $shape->getHashCode(),
element_depth: 0,
is_continuation: false,
);
yield $content;

foreach ($table as $rowNumber => $row) {
foreach ($row->getCells() as $cellNumber => $cell) {
$pageContent = $cell->getPlainText();
Expand All @@ -87,7 +84,7 @@ public function handle(string $pathToFile): Generator
content: $pageContent,
page_number: $page_number,
guid: $row->getHashCode(),
element_depth: $rowNumber . $cellNumber,
element_depth: $rowNumber.$cellNumber,
is_continuation: true,
);
yield $content;
Expand All @@ -100,12 +97,12 @@ public function handle(string $pathToFile): Generator
$nameAndType = $this->title.'.'.$mimtype;
$this->path_to_file = storage_path('app/temp/'.$nameAndType);
FacadesFile::put($this->path_to_file, $contents);
$this->subject = "image";
$this->description = "Image of type " . $mimtype;
$this->subject = 'image';
$this->description = 'Image of type '.$mimtype;

$content = $this->output(
type: StructuredTypeEnum::Image,
content: "see image", //ocr at this point or after
content: 'see image', //ocr at this point or after
page_number: $page_number,
guid: $shape->getHashCode(),
element_depth: $shapeCount,
Expand Down
35 changes: 17 additions & 18 deletions app/Domains/UnStructured/StructuredDto.php
Original file line number Diff line number Diff line change
@@ -1,31 +1,30 @@
<?php
<?php

namespace App\Domains\UnStructured;

use Spatie\LaravelData\Data;

class StructuredDto extends Data {

class StructuredDto extends Data
{
public function __construct(
public StructuredTypeEnum $type,
public string $content,
public string $title,
public string $created_by,
public string $last_updated_by,
public string $page,
public string $content,
public string $title,
public string $created_by,
public string $last_updated_by,
public string $page,
public string $guid,
public string $file_name,
public string $updated_at,
public string $coordinates,
public string $element_depth,
public bool $is_continuation = false,
public string|null $parent_id,
public string|null $description,
public string|null $subject,
public string|null $keywords,
public string|null $category,
)
{

public bool $is_continuation,
public ?string $parent_id,
public ?string $description,
public ?string $subject,
public ?string $keywords,
public ?string $category,
) {

}
}
}
10 changes: 4 additions & 6 deletions app/Domains/UnStructured/StructuredTypeEnum.php
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
<?php
<?php

namespace App\Domains\UnStructured;

use Spatie\LaravelData\Data;

enum StructuredTypeEnum : string {

enum StructuredTypeEnum: string
{
case Narrative = 'narrative';
case Title = 'title';
case Table = 'table';
case TableRow = 'table_row';
case Image = 'image';
case Footer = 'footer';
}
}
24 changes: 11 additions & 13 deletions app/Jobs/ProcessFileJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,7 @@
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Bus;
use Illuminate\Support\Facades\Log;
use Laravel\Pennant\Feature;
use LlmLaraHub\LlmDriver\LlmDriverFacade;
use PhpOffice\PhpPresentation\IOFactory;

class ProcessFileJob implements ShouldQueue
{
Expand All @@ -37,17 +35,17 @@ public function handle(): void
$document = $this->document;

if ($document->type === TypesEnum::Pptx) {
Log::info('Processing PPTX Document');
$batch = Bus::batch([
new ParsePowerPointJob($this->document),
])
->name('Process PPTX Document - '.$document->id)
->finally(function (Batch $batch) use ($document) {
DocumentParsedEvent::dispatch($document);
})
->allowFailures()
->onQueue(LlmDriverFacade::driver($document->getDriver())->onQueue())
->dispatch();
Log::info('Processing PPTX Document');
$batch = Bus::batch([
new ParsePowerPointJob($this->document),
])
->name('Process PPTX Document - '.$document->id)
->finally(function (Batch $batch) use ($document) {
DocumentParsedEvent::dispatch($document);
})
->allowFailures()
->onQueue(LlmDriverFacade::driver($document->getDriver())->onQueue())
->dispatch();

} elseif ($document->type === TypesEnum::PDF) {
Log::info('Processing PDF Document');
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
public function up(): void
{
Schema::table('document_chunks', function (Blueprint $table) {
$table->json("meta_data")->nullable();
$table->json('meta_data')->nullable();
});
}

Expand Down
5 changes: 0 additions & 5 deletions tests/Feature/ProcessPptTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,13 @@
namespace Tests\Feature;

use App\Domains\Documents\Transformers\ProcessPpt;
use App\Models\Document;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Foundation\Testing\WithFaker;
use Illuminate\Support\Facades\File;
use Tests\TestCase;

class ProcessPptTest extends TestCase
{

use SharedSetupForPptFile;


protected function tearDown(): void
{
if (File::exists($this->document->pathToFile())) {
Expand Down
3 changes: 1 addition & 2 deletions tests/Feature/SharedSetupForPptFile.php
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ trait SharedSetupForPptFile
{
public Document $document;

protected function setupFile() : Document
protected function setupFile(): Document
{
$document = Document::factory()->create([
'file_path' => 'example.ppt',
Expand All @@ -35,5 +35,4 @@ protected function setupFile() : Document

return $document;
}

}
3 changes: 0 additions & 3 deletions tests/Feature/StructuredDtoTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@

use App\Domains\UnStructured\StructuredDto;
use App\Domains\UnStructured\StructuredTypeEnum;
use Illuminate\Foundation\Testing\RefreshDatabase;
use Illuminate\Foundation\Testing\WithFaker;
use Nette\Schema\Elements\Structure;
use Tests\TestCase;

class StructuredDtoTest extends TestCase
Expand Down

0 comments on commit 662d862

Please sign in to comment.