Skip to content

Commit

Permalink
start adding docx
Browse files Browse the repository at this point in the history
  • Loading branch information
alnutile committed Jun 27, 2024
1 parent 3367bcf commit fca2a5a
Show file tree
Hide file tree
Showing 15 changed files with 403 additions and 4 deletions.
Binary file modified .DS_Store
Binary file not shown.
82 changes: 82 additions & 0 deletions app/Domains/Documents/Transformers/DocXTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
<?php

namespace App\Domains\Documents\Transformers;

use App\Domains\Collections\CollectionStatusEnum;
use App\Helpers\TextChunker;
use App\Jobs\VectorlizeDataJob;
use App\Models\Document;
use App\Models\DocumentChunk;
use Illuminate\Support\Facades\Log;
use PhpOffice\PhpWord\IOFactory;

class DocXTransformer
{
protected Document $document;

public function handle(Document $document): array
{
$this->document = $document;

$filePath = $this->document->pathToFile();

$parser = IOFactory::createReader('Word2007');

if (! $parser->canRead($filePath)) {
throw new \Exception('Can not read the document '.$filePath);
}

$document = $parser->load($filePath);

$sections = $document->getSections();

$content = [];
$chunks = [];

foreach ($sections as $section) {
$elements = $section->getElements();
foreach ($elements as $element) {
/**
* @TODO
* what type of section
* text is easy
* what about images
* what about tables
* what about lists
*/
$content[] = $element->getText();

}
}

$content_flattened = implode(' ', $content);
$size = config('llmdriver.chunking.default_size');
$chunked_chunks = TextChunker::handle($content_flattened, $size);
$page = 1;

foreach ($chunked_chunks as $chunkSection => $chunkContent) {
$DocumentChunk = DocumentChunk::updateOrCreate(
[
'document_id' => $this->document->id,
'sort_order' => $page,
'section_number' => $chunkSection,
],
[
'guid' => md5($chunkContent),
'content' => $chunkContent,
'meta_data' => [$chunkContent],
]
);

$chunks[] = [
new VectorlizeDataJob($DocumentChunk),
];
}

notify_collection_ui($document->collection, CollectionStatusEnum::PROCESSING, 'Processing Document');

Log::info('DocXTransformer:handle', ['chunks' => count($chunks)]);

return $chunks;
}
}
44 changes: 44 additions & 0 deletions app/Jobs/ParseDocxJob.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
<?php

namespace App\Jobs;

use App\Models\Document;
use Facades\App\Domains\Documents\Transformers\DocXTransformer;
use Illuminate\Bus\Batchable;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;

class ParseDocxJob implements ShouldQueue
{
use Batchable, Dispatchable, InteractsWithQueue, Queueable, SerializesModels;

/**
* Create a new job instance.
*/
public function __construct(public Document $document)
{
//
}

/**
* Execute the job.
*/
public function handle(): void
{
if ($this->batch()->cancelled()) {
// Determine if the batch has been cancelled...

return;
}

$chunks = DocXTransformer::handle($this->document);

foreach ($chunks as $chunk) {
$this->batch()->add($chunk);
}

}
}
10 changes: 10 additions & 0 deletions app/Jobs/ProcessFileJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,16 @@ public function handle(): void
DocumentProcessingCompleteJob::class,
],
],
TypesEnum::Docx->value => [
'jobs' => [
ParseDocxJob::class,
],
'finally' => [
SummarizeDocumentJob::class,
TagDocumentJob::class,
DocumentProcessingCompleteJob::class,
],
],
TypesEnum::Txt->value => [
'jobs' => [
ProcessTextFilesJob::class,
Expand Down
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"openai-php/laravel": "^0.8.1",
"owenvoke/blade-fontawesome": "^2.3",
"phpoffice/phppresentation": "dev-fix-pptx",
"phpoffice/phpword": "^1.2",
"pusher/pusher-php-server": "^7.2",
"roach-php/core": "^3.2",
"roach-php/laravel": "^3.1",
Expand Down
164 changes: 163 additions & 1 deletion composer.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

9 changes: 9 additions & 0 deletions database/factories/DocumentFactory.php
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,15 @@ public function email(): Factory
});
}

public function docx(): Factory
{
return $this->state(function (array $attributes) {
return [
'type' => TypesEnum::Docx,
];
});
}

public function pptx(): Factory
{
return $this->state(function (array $attributes) {
Expand Down
Binary file modified tests/.DS_Store
Binary file not shown.
Loading

0 comments on commit fca2a5a

Please sign in to comment.