Skip to content

Commit

Permalink
CSV upload working now for xlsx
Browse files Browse the repository at this point in the history
  • Loading branch information
alnutile committed Jul 8, 2024
1 parent 8b33b02 commit 3033d26
Show file tree
Hide file tree
Showing 22 changed files with 506 additions and 34 deletions.
Binary file modified .DS_Store
Binary file not shown.
5 changes: 5 additions & 0 deletions Modules/LlmDriver/app/Functions/StandardsChecker.php
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ public function handle(

foreach ($chunk as $document) {
if ($document->summary) {
/**
* @NOTE
* This assumes a small amount of incoming content to check
* The user my upload a blog post that is 20 paragraphs or more.
*/
$prompt = StandardsCheckerPrompt::prompt(
$document->summary, $usersInput->content
);
Expand Down
33 changes: 4 additions & 29 deletions Modules/TagFunction/app/TagManager.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,10 @@

namespace LlmLaraHub\TagFunction;

use App\Domains\Agents\VerifyPromptInputDto;
use App\Domains\Agents\VerifyPromptOutputDto;
use App\Domains\Collections\CollectionStatusEnum;
use App\Models\Document;
use Facades\App\Domains\Agents\VerifyResponseAgent;
use Illuminate\Support\Collection;
use Illuminate\Support\Facades\Log;
use Laravel\Pennant\Feature;
use LlmLaraHub\LlmDriver\LlmDriverFacade;
use LlmLaraHub\LlmDriver\Responses\CompletionResponse;

Expand Down Expand Up @@ -37,35 +33,14 @@ public function handle(Document $document): void

$this->tagsAsString = $response->content;

if (Feature::active('verification_prompt_tags')) {
$verifyPrompt = <<<'PROMPT'
This was the response from the LLM to get Tags from the content.
Please verify the json is good if not fix it so what you return is just JSON
and remove from tags any text that is not needed and any
tags that are not correct.
PROMPT;

$dto = VerifyPromptInputDto::from(
[
'chattable' => $document,
'originalPrompt' => $prompt,
'context' => $summary,
'llmResponse' => $this->tagsAsString,
'verifyPrompt' => $verifyPrompt,
]
);

/** @var VerifyPromptOutputDto $response */
$response = VerifyResponseAgent::verify($dto);

$this->tagsAsString = $response->response;

}

$this->tags = collect(explode(',', $this->tagsAsString));

$this->tags->take(3)
->map(function ($tag) use ($document) {
$tag = str($tag)
->remove('Here Are 3 Tags:')
->trim()
->toString();
$document->addTag($tag);
});

Expand Down
13 changes: 10 additions & 3 deletions Modules/TagFunction/app/TagPrompt.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,11 +7,18 @@ class TagPrompt
public static function prompt(string $context): string
{
return <<<PROMPT
This is the summary of the document, Can you make some tags I can use. Limit to 1-3 tags.
Please return them as a string of text with each tag separated by a comma for example:
**ROLE**
You are an assistant to help tag the content
**TASK**
You are going to tag the content, limited to 3 tags
**FORMAT**
Each tag should will be separated by a comma. No other text should be returned.
EXAMPLE FORMAT:
Tag 1, Tag Two Test, Tag Three Test
And nothing else. Here is the summary:
### START SUMMARY
$context
### END SUMMARY
Expand Down
99 changes: 99 additions & 0 deletions app/Domains/Documents/Transformers/CSVTransformer.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
<?php

namespace App\Domains\Documents\Transformers;

use App\Domains\Collections\CollectionStatusEnum;
use App\Domains\Documents\StatusEnum;
use App\Domains\Documents\TypesEnum;
use App\Helpers\TextChunker;
use App\Imports\DocumentsImport;
use App\Models\Document;
use App\Models\DocumentChunk;
use Illuminate\Support\Facades\Log;
use Illuminate\Support\Facades\Storage;

class CSVTransformer
{
protected Document $document;

protected TypesEnum $mimeType = TypesEnum::CSV;

public function handle(Document $document): array
{
$this->document = $document;

$filePath = $this->document->pathToFile();

//$filePath = null, string $disk = null, string $readerType = null
$collection = (new DocumentsImport())
->toCollection($filePath, null, \Maatwebsite\Excel\Excel::CSV);

$rows = $collection->first();

$chunks = [];

/**
* Going to turn into a document then chunks
*/
foreach ($rows as $rowNumber => $row) {
$file_name = 'row_'.$rowNumber.'_'.$document->file_path;

$encoded = json_encode($row);

Storage::disk('collections')
->put((string) $document->collection->id.'/'.$file_name, $encoded);

$documentRow = Document::updateOrCreate([
'collection_id' => $document->collection_id,
'file_path' => $file_name,
'type' => $this->mimeType,
], [
'status' => StatusEnum::Pending,
'summary' => $encoded,
'meta_data' => $row,
'original_content' => $encoded,
'subject' => "Row $rowNumber import from ".$document->file_path,
]);

$size = config('llmdriver.chunking.default_size');

$chunked_chunks = TextChunker::handle($encoded, $size);

if ($documentRow->wasRecentlyCreated) {
foreach ($chunked_chunks as $chunkSection => $chunkContent) {

$guid = md5($chunkContent);

$DocumentChunk = DocumentChunk::updateOrCreate(
[
'document_id' => $documentRow->id,
'sort_order' => $rowNumber,
'section_number' => $chunkSection,
],
[
'guid' => $guid,
'content' => $chunkContent,
'meta_data' => $row,
'original_content' => $encoded,
]
);

$chunks[] = $DocumentChunk;
}
} else {
$documentRow->updateQuietly([
'status' => StatusEnum::Complete,
]);
}

}

notify_collection_ui($document->collection, CollectionStatusEnum::PROCESSING, 'Processing Documents');

Log::info('CSVTransformer:handle', ['chunks' => count($chunks)]);

$document->delete();

return $chunks;
}
}
1 change: 1 addition & 0 deletions app/Domains/Documents/TypesEnum.php
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ public static function mimeTypeToType(string $mimeType): TypesEnum
'application/vnd.ms-excel' => TypesEnum::Xls,
'application/vnd.ms-powerpoint' => TypesEnum::Ppt,
'text/plain' => TypesEnum::Txt,
'text/csv' => TypesEnum::CSV,
'text/html' => TypesEnum::HTML,
default => TypesEnum::PDF,
};
Expand Down
1 change: 0 additions & 1 deletion app/Http/Controllers/CollectionController.php
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,6 @@ public function filesUpload(Collection $collection)
foreach ($validated['files'] as $file) {
$mimetype = $file->getMimeType();

//if pptx
Log::info('[LaraChain] - Mimetype', [
'mimetype' => $mimetype,
]);
Expand Down
18 changes: 18 additions & 0 deletions app/Imports/DocumentsImport.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
<?php

namespace App\Imports;

use Illuminate\Support\Collection;
use Maatwebsite\Excel\Concerns\Importable;
use Maatwebsite\Excel\Concerns\ToCollection;
use Maatwebsite\Excel\Concerns\WithHeadingRow;

class DocumentsImport implements ToCollection, WithHeadingRow
{
use Importable;

public function collection(Collection $collection)
{
return $collection;
}
}
66 changes: 66 additions & 0 deletions app/Jobs/ProcessCSVJob.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
<?php

namespace App\Jobs;

use App\Models\Document;
use Facades\App\Domains\Documents\Transformers\CSVTransformer;
use Illuminate\Bus\Batch;
use Illuminate\Bus\Batchable;
use Illuminate\Bus\Queueable;
use Illuminate\Contracts\Queue\ShouldQueue;
use Illuminate\Foundation\Bus\Dispatchable;
use Illuminate\Queue\InteractsWithQueue;
use Illuminate\Queue\SerializesModels;
use Illuminate\Support\Facades\Bus;
use LlmLaraHub\LlmDriver\LlmDriverFacade;
use LlmLaraHub\TagFunction\Jobs\TagDocumentJob;

class ProcessCSVJob implements ShouldQueue
{
use Batchable;
use Dispatchable, InteractsWithQueue, Queueable, SerializesModels;

/**
* Create a new job instance.
*/
public function __construct(public Document $document)
{
//
}

/**
* Execute the job.
*/
public function handle(): void
{
if ($this->batch()->cancelled()) {
// Determine if the batch has been cancelled...

return;
}

$chunks = CSVTransformer::handle($this->document);

foreach ($chunks as $chunk) {
$document = $chunk->document;
Bus::batch([
new VectorlizeDataJob($chunk),
])
->name(sprintf('Process %s Document Chunks - %d', $document->type->value, $document->id))
->finally(function (Batch $batch) use ($document) {
Bus::batch([
new SummarizeDocumentJob($document),
new TagDocumentJob($document),
new DocumentProcessingCompleteJob($document),
])->name(sprintf('Part 2 of Process for %s Document - %d',
$document->type->value, $document->id))
->allowFailures()
->onQueue(LlmDriverFacade::driver($document->getDriver())->onQueue())
->dispatch();
})
->allowFailures()
->onQueue(LlmDriverFacade::driver($document->getDriver())->onQueue())
->dispatch();
}
}
}
6 changes: 6 additions & 0 deletions app/Jobs/ProcessFileJob.php
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ public function handle(): void
$document = $this->document;

$options = [
TypesEnum::CSV->value => [
'jobs' => [
ProcessCSVJob::class,
],
'finally' => [], //going to make new docs from each row
],
TypesEnum::Pptx->value => [
'jobs' => [
ParsePowerPointJob::class,
Expand Down
1 change: 1 addition & 0 deletions composer.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
"laravel/sanctum": "^4.0",
"laravel/tinker": "^2.9",
"league/html-to-markdown": "^5.1",
"maatwebsite/excel": "^3.1",
"nwidart/laravel-modules": "^11.0",
"opcodesio/log-viewer": "^3.10",
"openai-php/laravel": "^0.8.1",
Expand Down
Loading

0 comments on commit 3033d26

Please sign in to comment.