Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Draft] Feature: automatic document translation #6386

Closed
wants to merge 5 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 3 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,6 @@ types-Pygments = "*"
types-colorama = "*"
types-psycopg2 = "*"
types-setuptools = "*"

[translation]
bergamot = "0.4.5"
28 changes: 28 additions & 0 deletions docs/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -825,6 +825,34 @@ they use underscores instead of dashes.
{"deskew": true, "optimize": 3, "unpaper_args": "--pre-rotate 90"}
```

## Translations {#translations}

Paperless uses [Bergamot](https://browser.mt/) to automatically translate
documents locally. When this feature is used you can search in documents using
their translated version.

!!! warning

Translations are an optional Paperless feature and needs to be
installed using `pip install bergamot` or
`pipenv sync --categories translation`

#### [`PAPERLESS_TRANSLATION_TARGET=<string>`](#PAPERLESS_TRANSLATION_TARGET) {#PAPERLESS_TRANSLATION_TARGET}

: Use this parameter to enable automatic translation of documents not already
in the target language. For example, specify `en` to translate all non-English
documents to English.

The default is blank, which means nothing will be translated.

#### [`PAPERLESS_TRANSLATION_MODELS=<string>`](#PAPERLESS_TRANSLATION_MODELS) {#PAPERLESS_TRANSLATION_MODELS}

: Use this parameter to tell Bergamot what language models to use. To see
and download models use `bergamot ls` and `bergamot download`. For example,
if you download the Ukrainian to English model using `bergamot download -m
uk-en-tiny`, you can specify `uk-en-tiny`. Use multiple models by separating
them with a comma.

## Software tweaks {#software_tweaks}

#### [`PAPERLESS_TASK_WORKERS=<num>`](#PAPERLESS_TASK_WORKERS) {#PAPERLESS_TASK_WORKERS}
Expand Down
2 changes: 2 additions & 0 deletions paperless.conf.example
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,8 @@
#PAPERLESS_OCR_USER_ARGS={}
#PAPERLESS_CONVERT_MEMORY_LIMIT=0
#PAPERLESS_CONVERT_TMPDIR=/var/tmp/paperless
#PAPERLESS_TRANSLATION_MODELS=uk-en-tiny,cs-en-base
#PAPERLESS_TRANSLATION_TARGET=en

# Software tweaks

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,10 @@
<div>
<textarea class="form-control" id="content" rows="20" formControlName='content' [class.rtl]="isRTL"></textarea>
</div>
<h3>Translation</h3>
<div>
<textarea class="form-control" id="translation" rows="20" formControlName='translation' [class.rtl]="isRTL"></textarea>
</div>
</ng-template>
</li>

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ const doc: Document = {
storage_path: 31,
tags: [41, 42, 43],
content: 'text content',
translation: 'text content',
added: new Date('May 4, 2014 03:24:00'),
created: new Date('May 4, 2014 03:24:00'),
modified: new Date('May 4, 2014 03:24:00'),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ export class DocumentDetailComponent
documentForm: FormGroup = new FormGroup({
title: new FormControl(''),
content: new FormControl(''),
translation: new FormControl(''),
created_date: new FormControl(),
correspondent: new FormControl(),
document_type: new FormControl(),
Expand Down Expand Up @@ -404,6 +405,7 @@ export class DocumentDetailComponent
this.store = new BehaviorSubject({
title: doc.title,
content: doc.content,
translation: doc.translation,
created_date: doc.created_date,
correspondent: doc.correspondent,
document_type: doc.document_type,
Expand Down
2 changes: 2 additions & 0 deletions src-ui/src/app/data/document.ts
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@ export interface Document extends ObjectWithPermissions {

content?: string

translation?: string

tags$?: Observable<Tag[]>

tags?: number[]
Expand Down
6 changes: 5 additions & 1 deletion src/documents/filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,11 @@ def filter(self, qs, value):
class TitleContentFilter(Filter):
def filter(self, qs, value):
if value:
return qs.filter(Q(title__icontains=value) | Q(content__icontains=value))
return qs.filter(
Q(title__icontains=value)
| Q(translation__icontains=value)
| Q(content__icontains=value),
)
else:
return qs

Expand Down
23 changes: 23 additions & 0 deletions src/documents/migrations/1047_document_translation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Generated by Django 4.2.11 on 2024-04-11 18:24

from django.db import migrations
from django.db import models


class Migration(migrations.Migration):

dependencies = [
("documents", "1046_workflowaction_remove_all_correspondents_and_more"),
]

operations = [
migrations.AddField(
model_name="document",
name="translation",
field=models.TextField(
blank=True,
help_text="The translated version of the content field. This field can also be used for searching.",
verbose_name="translation",
),
),
]
9 changes: 9 additions & 0 deletions src/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,15 @@ class Document(ModelWithOwner):
),
)

translation = models.TextField(
_("translation"),
blank=True,
help_text=_(
"The translated version of the content field. This field can "
"also be used for searching.",
),
)

mime_type = models.CharField(_("mime type"), max_length=256, editable=False)

tags = models.ManyToManyField(
Expand Down
1 change: 1 addition & 0 deletions src/documents/serialisers.py
Original file line number Diff line number Diff line change
Expand Up @@ -772,6 +772,7 @@ class Meta:
"storage_path",
"title",
"content",
"translation",
"tags",
"created",
"created_date",
Expand Down
36 changes: 35 additions & 1 deletion src/documents/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from django.db import transaction
from django.db.models.signals import post_save
from filelock import FileLock
from langdetect import detect
from whoosh.writing import AsyncWriter

from documents import index
Expand Down Expand Up @@ -49,6 +50,34 @@
logger = logging.getLogger("paperless.tasks")


def translate_content(content):
import bergamot

models = settings.TRANSLATION_MODELS.split(",")
original_language = detect(content)

# Avoid translating if we already have the target language
if original_language == settings.TRANSLATION_TARGET_LANGUAGE:
return ""

for model in models:
# Find the right model for the translation
# bergamot models usually end with "tiny" or "base" so we remove that
if original_language in model.replace("base", "").replace("tiny", ""):
service = bergamot.Service(bergamot.ServiceConfig())

model = service.modelFromConfigPath(
bergamot.REPOSITORY.modelConfigPath("browsermt", model),
)
result = service.translate(
model,
bergamot.VectorString([content]),
bergamot.ResponseOptions(),
)
return next(r.target.text for r in result)
return ""


@shared_task
def index_optimize():
ix = index.open_index()
Expand Down Expand Up @@ -243,9 +272,14 @@ def update_document_archive_file(document_id):
archive_filename=True,
)
oldDocument = Document.objects.get(pk=document.pk)
content = parser.get_text()
translation = (
translate_content(content) if settings.TRANSLATION_MODEL else ""
)
Document.objects.filter(pk=document.pk).update(
archive_checksum=checksum,
content=parser.get_text(),
content=content,
translation=translation,
archive_filename=document.archive_filename,
)
newDocument = Document.objects.get(pk=document.pk)
Expand Down