Skip to content
This repository has been archived by the owner on Feb 19, 2021. It is now read-only.

Add insensitive search for documents #545

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 15 additions & 7 deletions src/documents/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,11 @@
from django.utils.safestring import mark_safe
from djangoql.admin import DjangoQLSearchMixin

from documents.actions import (
add_tag_to_selected,
remove_correspondent_from_selected,
remove_tag_from_selected,
set_correspondent_on_selected
)
from documents.actions import (add_tag_to_selected,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the interests of consistency, please don't reformat imports. If anything, imports should conform to an isort configuration of:

isort -m 3 --dont-skip __init__.py --virtual-env ${virtualenv}/.. --quiet ${file_name}

remove_correspondent_from_selected,
remove_tag_from_selected,
set_correspondent_on_selected)
from paperless.utils import slugify

from .models import Correspondent, Document, Log, Tag

Expand Down Expand Up @@ -173,7 +172,12 @@ class Media:
"all": ("paperless.css",)
}

search_fields = ("correspondent__name", "title", "content", "tags__name")
search_fields = (
"correspondent__name",
"searchable_title",
"searchable_content",
"tags__name",
)
readonly_fields = ("added", "file_type", "storage_type",)
list_display = ("title", "created", "added", "thumbnail", "correspondent",
"tags_")
Expand Down Expand Up @@ -338,6 +342,10 @@ def _html_tag(kind, inside=None, **kwargs):

return format_html("<{} {}/>", kind, attributes)

def get_search_results(self, request, queryset, search_term):
search_term = slugify(search_term)
return super().get_search_results(request, queryset, search_term)


class LogAdmin(CommonAdmin):

Expand Down
44 changes: 44 additions & 0 deletions src/documents/migrations/0023_document_searchable_content.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
# Generated by Django 2.0.13 on 2019-05-30 14:50
import unicodedata

from django.db import migrations, models

from paperless.utils import slugify as slugifyOCR
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While it can be tempting to import stuff from modules into a migration to keep your code DRY, this will later bite us in the ass should we decide to rename/remove this function. It will break all migrations for all time as a result.

Instead, if you have logic you wish to make available in a migration, copy it verbatim into the migration.



class Migration(migrations.Migration):
dependencies = [("documents", "0022_auto_20181007_1420")]

reversible = True

def casefold_forwards(apps, schema_editor):
Document = apps.get_model("documents", "Document")
for doc in Document.objects.all():
if doc.title is not None:
doc.searchable_title = slugifyOCR(doc.title)
if doc.content is not None:
doc.searchable_content = slugifyOCR(doc.content)
doc.save()

def casefold_backwards(apps, schema_editor):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Protip: you don't need to create an empty backwards method. You can just do migrations.RunPython(casefold_forwards, migrations.RunPython.noop) below instead.

pass

def database_backwards(self, app_label, schema_editor, from_state, to_state):
migrations.RemoveField(model_name="document", name="searchable_content"),
migrations.RemoveField(model_name="document", name="searchable_title"),

operations = [
migrations.AddField(
model_name="document",
name="searchable_content",
field=models.TextField(blank=True, db_index=True, editable=False),
),
migrations.AddField(
model_name="document",
name="searchable_title",
field=models.CharField(
max_length=128, blank=True, db_index=True, editable=False
),
),
migrations.RunPython(casefold_forwards, casefold_backwards),
]
22 changes: 22 additions & 0 deletions src/documents/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
from django.utils.text import slugify
from fuzzywuzzy import fuzz

from paperless.utils import slugify as slugifyOCR

from .managers import LogManager

try:
Expand Down Expand Up @@ -221,6 +223,19 @@ class Document(models.Model):
"primarily used for searching."
)

searchable_content = models.TextField(
db_index=True,
blank=True,
editable=False,
)

searchable_title = models.CharField(
max_length=128,
blank=True,
db_index=True,
editable=False,
)

file_type = models.CharField(
max_length=4,
editable=False,
Expand Down Expand Up @@ -266,6 +281,13 @@ def __str__(self):
return "{}: {}".format(created, self.correspondent or self.title)
return str(created)

def save(self, *args, **kwargs):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you for doing this in .save() and not in a signal. This is much more appropriate.

if self.title is not None:
self.searchable_title = slugifyOCR(self.title)
if self.content is not None:
self.searchable_content = slugifyOCR(self.content)
return super().save(*args, **kwargs)

@property
def source_path(self):

Expand Down
25 changes: 25 additions & 0 deletions src/documents/tests/test_document_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,28 @@ def test_file_deletion(self):
mock_unlink.assert_any_call(file_path)
mock_unlink.assert_any_call(thumb_path)
self.assertEqual(mock_unlink.call_count, 2)

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hooray for tests!

def test_searchable_title_and_content(self):
document = Document.objects.create(
title="Title",
content="Content",
checksum="azerty1"
)
self.assertEqual(document.title, "Title")
self.assertEqual(document.content, "Content")
self.assertEqual(document.searchable_title, "title")
self.assertEqual(document.searchable_content, "content")

document = Document.objects.create(
title="Zürich Weiß",
content="Telefónica ééé aaa",
checksum="azerty2"
)
self.assertEqual(document.searchable_title, "zurich weiss")
self.assertEqual(document.searchable_content, "telefonica eee aaa")

document = Document.objects.create(checksum="azerty3")
self.assertEqual(document.title, '')
self.assertEqual(document.content, '')
self.assertEqual(document.searchable_title, '')
self.assertEqual(document.searchable_content, '')
9 changes: 9 additions & 0 deletions src/paperless/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
import unicodedata


def slugify(content):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we come up with a more appropriate name for this than slugify()? Something like make_searchable() would be more apt as this doesn't turn the content into a slug at all.

return (
unicodedata.normalize("NFKD", content.casefold())
.encode("ASCII", "ignore")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, I'm not sure if this is the right thing to do. In light of your change only searching the new searchable fields, this straight up breaks search for languages that share no characters with the ASCII codepage, right? Judging from the official Python docs, "ignore" would simply drop non-ASCII characters, resulting in a potentially empty search-term.

As far as I can tell not ignoring these characters here would break the entire feature, your Zürich Weiß example would, after case-folding first, produce zu\xcc\x88rich weiss, which obviously cannot be searched for with zurich weiss.

Maybe the simplest fix is to search both the new ASCII-fied fields and the regular fields?

.decode("utf-8")
)