-
Notifications
You must be signed in to change notification settings - Fork 498
Add insensitive search for documents #545
base: master
Are you sure you want to change the base?
Changes from 3 commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
# Generated by Django 2.0.13 on 2019-05-30 14:50 | ||
import unicodedata | ||
|
||
from django.db import migrations, models | ||
|
||
from paperless.utils import slugify as slugifyOCR | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. While it can be tempting to import stuff from modules into a migration to keep your code DRY, this will later bite us in the ass should we decide to rename/remove this function. It will break all migrations for all time as a result. Instead, if you have logic you wish to make available in a migration, copy it verbatim into the migration. |
||
|
||
|
||
class Migration(migrations.Migration): | ||
dependencies = [("documents", "0022_auto_20181007_1420")] | ||
|
||
reversible = True | ||
|
||
def casefold_forwards(apps, schema_editor): | ||
Document = apps.get_model("documents", "Document") | ||
for doc in Document.objects.all(): | ||
if doc.title is not None: | ||
doc.searchable_title = slugifyOCR(doc.title) | ||
if doc.content is not None: | ||
doc.searchable_content = slugifyOCR(doc.content) | ||
doc.save() | ||
|
||
def casefold_backwards(apps, schema_editor): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Protip: you don't need to create an empty backwards method. You can just do |
||
pass | ||
|
||
def database_backwards(self, app_label, schema_editor, from_state, to_state): | ||
migrations.RemoveField(model_name="document", name="searchable_content"), | ||
migrations.RemoveField(model_name="document", name="searchable_title"), | ||
|
||
operations = [ | ||
migrations.AddField( | ||
model_name="document", | ||
name="searchable_content", | ||
field=models.TextField(blank=True, db_index=True, editable=False), | ||
), | ||
migrations.AddField( | ||
model_name="document", | ||
name="searchable_title", | ||
field=models.CharField( | ||
max_length=128, blank=True, db_index=True, editable=False | ||
), | ||
), | ||
migrations.RunPython(casefold_forwards, casefold_backwards), | ||
] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,6 +14,8 @@ | |
from django.utils.text import slugify | ||
from fuzzywuzzy import fuzz | ||
|
||
from paperless.utils import slugify as slugifyOCR | ||
|
||
from .managers import LogManager | ||
|
||
try: | ||
|
@@ -221,6 +223,19 @@ class Document(models.Model): | |
"primarily used for searching." | ||
) | ||
|
||
searchable_content = models.TextField( | ||
db_index=True, | ||
blank=True, | ||
editable=False, | ||
) | ||
|
||
searchable_title = models.CharField( | ||
max_length=128, | ||
blank=True, | ||
db_index=True, | ||
editable=False, | ||
) | ||
|
||
file_type = models.CharField( | ||
max_length=4, | ||
editable=False, | ||
|
@@ -266,6 +281,13 @@ def __str__(self): | |
return "{}: {}".format(created, self.correspondent or self.title) | ||
return str(created) | ||
|
||
def save(self, *args, **kwargs): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thank you for doing this in |
||
if self.title is not None: | ||
self.searchable_title = slugifyOCR(self.title) | ||
if self.content is not None: | ||
self.searchable_content = slugifyOCR(self.content) | ||
return super().save(*args, **kwargs) | ||
|
||
@property | ||
def source_path(self): | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,3 +21,28 @@ def test_file_deletion(self): | |
mock_unlink.assert_any_call(file_path) | ||
mock_unlink.assert_any_call(thumb_path) | ||
self.assertEqual(mock_unlink.call_count, 2) | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hooray for tests! |
||
def test_searchable_title_and_content(self): | ||
document = Document.objects.create( | ||
title="Title", | ||
content="Content", | ||
checksum="azerty1" | ||
) | ||
self.assertEqual(document.title, "Title") | ||
self.assertEqual(document.content, "Content") | ||
self.assertEqual(document.searchable_title, "title") | ||
self.assertEqual(document.searchable_content, "content") | ||
|
||
document = Document.objects.create( | ||
title="Zürich Weiß", | ||
content="Telefónica ééé aaa", | ||
checksum="azerty2" | ||
) | ||
self.assertEqual(document.searchable_title, "zurich weiss") | ||
self.assertEqual(document.searchable_content, "telefonica eee aaa") | ||
|
||
document = Document.objects.create(checksum="azerty3") | ||
self.assertEqual(document.title, '') | ||
self.assertEqual(document.content, '') | ||
self.assertEqual(document.searchable_title, '') | ||
self.assertEqual(document.searchable_content, '') |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
import unicodedata | ||
|
||
|
||
def slugify(content): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we come up with a more appropriate name for this than |
||
return ( | ||
unicodedata.normalize("NFKD", content.casefold()) | ||
.encode("ASCII", "ignore") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hm, I'm not sure if this is the right thing to do. In light of your change only searching the new searchable fields, this straight up breaks search for languages that share no characters with the ASCII codepage, right? Judging from the official Python docs, As far as I can tell not ignoring these characters here would break the entire feature, your Maybe the simplest fix is to search both the new ASCII-fied fields and the regular fields? |
||
.decode("utf-8") | ||
) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In the interests of consistency, please don't reformat imports. If anything, imports should conform to an isort configuration of: