Skip to content

Commit 37d2946

Browse files
authored
CHECK-2437 add support for using analyzers by language (#258)
* CHECK-2437 add support for using analyzers by language * CHECK-2437 remove old dependencies from half-implementation of analyzers * CHECK-2437 shift es client * CHECK-2437 add tests for new use case * CHECK-2437 add fix for tests to actually pass
1 parent 030fc10 commit 37d2946

File tree

9 files changed

+266
-78
lines changed

9 files changed

+266
-78
lines changed

app/main/controller/bulk_similarity_controller.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from elasticsearch import Elasticsearch
44
from elasticsearch import helpers
55
from app.main.lib.fields import JsonObject
6-
from app.main.lib.elasticsearch import language_to_analyzer
76
from app.main.lib.shared_models.shared_model import SharedModel
87
from app.main.lib.text_similarity import get_document_body
98
from app.main.lib import similarity

app/main/controller/bulk_update_similarity_controller.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from elasticsearch import Elasticsearch
44
from elasticsearch import helpers
55
from app.main.lib.fields import JsonObject
6-
from app.main.lib.elasticsearch import language_to_analyzer
76
from app.main.lib.shared_models.shared_model import SharedModel
87
from app.main.controller.bulk_similarity_controller import BulkSimilarityResource
98

app/main/lib/elasticsearch.py

Lines changed: 37 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from elasticsearch.helpers import scan
66

77
from flask import request, current_app as app
8+
9+
from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES
810
def get_all_documents_matching_context(context):
911
matches, clause_count = generate_matches(context)
1012
es = Elasticsearch(app.config['ELASTICSEARCH_URL'], timeout=30)
@@ -68,31 +70,41 @@ def merge_contexts(body, found_doc):
6870
body["contexts"].append(context)
6971
return body
7072

71-
def store_document(body, doc_id):
72-
es = Elasticsearch(app.config['ELASTICSEARCH_URL'])
73-
if doc_id:
74-
try:
75-
found_doc = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id)
76-
except elasticsearch.exceptions.NotFoundError:
77-
found_doc = None
78-
if found_doc:
79-
result = es.update(
80-
id=doc_id,
81-
body={"doc": merge_contexts(body, found_doc)},
82-
index=app.config['ELASTICSEARCH_SIMILARITY']
83-
)
84-
else:
85-
result = es.index(
86-
id=doc_id,
87-
body=body,
88-
index=app.config['ELASTICSEARCH_SIMILARITY']
89-
)
90-
else:
91-
result = es.index(
92-
body=body,
93-
index=app.config['ELASTICSEARCH_SIMILARITY']
94-
)
95-
# es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY'])
73+
def update_or_create_document(body, doc_id, index):
74+
es = Elasticsearch(app.config['ELASTICSEARCH_URL'], timeout=30)
75+
result = None
76+
if doc_id:
77+
try:
78+
found_doc = es.get(index=index, id=doc_id)
79+
except elasticsearch.exceptions.NotFoundError:
80+
found_doc = None
81+
if found_doc:
82+
result = es.update(
83+
id=doc_id,
84+
body={"doc": merge_contexts(body, found_doc)},
85+
index=index
86+
)
87+
else:
88+
result = es.index(
89+
id=doc_id,
90+
body=body,
91+
index=index
92+
)
93+
else:
94+
result = es.index(
95+
body=body,
96+
index=index
97+
)
98+
return result
99+
100+
def store_document(body, doc_id, language=None):
101+
indices = [app.config['ELASTICSEARCH_SIMILARITY']]
102+
if language and language in SUPPORTED_LANGUAGES:
103+
indices.append(app.config['ELASTICSEARCH_SIMILARITY']+"_"+language)
104+
results = []
105+
for index in indices:
106+
results.append(update_or_create_document(body, doc_id, index))
107+
result = results[0]
96108
success = False
97109
if result['result'] == 'created' or result['result'] == 'updated':
98110
success = True
@@ -128,42 +140,3 @@ def delete_document(doc_id, context, quiet):
128140
}
129141
else:
130142
return False
131-
132-
def language_to_analyzer(lang):
133-
analyzer_dict = {
134-
'ar': 'arabic',
135-
'hy': 'armenian',
136-
'eu': 'basque',
137-
'bn': 'bengali',
138-
'pt-br': 'brazilian', # TODO
139-
'bg': 'bulgarian',
140-
'ca': 'catalan',
141-
'cjk': 'cjk', # TODO
142-
'cs': 'czech',
143-
'da': 'danish',
144-
'nl': 'dutch',
145-
'en': 'english',
146-
'fi': 'finnish',
147-
'fr': 'french',
148-
'gl': 'galician',
149-
'de': 'german',
150-
'gr': 'greek',
151-
'hi': 'hindi',
152-
'hu': 'hungarian',
153-
'id': 'indonesian',
154-
'ga': 'irish',
155-
'it': 'italian',
156-
'lv': 'latvian',
157-
'lt': 'lithuanian',
158-
'no': 'norwegian',
159-
'fa': 'persian',
160-
'pt': 'portuguese',
161-
'ro': 'romanian',
162-
'ru': 'russian',
163-
'ku': 'sorani',
164-
'es': 'spanish',
165-
'sv': 'swedish',
166-
'tr': 'turkish',
167-
'th': 'thai'
168-
}
169-
return analyzer_dict.get(lang, 'standard')

app/main/lib/language_analyzers.py

Lines changed: 183 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,183 @@
1+
import json
2+
from elasticsearch import Elasticsearch
3+
from flask import request, current_app as app
4+
SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn"]
5+
#via https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#bengali-analyzer
6+
SETTINGS_BY_LANGUAGE = {
7+
"en": {
8+
"analysis": {
9+
"filter": {
10+
"english_stop": {
11+
"type": "stop",
12+
"stopwords": "_english_"
13+
},
14+
"english_keywords": {
15+
"type": "keyword_marker",
16+
"keywords": ["example"]
17+
},
18+
"english_stemmer": {
19+
"type": "stemmer",
20+
"language": "english"
21+
},
22+
"english_possessive_stemmer": {
23+
"type": "stemmer",
24+
"language": "possessive_english"
25+
}
26+
},
27+
"analyzer": {
28+
"rebuilt_english": {
29+
"tokenizer": "standard",
30+
"filter": [
31+
"english_possessive_stemmer",
32+
"lowercase",
33+
"english_stop",
34+
"english_keywords",
35+
"english_stemmer"
36+
]
37+
}
38+
}
39+
}
40+
},
41+
"es": {
42+
"analysis": {
43+
"filter": {
44+
"spanish_stop": {
45+
"type": "stop",
46+
"stopwords": "_spanish_"
47+
},
48+
"spanish_keywords": {
49+
"type": "keyword_marker",
50+
"keywords": ["ejemplo"]
51+
},
52+
"spanish_stemmer": {
53+
"type": "stemmer",
54+
"language": "light_spanish"
55+
}
56+
},
57+
"analyzer": {
58+
"rebuilt_spanish": {
59+
"tokenizer": "standard",
60+
"filter": [
61+
"lowercase",
62+
"spanish_stop",
63+
"spanish_keywords",
64+
"spanish_stemmer"
65+
]
66+
}
67+
}
68+
}
69+
},
70+
"pt": {
71+
"analysis": {
72+
"filter": {
73+
"portuguese_stop": {
74+
"type": "stop",
75+
"stopwords": "_portuguese_"
76+
},
77+
"portuguese_keywords": {
78+
"type": "keyword_marker",
79+
"keywords": ["exemplo"]
80+
},
81+
"portuguese_stemmer": {
82+
"type": "stemmer",
83+
"language": "light_portuguese"
84+
}
85+
},
86+
"analyzer": {
87+
"rebuilt_portuguese": {
88+
"tokenizer": "standard",
89+
"filter": [
90+
"lowercase",
91+
"portuguese_stop",
92+
"portuguese_keywords",
93+
"portuguese_stemmer"
94+
]
95+
}
96+
}
97+
}
98+
},
99+
"hi": {
100+
"analysis": {
101+
"filter": {
102+
"hindi_stop": {
103+
"type": "stop",
104+
"stopwords": "_hindi_"
105+
},
106+
"hindi_keywords": {
107+
"type": "keyword_marker",
108+
"keywords": ["उदाहरण"]
109+
},
110+
"hindi_stemmer": {
111+
"type": "stemmer",
112+
"language": "hindi"
113+
}
114+
},
115+
"analyzer": {
116+
"rebuilt_hindi": {
117+
"tokenizer": "standard",
118+
"filter": [
119+
"lowercase",
120+
"decimal_digit",
121+
"hindi_keywords",
122+
"indic_normalization",
123+
"hindi_normalization",
124+
"hindi_stop",
125+
"hindi_stemmer"
126+
]
127+
}
128+
}
129+
}
130+
},
131+
"bn": {
132+
"analysis": {
133+
"filter": {
134+
"bengali_stop": {
135+
"type": "stop",
136+
"stopwords": "_bengali_"
137+
},
138+
"bengali_keywords": {
139+
"type": "keyword_marker",
140+
"keywords": ["উদাহরণ"]
141+
},
142+
"bengali_stemmer": {
143+
"type": "stemmer",
144+
"language": "bengali"
145+
}
146+
},
147+
"analyzer": {
148+
"rebuilt_bengali": {
149+
"tokenizer": "standard",
150+
"filter": [
151+
"lowercase",
152+
"decimal_digit",
153+
"bengali_keywords",
154+
"indic_normalization",
155+
"bengali_normalization",
156+
"bengali_stop",
157+
"bengali_stemmer"
158+
]
159+
}
160+
}
161+
}
162+
}
163+
}
164+
165+
def init_indices():
166+
es = Elasticsearch(app.config['ELASTICSEARCH_URL'])
167+
indices = es.cat.indices(h='index', s='index').split()
168+
for lang in SUPPORTED_LANGUAGES:
169+
index_name = app.config['ELASTICSEARCH_SIMILARITY']+"_"+lang
170+
if index_name not in indices:
171+
es.indices.create(index=index_name)
172+
es.indices.close(index=index_name)
173+
es.indices.put_mapping(
174+
body=json.load(open('./elasticsearch/alegre_similarity_base.json')),
175+
# include_type_name=True,
176+
index=index_name
177+
)
178+
es.indices.put_settings(
179+
body=SETTINGS_BY_LANGUAGE['pt'],
180+
# include_type_name=True,
181+
index=index_name
182+
)
183+
es.indices.open(index=index_name)

app/main/lib/similarity.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ def get_body_for_text_document(params):
1414
models = models|set(params['models'])
1515
if not models:
1616
models = ['elasticsearch']
17-
body = {'content': params.get('text'), 'created_at': params.get("created_at", datetime.now()), 'limit': params.get("limit", DEFAULT_SEARCH_LIMIT), 'models': list(models)}
17+
body = {'language': params.get('language'), 'content': params.get('text'), 'created_at': params.get("created_at", datetime.now()), 'limit': params.get("limit", DEFAULT_SEARCH_LIMIT), 'models': list(models)}
1818
for key in ['context', 'threshold', 'fuzzy']:
1919
if key in params:
2020
body[key] = params[key]
@@ -51,7 +51,8 @@ def add_item(item, similarity_type):
5151
response = add_image(item)
5252
elif similarity_type == "text":
5353
doc_id = item.pop("doc_id", None)
54-
response = add_text(item, doc_id)
54+
language = item.pop("language", None)
55+
response = add_text(item, doc_id, language)
5556
app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for delete was {response}")
5657
return response
5758

0 commit comments

Comments
 (0)