meedan
diff --git a/‎app/main/controller/bulk_similarity_controller.py
Lines changed: 0 additions & 1 deletion b/‎app/main/controller/bulk_similarity_controller.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎app/main/controller/bulk_update_similarity_controller.py
Lines changed: 0 additions & 1 deletion b/‎app/main/controller/bulk_update_similarity_controller.py
Lines changed: 0 additions & 1 deletion
diff --git a/‎app/main/lib/elasticsearch.py
Lines changed: 37 additions & 64 deletions b/‎app/main/lib/elasticsearch.py
Lines changed: 37 additions & 64 deletions
diff --git a/‎app/main/lib/language_analyzers.py
Lines changed: 183 additions & 0 deletions b/‎app/main/lib/language_analyzers.py
Lines changed: 183 additions & 0 deletions
diff --git a/‎app/main/lib/similarity.py
Lines changed: 3 additions & 2 deletions b/‎app/main/lib/similarity.py
Lines changed: 3 additions & 2 deletions
@@ -3,7 +3,6 @@
 from elasticsearch import Elasticsearch
 from elasticsearch import helpers
 from app.main.lib.fields import JsonObject
-from app.main.lib.elasticsearch import language_to_analyzer
 from app.main.lib.shared_models.shared_model import SharedModel
 from app.main.lib.text_similarity import get_document_body
 from app.main.lib import similarity
 
@@ -3,7 +3,6 @@
 from elasticsearch import Elasticsearch
 from elasticsearch import helpers
 from app.main.lib.fields import JsonObject
-from app.main.lib.elasticsearch import language_to_analyzer
 from app.main.lib.shared_models.shared_model import SharedModel
 from app.main.controller.bulk_similarity_controller import BulkSimilarityResource
 
 
@@ -5,6 +5,8 @@
 from elasticsearch.helpers import scan
 
 from flask import request, current_app as app
+
+from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES
 def get_all_documents_matching_context(context):
   matches, clause_count = generate_matches(context)
   es = Elasticsearch(app.config['ELASTICSEARCH_URL'], timeout=30)
@@ -68,31 +70,41 @@ def merge_contexts(body, found_doc):
             body["contexts"].append(context)
     return body
 
-def store_document(body, doc_id):
-    es = Elasticsearch(app.config['ELASTICSEARCH_URL'])
-    if doc_id:
-        try:
-            found_doc = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id)
-        except elasticsearch.exceptions.NotFoundError:
-            found_doc = None
-        if found_doc:
-            result = es.update(
-                id=doc_id,
-                body={"doc": merge_contexts(body, found_doc)},
-                index=app.config['ELASTICSEARCH_SIMILARITY']
-            )
-        else:
-            result = es.index(
-                id=doc_id,
-                body=body,
-                index=app.config['ELASTICSEARCH_SIMILARITY']
-            )
-    else:
-        result = es.index(
-            body=body,
-            index=app.config['ELASTICSEARCH_SIMILARITY']
-        )
-    # es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY'])
+def update_or_create_document(body, doc_id, index):
+  es = Elasticsearch(app.config['ELASTICSEARCH_URL'], timeout=30)
+  result = None
+  if doc_id:
+      try:
+          found_doc = es.get(index=index, id=doc_id)
+      except elasticsearch.exceptions.NotFoundError:
+          found_doc = None
+      if found_doc:
+          result = es.update(
+              id=doc_id,
+              body={"doc": merge_contexts(body, found_doc)},
+              index=index
+          )
+      else:
+          result = es.index(
+              id=doc_id,
+              body=body,
+              index=index
+          )
+  else:
+      result = es.index(
+          body=body,
+          index=index
+      )
+  return result
+
+def store_document(body, doc_id, language=None):
+    indices = [app.config['ELASTICSEARCH_SIMILARITY']]
+    if language and language in SUPPORTED_LANGUAGES:
+      indices.append(app.config['ELASTICSEARCH_SIMILARITY']+"_"+language)
+    results = []
+    for index in indices:
+      results.append(update_or_create_document(body, doc_id, index))
+    result = results[0]
     success = False
     if result['result'] == 'created' or result['result'] == 'updated':
         success = True
@@ -128,42 +140,3 @@ def delete_document(doc_id, context, quiet):
             }
         else:
             return False
-
-def language_to_analyzer(lang):
-    analyzer_dict = {
-        'ar': 'arabic',
-        'hy': 'armenian',
-        'eu': 'basque',
-        'bn': 'bengali',
-        'pt-br': 'brazilian', # TODO
-        'bg': 'bulgarian',
-        'ca': 'catalan',
-        'cjk': 'cjk', # TODO
-        'cs': 'czech',
-        'da': 'danish',
-        'nl': 'dutch',
-        'en': 'english',
-        'fi': 'finnish',
-        'fr': 'french',
-        'gl': 'galician',
-        'de': 'german',
-        'gr': 'greek',
-        'hi': 'hindi',
-        'hu': 'hungarian',
-        'id': 'indonesian',
-        'ga': 'irish',
-        'it': 'italian',
-        'lv': 'latvian',
-        'lt': 'lithuanian',
-        'no': 'norwegian',
-        'fa': 'persian',
-        'pt': 'portuguese',
-        'ro': 'romanian',
-        'ru': 'russian',
-        'ku': 'sorani',
-        'es': 'spanish',
-        'sv': 'swedish',
-        'tr': 'turkish',
-        'th': 'thai'
-    }
-    return analyzer_dict.get(lang, 'standard')
@@ -0,0 +1,183 @@
+import json
+from elasticsearch import Elasticsearch
+from flask import request, current_app as app
+SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn"]
+#via https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#bengali-analyzer
+SETTINGS_BY_LANGUAGE = {
+  "en": {
+    "analysis": {
+      "filter": {
+        "english_stop": {
+          "type":       "stop",
+          "stopwords":  "_english_" 
+        },
+        "english_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   ["example"] 
+        },
+        "english_stemmer": {
+          "type":       "stemmer",
+          "language":   "english"
+        },
+        "english_possessive_stemmer": {
+          "type":       "stemmer",
+          "language":   "possessive_english"
+        }
+      },
+      "analyzer": {
+        "rebuilt_english": {
+          "tokenizer":  "standard",
+          "filter": [
+            "english_possessive_stemmer",
+            "lowercase",
+            "english_stop",
+            "english_keywords",
+            "english_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "es": {
+    "analysis": {
+      "filter": {
+        "spanish_stop": {
+          "type":       "stop",
+          "stopwords":  "_spanish_" 
+        },
+        "spanish_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   ["ejemplo"] 
+        },
+        "spanish_stemmer": {
+          "type":       "stemmer",
+          "language":   "light_spanish"
+        }
+      },
+      "analyzer": {
+        "rebuilt_spanish": {
+          "tokenizer":  "standard",
+          "filter": [
+            "lowercase",
+            "spanish_stop",
+            "spanish_keywords",
+            "spanish_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "pt": {
+    "analysis": {
+      "filter": {
+        "portuguese_stop": {
+          "type":       "stop",
+          "stopwords":  "_portuguese_" 
+        },
+        "portuguese_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   ["exemplo"] 
+        },
+        "portuguese_stemmer": {
+          "type":       "stemmer",
+          "language":   "light_portuguese"
+        }
+      },
+      "analyzer": {
+        "rebuilt_portuguese": {
+          "tokenizer":  "standard",
+          "filter": [
+            "lowercase",
+            "portuguese_stop",
+            "portuguese_keywords",
+            "portuguese_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "hi": {
+    "analysis": {
+      "filter": {
+        "hindi_stop": {
+          "type":       "stop",
+          "stopwords":  "_hindi_" 
+        },
+        "hindi_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   ["उदाहरण"] 
+        },
+        "hindi_stemmer": {
+          "type":       "stemmer",
+          "language":   "hindi"
+        }
+      },
+      "analyzer": {
+        "rebuilt_hindi": {
+          "tokenizer":  "standard",
+          "filter": [
+            "lowercase",
+            "decimal_digit",
+            "hindi_keywords",
+            "indic_normalization",
+            "hindi_normalization",
+            "hindi_stop",
+            "hindi_stemmer"
+          ]
+        }
+      }
+    }
+  },
+  "bn": {
+    "analysis": {
+      "filter": {
+        "bengali_stop": {
+          "type":       "stop",
+          "stopwords":  "_bengali_" 
+        },
+        "bengali_keywords": {
+          "type":       "keyword_marker",
+          "keywords":   ["উদাহরণ"] 
+        },
+        "bengali_stemmer": {
+          "type":       "stemmer",
+          "language":   "bengali"
+        }
+      },
+      "analyzer": {
+        "rebuilt_bengali": {
+          "tokenizer":  "standard",
+          "filter": [
+            "lowercase",
+            "decimal_digit",
+            "bengali_keywords",
+            "indic_normalization",
+            "bengali_normalization",
+            "bengali_stop",
+            "bengali_stemmer"
+          ]
+        }
+      }
+    }
+  }
+}
+
+def init_indices():
+  es = Elasticsearch(app.config['ELASTICSEARCH_URL'])
+  indices = es.cat.indices(h='index', s='index').split()
+  for lang in SUPPORTED_LANGUAGES:
+    index_name = app.config['ELASTICSEARCH_SIMILARITY']+"_"+lang
+    if index_name not in indices:
+      es.indices.create(index=index_name)
+    es.indices.close(index=index_name)
+    es.indices.put_mapping(
+      body=json.load(open('./elasticsearch/alegre_similarity_base.json')),
+      # include_type_name=True,
+      index=index_name
+    )
+    es.indices.put_settings(
+      body=SETTINGS_BY_LANGUAGE['pt'],
+      # include_type_name=True,
+      index=index_name
+    )
+    es.indices.open(index=index_name)    
@@ -14,7 +14,7 @@ def get_body_for_text_document(params):
         models = models|set(params['models'])
     if not models:
         models = ['elasticsearch']
-    body = {'content': params.get('text'), 'created_at': params.get("created_at", datetime.now()), 'limit': params.get("limit", DEFAULT_SEARCH_LIMIT), 'models': list(models)}
+    body = {'language': params.get('language'), 'content': params.get('text'), 'created_at': params.get("created_at", datetime.now()), 'limit': params.get("limit", DEFAULT_SEARCH_LIMIT), 'models': list(models)}
     for key in ['context', 'threshold', 'fuzzy']:
         if key in params:
             body[key] = params[key]
@@ -51,7 +51,8 @@ def add_item(item, similarity_type):
     response = add_image(item)
   elif similarity_type == "text":
     doc_id = item.pop("doc_id", None)
-    response = add_text(item, doc_id)
+    language = item.pop("language", None)
+    response = add_text(item, doc_id, language)
   app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for delete was {response}")
   return response