meedan · DGaffney · Sep 28, 2022 · Sep 23, 2022 · Sep 23, 2022 · Sep 23, 2022
@@ -3,7  3,6 @@
 from elasticsearch import Elasticsearch
 from elasticsearch import helpers
 from app.main.lib.fields import JsonObject
-from app.main.lib.elasticsearch import language_to_analyzer
 from app.main.lib.shared_models.shared_model import SharedModel
 from app.main.lib.text_similarity import get_document_body
 from app.main.lib import similarity

@@ -3,7  3,6 @@
 from elasticsearch import Elasticsearch
 from elasticsearch import helpers
 from app.main.lib.fields import JsonObject
-from app.main.lib.elasticsearch import language_to_analyzer
 from app.main.lib.shared_models.shared_model import SharedModel
 from app.main.controller.bulk_similarity_controller import BulkSimilarityResource
 

@@ -5,6  5,8 @@
 from elasticsearch.helpers import scan
 
 from flask import request, current_app as app
 
 from app.main.lib.language_analyzers import SUPPORTED_LANGUAGES
 def get_all_documents_matching_context(context):
   matches, clause_count = generate_matches(context)
   es = Elasticsearch(app.config['ELASTICSEARCH_URL'], timeout=30)
@@ -68,31  70,41 @@ def merge_contexts(body, found_doc):
             body["contexts"].append(context)
     return body
 
-def store_document(body, doc_id):
-    es = Elasticsearch(app.config['ELASTICSEARCH_URL'])
-    if doc_id:
-        try:
-            found_doc = es.get(index=app.config['ELASTICSEARCH_SIMILARITY'], id=doc_id)
-        except elasticsearch.exceptions.NotFoundError:
-            found_doc = None
-        if found_doc:
-            result = es.update(
-                id=doc_id,
-                body={"doc": merge_contexts(body, found_doc)},
-                index=app.config['ELASTICSEARCH_SIMILARITY']
-            )
-        else:
-            result = es.index(
-                id=doc_id,
-                body=body,
-                index=app.config['ELASTICSEARCH_SIMILARITY']
-            )
-    else:
-        result = es.index(
-            body=body,
-            index=app.config['ELASTICSEARCH_SIMILARITY']
-        )
-    # es.indices.refresh(index=app.config['ELASTICSEARCH_SIMILARITY'])
 def update_or_create_document(body, doc_id, index):
   es = Elasticsearch(app.config['ELASTICSEARCH_URL'], timeout=30)
   result = None
   if doc_id:
       try:
           found_doc = es.get(index=index, id=doc_id)
       except elasticsearch.exceptions.NotFoundError:
           found_doc = None
       if found_doc:
           result = es.update(
               id=doc_id,
               body={"doc": merge_contexts(body, found_doc)},
               index=index
           )
       else:
           result = es.index(
               id=doc_id,
               body=body,
               index=index
           )
   else:
       result = es.index(
           body=body,
           index=index
       )
   return result
 
 def store_document(body, doc_id, language=None):
     indices = [app.config['ELASTICSEARCH_SIMILARITY']]
     if language and language in SUPPORTED_LANGUAGES:
       indices.append(app.config['ELASTICSEARCH_SIMILARITY'] "_" language)
     results = []
     for index in indices:
       results.append(update_or_create_document(body, doc_id, index))
     result = results[0]
     success = False
     if result['result'] == 'created' or result['result'] == 'updated':
         success = True
@@ -128,42  140,3 @@ def delete_document(doc_id, context, quiet):
             }
         else:
             return False
-
-def language_to_analyzer(lang):
-    analyzer_dict = {
-        'ar': 'arabic',
-        'hy': 'armenian',
-        'eu': 'basque',
-        'bn': 'bengali',
-        'pt-br': 'brazilian', # TODO
-        'bg': 'bulgarian',
-        'ca': 'catalan',
-        'cjk': 'cjk', # TODO
-        'cs': 'czech',
-        'da': 'danish',
-        'nl': 'dutch',
-        'en': 'english',
-        'fi': 'finnish',
-        'fr': 'french',
-        'gl': 'galician',
-        'de': 'german',
-        'gr': 'greek',
-        'hi': 'hindi',
-        'hu': 'hungarian',
-        'id': 'indonesian',
-        'ga': 'irish',
-        'it': 'italian',
-        'lv': 'latvian',
-        'lt': 'lithuanian',
-        'no': 'norwegian',
-        'fa': 'persian',
-        'pt': 'portuguese',
-        'ro': 'romanian',
-        'ru': 'russian',
-        'ku': 'sorani',
-        'es': 'spanish',
-        'sv': 'swedish',
-        'tr': 'turkish',
-        'th': 'thai'
-    }
-    return analyzer_dict.get(lang, 'standard')
@@ -0,0  1,183 @@
 import json
 from elasticsearch import Elasticsearch
 from flask import request, current_app as app
 SUPPORTED_LANGUAGES = ["en", "pt", "es", "hi", "bn"]
 #via https://www.elastic.co/guide/en/elasticsearch/reference/current/analysis-lang-analyzer.html#bengali-analyzer
 SETTINGS_BY_LANGUAGE = {
   "en": {
     "analysis": {
       "filter": {
         "english_stop": {
           "type":       "stop",
           "stopwords":  "_english_" 
         },
         "english_keywords": {
           "type":       "keyword_marker",
           "keywords":   ["example"] 
         },
         "english_stemmer": {
           "type":       "stemmer",
           "language":   "english"
         },
         "english_possessive_stemmer": {
           "type":       "stemmer",
           "language":   "possessive_english"
         }
       },
       "analyzer": {
         "rebuilt_english": {
           "tokenizer":  "standard",
           "filter": [
             "english_possessive_stemmer",
             "lowercase",
             "english_stop",
             "english_keywords",
             "english_stemmer"
           ]
         }
       }
     }
   },
   "es": {
     "analysis": {
       "filter": {
         "spanish_stop": {
           "type":       "stop",
           "stopwords":  "_spanish_" 
         },
         "spanish_keywords": {
           "type":       "keyword_marker",
           "keywords":   ["ejemplo"] 
         },
         "spanish_stemmer": {
           "type":       "stemmer",
           "language":   "light_spanish"
         }
       },
       "analyzer": {
         "rebuilt_spanish": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
             "spanish_stop",
             "spanish_keywords",
             "spanish_stemmer"
           ]
         }
       }
     }
   },
   "pt": {
     "analysis": {
       "filter": {
         "portuguese_stop": {
           "type":       "stop",
           "stopwords":  "_portuguese_" 
         },
         "portuguese_keywords": {
           "type":       "keyword_marker",
           "keywords":   ["exemplo"] 
         },
         "portuguese_stemmer": {
           "type":       "stemmer",
           "language":   "light_portuguese"
         }
       },
       "analyzer": {
         "rebuilt_portuguese": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
             "portuguese_stop",
             "portuguese_keywords",
             "portuguese_stemmer"
           ]
         }
       }
     }
   },
   "hi": {
     "analysis": {
       "filter": {
         "hindi_stop": {
           "type":       "stop",
           "stopwords":  "_hindi_" 
         },
         "hindi_keywords": {
           "type":       "keyword_marker",
           "keywords":   ["उदाहरण"] 
         },
         "hindi_stemmer": {
           "type":       "stemmer",
           "language":   "hindi"
         }
       },
       "analyzer": {
         "rebuilt_hindi": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
             "decimal_digit",
             "hindi_keywords",
             "indic_normalization",
             "hindi_normalization",
             "hindi_stop",
             "hindi_stemmer"
           ]
         }
       }
     }
   },
   "bn": {
     "analysis": {
       "filter": {
         "bengali_stop": {
           "type":       "stop",
           "stopwords":  "_bengali_" 
         },
         "bengali_keywords": {
           "type":       "keyword_marker",
           "keywords":   ["উদাহরণ"] 
         },
         "bengali_stemmer": {
           "type":       "stemmer",
           "language":   "bengali"
         }
       },
       "analyzer": {
         "rebuilt_bengali": {
           "tokenizer":  "standard",
           "filter": [
             "lowercase",
             "decimal_digit",
             "bengali_keywords",
             "indic_normalization",
             "bengali_normalization",
             "bengali_stop",
             "bengali_stemmer"
           ]
         }
       }
     }
   }
 }
 
 def init_indices():
   es = Elasticsearch(app.config['ELASTICSEARCH_URL'])
   indices = es.cat.indices(h='index', s='index').split()
   for lang in SUPPORTED_LANGUAGES:
     index_name = app.config['ELASTICSEARCH_SIMILARITY'] "_" lang
     if index_name not in indices:
       es.indices.create(index=index_name)
     es.indices.close(index=index_name)
     es.indices.put_mapping(
       body=json.load(open('./elasticsearch/alegre_similarity_base.json')),
       # include_type_name=True,
       index=index_name
     )
     es.indices.put_settings(
       body=SETTINGS_BY_LANGUAGE['pt'],
       # include_type_name=True,
       index=index_name
     )
     es.indices.open(index=index_name)    
@@ -14,7  14,7 @@ def get_body_for_text_document(params):
         models = models|set(params['models'])
     if not models:
         models = ['elasticsearch']
-    body = {'content': params.get('text'), 'created_at': params.get("created_at", datetime.now()), 'limit': params.get("limit", DEFAULT_SEARCH_LIMIT), 'models': list(models)}
     body = {'language': params.get('language'), 'content': params.get('text'), 'created_at': params.get("created_at", datetime.now()), 'limit': params.get("limit", DEFAULT_SEARCH_LIMIT), 'models': list(models)}
     for key in ['context', 'threshold', 'fuzzy']:
         if key in params:
             body[key] = params[key]
@@ -51,7  51,8 @@ def add_item(item, similarity_type):
     response = add_image(item)
   elif similarity_type == "text":
     doc_id = item.pop("doc_id", None)
-    response = add_text(item, doc_id)
     language = item.pop("language", None)
     response = add_text(item, doc_id, language)
   app.logger.info(f"[Alegre Similarity] [Item {item}, Similarity type: {similarity_type}] response for delete was {response}")
   return response