diff --git a/README.md b/README.md index 84e6ae3e7ed6714191873fa5ab0ab072f6ad0fbc..6aa636d67b62aa1d91fb3654da69fd4c034341a0 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,7 @@ docker run \ elasticsearch:7.13.3 ``` 1. Make sure you are at the virtual env (run `. venv/bin/activate`) -1. To index your content, run: `python3 src/main.py "PATH_TO_YOUR_CONTENT"` +1. To index your content, run: `python src/main.py "PATH_TO_YOUR_CONTENT"` 1. run `export FLASK_ENV=development` 1. run `export FLASK_APP=src/app` 1. run `flask run` diff --git a/src/app.py b/src/app.py index c11afd0f74d6d2c83fcb3cdb3b108d9efe2baa58..99de8a6af9cc419a6b8d3b260e050f05c28d7b54 100644 --- a/src/app.py +++ b/src/app.py @@ -1,5 +1,6 @@ -from flask import Flask, json, jsonify, Response -from searcher import search_query +from flask import Flask, json, Response, make_response +import searcher +import gzip from indexer import connect_elastic from dotenv import load_dotenv import os @@ -15,12 +16,12 @@ es = connect_elastic(es_host, es_port) @app.route('//search/') def search(language, query): - result = search_query(es, query, language) + result = searcher.search_query(es, query, language) data = { 'search_results': result, } try: - resp = jsonify(data) + resp = gzip_json_response(data) except: resp = json.dumps({'error': 'Unable to process at the moment'}) return Response(response=resp, @@ -31,4 +32,20 @@ def search(language, query): }) resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*') - return resp \ No newline at end of file + return resp + +@app.route('//autocomplete') +def autocomplete(language): + result = searcher.autocomplete(es, language) + data = { + 'autocomplete': result, + } + return gzip_json_response(data) + +def gzip_json_response(data: dict) -> Response: + content = gzip.compress(json.dumps(data).encode('utf8'), 5) + response = make_response(content) + response.headers['Content-length'] = len(content) + response.headers['Content-Encoding'] = 'gzip' + response.headers['Content-Type'] = 'application/json' + return response \ No newline at end of file diff --git a/src/searcher.py b/src/searcher.py index 63d3ab08aef7d596bafbf6c271809d958aa2023c..6509dcbfa2bad9f29d6683400bf734b95346fe6b 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -22,7 +22,13 @@ def get_current_hash(es: Elasticsearch) -> Optional[str]: if oldest_index == None: return None - return oldest_index.split('-')[1] + oldest_index_params = oldest_index.split('-') + + if len(oldest_index_params) is not 2: + print(f'Your index "{oldest_index}" is not a valid index name') + return None + + return oldest_index_params[1] def get_oldest_index(es: Elasticsearch, indexes: List[str]) -> Optional[str]: if len(indexes) == 0: @@ -47,13 +53,8 @@ def get_oldest_index(es: Elasticsearch, indexes: List[str]) -> Optional[str]: def search_query(es: Elasticsearch, user_query: str, language: str): - current_hash = get_current_hash_by_language(es, language) - if current_hash == None: - return [] - - index_name = indexer.get_index_name(language, current_hash) - - if not es.indices.exists(index_name): + index_name = get_index_name_from_lang(es=es, language=language) + if index_name is None: return [] query = { @@ -80,6 +81,46 @@ def search_query(es: Elasticsearch, user_query: str, language: str): results.append(h['_source']) return results +def autocomplete(es: Elasticsearch, language: str) -> List[str]: + + index_name = get_index_name_from_lang(es=es, language=language) + if index_name is None: + return [] + return autocomplete_for_index(es=es, index_name=index_name) + +def get_index_name_from_lang(es: Elasticsearch, language: str) -> Optional[str]: + current_hash = get_current_hash_by_language(es, language) + if current_hash == None: + return None + + index_name = indexer.get_index_name(language, current_hash) + + if not es.indices.exists(index_name): + return None + + return index_name + +def autocomplete_for_index(es: Elasticsearch, index_name: str) -> Optional[List[str]]: + query = { + "query": { + "match_all": {} + }, + "size": 10000, + "_source": ["title", "subtitles"] + } + + res = es.search(index=index_name, body=query) + results = [] + for h in res['hits']['hits']: + results.append(h['_source']['title']) + results += h['_source']['subtitles'] + + if '' in results: + results.remove('') + + results = list(set(results)) + return results + if __name__ == "__main__": import os from indexer import connect_elastic