From 27b77b4e89044f53bddb2c3730ae584d8d14088f Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 25 Aug 2021 16:24:15 -0300 Subject: [PATCH 1/7] Indexation in parallel with webservice --- .example.env | 3 --- Dockerfile | 2 +- README.md | 1 + src/app.py | 8 +++++++- src/config.ini | 8 ++++++++ src/main.py | 17 ++++++++++------- 6 files changed, 27 insertions(+), 12 deletions(-) delete mode 100644 .example.env create mode 100644 src/config.ini diff --git a/.example.env b/.example.env deleted file mode 100644 index c86789f..0000000 --- a/.example.env +++ /dev/null @@ -1,3 +0,0 @@ -ALLOW_ORIGINS="*" -ES_HOST="localhost" -ES_PORT=9200 \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 8add389..1be2d79 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,4 +14,4 @@ RUN pip install --no-cache-dir -r requirements.txt COPY src . -CMD [ "uwsgi", "--socket", "0.0.0.0:5000", "--protocol=http", "--wsgi-file", "./wsgi.py", "--callable", "application" ] +CMD [ "uwsgi", "--ini", "config.ini" ] diff --git a/README.md b/README.md index f5ddc2d..84e6ae3 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Full-text search for your Jekyll blog with ElasticSearch. 1. run `python3 -m venv venv` to create the virtual env 1. run `. venv/bin/activate` to be able to use the python packages 1. run `pip install -r requirements.txt` to install all packages +1. Update the file `./src/config.ini` according to your needs To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode diff --git a/src/app.py b/src/app.py index 24a1195..1df6ac6 100644 --- a/src/app.py +++ b/src/app.py @@ -1,13 +1,19 @@ -from flask import Flask, jsonify, request +from flask import Flask, jsonify from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv +import main +from multiprocessing import Process import os app = Flask(__name__) load_dotenv() +folder = os.getenv('INDEX_FOLDER') +task = Process(target=main.index_folder, args=(folder,)) +task.start() + es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) diff --git a/src/config.ini b/src/config.ini new file mode 100644 index 0000000..854763a --- /dev/null +++ b/src/config.ini @@ -0,0 +1,8 @@ +[uwsgi] +socket=0.0.0.0:5000 +protocol=http +module = wsgi:application +master = true +processes = 8 + +env=INDEX_FOLDER=/e_docs_website \ No newline at end of file diff --git a/src/main.py b/src/main.py index 19d3f00..ef709a5 100644 --- a/src/main.py +++ b/src/main.py @@ -6,13 +6,7 @@ import os load_dotenv() -if __name__ == "__main__": - # provide blog base directory as arg - if len(sys.argv) != 2: - raise BaseException('You must pass the project folder to be crawled, and only it.') - - base_dir = str(sys.argv[1]) - +def index_folder(base_dir: str): print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) @@ -34,3 +28,12 @@ if __name__ == "__main__": indexer.index_posts(es, posts) print("Finished indexing posts") + +if __name__ == "__main__": + # provide blog base directory as arg + if len(sys.argv) != 2: + raise BaseException('You must pass the project folder to be crawled, and only it.') + + base_dir = str(sys.argv[1]) + + index_folder(base_dir) \ No newline at end of file -- GitLab From 93654f3f576d14088198e80da4efd68ec9a4256d Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Fri, 27 Aug 2021 14:35:42 -0300 Subject: [PATCH 2/7] Indexations with versions --- .gitignore | 3 ++- src/indexer.py | 24 ++++++++++++++++++------ src/main.py | 17 +++++++++++++---- src/searcher.py | 13 ++++++++++--- src/shared_state.py | 14 ++++++++++++++ 5 files changed, 57 insertions(+), 14 deletions(-) create mode 100644 src/shared_state.py diff --git a/.gitignore b/.gitignore index 8a23f93..a735305 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ __pycache__/ .env -venv \ No newline at end of file +venv +shared_state_hash.txt \ No newline at end of file diff --git a/src/indexer.py b/src/indexer.py index e2ad0bc..4b11ddc 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,5 +1,7 @@ from elasticsearch import Elasticsearch from time import sleep +from typing import List +import post doc_type = "post" @@ -17,14 +19,22 @@ def connect_elastic(host='localhost', port=9200): raise TimeoutError(f"Could not connect to elasticsearch server at {host}:{port}.") -def refresh_index(es, languages): +def create_indexes(es: Elasticsearch, languages: List[str], hash: str): for lang in languages: - if es.indices.exists(index=lang): - es.indices.delete(index=lang) - es.indices.create(index=lang) + index_name = get_index_name(lang, hash) + es.indices.create(index=index_name) -def index_posts(es, posts): +def get_index_name(language: str, hash: str) -> str: + return f"{language}-{hash}" + +def delete_indexes(es: Elasticsearch, languages: List[str], hash: str): + for lang in languages: + index_name = get_index_name(lang, hash) + if es.indices.exists(index=index_name): + es.indices.delete(index=index_name) + +def index_posts(es: Elasticsearch, posts: List[post.Post], hash: str): for post in posts: doc = { "title": post.title, @@ -34,5 +44,7 @@ def index_posts(es, posts): "lang": post.lang, } - es.index(index=post.lang, doc_type=doc_type, id=post.id, body=doc) + index_name = get_index_name(post.lang, hash) + + es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) print("Created doc for " + post.url) diff --git a/src/main.py b/src/main.py index ef709a5..987d443 100644 --- a/src/main.py +++ b/src/main.py @@ -3,6 +3,8 @@ from find_posts import create_posts import indexer from dotenv import load_dotenv import os +import uuid +import shared_state load_dotenv() @@ -23,11 +25,18 @@ def index_folder(base_dir: str): unique_languages = set([post.lang for post in posts ]) - indexer.refresh_index(es, unique_languages) - print("Current indexes updated") - - indexer.index_posts(es, posts) + new_hash = str(uuid.uuid4()) + indexer.create_indexes(es, unique_languages, new_hash) + indexer.index_posts(es, posts, new_hash) print("Finished indexing posts") + + old_hash = shared_state.get_hash() + shared_state.set_hash(new_hash) + print(f"Updated shared hash to {new_hash}") + + if old_hash is not None: + print(f"Deleting old values from hash {old_hash}") + indexer.delete_indexes(es, unique_languages, old_hash) if __name__ == "__main__": # provide blog base directory as arg diff --git a/src/searcher.py b/src/searcher.py index 5b066e2..b7dc963 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,9 +1,16 @@ from elasticsearch.client import Elasticsearch - +import indexer +import shared_state def search_query(es: Elasticsearch, user_query: str, language: str): - if not es.indices.exists(language): + current_hash = shared_state.get_hash() + if current_hash == None: + return [] + + index_name = indexer.get_index_name(language, current_hash) + + if not es.indices.exists(index_name): return [] query = { @@ -24,7 +31,7 @@ def search_query(es: Elasticsearch, user_query: str, language: str): "_source": ["title", "url", "description", "lang", "body"] } - res = es.search(index=language, body=query) + res = es.search(index=index_name, body=query) results = [] for h in res['hits']['hits']: results.append(h['_source']) diff --git a/src/shared_state.py b/src/shared_state.py new file mode 100644 index 0000000..6e34790 --- /dev/null +++ b/src/shared_state.py @@ -0,0 +1,14 @@ +from typing import Optional +import os + +HASH_FILE_NAME = 'shared_state_hash.txt' + +def get_hash() -> Optional[str]: + if not os.path.exists(HASH_FILE_NAME): + return None + with open(HASH_FILE_NAME, 'r') as f: + return f.read().strip() + +def set_hash(hash: str): + with open(HASH_FILE_NAME, 'w') as f: + f.write(hash) -- GitLab From 2841879418ca79f98bcf51a16e5d3c819f69a277 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 1 Sep 2021 11:32:47 -0300 Subject: [PATCH 3/7] Calculate hash based on folder contents --- requirements.txt | 1 + src/config.ini | 2 +- src/main.py | 25 ++++++++++++++++++------- 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/requirements.txt b/requirements.txt index 6cca06f..0b9242d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ certifi==2021.5.30 click==8.0.1 elasticsearch==7.13.3 Flask==2.0.1 +folder-hash==1.0.0 itsdangerous==2.0.1 Jinja2==3.0.1 MarkupSafe==2.0.1 diff --git a/src/config.ini b/src/config.ini index 854763a..be3bbba 100644 --- a/src/config.ini +++ b/src/config.ini @@ -5,4 +5,4 @@ module = wsgi:application master = true processes = 8 -env=INDEX_FOLDER=/e_docs_website \ No newline at end of file +env=INDEX_FOLDER=/public_html \ No newline at end of file diff --git a/src/main.py b/src/main.py index 987d443..5525dfd 100644 --- a/src/main.py +++ b/src/main.py @@ -3,12 +3,25 @@ from find_posts import create_posts import indexer from dotenv import load_dotenv import os +import subprocess import uuid import shared_state +from folder_hash.hasher import Hasher load_dotenv() def index_folder(base_dir: str): + if not os.path.exists(base_dir): + print(f"No folder was found at {base_dir}") + return + + hasher = Hasher(base_dir, "md5") + folder_hash = hasher.get_hash() + old_hash = shared_state.get_hash() + if folder_hash == old_hash: + print(f"The folder {base_dir} was already indexed. Hash: {folder_hash}") + return + print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) @@ -23,16 +36,14 @@ def index_folder(base_dir: str): print("ElasticSearch connection established") - unique_languages = set([post.lang for post in posts ]) + unique_languages = set([post.lang for post in posts]) - new_hash = str(uuid.uuid4()) - indexer.create_indexes(es, unique_languages, new_hash) - indexer.index_posts(es, posts, new_hash) + indexer.create_indexes(es, unique_languages, folder_hash) + indexer.index_posts(es, posts, folder_hash) print("Finished indexing posts") - old_hash = shared_state.get_hash() - shared_state.set_hash(new_hash) - print(f"Updated shared hash to {new_hash}") + shared_state.set_hash(folder_hash) + print(f"Updated shared hash to {folder_hash}") if old_hash is not None: print(f"Deleting old values from hash {old_hash}") -- GitLab From 79eb9b972d097120ddd9b22d1728e91db2ba2062 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 2 Sep 2021 11:09:54 -0300 Subject: [PATCH 4/7] Fix index already exists; Catche jsonify error; Do not index on startup --- src/app.py | 19 +++++++++++++------ src/indexer.py | 3 ++- 2 files changed, 15 insertions(+), 7 deletions(-) diff --git a/src/app.py b/src/app.py index 1df6ac6..dcbf0b0 100644 --- a/src/app.py +++ b/src/app.py @@ -1,4 +1,5 @@ -from flask import Flask, jsonify +from flask import Flask, json, jsonify, Response +from werkzeug.exceptions import HTTPException from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv @@ -10,10 +11,6 @@ app = Flask(__name__) load_dotenv() -folder = os.getenv('INDEX_FOLDER') -task = Process(target=main.index_folder, args=(folder,)) -task.start() - es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) @@ -25,6 +22,16 @@ def search(language, query): data = { 'search_results': result, } - resp = jsonify(data) + try: + resp = jsonify(data) + except: + resp = json.dumps({'error': 'Unable to process at the moment'}) + return Response(response=resp, + status=500, + content_type='application/json', + headers={ + 'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*') + }) + resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*') return resp \ No newline at end of file diff --git a/src/indexer.py b/src/indexer.py index 4b11ddc..cb556fe 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -23,7 +23,8 @@ def create_indexes(es: Elasticsearch, languages: List[str], hash: str): for lang in languages: index_name = get_index_name(lang, hash) - es.indices.create(index=index_name) + if not es.indices.exists(index=index_name): + es.indices.create(index=index_name) def get_index_name(language: str, hash: str) -> str: return f"{language}-{hash}" -- GitLab From 3cff37bcfdfceeb8ea059fee6cb144bb9c2c22d2 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 2 Sep 2021 16:12:01 -0300 Subject: [PATCH 5/7] Not using shared file to save current hash --- src/indexer.py | 7 +++---- src/main.py | 35 ++++++++++++++------------------- src/searcher.py | 47 +++++++++++++++++++++++++++++++++++++++++++-- src/shared_state.py | 14 -------------- 4 files changed, 62 insertions(+), 41 deletions(-) delete mode 100644 src/shared_state.py diff --git a/src/indexer.py b/src/indexer.py index cb556fe..a4843ce 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -29,10 +29,9 @@ def create_indexes(es: Elasticsearch, languages: List[str], hash: str): def get_index_name(language: str, hash: str) -> str: return f"{language}-{hash}" -def delete_indexes(es: Elasticsearch, languages: List[str], hash: str): - for lang in languages: - index_name = get_index_name(lang, hash) - if es.indices.exists(index=index_name): +def delete_all_indexes_except(es: Elasticsearch, except_hash: str): + for index_name in es.indices.get('*'): + if except_hash not in index_name: es.indices.delete(index=index_name) def index_posts(es: Elasticsearch, posts: List[post.Post], hash: str): diff --git a/src/main.py b/src/main.py index 5525dfd..d6a0f3b 100644 --- a/src/main.py +++ b/src/main.py @@ -3,9 +3,7 @@ from find_posts import create_posts import indexer from dotenv import load_dotenv import os -import subprocess -import uuid -import shared_state +import searcher from folder_hash.hasher import Hasher load_dotenv() @@ -15,39 +13,34 @@ def index_folder(base_dir: str): print(f"No folder was found at {base_dir}") return + es_host = os.getenv('ES_HOST', 'localhost') + es_port = os.getenv('ES_PORT', 9200) + + print(f"Connecting to elastic search at: {es_host}:{es_port}") + + es = indexer.connect_elastic(es_host, es_port) + print("ElasticSearch connection established") + hasher = Hasher(base_dir, "md5") folder_hash = hasher.get_hash() - old_hash = shared_state.get_hash() - if folder_hash == old_hash: + current_hash = searcher.get_current_hash(es) + if folder_hash == current_hash: print(f"The folder {base_dir} was already indexed. Hash: {folder_hash}") - return + return print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) print("Posts created ({})".format(len(posts))) - es_host = os.getenv('ES_HOST', 'localhost') - es_port = os.getenv('ES_PORT', 9200) - - print(f"Connecting to {es_host}:{es_port}") - - es = indexer.connect_elastic(es_host, es_port) - print("ElasticSearch connection established") - - unique_languages = set([post.lang for post in posts]) indexer.create_indexes(es, unique_languages, folder_hash) indexer.index_posts(es, posts, folder_hash) print("Finished indexing posts") - shared_state.set_hash(folder_hash) - print(f"Updated shared hash to {folder_hash}") - - if old_hash is not None: - print(f"Deleting old values from hash {old_hash}") - indexer.delete_indexes(es, unique_languages, old_hash) + print(f"Deleting all indexes except {folder_hash}") + indexer.delete_all_indexes_except(es, folder_hash) if __name__ == "__main__": # provide blog base directory as arg diff --git a/src/searcher.py b/src/searcher.py index b7dc963..e0e7f67 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,10 +1,53 @@ from elasticsearch.client import Elasticsearch +import math +from typing import List, Optional import indexer -import shared_state + +def get_current_hash_by_language(es: Elasticsearch, language: str) -> Optional[str]: + all_indexes_in_language = [index for index in es.indices.get('*') if index.startswith(language)] + + if len(all_indexes_in_language) == 0: + return None + + oldest_index = get_oldest_index(es, all_indexes_in_language) + + if oldest_index == None: + return None + + return oldest_index.split('-')[1] + +def get_current_hash(es: Elasticsearch) -> Optional[str]: + oldest_index = get_oldest_index(es, [index for index in es.indices.get('*')]) + + if oldest_index == None: + return None + + return oldest_index.split('-')[1] + +def get_oldest_index(es: Elasticsearch, indexes: List[str]) -> Optional[str]: + if len(indexes) == 0: + return None + oldest_index = None + oldest_index_date = math.inf + + for i in indexes: + + if not es.indices.exists(i): + continue + + index = es.indices.get(i) + created_date = int(index[i]['settings']['index']['creation_date']) + + is_older = oldest_index_date > created_date + if is_older: + oldest_index_date = created_date + oldest_index = i + + return oldest_index def search_query(es: Elasticsearch, user_query: str, language: str): - current_hash = shared_state.get_hash() + current_hash = get_current_hash_by_language(es, language) if current_hash == None: return [] diff --git a/src/shared_state.py b/src/shared_state.py deleted file mode 100644 index 6e34790..0000000 --- a/src/shared_state.py +++ /dev/null @@ -1,14 +0,0 @@ -from typing import Optional -import os - -HASH_FILE_NAME = 'shared_state_hash.txt' - -def get_hash() -> Optional[str]: - if not os.path.exists(HASH_FILE_NAME): - return None - with open(HASH_FILE_NAME, 'r') as f: - return f.read().strip() - -def set_hash(hash: str): - with open(HASH_FILE_NAME, 'w') as f: - f.write(hash) -- GitLab From d7d58b69151b1ce66d0ac9652653333b7c81ff94 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 2 Sep 2021 16:26:24 -0300 Subject: [PATCH 6/7] Removing unused imports --- src/app.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/app.py b/src/app.py index dcbf0b0..c11afd0 100644 --- a/src/app.py +++ b/src/app.py @@ -1,10 +1,7 @@ from flask import Flask, json, jsonify, Response -from werkzeug.exceptions import HTTPException from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv -import main -from multiprocessing import Process import os app = Flask(__name__) -- GitLab From b66cb686745f803a13c4b1853b0c675bf7257fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Arnau=20V=C3=A0zquez?= Date: Fri, 3 Sep 2021 08:02:59 +0000 Subject: [PATCH 7/7] Update .gitignore --- .gitignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.gitignore b/.gitignore index a735305..820e4a8 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,3 @@ __pycache__/ .env venv -shared_state_hash.txt \ No newline at end of file -- GitLab