diff --git a/.example.env b/.example.env deleted file mode 100644 index c86789f387ca2a1c1e2ea7ae851c7fba2caecebe..0000000000000000000000000000000000000000 --- a/.example.env +++ /dev/null @@ -1,3 +0,0 @@ -ALLOW_ORIGINS="*" -ES_HOST="localhost" -ES_PORT=9200 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 8a23f93d7cf877b58d14f2b3b4999cb33c57e903..820e4a8501a63237fcfe78ef0db71a391478d1ce 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,3 @@ __pycache__/ .env -venv \ No newline at end of file +venv diff --git a/Dockerfile b/Dockerfile index 8add38921dba7fcd6ae35b702df7c45d94ca80b0..1be2d79cb8e87fe38646565380d4e73552e5ebf2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -14,4 +14,4 @@ RUN pip install --no-cache-dir -r requirements.txt COPY src . -CMD [ "uwsgi", "--socket", "0.0.0.0:5000", "--protocol=http", "--wsgi-file", "./wsgi.py", "--callable", "application" ] +CMD [ "uwsgi", "--ini", "config.ini" ] diff --git a/README.md b/README.md index f5ddc2d2a3511b9c6324a436e66a1a9ce7409817..84e6ae3e7ed6714191873fa5ab0ab072f6ad0fbc 100644 --- a/README.md +++ b/README.md @@ -7,6 +7,7 @@ Full-text search for your Jekyll blog with ElasticSearch. 1. run `python3 -m venv venv` to create the virtual env 1. run `. venv/bin/activate` to be able to use the python packages 1. run `pip install -r requirements.txt` to install all packages +1. Update the file `./src/config.ini` according to your needs To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode diff --git a/requirements.txt b/requirements.txt index 6cca06f521b7b1905c233947152c5e200a65a4f1..0b9242dfc3c325d34b1e694d4a8c52579aa2dc21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,6 +3,7 @@ certifi==2021.5.30 click==8.0.1 elasticsearch==7.13.3 Flask==2.0.1 +folder-hash==1.0.0 itsdangerous==2.0.1 Jinja2==3.0.1 MarkupSafe==2.0.1 diff --git a/src/app.py b/src/app.py index 24a119537f87b7f18e260fb690f4fd1467790c45..c11afd0f74d6d2c83fcb3cdb3b108d9efe2baa58 100644 --- a/src/app.py +++ b/src/app.py @@ -1,4 +1,4 @@ -from flask import Flask, jsonify, request +from flask import Flask, json, jsonify, Response from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv @@ -19,6 +19,16 @@ def search(language, query): data = { 'search_results': result, } - resp = jsonify(data) + try: + resp = jsonify(data) + except: + resp = json.dumps({'error': 'Unable to process at the moment'}) + return Response(response=resp, + status=500, + content_type='application/json', + headers={ + 'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*') + }) + resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*') return resp \ No newline at end of file diff --git a/src/config.ini b/src/config.ini new file mode 100644 index 0000000000000000000000000000000000000000..be3bbbad9a4d2d3eac4cdbf0d18ca6d79a7f0355 --- /dev/null +++ b/src/config.ini @@ -0,0 +1,8 @@ +[uwsgi] +socket=0.0.0.0:5000 +protocol=http +module = wsgi:application +master = true +processes = 8 + +env=INDEX_FOLDER=/public_html \ No newline at end of file diff --git a/src/indexer.py b/src/indexer.py index e2ad0bcd1ef3335d29ef05f4602e35d4964f9f1b..a4843ceb70a2e77150d695f13076015e59a87e91 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,5 +1,7 @@ from elasticsearch import Elasticsearch from time import sleep +from typing import List +import post doc_type = "post" @@ -17,14 +19,22 @@ def connect_elastic(host='localhost', port=9200): raise TimeoutError(f"Could not connect to elasticsearch server at {host}:{port}.") -def refresh_index(es, languages): +def create_indexes(es: Elasticsearch, languages: List[str], hash: str): for lang in languages: - if es.indices.exists(index=lang): - es.indices.delete(index=lang) - es.indices.create(index=lang) + index_name = get_index_name(lang, hash) + if not es.indices.exists(index=index_name): + es.indices.create(index=index_name) -def index_posts(es, posts): +def get_index_name(language: str, hash: str) -> str: + return f"{language}-{hash}" + +def delete_all_indexes_except(es: Elasticsearch, except_hash: str): + for index_name in es.indices.get('*'): + if except_hash not in index_name: + es.indices.delete(index=index_name) + +def index_posts(es: Elasticsearch, posts: List[post.Post], hash: str): for post in posts: doc = { "title": post.title, @@ -34,5 +44,7 @@ def index_posts(es, posts): "lang": post.lang, } - es.index(index=post.lang, doc_type=doc_type, id=post.id, body=doc) + index_name = get_index_name(post.lang, hash) + + es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) print("Created doc for " + post.url) diff --git a/src/main.py b/src/main.py index 19d3f008aac99db90045294e8b6df76cbd21db00..d6a0f3b6597a95811008fe32376fad11aa2b0c8a 100644 --- a/src/main.py +++ b/src/main.py @@ -3,34 +3,50 @@ from find_posts import create_posts import indexer from dotenv import load_dotenv import os +import searcher +from folder_hash.hasher import Hasher load_dotenv() -if __name__ == "__main__": - # provide blog base directory as arg - if len(sys.argv) != 2: - raise BaseException('You must pass the project folder to be crawled, and only it.') - - base_dir = str(sys.argv[1]) - - print("Finding posts in %s" % base_dir) - - posts = create_posts(base_dir) - print("Posts created ({})".format(len(posts))) +def index_folder(base_dir: str): + if not os.path.exists(base_dir): + print(f"No folder was found at {base_dir}") + return es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) - print(f"Connecting to {es_host}:{es_port}") + print(f"Connecting to elastic search at: {es_host}:{es_port}") es = indexer.connect_elastic(es_host, es_port) print("ElasticSearch connection established") + hasher = Hasher(base_dir, "md5") + folder_hash = hasher.get_hash() + current_hash = searcher.get_current_hash(es) + if folder_hash == current_hash: + print(f"The folder {base_dir} was already indexed. Hash: {folder_hash}") + return - unique_languages = set([post.lang for post in posts ]) + print("Finding posts in %s" % base_dir) - indexer.refresh_index(es, unique_languages) - print("Current indexes updated") + posts = create_posts(base_dir) + print("Posts created ({})".format(len(posts))) + + unique_languages = set([post.lang for post in posts]) - indexer.index_posts(es, posts) + indexer.create_indexes(es, unique_languages, folder_hash) + indexer.index_posts(es, posts, folder_hash) print("Finished indexing posts") + + print(f"Deleting all indexes except {folder_hash}") + indexer.delete_all_indexes_except(es, folder_hash) + +if __name__ == "__main__": + # provide blog base directory as arg + if len(sys.argv) != 2: + raise BaseException('You must pass the project folder to be crawled, and only it.') + + base_dir = str(sys.argv[1]) + + index_folder(base_dir) \ No newline at end of file diff --git a/src/searcher.py b/src/searcher.py index 5b066e28ef157b064b8e7cf23e6c2a4e2588d79d..e0e7f67f40670d84d7c7625310e3661d4441360d 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,9 +1,59 @@ from elasticsearch.client import Elasticsearch +import math +from typing import List, Optional +import indexer +def get_current_hash_by_language(es: Elasticsearch, language: str) -> Optional[str]: + all_indexes_in_language = [index for index in es.indices.get('*') if index.startswith(language)] + + if len(all_indexes_in_language) == 0: + return None + + oldest_index = get_oldest_index(es, all_indexes_in_language) + + if oldest_index == None: + return None + + return oldest_index.split('-')[1] + +def get_current_hash(es: Elasticsearch) -> Optional[str]: + oldest_index = get_oldest_index(es, [index for index in es.indices.get('*')]) + + if oldest_index == None: + return None + + return oldest_index.split('-')[1] + +def get_oldest_index(es: Elasticsearch, indexes: List[str]) -> Optional[str]: + if len(indexes) == 0: + return None + oldest_index = None + oldest_index_date = math.inf + + for i in indexes: + + if not es.indices.exists(i): + continue + + index = es.indices.get(i) + created_date = int(index[i]['settings']['index']['creation_date']) + + is_older = oldest_index_date > created_date + if is_older: + oldest_index_date = created_date + oldest_index = i + + return oldest_index def search_query(es: Elasticsearch, user_query: str, language: str): - if not es.indices.exists(language): + current_hash = get_current_hash_by_language(es, language) + if current_hash == None: + return [] + + index_name = indexer.get_index_name(language, current_hash) + + if not es.indices.exists(index_name): return [] query = { @@ -24,7 +74,7 @@ def search_query(es: Elasticsearch, user_query: str, language: str): "_source": ["title", "url", "description", "lang", "body"] } - res = es.search(index=language, body=query) + res = es.search(index=index_name, body=query) results = [] for h in res['hits']['hits']: results.append(h['_source'])