diff --git a/src/app.py b/src/app.py index 259c08dac0190dbe20bb9133e2a7e9eac11245bf..24a119537f87b7f18e260fb690f4fd1467790c45 100644 --- a/src/app.py +++ b/src/app.py @@ -1,4 +1,4 @@ -from flask import Flask, jsonify +from flask import Flask, jsonify, request from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv @@ -13,9 +13,9 @@ es_port = os.getenv('ES_PORT', 9200) es = connect_elastic(es_host, es_port) -@app.route('/search/') -def search(query): - result = search_query(es, query) +@app.route('//search/') +def search(language, query): + result = search_query(es, query, language) data = { 'search_results': result, } diff --git a/src/find_posts.py b/src/find_posts.py index 13c43c9aad945ecc51ab5b423388d36d2872d53b..b1d1183edd14018c407125d8974eef6622a35b2d 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -70,6 +70,7 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: def create_posts(base_dir): paths = find_post_paths(base_dir) + posts = [] for path in paths: htmltree = get_htmltree_from_file(path) @@ -96,11 +97,13 @@ def create_posts(base_dir): id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") - yield Post( + posts.append(Post( id=id, title=title, url=url, body=body, description=description, lang=lang - ) + )) + + return posts diff --git a/src/indexer.py b/src/indexer.py index bc3a5b2797a1a9cb8840771d9dd719edefbb9fe0..6e3faab9308c4481af96851780406147713797f8 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,7 +1,6 @@ from elasticsearch import Elasticsearch from time import sleep -index_name = "blog" doc_type = "post" def connect_elastic(host='localhost', port=9200): @@ -18,10 +17,12 @@ def connect_elastic(host='localhost', port=9200): raise TimeoutError(f"Could not connect to elasticsearch server at {host}:{port}.") -def refresh_index(es): - if es.indices.exists(index=index_name): - es.indices.delete(index=index_name) - es.indices.create(index=index_name) +def refresh_index(es, languages): + + for lang in languages: + if es.indices.exists(index=lang): + es.indices.delete(index=lang) + es.indices.create(index=lang) def index_posts(es, posts): for post in posts: @@ -32,5 +33,5 @@ def index_posts(es, posts): "lang": post.lang, } - es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) + es.index(index=post.lang, doc_type=doc_type, id=post.id, body=doc) print("Created doc for " + post.url) diff --git a/src/main.py b/src/main.py index eb8dd1089e09b4541a716b846095ba4f1475a7d0..19d3f008aac99db90045294e8b6df76cbd21db00 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,4 @@ import sys -from os.path import expanduser from find_posts import create_posts import indexer from dotenv import load_dotenv @@ -17,7 +16,7 @@ if __name__ == "__main__": print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) - print("Posts created") + print("Posts created ({})".format(len(posts))) es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) @@ -27,8 +26,11 @@ if __name__ == "__main__": es = indexer.connect_elastic(es_host, es_port) print("ElasticSearch connection established") - indexer.refresh_index(es) - print("Current blog index removed") + + unique_languages = set([post.lang for post in posts ]) + + indexer.refresh_index(es, unique_languages) + print("Current indexes updated") indexer.index_posts(es, posts) print("Finished indexing posts") diff --git a/src/searcher.py b/src/searcher.py index 881b33ac73bbeee29fbdc79626e41c1f7fb719e4..4af6be0b1b69a0c4d1801eaecdfba800aea3d64a 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,4 +1,10 @@ -def search_query(es, user_query): +from elasticsearch.client import Elasticsearch + + +def search_query(es: Elasticsearch, user_query: str, language: str): + + if not es.indices.exists(language): + return [] query = { "query": { @@ -7,7 +13,7 @@ def search_query(es, user_query): "type": "best_fields", "fuzziness": "AUTO", "tie_breaker": 0.3, - "fields": ["title^3", "body"] + "fields": ["title^3", "body"], } }, "highlight": { @@ -18,7 +24,7 @@ def search_query(es, user_query): "_source": ["title", "url", "body", "lang"] } - res = es.search(index="blog", body=query) + res = es.search(index=language, body=query) results = [] for h in res['hits']['hits']: results.append(h['_source'])