From 848f5090af8099728af782c5c8b152de12916a54 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 16 Aug 2021 16:10:12 -0300 Subject: [PATCH 1/4] Search by language --- src/app.py | 5 +++-- src/searcher.py | 23 +++++++++++++++-------- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/src/app.py b/src/app.py index 259c08d..0d1aa1f 100644 --- a/src/app.py +++ b/src/app.py @@ -1,4 +1,4 @@ -from flask import Flask, jsonify +from flask import Flask, jsonify, request from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv @@ -15,7 +15,8 @@ es = connect_elastic(es_host, es_port) @app.route('/search/') def search(query): - result = search_query(es, query) + language = request.args.get("lang", "en") + result = search_query(es, query, language) data = { 'search_results': result, } diff --git a/src/searcher.py b/src/searcher.py index 881b33a..e5442d4 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,14 +1,21 @@ -def search_query(es, user_query): +def search_query(es, user_query, language): query = { "query": { - "multi_match": { - "query": user_query, - "type": "best_fields", - "fuzziness": "AUTO", - "tie_breaker": 0.3, - "fields": ["title^3", "body"] - } + "bool": { + "must": { + "multi_match": { + "query": user_query, + "type": "best_fields", + "fuzziness": "AUTO", + "tie_breaker": 0.3, + "fields": ["title^3", "body"], + } + }, + "filter": [ + { "term": { "lang": language } } + ] + }, }, "highlight": { "fields" : { -- GitLab From 7eea21f145f9812301aa5be80bba0cd253623a53 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 18 Aug 2021 14:24:15 -0300 Subject: [PATCH 2/4] Spliting languages into different indexes --- src/app.py | 5 ++--- src/find_posts.py | 7 +++++-- src/indexer.py | 13 +++++++------ src/main.py | 8 ++++++-- src/searcher.py | 23 ++++++++--------------- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/src/app.py b/src/app.py index 0d1aa1f..24a1195 100644 --- a/src/app.py +++ b/src/app.py @@ -13,9 +13,8 @@ es_port = os.getenv('ES_PORT', 9200) es = connect_elastic(es_host, es_port) -@app.route('/search/') -def search(query): - language = request.args.get("lang", "en") +@app.route('//search/') +def search(language, query): result = search_query(es, query, language) data = { 'search_results': result, diff --git a/src/find_posts.py b/src/find_posts.py index 13c43c9..b1d1183 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -70,6 +70,7 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: def create_posts(base_dir): paths = find_post_paths(base_dir) + posts = [] for path in paths: htmltree = get_htmltree_from_file(path) @@ -96,11 +97,13 @@ def create_posts(base_dir): id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") - yield Post( + posts.append(Post( id=id, title=title, url=url, body=body, description=description, lang=lang - ) + )) + + return posts diff --git a/src/indexer.py b/src/indexer.py index bc3a5b2..6e3faab 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,7 +1,6 @@ from elasticsearch import Elasticsearch from time import sleep -index_name = "blog" doc_type = "post" def connect_elastic(host='localhost', port=9200): @@ -18,10 +17,12 @@ def connect_elastic(host='localhost', port=9200): raise TimeoutError(f"Could not connect to elasticsearch server at {host}:{port}.") -def refresh_index(es): - if es.indices.exists(index=index_name): - es.indices.delete(index=index_name) - es.indices.create(index=index_name) +def refresh_index(es, languages): + + for lang in languages: + if es.indices.exists(index=lang): + es.indices.delete(index=lang) + es.indices.create(index=lang) def index_posts(es, posts): for post in posts: @@ -32,5 +33,5 @@ def index_posts(es, posts): "lang": post.lang, } - es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) + es.index(index=post.lang, doc_type=doc_type, id=post.id, body=doc) print("Created doc for " + post.url) diff --git a/src/main.py b/src/main.py index eb8dd10..b2aa0f0 100644 --- a/src/main.py +++ b/src/main.py @@ -18,6 +18,7 @@ if __name__ == "__main__": posts = create_posts(base_dir) print("Posts created") + print("Posts n: ", len(posts)) es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) @@ -27,8 +28,11 @@ if __name__ == "__main__": es = indexer.connect_elastic(es_host, es_port) print("ElasticSearch connection established") - indexer.refresh_index(es) - print("Current blog index removed") + + unique_languages = set([post.lang for post in posts ]) + + indexer.refresh_index(es, unique_languages) + print("Current indexes updated") indexer.index_posts(es, posts) print("Finished indexing posts") diff --git a/src/searcher.py b/src/searcher.py index e5442d4..3d2ac4f 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -2,20 +2,13 @@ def search_query(es, user_query, language): query = { "query": { - "bool": { - "must": { - "multi_match": { - "query": user_query, - "type": "best_fields", - "fuzziness": "AUTO", - "tie_breaker": 0.3, - "fields": ["title^3", "body"], - } - }, - "filter": [ - { "term": { "lang": language } } - ] - }, + "multi_match": { + "query": user_query, + "type": "best_fields", + "fuzziness": "AUTO", + "tie_breaker": 0.3, + "fields": ["title^3", "body"], + } }, "highlight": { "fields" : { @@ -25,7 +18,7 @@ def search_query(es, user_query, language): "_source": ["title", "url", "body", "lang"] } - res = es.search(index="blog", body=query) + res = es.search(index=language, body=query) results = [] for h in res['hits']['hits']: results.append(h['_source']) -- GitLab From 8797b34c24836337f8e2e787a1ed88d32b1559be Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 18 Aug 2021 15:23:47 -0300 Subject: [PATCH 3/4] Check if index exists before attempting to query it --- src/searcher.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/searcher.py b/src/searcher.py index 3d2ac4f..4af6be0 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,4 +1,10 @@ -def search_query(es, user_query, language): +from elasticsearch.client import Elasticsearch + + +def search_query(es: Elasticsearch, user_query: str, language: str): + + if not es.indices.exists(language): + return [] query = { "query": { -- GitLab From d43bbe9a994bcb0916b37f09e04d4aa5d33a309c Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Tue, 24 Aug 2021 11:20:18 -0300 Subject: [PATCH 4/4] Updating log message --- src/main.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/main.py b/src/main.py index b2aa0f0..19d3f00 100644 --- a/src/main.py +++ b/src/main.py @@ -1,5 +1,4 @@ import sys -from os.path import expanduser from find_posts import create_posts import indexer from dotenv import load_dotenv @@ -17,8 +16,7 @@ if __name__ == "__main__": print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) - print("Posts created") - print("Posts n: ", len(posts)) + print("Posts created ({})".format(len(posts))) es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) -- GitLab