Loading src/app.py +4 −4 Original line number Diff line number Diff line from flask import Flask, jsonify from flask import Flask, jsonify, request from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv Loading @@ -13,9 +13,9 @@ es_port = os.getenv('ES_PORT', 9200) es = connect_elastic(es_host, es_port) @app.route('/search/<query>') def search(query): result = search_query(es, query) @app.route('/<language>/search/<query>') def search(language, query): result = search_query(es, query, language) data = { 'search_results': result, } Loading src/find_posts.py +5 −2 Original line number Diff line number Diff line Loading @@ -70,6 +70,7 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: def create_posts(base_dir): paths = find_post_paths(base_dir) posts = [] for path in paths: htmltree = get_htmltree_from_file(path) Loading @@ -96,11 +97,13 @@ def create_posts(base_dir): id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") yield Post( posts.append(Post( id=id, title=title, url=url, body=body, description=description, lang=lang ) )) return posts src/indexer.py +7 −6 Original line number Diff line number Diff line from elasticsearch import Elasticsearch from time import sleep index_name = "blog" doc_type = "post" def connect_elastic(host='localhost', port=9200): Loading @@ -18,10 +17,12 @@ def connect_elastic(host='localhost', port=9200): raise TimeoutError(f"Could not connect to elasticsearch server at {host}:{port}.") def refresh_index(es): if es.indices.exists(index=index_name): es.indices.delete(index=index_name) es.indices.create(index=index_name) def refresh_index(es, languages): for lang in languages: if es.indices.exists(index=lang): es.indices.delete(index=lang) es.indices.create(index=lang) def index_posts(es, posts): for post in posts: Loading @@ -32,5 +33,5 @@ def index_posts(es, posts): "lang": post.lang, } es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) es.index(index=post.lang, doc_type=doc_type, id=post.id, body=doc) print("Created doc for " + post.url) src/main.py +6 −4 Original line number Diff line number Diff line import sys from os.path import expanduser from find_posts import create_posts import indexer from dotenv import load_dotenv Loading @@ -17,7 +16,7 @@ if __name__ == "__main__": print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) print("Posts created") print("Posts created ({})".format(len(posts))) es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) Loading @@ -27,8 +26,11 @@ if __name__ == "__main__": es = indexer.connect_elastic(es_host, es_port) print("ElasticSearch connection established") indexer.refresh_index(es) print("Current blog index removed") unique_languages = set([post.lang for post in posts ]) indexer.refresh_index(es, unique_languages) print("Current indexes updated") indexer.index_posts(es, posts) print("Finished indexing posts") src/searcher.py +9 −3 Original line number Diff line number Diff line def search_query(es, user_query): from elasticsearch.client import Elasticsearch def search_query(es: Elasticsearch, user_query: str, language: str): if not es.indices.exists(language): return [] query = { "query": { Loading @@ -7,7 +13,7 @@ def search_query(es, user_query): "type": "best_fields", "fuzziness": "AUTO", "tie_breaker": 0.3, "fields": ["title^3", "body"] "fields": ["title^3", "body"], } }, "highlight": { Loading @@ -18,7 +24,7 @@ def search_query(es, user_query): "_source": ["title", "url", "body", "lang"] } res = es.search(index="blog", body=query) res = es.search(index=language, body=query) results = [] for h in res['hits']['hits']: results.append(h['_source']) Loading Loading
src/app.py +4 −4 Original line number Diff line number Diff line from flask import Flask, jsonify from flask import Flask, jsonify, request from searcher import search_query from indexer import connect_elastic from dotenv import load_dotenv Loading @@ -13,9 +13,9 @@ es_port = os.getenv('ES_PORT', 9200) es = connect_elastic(es_host, es_port) @app.route('/search/<query>') def search(query): result = search_query(es, query) @app.route('/<language>/search/<query>') def search(language, query): result = search_query(es, query, language) data = { 'search_results': result, } Loading
src/find_posts.py +5 −2 Original line number Diff line number Diff line Loading @@ -70,6 +70,7 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: def create_posts(base_dir): paths = find_post_paths(base_dir) posts = [] for path in paths: htmltree = get_htmltree_from_file(path) Loading @@ -96,11 +97,13 @@ def create_posts(base_dir): id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") yield Post( posts.append(Post( id=id, title=title, url=url, body=body, description=description, lang=lang ) )) return posts
src/indexer.py +7 −6 Original line number Diff line number Diff line from elasticsearch import Elasticsearch from time import sleep index_name = "blog" doc_type = "post" def connect_elastic(host='localhost', port=9200): Loading @@ -18,10 +17,12 @@ def connect_elastic(host='localhost', port=9200): raise TimeoutError(f"Could not connect to elasticsearch server at {host}:{port}.") def refresh_index(es): if es.indices.exists(index=index_name): es.indices.delete(index=index_name) es.indices.create(index=index_name) def refresh_index(es, languages): for lang in languages: if es.indices.exists(index=lang): es.indices.delete(index=lang) es.indices.create(index=lang) def index_posts(es, posts): for post in posts: Loading @@ -32,5 +33,5 @@ def index_posts(es, posts): "lang": post.lang, } es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) es.index(index=post.lang, doc_type=doc_type, id=post.id, body=doc) print("Created doc for " + post.url)
src/main.py +6 −4 Original line number Diff line number Diff line import sys from os.path import expanduser from find_posts import create_posts import indexer from dotenv import load_dotenv Loading @@ -17,7 +16,7 @@ if __name__ == "__main__": print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) print("Posts created") print("Posts created ({})".format(len(posts))) es_host = os.getenv('ES_HOST', 'localhost') es_port = os.getenv('ES_PORT', 9200) Loading @@ -27,8 +26,11 @@ if __name__ == "__main__": es = indexer.connect_elastic(es_host, es_port) print("ElasticSearch connection established") indexer.refresh_index(es) print("Current blog index removed") unique_languages = set([post.lang for post in posts ]) indexer.refresh_index(es, unique_languages) print("Current indexes updated") indexer.index_posts(es, posts) print("Finished indexing posts")
src/searcher.py +9 −3 Original line number Diff line number Diff line def search_query(es, user_query): from elasticsearch.client import Elasticsearch def search_query(es: Elasticsearch, user_query: str, language: str): if not es.indices.exists(language): return [] query = { "query": { Loading @@ -7,7 +13,7 @@ def search_query(es, user_query): "type": "best_fields", "fuzziness": "AUTO", "tie_breaker": 0.3, "fields": ["title^3", "body"] "fields": ["title^3", "body"], } }, "highlight": { Loading @@ -18,7 +24,7 @@ def search_query(es, user_query): "_source": ["title", "url", "body", "lang"] } res = es.search(index="blog", body=query) res = es.search(index=language, body=query) results = [] for h in res['hits']['hits']: results.append(h['_source']) Loading