diff --git a/src/find_posts.py b/src/find_posts.py index b1d1183edd14018c407125d8974eef6622a35b2d..363c80b23ced4d89489930acea03c485526fe713 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -20,6 +20,14 @@ def get_title_from_htmltree(htmltree: BeautifulSoup): return None return title.text.strip() +def get_subtitles_from_htmltree(htmltree: BeautifulSoup): + subtitles = [] + headings = htmltree.select('h2,h3,h4,h5,h6') + for h in headings: + subtitles.append(h.text.strip()) + + return subtitles + def get_body_from_htmltree(htmltree: BeautifulSoup): post_elem = htmltree.select_one('[data-elasticsearch-body]') @@ -88,6 +96,8 @@ def create_posts(base_dir): logging.warning(f"No element for body found in '{path}'") continue + subtittles = get_subtitles_from_htmltree(htmltree) + description = get_description_from_htmltree(htmltree) if description == None: description = body @@ -100,6 +110,7 @@ def create_posts(base_dir): posts.append(Post( id=id, title=title, + subtittles=subtittles, url=url, body=body, description=description, diff --git a/src/indexer.py b/src/indexer.py index 023af245498d145f766c80443a757a7b3f689b24..a7066cab6d1df5c2d3ea90803f7e91f86c79e521 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -36,6 +36,7 @@ def index_posts(es: Elasticsearch, posts: List[post.Post], hash: str): for post in posts: doc = { "title": post.title, + "subtitles": post.subtittles, "url": post.url, "description": post.description, "body": post.body, diff --git a/src/post.py b/src/post.py index db4a9443c40362bd01274540d39af10d8ce70815..e03c5218da056dc5479be9bb090cfee04c4fefa7 100644 --- a/src/post.py +++ b/src/post.py @@ -1,7 +1,8 @@ class Post: - def __init__(self, id, title, url, body, description, lang): + def __init__(self, id, title, subtittles, url, body, description, lang): self.id = id self.title = title + self.subtittles = subtittles self.url = url self.body = body diff --git a/src/searcher.py b/src/searcher.py index e0e7f67f40670d84d7c7625310e3661d4441360d..63d3ab08aef7d596bafbf6c271809d958aa2023c 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -63,7 +63,7 @@ def search_query(es: Elasticsearch, user_query: str, language: str): "type": "best_fields", "fuzziness": "AUTO", "tie_breaker": 0.3, - "fields": ["title^3", "description^2", "body"], + "fields": ["title^10", "subtitles^9", "description^2", "body"], } }, "highlight": { @@ -71,7 +71,7 @@ def search_query(es: Elasticsearch, user_query: str, language: str): "body" : {} } }, - "_source": ["title", "url", "description", "lang", "body"] + "_source": ["title", "subtitles", "url", "description", "lang", "body"] } res = es.search(index=index_name, body=query)