From c9f192a4df48902425504ab46ae1c00ad5aa4138 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 7 Oct 2021 17:14:49 -0300 Subject: [PATCH 1/3] Parse subtittles --- src/find_posts.py | 12 ++++++++++++ src/indexer.py | 1 + src/post.py | 3 ++- src/searcher.py | 4 ++-- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/src/find_posts.py b/src/find_posts.py index b1d1183..7aac2a9 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -2,6 +2,7 @@ import glob from bs4 import BeautifulSoup from post import Post import logging +from multiprocessing import Pool DEFAULT_LANG = 'en' @@ -20,6 +21,14 @@ def get_title_from_htmltree(htmltree: BeautifulSoup): return None return title.text.strip() +def get_subtitles_from_htmltree(htmltree: BeautifulSoup): + subtitles = [] + headings = htmltree.select('h2,h3,h4,h5,h6') + for h in headings: + subtitles.append(h.text.strip()) + + return subtitles + def get_body_from_htmltree(htmltree: BeautifulSoup): post_elem = htmltree.select_one('[data-elasticsearch-body]') @@ -88,6 +97,8 @@ def create_posts(base_dir): logging.warning(f"No element for body found in '{path}'") continue + subtittles = get_subtitles_from_htmltree(htmltree) + description = get_description_from_htmltree(htmltree) if description == None: description = body @@ -100,6 +111,7 @@ def create_posts(base_dir): posts.append(Post( id=id, title=title, + subtittles=subtittles, url=url, body=body, description=description, diff --git a/src/indexer.py b/src/indexer.py index 023af24..a7066ca 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -36,6 +36,7 @@ def index_posts(es: Elasticsearch, posts: List[post.Post], hash: str): for post in posts: doc = { "title": post.title, + "subtitles": post.subtittles, "url": post.url, "description": post.description, "body": post.body, diff --git a/src/post.py b/src/post.py index db4a944..e03c521 100644 --- a/src/post.py +++ b/src/post.py @@ -1,7 +1,8 @@ class Post: - def __init__(self, id, title, url, body, description, lang): + def __init__(self, id, title, subtittles, url, body, description, lang): self.id = id self.title = title + self.subtittles = subtittles self.url = url self.body = body diff --git a/src/searcher.py b/src/searcher.py index e0e7f67..d02420a 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -63,7 +63,7 @@ def search_query(es: Elasticsearch, user_query: str, language: str): "type": "best_fields", "fuzziness": "AUTO", "tie_breaker": 0.3, - "fields": ["title^3", "description^2", "body"], + "fields": ["title^10", "subtitles^9", "description^2", "body"], } }, "highlight": { @@ -71,7 +71,7 @@ def search_query(es: Elasticsearch, user_query: str, language: str): "body" : {} } }, - "_source": ["title", "url", "description", "lang", "body"] + "_source": ["title", "subtitles", "url", "description", "lang"] } res = es.search(index=index_name, body=query) -- GitLab From 136926d57d8f63a2843b68af00940d40bcf04812 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 7 Oct 2021 17:18:18 -0300 Subject: [PATCH 2/3] (fix) added body on results --- src/searcher.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/searcher.py b/src/searcher.py index d02420a..63d3ab0 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -71,7 +71,7 @@ def search_query(es: Elasticsearch, user_query: str, language: str): "body" : {} } }, - "_source": ["title", "subtitles", "url", "description", "lang"] + "_source": ["title", "subtitles", "url", "description", "lang", "body"] } res = es.search(index=index_name, body=query) -- GitLab From d898173ef664f29b2dc49e1028c3b101496856be Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Fri, 8 Oct 2021 13:58:03 -0300 Subject: [PATCH 3/3] Remove unused import --- src/find_posts.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/find_posts.py b/src/find_posts.py index 7aac2a9..363c80b 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -2,7 +2,6 @@ import glob from bs4 import BeautifulSoup from post import Post import logging -from multiprocessing import Pool DEFAULT_LANG = 'en' -- GitLab