From b3934d171a1f40713839d5445120dca9f84e33f1 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Fri, 3 Sep 2021 16:34:46 -0300 Subject: [PATCH 1/2] Allow for patterns to exclude files from indexation --- src/find_posts.py | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/src/find_posts.py b/src/find_posts.py index b1d1183..04bf117 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -1,15 +1,29 @@ import glob +from typing import List, Pattern from bs4 import BeautifulSoup from post import Post import logging +import re DEFAULT_LANG = 'en' -def find_post_paths(base_dir): +def find_post_paths(base_dir: str) -> List[str]: files = glob.glob(base_dir + "/**/*.html", recursive=True) files = [f.replace("\\", "/") for f in files] return files +def filter_by_regex(items: List[str], patterns: List[Pattern[str]]) -> List[str]: + items_to_return = [] + for item in items: + should_add = True + for pattern in patterns: + if pattern.match(item) != None: + should_add = False + break + if should_add: + items_to_return.append(item) + return items_to_return + def get_title_from_htmltree(htmltree: BeautifulSoup): title = htmltree.select_one('[data-elasticsearch-title]') if title == None: @@ -70,6 +84,10 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: def create_posts(base_dir): paths = find_post_paths(base_dir) + patterns_to_exclude = [ + re.compile(r'.*\/devices\/.+\/(?!index).+\.html'), + ] + paths = filter_by_regex(paths, patterns_to_exclude) posts = [] for path in paths: -- GitLab From 8b7b8092cf8a2ccba41730c20bcd584ac6eb679b Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Tue, 7 Sep 2021 14:15:39 -0300 Subject: [PATCH 2/2] Exclude pattern as CLI args --- src/find_posts.py | 6 +----- src/main.py | 18 +++++++++++------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/find_posts.py b/src/find_posts.py index 04bf117..86273d8 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -3,7 +3,6 @@ from typing import List, Pattern from bs4 import BeautifulSoup from post import Post import logging -import re DEFAULT_LANG = 'en' @@ -82,11 +81,8 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: return False return True -def create_posts(base_dir): +def create_posts(base_dir: str, patterns_to_exclude: List[Pattern]): paths = find_post_paths(base_dir) - patterns_to_exclude = [ - re.compile(r'.*\/devices\/.+\/(?!index).+\.html'), - ] paths = filter_by_regex(paths, patterns_to_exclude) posts = [] for path in paths: diff --git a/src/main.py b/src/main.py index d6a0f3b..383015d 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,6 @@ +import re import sys +from typing import List, Pattern from find_posts import create_posts import indexer from dotenv import load_dotenv @@ -8,7 +10,7 @@ from folder_hash.hasher import Hasher load_dotenv() -def index_folder(base_dir: str): +def index_folder(base_dir: str, patterns_to_exclude: List[Pattern]): if not os.path.exists(base_dir): print(f"No folder was found at {base_dir}") return @@ -30,23 +32,25 @@ def index_folder(base_dir: str): print("Finding posts in %s" % base_dir) - posts = create_posts(base_dir) - print("Posts created ({})".format(len(posts))) + posts = create_posts(base_dir, patterns_to_exclude) + print("Posts created") unique_languages = set([post.lang for post in posts]) indexer.create_indexes(es, unique_languages, folder_hash) indexer.index_posts(es, posts, folder_hash) - print("Finished indexing posts") + print("Finished indexing posts (total of {})".format(len(posts))) print(f"Deleting all indexes except {folder_hash}") indexer.delete_all_indexes_except(es, folder_hash) if __name__ == "__main__": # provide blog base directory as arg - if len(sys.argv) != 2: - raise BaseException('You must pass the project folder to be crawled, and only it.') + if len(sys.argv) < 3: + raise BaseException('You must pass the project folder to be crawled.') base_dir = str(sys.argv[1]) - index_folder(base_dir) \ No newline at end of file + patterns_to_exclude = [re.compile(p) for p in sys.argv[2:]] + + index_folder(base_dir, patterns_to_exclude) \ No newline at end of file -- GitLab