diff --git a/src/find_posts.py b/src/find_posts.py index b1d1183edd14018c407125d8974eef6622a35b2d..86273d84db035c16ea78f25d83a6e52490ba679d 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -1,15 +1,28 @@ import glob +from typing import List, Pattern from bs4 import BeautifulSoup from post import Post import logging DEFAULT_LANG = 'en' -def find_post_paths(base_dir): +def find_post_paths(base_dir: str) -> List[str]: files = glob.glob(base_dir + "/**/*.html", recursive=True) files = [f.replace("\\", "/") for f in files] return files +def filter_by_regex(items: List[str], patterns: List[Pattern[str]]) -> List[str]: + items_to_return = [] + for item in items: + should_add = True + for pattern in patterns: + if pattern.match(item) != None: + should_add = False + break + if should_add: + items_to_return.append(item) + return items_to_return + def get_title_from_htmltree(htmltree: BeautifulSoup): title = htmltree.select_one('[data-elasticsearch-title]') if title == None: @@ -68,8 +81,9 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: return False return True -def create_posts(base_dir): +def create_posts(base_dir: str, patterns_to_exclude: List[Pattern]): paths = find_post_paths(base_dir) + paths = filter_by_regex(paths, patterns_to_exclude) posts = [] for path in paths: diff --git a/src/main.py b/src/main.py index d6a0f3b6597a95811008fe32376fad11aa2b0c8a..383015d9f3ef68fc725faf89d1938c64549f6be5 100644 --- a/src/main.py +++ b/src/main.py @@ -1,4 +1,6 @@ +import re import sys +from typing import List, Pattern from find_posts import create_posts import indexer from dotenv import load_dotenv @@ -8,7 +10,7 @@ from folder_hash.hasher import Hasher load_dotenv() -def index_folder(base_dir: str): +def index_folder(base_dir: str, patterns_to_exclude: List[Pattern]): if not os.path.exists(base_dir): print(f"No folder was found at {base_dir}") return @@ -30,23 +32,25 @@ def index_folder(base_dir: str): print("Finding posts in %s" % base_dir) - posts = create_posts(base_dir) - print("Posts created ({})".format(len(posts))) + posts = create_posts(base_dir, patterns_to_exclude) + print("Posts created") unique_languages = set([post.lang for post in posts]) indexer.create_indexes(es, unique_languages, folder_hash) indexer.index_posts(es, posts, folder_hash) - print("Finished indexing posts") + print("Finished indexing posts (total of {})".format(len(posts))) print(f"Deleting all indexes except {folder_hash}") indexer.delete_all_indexes_except(es, folder_hash) if __name__ == "__main__": # provide blog base directory as arg - if len(sys.argv) != 2: - raise BaseException('You must pass the project folder to be crawled, and only it.') + if len(sys.argv) < 3: + raise BaseException('You must pass the project folder to be crawled.') base_dir = str(sys.argv[1]) - index_folder(base_dir) \ No newline at end of file + patterns_to_exclude = [re.compile(p) for p in sys.argv[2:]] + + index_folder(base_dir, patterns_to_exclude) \ No newline at end of file