diff --git a/requirements.txt b/requirements.txt index 530118cd688833b0ecb40567636fb8a86f55a792..9039d37131b4b4b33a1b1683c9f932ab17b499fb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ certifi==2021.5.30 click==8.0.1 elasticsearch==7.13.3 Flask==2.0.1 -folder-hash==1.0.0 isort==5.10.1 itsdangerous==2.0.1 Jinja2==3.0.1 diff --git a/src/indexer.py b/src/indexer.py index 815a982d38ceee7bdf76c39a2c747d2b831407f0..3a0ca776437fd9692756ad3f1290578eb5be0b6c 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -45,7 +45,18 @@ def delete_all_indexes_except(elastic_search: Elasticsearch, except_hash: str): def index_posts(elastic_search: Elasticsearch, posts: List[PostModule.Post], unique_hash: str): """ Indexes the posts into a elastic search instance """ + # Define the specific URLs you want to exclude from indexing + excluded_urls = [ + '/index.html', + '/fr/index.html', + '/es/index.html', + '/de/index.html', + '/it/index.html', + ] for post in posts: + # Check if the post URL is in the list of excluded URLs; if so, skip indexing this post + if post.url in excluded_urls: + continue doc = { 'title': post.title, 'subtitles': post.subtittles, diff --git a/src/main.py b/src/main.py index 8ec48e8b637d38377d6b61457743b3d10b7b82de..357cf39a875f89dc5d030a58bcfb89d9210196df 100644 --- a/src/main.py +++ b/src/main.py @@ -1,8 +1,8 @@ """ Indexer entry point """ import sys import os +from hashlib import md5 from dotenv import load_dotenv -from folder_hash.hasher import Hasher import indexer from find_posts import create_posts import searcher @@ -23,13 +23,15 @@ def index_folder(base_dir: str): elastic_search = indexer.connect_elastic(es_host, es_port) print('ElasticSearch connection established') - - hasher = Hasher(base_dir, 'md5') - folder_hash = hasher.get_hash() - current_hash = searcher.get_current_hash(elastic_search) + try: + base_dir_encoded = base_dir.encode() + folder_hash = md5(base_dir_encoded).hexdigest() + current_hash = searcher.get_current_hash(elastic_search) + except UnicodeEncodeError as encode_error: + print(f'Error encoding base_dir: {encode_error}') + return if folder_hash == current_hash: - print( - f'The folder {base_dir} was already indexed. Hash: {folder_hash}') + print(f'The folder {base_dir} was already indexed. Hash: {folder_hash}') return print('Finding posts in', base_dir)