From f0081dbd374b6d0996dbe373a613b0b3a39520fd Mon Sep 17 00:00:00 2001 From: Akhil Date: Mon, 11 Mar 2024 16:32:58 +0530 Subject: [PATCH 01/10] use hashlib to md5 path --- src/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main.py b/src/main.py index 8ec48e8..ce5dbee 100644 --- a/src/main.py +++ b/src/main.py @@ -2,7 +2,7 @@ import sys import os from dotenv import load_dotenv -from folder_hash.hasher import Hasher +from hashlib import md5 import indexer from find_posts import create_posts import searcher @@ -24,8 +24,8 @@ def index_folder(base_dir: str): elastic_search = indexer.connect_elastic(es_host, es_port) print('ElasticSearch connection established') - hasher = Hasher(base_dir, 'md5') - folder_hash = hasher.get_hash() + base_dir_encoded = base_dir.encode() + folder_hash = md5(base_dir_encoded).hexdigest() current_hash = searcher.get_current_hash(elastic_search) if folder_hash == current_hash: print( -- GitLab From 0dabb1f64468d0209bb961702d516a136877381d Mon Sep 17 00:00:00 2001 From: Akhil Date: Mon, 11 Mar 2024 16:39:04 +0530 Subject: [PATCH 02/10] Remove folder-hash library --- requirements.txt | 1 - 1 file changed, 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 530118c..9039d37 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,7 +4,6 @@ certifi==2021.5.30 click==8.0.1 elasticsearch==7.13.3 Flask==2.0.1 -folder-hash==1.0.0 isort==5.10.1 itsdangerous==2.0.1 Jinja2==3.0.1 -- GitLab From 74302643e2e2f7b1357322b06c315cba8e84454a Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Tue, 12 Mar 2024 01:04:16 +0530 Subject: [PATCH 03/10] fix lint issue --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index ce5dbee..cf0f9af 100644 --- a/src/main.py +++ b/src/main.py @@ -1,8 +1,8 @@ """ Indexer entry point """ import sys import os -from dotenv import load_dotenv from hashlib import md5 +from dotenv import load_dotenv import indexer from find_posts import create_posts import searcher -- GitLab From 9c48b9cb9b82b7f7c511a7349429563ab021a4b5 Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Tue, 12 Mar 2024 01:05:48 +0530 Subject: [PATCH 04/10] remove urls from indexing --- src/indexer.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/src/indexer.py b/src/indexer.py index 815a982..747d5d2 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -45,7 +45,18 @@ def delete_all_indexes_except(elastic_search: Elasticsearch, except_hash: str): def index_posts(elastic_search: Elasticsearch, posts: List[PostModule.Post], unique_hash: str): """ Indexes the posts into a elastic search instance """ + # Define the specific URLs you want to exclude from indexing + excluded_urls = [ + "/index.html", + "/fr/index.html", + "/es/index.html", + "/de/index.html", + "/it/index.html", + ] for post in posts: + # Check if the post URL is in the list of excluded URLs; if so, skip indexing this post + if post.url in excluded_urls: + continue doc = { 'title': post.title, 'subtitles': post.subtittles, -- GitLab From 3b3a252e5926e4d452246ae0b943ba71ac26f449 Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Tue, 12 Mar 2024 01:09:06 +0530 Subject: [PATCH 05/10] remove urls from indexing --- src/indexer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/indexer.py b/src/indexer.py index 747d5d2..e6269d1 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -47,11 +47,11 @@ def index_posts(elastic_search: Elasticsearch, posts: List[PostModule.Post], uni """ Indexes the posts into a elastic search instance """ # Define the specific URLs you want to exclude from indexing excluded_urls = [ - "/index.html", - "/fr/index.html", - "/es/index.html", - "/de/index.html", - "/it/index.html", + '/index.html', + '/fr/index.html', + '/es/index.html', + '/de/index.html', + '/it/index.html', ] for post in posts: # Check if the post URL is in the list of excluded URLs; if so, skip indexing this post -- GitLab From 3c6b924a7abc9b5e8c413a2fd6bc0574eca1d64c Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Tue, 12 Mar 2024 01:09:56 +0530 Subject: [PATCH 06/10] remove whitespace --- src/indexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/indexer.py b/src/indexer.py index e6269d1..3a0ca77 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -56,7 +56,7 @@ def index_posts(elastic_search: Elasticsearch, posts: List[PostModule.Post], uni for post in posts: # Check if the post URL is in the list of excluded URLs; if so, skip indexing this post if post.url in excluded_urls: - continue + continue doc = { 'title': post.title, 'subtitles': post.subtittles, -- GitLab From d3bb8b650958464329033c604189cde4f27b458a Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Wed, 13 Mar 2024 12:04:45 +0530 Subject: [PATCH 07/10] error handling --- src/main.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/main.py b/src/main.py index cf0f9af..b500a5b 100644 --- a/src/main.py +++ b/src/main.py @@ -1,8 +1,8 @@ """ Indexer entry point """ import sys import os -from hashlib import md5 from dotenv import load_dotenv +from hashlib import md5 import indexer from find_posts import create_posts import searcher @@ -23,13 +23,15 @@ def index_folder(base_dir: str): elastic_search = indexer.connect_elastic(es_host, es_port) print('ElasticSearch connection established') - - base_dir_encoded = base_dir.encode() - folder_hash = md5(base_dir_encoded).hexdigest() - current_hash = searcher.get_current_hash(elastic_search) + try: + base_dir_encoded = base_dir.encode() + folder_hash = md5(base_dir_encoded).hexdigest() + current_hash = searcher.get_current_hash(elastic_search) + except Exception as e: + print(f'Error processing directory hash or retrieving current hash: {e}') + return if folder_hash == current_hash: - print( - f'The folder {base_dir} was already indexed. Hash: {folder_hash}') + print(f'The folder {base_dir} was already indexed. Hash: {folder_hash}') return print('Finding posts in', base_dir) -- GitLab From 8b1a8ff8a11ffe8dfeca0483c2197ed04e4191af Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Wed, 13 Mar 2024 12:10:27 +0530 Subject: [PATCH 08/10] lint fix --- src/main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main.py b/src/main.py index b500a5b..2532942 100644 --- a/src/main.py +++ b/src/main.py @@ -1,8 +1,8 @@ """ Indexer entry point """ import sys import os -from dotenv import load_dotenv from hashlib import md5 +from dotenv import load_dotenv import indexer from find_posts import create_posts import searcher -- GitLab From a371b8529e08b5fad1cb19bc52b3ca31687fef19 Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Wed, 13 Mar 2024 12:23:56 +0530 Subject: [PATCH 09/10] lint fix --- src/main.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main.py b/src/main.py index 2532942..f460d9b 100644 --- a/src/main.py +++ b/src/main.py @@ -27,9 +27,9 @@ def index_folder(base_dir: str): base_dir_encoded = base_dir.encode() folder_hash = md5(base_dir_encoded).hexdigest() current_hash = searcher.get_current_hash(elastic_search) - except Exception as e: - print(f'Error processing directory hash or retrieving current hash: {e}') - return + except Exception as exception: + print(f'Error processing directory hash or retrieving current hash: {exception}') + return if folder_hash == current_hash: print(f'The folder {base_dir} was already indexed. Hash: {folder_hash}') return -- GitLab From 8e29b95af7d0740d516f5064472acd32530825c1 Mon Sep 17 00:00:00 2001 From: Avinash Gusain Date: Wed, 13 Mar 2024 12:29:46 +0530 Subject: [PATCH 10/10] error handling fix --- src/main.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main.py b/src/main.py index f460d9b..357cf39 100644 --- a/src/main.py +++ b/src/main.py @@ -27,8 +27,8 @@ def index_folder(base_dir: str): base_dir_encoded = base_dir.encode() folder_hash = md5(base_dir_encoded).hexdigest() current_hash = searcher.get_current_hash(elastic_search) - except Exception as exception: - print(f'Error processing directory hash or retrieving current hash: {exception}') + except UnicodeEncodeError as encode_error: + print(f'Error encoding base_dir: {encode_error}') return if folder_hash == current_hash: print(f'The folder {base_dir} was already indexed. Hash: {folder_hash}') -- GitLab