From 8ae84f58f72ffae06e9691dd3f40f7bfbe6b9c25 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Fri, 19 Nov 2021 13:28:00 -0300 Subject: [PATCH 1/7] Updates to conform to pep8 --- src/app.py | 34 ++++-- src/find_posts.py | 89 ++++++++------ src/indexer.py | 64 ++++++---- src/main.py | 56 +++++---- src/post.py | 10 +- src/searcher.py | 303 +++++++++++++++++++++++++--------------------- src/wsgi.py | 5 +- 7 files changed, 316 insertions(+), 245 deletions(-) diff --git a/src/app.py b/src/app.py index 99de8a6..9b88b2e 100644 --- a/src/app.py +++ b/src/app.py @@ -1,51 +1,59 @@ +""" Handles connection with the outside world by REST api """ +import gzip +import os from flask import Flask, json, Response, make_response +from dotenv import load_dotenv import searcher -import gzip from indexer import connect_elastic -from dotenv import load_dotenv -import os app = Flask(__name__) load_dotenv() es_host = os.getenv('ES_HOST', 'localhost') -es_port = os.getenv('ES_PORT', 9200) +es_port = os.getenv('ES_PORT', '9200') es = connect_elastic(es_host, es_port) + @app.route('//search/') def search(language, query): + """ Endpoint to search into all contents of the specific language """ result = searcher.search_query(es, query, language) data = { 'search_results': result, } try: resp = gzip_json_response(data) - except: + except BaseException: # pylint: disable=w0703 resp = json.dumps({'error': 'Unable to process at the moment'}) - return Response(response=resp, - status=500, + headers = { + 'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*') + } + return Response(response=resp, + status=500, content_type='application/json', - headers={ - 'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*') - }) - - resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*') + headers=headers) + resp.headers['Access-Control-Allow-Origin'] = os.getenv( + 'ALLOW_ORIGINS', '*') return resp + @app.route('//autocomplete') def autocomplete(language): + """ Endpoint to return a list of autocomplete words """ result = searcher.autocomplete(es, language) data = { 'autocomplete': result, } return gzip_json_response(data) + def gzip_json_response(data: dict) -> Response: + """ Converts a dictionary into a flask response with it's content compressed (gzip) """ content = gzip.compress(json.dumps(data).encode('utf8'), 5) response = make_response(content) response.headers['Content-length'] = len(content) response.headers['Content-Encoding'] = 'gzip' response.headers['Content-Type'] = 'application/json' - return response \ No newline at end of file + return response diff --git a/src/find_posts.py b/src/find_posts.py index 363c80b..c664922 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -1,74 +1,91 @@ +""" Parse the content from html """ import glob +import logging from bs4 import BeautifulSoup from post import Post -import logging DEFAULT_LANG = 'en' + def find_post_paths(base_dir): - files = glob.glob(base_dir + "/**/*.html", recursive=True) - files = [f.replace("\\", "/") for f in files] + """ Get all html files inside a folder """ + files = glob.glob(base_dir + '/**/*.html', recursive=True) + files = [f.replace('\\', '/') for f in files] return files + def get_title_from_htmltree(htmltree: BeautifulSoup): + """ Grab the title from a set of possible places """ title = htmltree.select_one('[data-elasticsearch-title]') - if title == None: - title = htmltree.find('h1', { "class" : "post-title" }) - if title == None: + if title is None: + title = htmltree.find('h1', {'class': 'post-title'}) + if title is None: title = htmltree.find('h1') - if title == None: + if title is None: return None return title.text.strip() + def get_subtitles_from_htmltree(htmltree: BeautifulSoup): + """ Grab the subtitles from all h2,h3,h4,h5,h6 elements """ subtitles = [] headings = htmltree.select('h2,h3,h4,h5,h6') - for h in headings: - subtitles.append(h.text.strip()) + for heading in headings: + subtitles.append(heading.text.strip()) return subtitles + def get_body_from_htmltree(htmltree: BeautifulSoup): + """ Get the body content of the page """ post_elem = htmltree.select_one('[data-elasticsearch-body]') - if post_elem == None: - post_elem = htmltree.find("div", {"class": "post"}) - if post_elem == None: + if post_elem is None: + post_elem = htmltree.find('div', {'class': 'post'}) + if post_elem is None: return None - post_elem.find(attrs={"class": "post-title"}).decompose() - post_elem.find(attrs={"class": "post-date"}).decompose() + post_elem.find(attrs={'class': 'post-title'}).decompose() + post_elem.find(attrs={'class': 'post-date'}).decompose() paras = post_elem.find_all(text=True) - body = " ".join(p.strip() for p in paras).replace(" ", " ").strip() + body = ' '.join(p.strip() for p in paras).replace(' ', ' ').strip() return body + def get_htmltree_from_file(path: str) -> BeautifulSoup: - with open(path, encoding="utf8") as f: - contents = f.read() + """ Parse an html file into a html struct """ + with open(path, encoding='utf8') as file: + contents = file.read() return BeautifulSoup(contents, 'html.parser') + def get_lang_from_htmltree(htmltree: BeautifulSoup) -> str: + """ Returns the language set in the html tag """ html = htmltree.select_one('html') lang = html.get('lang') - return DEFAULT_LANG if lang == None else lang + return DEFAULT_LANG if lang is None else lang + def get_description_from_htmltree(htmltree: BeautifulSoup) -> str: - metatag = htmltree.select_one('meta[name="description"]') - if metatag == None: + """ Gather the description of the page from the meta tag description """ + metatag = htmltree.select_one('meta[name=\'description\']') + if metatag is None: return None description = metatag.get('content') return description + def should_crawl_page(htmltree: BeautifulSoup) -> bool: + """ Determines if the page should be crawled """ - metatag = htmltree.select_one('meta[name="robots"]') - if metatag == None: + metatag = htmltree.select_one('meta[name=\'robots\']') + if metatag is None: return True metatag_content = metatag.get('content') - if metatag_content == None: + if metatag_content is None: return True options = metatag_content.split(',') @@ -76,7 +93,9 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool: return False return True + def create_posts(base_dir): + """ Returns a list posts crawled from the html content """ paths = find_post_paths(base_dir) posts = [] for path in paths: @@ -88,33 +107,33 @@ def create_posts(base_dir): continue title = get_title_from_htmltree(htmltree) - if title == None: - logging.warning(f"No element for title found in '{path}'") + if title is None: + logging.warning('No element for title found in "%s"', path) continue body = get_body_from_htmltree(htmltree) - if body == None: - logging.warning(f"No element for body found in '{path}'") + if body is None: + logging.warning('No element for body found in "%s"', path) continue subtittles = get_subtitles_from_htmltree(htmltree) description = get_description_from_htmltree(htmltree) - if description == None: + if description is None: description = body lang = get_lang_from_htmltree(htmltree) - id = path.replace(base_dir, "").replace("/", "-") - url = path.replace(base_dir, "") + page_id = path.replace(base_dir, '').replace('/', '-') + url = path.replace(base_dir, '') posts.append(Post( - id=id, - title=title, + id=page_id, + title=title, subtittles=subtittles, - url=url, - body=body, + url=url, + body=body, description=description, lang=lang )) - + return posts diff --git a/src/indexer.py b/src/indexer.py index a7066ca..fc55010 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -1,49 +1,61 @@ -from elasticsearch import Elasticsearch +""" Indexer of posts """ from time import sleep from typing import List -import post +from elasticsearch import Elasticsearch +import post as PostModule -def connect_elastic(host='localhost', port=9200): - MAXIMUM_ATTEMPS = 12 +def connect_elastic(host='localhost', port=9200): + """ Returns an connected elastic search instance with a small retry policy """ + maximum_attemps = 12 connection = Elasticsearch([{'host': host, 'port': port}]) - for _ in range(MAXIMUM_ATTEMPS): + for _ in range(maximum_attemps): is_connected = connection.ping() if not is_connected: sleep(5) continue - + return connection - raise TimeoutError(f"Could not connect to elasticsearch server at {host}:{port}.") + raise TimeoutError( + f'Could not connect to elasticsearch server at {host}:{port}.') -def create_indexes(es: Elasticsearch, languages: List[str], hash: str): +def create_indexes(elastic_search: Elasticsearch, languages: List[str], unique_hash: str): + """ Create the indexes for the languages using a unique hash """ for lang in languages: - index_name = get_index_name(lang, hash) - if not es.indices.exists(index=index_name): - es.indices.create(index=index_name) + index_name = get_index_name(lang, unique_hash) + if not elastic_search.indices.exists(index=index_name): + elastic_search.indices.create(index=index_name) + -def get_index_name(language: str, hash: str) -> str: - return f"{language}-{hash}" +def get_index_name(language: str, unique_hash: str) -> str: + """ Returns the index name of a language with the hash""" + return f'{language}-{unique_hash}' -def delete_all_indexes_except(es: Elasticsearch, except_hash: str): - for index_name in es.indices.get('*'): + +def delete_all_indexes_except(elastic_search: Elasticsearch, except_hash: str): + """ Clear the elastic search instance of all content, + with the exception of the content marked with the hash + """ + for index_name in elastic_search.indices.get('*'): if except_hash not in index_name: - es.indices.delete(index=index_name) + elastic_search.indices.delete(index=index_name) + -def index_posts(es: Elasticsearch, posts: List[post.Post], hash: str): +def index_posts(elastic_search: Elasticsearch, posts: List[PostModule.Post], unique_hash: str): + """ Indexes the posts into a elastic search instance """ for post in posts: doc = { - "title": post.title, - "subtitles": post.subtittles, - "url": post.url, - "description": post.description, - "body": post.body, - "lang": post.lang, + 'title': post.title, + 'subtitles': post.subtittles, + 'url': post.url, + 'description': post.description, + 'body': post.body, + 'lang': post.lang, } - index_name = get_index_name(post.lang, hash) + index_name = get_index_name(post.lang, unique_hash) - es.index(index=index_name, id=post.id, body=doc) - print("Created doc for " + post.url) + elastic_search.index(index=index_name, id=post.id, body=doc) + print('Created doc for', post.url) diff --git a/src/main.py b/src/main.py index d6a0f3b..0364432 100644 --- a/src/main.py +++ b/src/main.py @@ -1,52 +1,58 @@ +""" Indexer entry point """ import sys -from find_posts import create_posts -import indexer -from dotenv import load_dotenv import os -import searcher +from dotenv import load_dotenv from folder_hash.hasher import Hasher +import indexer +from find_posts import create_posts +import searcher load_dotenv() + def index_folder(base_dir: str): + """ Performs the indexation of the folder """ if not os.path.exists(base_dir): - print(f"No folder was found at {base_dir}") + print(f'No folder was found at {base_dir}') return es_host = os.getenv('ES_HOST', 'localhost') - es_port = os.getenv('ES_PORT', 9200) + es_port = os.getenv('ES_PORT', '9200') - print(f"Connecting to elastic search at: {es_host}:{es_port}") + print(f'Connecting to elastic search at: {es_host}:{es_port}') - es = indexer.connect_elastic(es_host, es_port) - print("ElasticSearch connection established") + elastic_search = indexer.connect_elastic(es_host, es_port) + print('ElasticSearch connection established') - hasher = Hasher(base_dir, "md5") + hasher = Hasher(base_dir, 'md5') folder_hash = hasher.get_hash() - current_hash = searcher.get_current_hash(es) + current_hash = searcher.get_current_hash(elastic_search) if folder_hash == current_hash: - print(f"The folder {base_dir} was already indexed. Hash: {folder_hash}") - return + print( + f'The folder {base_dir} was already indexed. Hash: {folder_hash}') + return - print("Finding posts in %s" % base_dir) + print('Finding posts in', base_dir) posts = create_posts(base_dir) - print("Posts created ({})".format(len(posts))) + print('Posts created (%d)' % len(posts)) unique_languages = set([post.lang for post in posts]) - indexer.create_indexes(es, unique_languages, folder_hash) - indexer.index_posts(es, posts, folder_hash) - print("Finished indexing posts") - - print(f"Deleting all indexes except {folder_hash}") - indexer.delete_all_indexes_except(es, folder_hash) + indexer.create_indexes(elastic_search, unique_languages, folder_hash) + indexer.index_posts(elastic_search, posts, folder_hash) + print('Finished indexing posts') + + print(f'Deleting all indexes except {folder_hash}') + indexer.delete_all_indexes_except(elastic_search, folder_hash) + -if __name__ == "__main__": +if __name__ == '__main__': # provide blog base directory as arg if len(sys.argv) != 2: - raise BaseException('You must pass the project folder to be crawled, and only it.') + raise BaseException( + 'You must pass the project folder to be crawled, and only it.') - base_dir = str(sys.argv[1]) + BASE_DIRECTORY = str(sys.argv[1]) - index_folder(base_dir) \ No newline at end of file + index_folder(BASE_DIRECTORY) diff --git a/src/post.py b/src/post.py index e03c521..1528caf 100644 --- a/src/post.py +++ b/src/post.py @@ -1,13 +1,15 @@ +""" Post module """ class Post: + """ Post data model """ def __init__(self, id, title, subtittles, url, body, description, lang): self.id = id self.title = title self.subtittles = subtittles self.url = url self.body = body - + if len(description) > 200: - description = description[0:200] + "..." + description = description[0:200] + '...' self.description = description - - self.lang = lang \ No newline at end of file + + self.lang = lang diff --git a/src/searcher.py b/src/searcher.py index 6509dcb..11baee6 100644 --- a/src/searcher.py +++ b/src/searcher.py @@ -1,152 +1,175 @@ -from elasticsearch.client import Elasticsearch +""" Search module to interact with elastic search """ import math from typing import List, Optional +from elasticsearch.client import Elasticsearch import indexer -def get_current_hash_by_language(es: Elasticsearch, language: str) -> Optional[str]: - all_indexes_in_language = [index for index in es.indices.get('*') if index.startswith(language)] - - if len(all_indexes_in_language) == 0: - return None - - oldest_index = get_oldest_index(es, all_indexes_in_language) - - if oldest_index == None: - return None - - return oldest_index.split('-')[1] - -def get_current_hash(es: Elasticsearch) -> Optional[str]: - oldest_index = get_oldest_index(es, [index for index in es.indices.get('*')]) - - if oldest_index == None: - return None - - oldest_index_params = oldest_index.split('-') - - if len(oldest_index_params) is not 2: - print(f'Your index "{oldest_index}" is not a valid index name') - return None - - return oldest_index_params[1] - -def get_oldest_index(es: Elasticsearch, indexes: List[str]) -> Optional[str]: - if len(indexes) == 0: - return None - oldest_index = None - oldest_index_date = math.inf - - for i in indexes: - - if not es.indices.exists(i): - continue - - index = es.indices.get(i) - created_date = int(index[i]['settings']['index']['creation_date']) - - is_older = oldest_index_date > created_date - if is_older: - oldest_index_date = created_date - oldest_index = i - - return oldest_index - -def search_query(es: Elasticsearch, user_query: str, language: str): - - index_name = get_index_name_from_lang(es=es, language=language) - if index_name is None: - return [] - - query = { - "query": { - "multi_match": { - "query": user_query, - "type": "best_fields", - "fuzziness": "AUTO", - "tie_breaker": 0.3, - "fields": ["title^10", "subtitles^9", "description^2", "body"], - } - }, - "highlight": { - "fields" : { - "body" : {} - } - }, - "_source": ["title", "subtitles", "url", "description", "lang", "body"] - } - - res = es.search(index=index_name, body=query) - results = [] - for h in res['hits']['hits']: - results.append(h['_source']) - return results - -def autocomplete(es: Elasticsearch, language: str) -> List[str]: - - index_name = get_index_name_from_lang(es=es, language=language) - if index_name is None: - return [] - return autocomplete_for_index(es=es, index_name=index_name) - -def get_index_name_from_lang(es: Elasticsearch, language: str) -> Optional[str]: - current_hash = get_current_hash_by_language(es, language) - if current_hash == None: - return None - - index_name = indexer.get_index_name(language, current_hash) - - if not es.indices.exists(index_name): - return None - - return index_name - -def autocomplete_for_index(es: Elasticsearch, index_name: str) -> Optional[List[str]]: - query = { - "query": { - "match_all": {} - }, - "size": 10000, - "_source": ["title", "subtitles"] - } - - res = es.search(index=index_name, body=query) - results = [] - for h in res['hits']['hits']: - results.append(h['_source']['title']) - results += h['_source']['subtitles'] - - if '' in results: - results.remove('') - - results = list(set(results)) - return results - -if __name__ == "__main__": - import os - from indexer import connect_elastic - from dotenv import load_dotenv - load_dotenv() - es_host = os.getenv('ES_HOST', 'localhost') - es_port = os.getenv('ES_PORT', 9200) - - es = connect_elastic(es_host, es_port) - print(search_query(es, "map")) + +def get_current_hash_by_language( + elastic_search: Elasticsearch, language: str) -> Optional[str]: + """ Get the oldest hash of the parsed folder using the language """ + all_indexes_in_language = [ + index for index in elastic_search.indices.get('*') if index.startswith(language)] + + if len(all_indexes_in_language) == 0: + return None + + oldest_index = get_oldest_index(elastic_search, all_indexes_in_language) + + if oldest_index is None: + return None + + return oldest_index.split('-')[1] + + +def get_current_hash(elastic_search: Elasticsearch) -> Optional[str]: + """ Returns the hash to be used on the search """ + oldest_index = get_oldest_index( + elastic_search, list(elastic_search.indices.get('*'))) + + if oldest_index is None: + return None + + oldest_index_params = oldest_index.split('-') + + if len(oldest_index_params) != 2: + print(f'Your index "{oldest_index}" is not a valid index name') + return None + + return oldest_index_params[1] + + +def get_oldest_index(elastic_search: Elasticsearch, indexes: List[str]) -> Optional[str]: + """ Get the oldest indext inside the list """ + if len(indexes) == 0: + return None + oldest_index = None + oldest_index_date = math.inf + + for i in indexes: + + if not elastic_search.indices.exists(i): + continue + + index = elastic_search.indices.get(i) + created_date = int(index[i]['settings']['index']['creation_date']) + + is_older = oldest_index_date > created_date + if is_older: + oldest_index_date = created_date + oldest_index = i + + return oldest_index + + +def search_query(elastic_search: Elasticsearch, user_query: str, language: str): + """ Performs the search using a query on a specific language """ + + index_name = get_index_name_from_lang( + elastic_search=elastic_search, language=language) + if index_name is None: + return [] + + query = { + 'query': { + 'multi_match': { + 'query': user_query, + 'type': 'best_fields', + 'fuzziness': 'AUTO', + 'tie_breaker': 0.3, + 'fields': ['title^10', 'subtitles^9', 'description^2', 'body'], + } + }, + 'highlight': { + 'fields': { + 'body': {} + } + }, + '_source': ['title', 'subtitles', 'url', 'description', 'lang', 'body'] + } + + res = elastic_search.search(index=index_name, body=query) + results = [] + for hit in res['hits']['hits']: + results.append(hit['_source']) + return results + + +def autocomplete(elastic_search: Elasticsearch, language: str) -> List[str]: + """ Get the Autocomplete list """ + + index_name = get_index_name_from_lang( + elastic_search=elastic_search, language=language) + if index_name is None: + return [] + return autocomplete_for_index(elastic_search=elastic_search, index_name=index_name) + + +def get_index_name_from_lang(elastic_search: Elasticsearch, + language: str) -> Optional[str]: + """ Get the index name from a content language """ + current_hash = get_current_hash_by_language(elastic_search, language) + if current_hash is None: + return None + + index_name = indexer.get_index_name(language, current_hash) + + if not elastic_search.indices.exists(index_name): + return None + + return index_name + + +def autocomplete_for_index(elastic_search: Elasticsearch, + index_name: str) -> Optional[List[str]]: + """ Get the autocomplete for a specific index """ + query = { + 'query': { + 'match_all': {} + }, + 'size': 10000, + '_source': ['title', 'subtitles'] + } + + res = elastic_search.search(index=index_name, body=query) + results = [] + for hit in res['hits']['hits']: + results.append(hit['_source']['title']) + results += hit['_source']['subtitles'] + + if '' in results: + results.remove('') + + results = list(set(results)) + return results + + +if __name__ == '__main__': + import os + from indexer import connect_elastic + from dotenv import load_dotenv + load_dotenv() + es_host = os.getenv('ES_HOST', 'localhost') + es_port = os.getenv('ES_PORT', '9200') + + es = connect_elastic(es_host, es_port) + print(search_query(es, 'map', 'en')) # POST /blog/post/_search # { -# "query": { -# "multi_match": { -# "query": "python", -# "type": "best_fields", -# "fuzziness": "AUTO", -# "tie_breaker": 0.3, -# "fields": ["title^3", "body"] +# 'query': { +# 'multi_match': { +# 'query': 'python', +# 'type': 'best_fields', +# 'fuzziness': 'AUTO', +# 'tie_breaker': 0.3, +# 'fields': ['title^3', 'body'] # } # }, -# "highlight": { -# "fields" : { -# "body" : {} +# 'highlight': { +# 'fields' : { +# 'body' : {} # } # }, -# "_source": ["title", "url"] +# '_source': ['title', 'url'] # } diff --git a/src/wsgi.py b/src/wsgi.py index 1043a11..e6493c4 100644 --- a/src/wsgi.py +++ b/src/wsgi.py @@ -1,4 +1,5 @@ +""" Application entrypoint """ from app import app as application -if __name__ == "__main__": - application.run() \ No newline at end of file +if __name__ == '__main__': + application.run() -- GitLab From 0b6a32d645131894ad71bb0bb582348307434de8 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 29 Nov 2021 07:37:55 -0300 Subject: [PATCH 2/7] Fix last missing errors --- src/find_posts.py | 2 +- src/indexer.py | 2 +- src/main.py | 2 +- src/post.py | 6 +++--- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/find_posts.py b/src/find_posts.py index c664922..0f95aee 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -127,7 +127,7 @@ def create_posts(base_dir): url = path.replace(base_dir, '') posts.append(Post( - id=page_id, + post_id=page_id, title=title, subtittles=subtittles, url=url, diff --git a/src/indexer.py b/src/indexer.py index fc55010..815a982 100644 --- a/src/indexer.py +++ b/src/indexer.py @@ -57,5 +57,5 @@ def index_posts(elastic_search: Elasticsearch, posts: List[PostModule.Post], uni index_name = get_index_name(post.lang, unique_hash) - elastic_search.index(index=index_name, id=post.id, body=doc) + elastic_search.index(index=index_name, id=post.post_id, body=doc) print('Created doc for', post.url) diff --git a/src/main.py b/src/main.py index 0364432..53f1c1a 100644 --- a/src/main.py +++ b/src/main.py @@ -37,7 +37,7 @@ def index_folder(base_dir: str): posts = create_posts(base_dir) print('Posts created (%d)' % len(posts)) - unique_languages = set([post.lang for post in posts]) + unique_languages = set(post.lang for post in posts) indexer.create_indexes(elastic_search, unique_languages, folder_hash) indexer.index_posts(elastic_search, posts, folder_hash) diff --git a/src/post.py b/src/post.py index 1528caf..066704e 100644 --- a/src/post.py +++ b/src/post.py @@ -1,8 +1,8 @@ """ Post module """ -class Post: +class Post: # pylint: disable=R0903 """ Post data model """ - def __init__(self, id, title, subtittles, url, body, description, lang): - self.id = id + def __init__(self, post_id, title, subtittles, url, body, description, lang): # pylint: disable=R0913 + self.post_id = post_id self.title = title self.subtittles = subtittles self.url = url -- GitLab From 74284ab15084a66268b186893f3b09748001ece2 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Mon, 29 Nov 2021 07:57:19 -0300 Subject: [PATCH 3/7] Pylint quotes as dependency --- requirements.txt | 1 + src/main.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0b9242d..86f6656 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,6 +8,7 @@ itsdangerous==2.0.1 Jinja2==3.0.1 MarkupSafe==2.0.1 python-dotenv==0.18.0 +pylint-quotes==0.2.3 soupsieve==2.2.1 urllib3==1.26.6 uWSGI==2.0.19.1 diff --git a/src/main.py b/src/main.py index 53f1c1a..8ec48e8 100644 --- a/src/main.py +++ b/src/main.py @@ -35,7 +35,7 @@ def index_folder(base_dir: str): print('Finding posts in', base_dir) posts = create_posts(base_dir) - print('Posts created (%d)' % len(posts)) + print(f'Posts created ({len(posts)})') unique_languages = set(post.lang for post in posts) -- GitLab From 811f818ca27a146e0f4b473e2b52c31e4fa9c752 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 2 Dec 2021 17:12:59 -0300 Subject: [PATCH 4/7] Linting on CI pipeline --- .gitlab-ci.yml | 10 ++++++++++ requirements.txt | 11 ++++++++++- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ece5b10..eb94e90 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -3,11 +3,21 @@ variables: DOCKER_DRIVER: overlay2 stages: + - lint - build - deploy default: image: docker:20.10 +lint:branch: + stage: lint + tags: + - generic_privileged + image: python:3.9 + script: + - pip install pylint pylint-quotes + - pylint --load-plugins pylint_quotes ./src + # Build stage .build:docker: stage: build diff --git a/requirements.txt b/requirements.txt index 86f6656..530118c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,15 +1,24 @@ +astroid==2.9.0 beautifulsoup4==4.9.3 certifi==2021.5.30 click==8.0.1 elasticsearch==7.13.3 Flask==2.0.1 folder-hash==1.0.0 +isort==5.10.1 itsdangerous==2.0.1 Jinja2==3.0.1 +lazy-object-proxy==1.6.0 MarkupSafe==2.0.1 -python-dotenv==0.18.0 +mccabe==0.6.1 +platformdirs==2.4.0 +pylint==2.12.1 pylint-quotes==0.2.3 +python-dotenv==0.18.0 soupsieve==2.2.1 +toml==0.10.2 +typing-extensions==4.0.1 urllib3==1.26.6 uWSGI==2.0.19.1 Werkzeug==2.0.1 +wrapt==1.13.3 -- GitLab From 1174f7a994f815bee6168c27f5d747642b4365d1 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 2 Dec 2021 17:16:13 -0300 Subject: [PATCH 5/7] Install dependencies before --- .gitlab-ci.yml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index eb94e90..00bee56 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,7 +15,9 @@ lint:branch: - generic_privileged image: python:3.9 script: - - pip install pylint pylint-quotes + - python -m venv venv + - ". venv/bin/activate" + - pip install -r requirements.txt - pylint --load-plugins pylint_quotes ./src # Build stage -- GitLab From ab759da9fe7a83543049945eb6b1dcf1ab982f41 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 2 Dec 2021 18:33:52 -0300 Subject: [PATCH 6/7] Added READme instruction on git hook --- README.md | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/README.md b/README.md index 6aa636d..309cd6a 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,14 @@ Full-text search for your Jekyll blog with ElasticSearch. To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode +This project uses [pylint](https://pylint.org/) to help with the codestyle. If you wish to save some server resources, you can put the following shell script into the `.git/hooks/pre-commit` file (create one if it does not exists): +```shell +. venv/bin/activate +pylint --load-plugins pylint_quotes ./src +``` + +So every time you try to commit something, it will validate the linter locally beforehand. + ## Development 1. You need a running copy of elasticsearch: ```bash -- GitLab From f5b128332d70a3c522eba6cca6e6c3ab5cdf7972 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 8 Dec 2021 07:42:40 -0300 Subject: [PATCH 7/7] Removing the redundant use of virtual env --- .gitlab-ci.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 00bee56..4a50909 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -15,8 +15,6 @@ lint:branch: - generic_privileged image: python:3.9 script: - - python -m venv venv - - ". venv/bin/activate" - pip install -r requirements.txt - pylint --load-plugins pylint_quotes ./src -- GitLab