From f82630e44d998fec935ecd34b2d51dcf5c9a835f Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 14 Jul 2021 15:41:53 -0300 Subject: [PATCH 01/11] Index and search e docs --- README.md | 23 ++++++++++++++ app.py | 14 +++++++++ find_posts.py | 18 +++++++---- requirements.txt | 82 ++++++++++++++++++++++++++++++++++++++++++++++++ searcher.py | 22 +++++++------ 5 files changed, 143 insertions(+), 16 deletions(-) create mode 100644 app.py create mode 100644 requirements.txt diff --git a/README.md b/README.md index 58d21f0..e6bb5a2 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,29 @@ Full-text search for your Jekyll blog with ElasticSearch. +## Installation +- Clone the project +- run `python3 -m venv venv` to create the virtual env +- run `pip install -r requirements.txt` to install all packages +- run `. venv/bin/activate` to be able to use the python packages + +## Development +- You need a running copy of elasticsearch: +```bash +docker run \ +--name elasticsearch \ +--rm \ +-p 9200:9200 \ +-p 9300:9300 \ +-e "discovery.type=single-node" \ +elasticsearch:7.13.3 +``` +- Now, with the e documentation site already generated (`_site` folder), run: `python3 main.py "PATH_TO_USER_DOCS/htdocs/_site"` where `PATH_TO_USER_DOCS` is the location of the user documentation +- Make sure you are at the virtual env +- run `export FLASK_ENV=development` +- run `flask run` + + ## Features - Parses the html from your Jekyll `_site` directory using BeautifulSoup to get more accurate content instead of using the raw Markdown. diff --git a/app.py b/app.py new file mode 100644 index 0000000..488cc5c --- /dev/null +++ b/app.py @@ -0,0 +1,14 @@ +from flask import Flask, jsonify +from searcher import search_query + +app = Flask(__name__) + +@app.route('/search/') +def search(query): + result = search_query(query) + data = { + 'search_results': result, + } + resp = jsonify(data) + resp.headers['Access-Control-Allow-Origin'] = '*' + return resp \ No newline at end of file diff --git a/find_posts.py b/find_posts.py index 826f447..32c5ce2 100644 --- a/find_posts.py +++ b/find_posts.py @@ -3,7 +3,7 @@ from bs4 import BeautifulSoup from post import Post def find_post_paths(base_dir): - files = glob.glob(base_dir + "/20*/*/*/*.html") + files = glob.glob(base_dir + "/**/*.html", recursive=True) files = [f.replace("\\", "/") for f in files] return files @@ -12,12 +12,18 @@ def parse_post(path): contents = f.read() soup = BeautifulSoup(contents, 'html.parser') - title = soup.find('h1', { "class" : "post-title" }).text.strip() - - post_elem = soup.find("div", {"class": "post"}) - post_elem.find(attrs={"class": "post-title"}).decompose() - post_elem.find(attrs={"class": "post-date"}).decompose() + title = soup.find('title').text.strip() + post_elem = soup.find("div", {"class": "post"}) + if post_elem == None: + post_elem = soup.find("article", {"class": "post"}) + if post_elem == None: + post_elem = soup.find("div", {"class": "container"}) + if post_elem == None: + post_elem = soup.find("div", {"class": "wrapper"}) + if post_elem == None: + raise BaseException("No element found in '{}'".format(path)) + paras = post_elem.find_all(text=True) body = " ".join(p.strip() for p in paras).replace(" ", " ").strip() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c555b75 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,82 @@ +apturl==0.5.2 +attrs==19.3.0 +beautifulsoup4==4.9.3 +blinker==1.4 +Brlapi==0.7.0 +cached-property==1.5.1 +certifi==2019.11.28 +chardet==3.0.4 +Click==7.0 +colorama==0.4.3 +command-not-found==0.3 +cryptography==2.8 +cupshelpers==1.0 +dbus-python==1.2.16 +defer==1.0.6 +distro==1.4.0 +distro-info===0.23ubuntu1 +docker==4.1.0 +dockerpty==0.4.1 +docopt==0.6.2 +elasticsearch==7.13.3 +elasticsearch7==7.13.3 +entrypoints==0.3 +Flask==1.1.1 +httplib2==0.14.0 +idna==2.8 +importlib-metadata==1.5.0 +itsdangerous==1.1.0 +Jinja2==2.10.1 +jsonschema==3.2.0 +keyring==18.0.1 +language-selector==0.1 +launchpadlib==1.10.13 +lazr.restfulclient==0.14.2 +lazr.uri==1.0.3 +louis==3.12.0 +macaroonbakery==1.3.1 +MarkupSafe==1.1.0 +more-itertools==4.2.0 +netifaces==0.10.4 +oauthlib==3.1.0 +olefile==0.46 +pbr==5.6.0 +pexpect==4.6.0 +Pillow==7.0.0 +protobuf==3.6.1 +pycairo==1.16.2 +pycups==1.9.73 +PyGObject==3.36.0 +pyinotify==0.9.6 +PyJWT==1.7.1 +pymacaroons==0.13.0 +PyNaCl==1.3.0 +pyOpenSSL==19.0.0 +pyRFC3339==1.1 +pyrsistent==0.15.5 +python-apt==2.0.0+ubuntu0.20.4.5 +python-dateutil==2.7.3 +python-debian===0.1.36ubuntu1 +pytz==2019.3 +pyxdg==0.26 +PyYAML==5.3.1 +reportlab==3.5.34 +requests==2.22.0 +requests-unixsocket==0.2.0 +SecretStorage==2.3.1 +simplejson==3.16.0 +six==1.14.0 +soupsieve==2.2.1 +systemd-python==234 +testresources==2.0.1 +texttable==1.6.2 +ubuntu-advantage-tools==27.0 +ubuntu-drivers-common==0.0.0 +ufw==0.36 +unattended-upgrades==0.1 +urllib3==1.25.8 +wadllib==1.3.3 +websocket-client==0.53.0 +Werkzeug==0.16.1 +xkit==0.0.0 +zipp==1.0.0 \ No newline at end of file diff --git a/searcher.py b/searcher.py index 9f4eb0a..5368c5d 100644 --- a/searcher.py +++ b/searcher.py @@ -1,10 +1,9 @@ from elasticsearch import Elasticsearch -es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) +def search_query(user_query): + es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) -user_query = "python" - -query = { + query = { "query": { "multi_match": { "query": user_query, @@ -19,14 +18,17 @@ query = { "body" : {} } }, - "_source": ["title", "url"] -} + "_source": ["title", "url", "body"] + } -res = es.search(index="blog", body=query) -print("Found %d Hits:" % res['hits']['total']) + res = es.search(index="blog", body=query) + results = [] + for h in res['hits']['hits']: + results.append(h['_source']) + return results -for hit in res['hits']['hits']: - print(hit["_source"]) +if __name__ == "__main__": + print(search_query("map")) # POST /blog/post/_search # { -- GitLab From f91ba428f2d215bb7e7c747a586cf3bf86747122 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 15 Jul 2021 12:09:02 -0300 Subject: [PATCH 02/11] Fixing too many host dependency; Using .env files --- .example.env | 1 + .gitignore | 4 ++- README.md | 20 ++++++----- app.py | 6 +++- indexer.py | 6 ++-- main.py | 8 ++--- requirements.txt | 88 +++++------------------------------------------- 7 files changed, 35 insertions(+), 98 deletions(-) create mode 100644 .example.env diff --git a/.example.env b/.example.env new file mode 100644 index 0000000..4fe5945 --- /dev/null +++ b/.example.env @@ -0,0 +1 @@ +ALLOW_ORIGINS='*' \ No newline at end of file diff --git a/.gitignore b/.gitignore index ba0430d..8a23f93 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -__pycache__/ \ No newline at end of file +__pycache__/ +.env +venv \ No newline at end of file diff --git a/README.md b/README.md index e6bb5a2..af58e68 100644 --- a/README.md +++ b/README.md @@ -3,13 +3,15 @@ Full-text search for your Jekyll blog with ElasticSearch. ## Installation -- Clone the project -- run `python3 -m venv venv` to create the virtual env -- run `pip install -r requirements.txt` to install all packages -- run `. venv/bin/activate` to be able to use the python packages +1. Clone the project +1. run `python3 -m venv venv` to create the virtual env +1. run `pip install -r requirements.txt` to install all packages +1. run `. venv/bin/activate` to be able to use the python packages + +To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode ## Development -- You need a running copy of elasticsearch: +1. You need a running copy of elasticsearch: ```bash docker run \ --name elasticsearch \ @@ -19,10 +21,10 @@ docker run \ -e "discovery.type=single-node" \ elasticsearch:7.13.3 ``` -- Now, with the e documentation site already generated (`_site` folder), run: `python3 main.py "PATH_TO_USER_DOCS/htdocs/_site"` where `PATH_TO_USER_DOCS` is the location of the user documentation -- Make sure you are at the virtual env -- run `export FLASK_ENV=development` -- run `flask run` +1. Make sure you are at the virtual env (run `. venv/bin/activate`) +1. To index your content, run: `python3 main.py "PATH_TO_YOUR_CONTENT"` +1. run `export FLASK_ENV=development` +1. run `flask run` ## Features diff --git a/app.py b/app.py index 488cc5c..eab00b3 100644 --- a/app.py +++ b/app.py @@ -1,8 +1,12 @@ from flask import Flask, jsonify from searcher import search_query +from dotenv import load_dotenv +import os app = Flask(__name__) +load_dotenv() + @app.route('/search/') def search(query): result = search_query(query) @@ -10,5 +14,5 @@ def search(query): 'search_results': result, } resp = jsonify(data) - resp.headers['Access-Control-Allow-Origin'] = '*' + resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', 'localhost') return resp \ No newline at end of file diff --git a/indexer.py b/indexer.py index 8fcd94c..c40590c 100644 --- a/indexer.py +++ b/indexer.py @@ -1,12 +1,10 @@ from elasticsearch import Elasticsearch -import os -import re index_name = "blog" doc_type = "post" -def connect_elastic(host="localhost", port=9200): - return Elasticsearch([{'host': 'localhost', 'port': 9200}]) +def connect_elastic(host='localhost', port=9200): + return Elasticsearch([{'host': host, 'port': port}]) def refresh_index(es): if es.indices.exists(index=index_name): diff --git a/main.py b/main.py index 992bddf..fbca80a 100644 --- a/main.py +++ b/main.py @@ -5,10 +5,10 @@ import indexer if __name__ == "__main__": # provide blog base directory as arg - if len(sys.argv) > 1: - base_dir = str(sys.argv[1]) - else: - base_dir = expanduser("~") + "/blog/_site" + if len(sys.argv) != 2: + raise BaseException('You must pass the project folder to be crawled, and only it.') + + base_dir = str(sys.argv[1]) print("Finding posts in %s" % base_dir) diff --git a/requirements.txt b/requirements.txt index c555b75..265688d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,82 +1,12 @@ -apturl==0.5.2 -attrs==19.3.0 beautifulsoup4==4.9.3 -blinker==1.4 -Brlapi==0.7.0 -cached-property==1.5.1 -certifi==2019.11.28 -chardet==3.0.4 -Click==7.0 -colorama==0.4.3 -command-not-found==0.3 -cryptography==2.8 -cupshelpers==1.0 -dbus-python==1.2.16 -defer==1.0.6 -distro==1.4.0 -distro-info===0.23ubuntu1 -docker==4.1.0 -dockerpty==0.4.1 -docopt==0.6.2 +certifi==2021.5.30 +click==8.0.1 elasticsearch==7.13.3 -elasticsearch7==7.13.3 -entrypoints==0.3 -Flask==1.1.1 -httplib2==0.14.0 -idna==2.8 -importlib-metadata==1.5.0 -itsdangerous==1.1.0 -Jinja2==2.10.1 -jsonschema==3.2.0 -keyring==18.0.1 -language-selector==0.1 -launchpadlib==1.10.13 -lazr.restfulclient==0.14.2 -lazr.uri==1.0.3 -louis==3.12.0 -macaroonbakery==1.3.1 -MarkupSafe==1.1.0 -more-itertools==4.2.0 -netifaces==0.10.4 -oauthlib==3.1.0 -olefile==0.46 -pbr==5.6.0 -pexpect==4.6.0 -Pillow==7.0.0 -protobuf==3.6.1 -pycairo==1.16.2 -pycups==1.9.73 -PyGObject==3.36.0 -pyinotify==0.9.6 -PyJWT==1.7.1 -pymacaroons==0.13.0 -PyNaCl==1.3.0 -pyOpenSSL==19.0.0 -pyRFC3339==1.1 -pyrsistent==0.15.5 -python-apt==2.0.0+ubuntu0.20.4.5 -python-dateutil==2.7.3 -python-debian===0.1.36ubuntu1 -pytz==2019.3 -pyxdg==0.26 -PyYAML==5.3.1 -reportlab==3.5.34 -requests==2.22.0 -requests-unixsocket==0.2.0 -SecretStorage==2.3.1 -simplejson==3.16.0 -six==1.14.0 +Flask==2.0.1 +itsdangerous==2.0.1 +Jinja2==3.0.1 +MarkupSafe==2.0.1 +python-dotenv==0.18.0 soupsieve==2.2.1 -systemd-python==234 -testresources==2.0.1 -texttable==1.6.2 -ubuntu-advantage-tools==27.0 -ubuntu-drivers-common==0.0.0 -ufw==0.36 -unattended-upgrades==0.1 -urllib3==1.25.8 -wadllib==1.3.3 -websocket-client==0.53.0 -Werkzeug==0.16.1 -xkit==0.0.0 -zipp==1.0.0 \ No newline at end of file +urllib3==1.26.6 +Werkzeug==2.0.1 -- GitLab From 2a96d8caa785ddbce9d5863b1f1e1dfd747e35ce Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 15 Jul 2021 13:58:25 -0300 Subject: [PATCH 03/11] Reusing ES instance on server --- .example.env | 4 +++- app.py | 8 +++++++- main.py | 9 ++++++++- searcher.py | 3 +-- 4 files changed, 19 insertions(+), 5 deletions(-) diff --git a/.example.env b/.example.env index 4fe5945..c86789f 100644 --- a/.example.env +++ b/.example.env @@ -1 +1,3 @@ -ALLOW_ORIGINS='*' \ No newline at end of file +ALLOW_ORIGINS="*" +ES_HOST="localhost" +ES_PORT=9200 \ No newline at end of file diff --git a/app.py b/app.py index eab00b3..1fa1a59 100644 --- a/app.py +++ b/app.py @@ -1,5 +1,6 @@ from flask import Flask, jsonify from searcher import search_query +from indexer import connect_elastic from dotenv import load_dotenv import os @@ -7,9 +8,14 @@ app = Flask(__name__) load_dotenv() +es_host = os.getenv('ES_HOST', 'localhost') +es_port = os.getenv('ES_PORT', 9200) + +es = connect_elastic(es_host, es_port) + @app.route('/search/') def search(query): - result = search_query(query) + result = search_query(es, query) data = { 'search_results': result, } diff --git a/main.py b/main.py index fbca80a..c567800 100644 --- a/main.py +++ b/main.py @@ -2,6 +2,10 @@ import sys from os.path import expanduser from find_posts import create_posts import indexer +from dotenv import load_dotenv +import os + +load_dotenv() if __name__ == "__main__": # provide blog base directory as arg @@ -15,7 +19,10 @@ if __name__ == "__main__": posts = create_posts(base_dir) print("Posts created") - es = indexer.connect_elastic() + es_host = os.getenv('ES_HOST', 'localhost') + es_port = os.getenv('ES_PORT', 9200) + + es = indexer.connect_elastic(es_host, es_port) print("ElasticSearch connection established") indexer.refresh_index(es) diff --git a/searcher.py b/searcher.py index 5368c5d..01cba1d 100644 --- a/searcher.py +++ b/searcher.py @@ -1,7 +1,6 @@ from elasticsearch import Elasticsearch -def search_query(user_query): - es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) +def search_query(es, user_query): query = { "query": { -- GitLab From 77aaaa080824fe866b3a48f3a1490885974adb39 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Fri, 16 Jul 2021 15:26:15 -0300 Subject: [PATCH 04/11] Custom attribute to be searched by indexer --- find_posts.py | 47 +++++++++++++++++++++++++++-------------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/find_posts.py b/find_posts.py index 32c5ce2..71d3dba 100644 --- a/find_posts.py +++ b/find_posts.py @@ -7,37 +7,44 @@ def find_post_paths(base_dir): files = [f.replace("\\", "/") for f in files] return files -def parse_post(path): - with open(path, encoding="utf8") as f: - contents = f.read() +def get_title_from_htmltree(htmltree: BeautifulSoup): + title = htmltree.select_one('[data-elasticsearch-title]') + if title == None: + title = htmltree.find('h1', { "class" : "post-title" }) + return title.text.strip() - soup = BeautifulSoup(contents, 'html.parser') - title = soup.find('title').text.strip() +def get_body_from_htmltree(htmltree: BeautifulSoup): + post_elem = htmltree.select_one('[data-elasticsearch-body]') - post_elem = soup.find("div", {"class": "post"}) - if post_elem == None: - post_elem = soup.find("article", {"class": "post"}) - if post_elem == None: - post_elem = soup.find("div", {"class": "container"}) - if post_elem == None: - post_elem = soup.find("div", {"class": "wrapper"}) + if post_elem == None: + post_elem = htmltree.find("div", {"class": "post"}) if post_elem == None: - raise BaseException("No element found in '{}'".format(path)) - - paras = post_elem.find_all(text=True) + return None + post_elem.find(attrs={"class": "post-title"}).decompose() + post_elem.find(attrs={"class": "post-date"}).decompose() - body = " ".join(p.strip() for p in paras).replace(" ", " ").strip() - # remove special characters + paras = post_elem.find_all(text=True) - return (title, body) + body = " ".join(p.strip() for p in paras).replace(" ", " ").strip() - raise "Could not read file: " + path + return body +def get_htmltree_from_file(path: str) -> BeautifulSoup: + with open(path, encoding="utf8") as f: + contents = f.read() + return BeautifulSoup(contents, 'html.parser') def create_posts(base_dir): paths = find_post_paths(base_dir) for path in paths: + + htmltree = get_htmltree_from_file(path) + title = get_title_from_htmltree(htmltree) + body = get_body_from_htmltree(htmltree) + if body == None: + raise BaseException("No element for body found in '{}'".format(path)) + id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") - (title, body) = parse_post(path) + yield Post(id, title, url, body) -- GitLab From 1309aa159b8522eb5126451b688ee68b95d186df Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Fri, 16 Jul 2021 15:40:35 -0300 Subject: [PATCH 05/11] Logging instead of raising Exceptions --- find_posts.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/find_posts.py b/find_posts.py index 71d3dba..09510a7 100644 --- a/find_posts.py +++ b/find_posts.py @@ -1,6 +1,7 @@ import glob from bs4 import BeautifulSoup from post import Post +import logging def find_post_paths(base_dir): files = glob.glob(base_dir + "/**/*.html", recursive=True) @@ -11,6 +12,8 @@ def get_title_from_htmltree(htmltree: BeautifulSoup): title = htmltree.select_one('[data-elasticsearch-title]') if title == None: title = htmltree.find('h1', { "class" : "post-title" }) + if title == None: + return None return title.text.strip() def get_body_from_htmltree(htmltree: BeautifulSoup): @@ -40,9 +43,13 @@ def create_posts(base_dir): htmltree = get_htmltree_from_file(path) title = get_title_from_htmltree(htmltree) + if title == None: + logging.warning(f"No element for title found in '{path}'") + continue body = get_body_from_htmltree(htmltree) if body == None: - raise BaseException("No element for body found in '{}'".format(path)) + logging.warning(f"No element for body found in '{path}'") + continue id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") -- GitLab From 45bfd81d4961e5c9e7e52f1c8da84aa5dd5a26bc Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Tue, 20 Jul 2021 11:38:13 -0300 Subject: [PATCH 06/11] Parse and return html page language --- find_posts.py | 11 ++++++++++- indexer.py | 3 ++- post.py | 5 +++-- searcher.py | 14 ++++++++++---- 4 files changed, 25 insertions(+), 8 deletions(-) diff --git a/find_posts.py b/find_posts.py index 09510a7..cbeac05 100644 --- a/find_posts.py +++ b/find_posts.py @@ -3,6 +3,8 @@ from bs4 import BeautifulSoup from post import Post import logging +DEFAULT_LANG = 'en' + def find_post_paths(base_dir): files = glob.glob(base_dir + "/**/*.html", recursive=True) files = [f.replace("\\", "/") for f in files] @@ -37,6 +39,11 @@ def get_htmltree_from_file(path: str) -> BeautifulSoup: contents = f.read() return BeautifulSoup(contents, 'html.parser') +def get_lang_from_htmltree(htmltree: BeautifulSoup) -> str: + html = htmltree.select_one('html') + lang = html.get('lang') + return DEFAULT_LANG if lang == None else lang + def create_posts(base_dir): paths = find_post_paths(base_dir) for path in paths: @@ -51,7 +58,9 @@ def create_posts(base_dir): logging.warning(f"No element for body found in '{path}'") continue + lang = get_lang_from_htmltree(htmltree) + id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") - yield Post(id, title, url, body) + yield Post(id, title, url, body, lang) diff --git a/indexer.py b/indexer.py index c40590c..fe4501e 100644 --- a/indexer.py +++ b/indexer.py @@ -16,7 +16,8 @@ def index_posts(es, posts): doc = { "title": post.title, "url": post.url, - "body": post.body + "body": post.body, + "lang": post.lang, } es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) diff --git a/post.py b/post.py index aaef2b5..196f844 100644 --- a/post.py +++ b/post.py @@ -1,6 +1,7 @@ class Post: - def __init__(self, id, title, url, body): + def __init__(self, id, title, url, body, lang): self.id = id self.title = title self.url = url - self.body = body \ No newline at end of file + self.body = body + self.lang = lang \ No newline at end of file diff --git a/searcher.py b/searcher.py index 01cba1d..881b33a 100644 --- a/searcher.py +++ b/searcher.py @@ -1,5 +1,3 @@ -from elasticsearch import Elasticsearch - def search_query(es, user_query): query = { @@ -17,7 +15,7 @@ def search_query(es, user_query): "body" : {} } }, - "_source": ["title", "url", "body"] + "_source": ["title", "url", "body", "lang"] } res = es.search(index="blog", body=query) @@ -27,7 +25,15 @@ def search_query(es, user_query): return results if __name__ == "__main__": - print(search_query("map")) + import os + from indexer import connect_elastic + from dotenv import load_dotenv + load_dotenv() + es_host = os.getenv('ES_HOST', 'localhost') + es_port = os.getenv('ES_PORT', 9200) + + es = connect_elastic(es_host, es_port) + print(search_query(es, "map")) # POST /blog/post/_search # { -- GitLab From a27a380a4d6cfc05c84fa8287ad2691daf213eea Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 21 Jul 2021 10:30:14 -0300 Subject: [PATCH 07/11] Parsing meta description --- find_posts.py | 20 +++++++++++++++++++- post.py | 3 ++- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/find_posts.py b/find_posts.py index cbeac05..0af88e9 100644 --- a/find_posts.py +++ b/find_posts.py @@ -44,6 +44,13 @@ def get_lang_from_htmltree(htmltree: BeautifulSoup) -> str: lang = html.get('lang') return DEFAULT_LANG if lang == None else lang +def get_description_from_htmltree(htmltree: BeautifulSoup) -> str: + metatag = htmltree.select_one('meta[name="description"]') + if metatag == None: + return None + description = metatag.get('content') + return description + def create_posts(base_dir): paths = find_post_paths(base_dir) for path in paths: @@ -58,9 +65,20 @@ def create_posts(base_dir): logging.warning(f"No element for body found in '{path}'") continue + description = get_description_from_htmltree(htmltree) + if description == None: + description = body + lang = get_lang_from_htmltree(htmltree) id = path.replace(base_dir, "").replace("/", "-") url = path.replace(base_dir, "") - yield Post(id, title, url, body, lang) + yield Post( + id=id, + title=title, + url=url, + body=body, + description=description, + lang=lang + ) diff --git a/post.py b/post.py index 196f844..3ad4c11 100644 --- a/post.py +++ b/post.py @@ -1,7 +1,8 @@ class Post: - def __init__(self, id, title, url, body, lang): + def __init__(self, id, title, url, body, description, lang): self.id = id self.title = title self.url = url self.body = body + self.description = description self.lang = lang \ No newline at end of file -- GitLab From be8828f525e0b7aec48c4bdbed0e5744c39f08ca Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 21 Jul 2021 11:55:30 -0300 Subject: [PATCH 08/11] Dockerfile added --- Dockerfile | 12 ++++++++++++ README.md | 5 +++-- app.py => src/app.py | 0 find_posts.py => src/find_posts.py | 0 indexer.py => src/indexer.py | 0 main.py => src/main.py | 0 post.py => src/post.py | 0 searcher.py => src/searcher.py | 0 8 files changed, 15 insertions(+), 2 deletions(-) create mode 100644 Dockerfile rename app.py => src/app.py (100%) rename find_posts.py => src/find_posts.py (100%) rename indexer.py => src/indexer.py (100%) rename main.py => src/main.py (100%) rename post.py => src/post.py (100%) rename searcher.py => src/searcher.py (100%) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2f29265 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.9.6-alpine3.13 AS py + +EXPOSE 5000 + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY src . + +CMD [ "flask", "run" ] \ No newline at end of file diff --git a/README.md b/README.md index af58e68..ff092bf 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,9 @@ docker run \ elasticsearch:7.13.3 ``` 1. Make sure you are at the virtual env (run `. venv/bin/activate`) -1. To index your content, run: `python3 main.py "PATH_TO_YOUR_CONTENT"` +1. To index your content, run: `python3 src/main.py "PATH_TO_YOUR_CONTENT"` 1. run `export FLASK_ENV=development` +1. run `export FLASK_APP=src/app` 1. run `flask run` @@ -38,7 +39,7 @@ elasticsearch:7.13.3 ### Indexing: - Make sure you have an ElasticSearch server running. If not local, change the config in `indexer.py` to reflect your location. - - Run the command `python main.py `, running without an argument will assume your compiled blog is located at `~/blog/_site`. + - Run the command `python src/main.py `, running without an argument will assume your compiled blog is located at `~/blog/_site`. - If the library cannot find your content correctly, modify `indexer.py` to point to the correct HTML elements for title, post content etc (assuming you have unique CSS classes for these). ### Searching: diff --git a/app.py b/src/app.py similarity index 100% rename from app.py rename to src/app.py diff --git a/find_posts.py b/src/find_posts.py similarity index 100% rename from find_posts.py rename to src/find_posts.py diff --git a/indexer.py b/src/indexer.py similarity index 100% rename from indexer.py rename to src/indexer.py diff --git a/main.py b/src/main.py similarity index 100% rename from main.py rename to src/main.py diff --git a/post.py b/src/post.py similarity index 100% rename from post.py rename to src/post.py diff --git a/searcher.py b/src/searcher.py similarity index 100% rename from searcher.py rename to src/searcher.py -- GitLab From 0cbbe0a7bb5d99f05ec2ba099b23b3c5fca33968 Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Wed, 21 Jul 2021 13:30:03 -0300 Subject: [PATCH 09/11] Gitlab CI added --- .gitlab-ci.yml | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) create mode 100644 .gitlab-ci.yml diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000..ece5b10 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,35 @@ +# When using dind, it's wise to use the overlayfs driver for +# improved performance. +variables: + DOCKER_DRIVER: overlay2 +stages: + - build + - deploy +default: + image: docker:20.10 + +# Build stage +.build:docker: + stage: build + tags: + - generic_privileged + services: + - docker:20.10-dind + before_script: + - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + script: + - docker build . --pull -t "$CI_REGISTRY_IMAGE:$IMAGE_TAG" -f Dockerfile + - docker push "$CI_REGISTRY_IMAGE:$IMAGE_TAG" + +build:branch: + extends: .build:docker + rules: + - if: '$CI_COMMIT_TAG' + when: never + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + when: never + - if: '$CI_PIPELINE_SOURCE =~ /schedule|web|api|trigger/ && $IMAGE_TAG != $CI_COMMIT_REF_SLUG' + when: never + - when: on_success + variables: + IMAGE_TAG: $CI_COMMIT_REF_SLUG -- GitLab From ef5d28e56daa2cff742b9e6deb619c95e513776e Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Thu, 22 Jul 2021 15:51:10 -0300 Subject: [PATCH 10/11] Respect noindex in robots --- src/find_posts.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/src/find_posts.py b/src/find_posts.py index 0af88e9..a3cecde 100644 --- a/src/find_posts.py +++ b/src/find_posts.py @@ -51,11 +51,31 @@ def get_description_from_htmltree(htmltree: BeautifulSoup) -> str: description = metatag.get('content') return description +def should_crawl_page(htmltree: BeautifulSoup) -> bool: + + metatag = htmltree.select_one('meta[name="robots"]') + if metatag == None: + return True + + metatag_content = metatag.get('content') + if metatag_content == None: + return True + options = metatag_content.split(',') + + if 'noindex' in options: + return False + return True + def create_posts(base_dir): paths = find_post_paths(base_dir) for path in paths: htmltree = get_htmltree_from_file(path) + + should = should_crawl_page(htmltree) + if not should: + continue + title = get_title_from_htmltree(htmltree) if title == None: logging.warning(f"No element for title found in '{path}'") -- GitLab From b4ca670db00e71aece28b648c6d0715c1208a19f Mon Sep 17 00:00:00 2001 From: Israel Yago Pereira Date: Tue, 3 Aug 2021 14:43:39 -0300 Subject: [PATCH 11/11] Changed default allowed origin --- README.md | 2 +- src/app.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ff092bf..f5ddc2d 100644 --- a/README.md +++ b/README.md @@ -5,8 +5,8 @@ Full-text search for your Jekyll blog with ElasticSearch. ## Installation 1. Clone the project 1. run `python3 -m venv venv` to create the virtual env -1. run `pip install -r requirements.txt` to install all packages 1. run `. venv/bin/activate` to be able to use the python packages +1. run `pip install -r requirements.txt` to install all packages To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode diff --git a/src/app.py b/src/app.py index 1fa1a59..259c08d 100644 --- a/src/app.py +++ b/src/app.py @@ -20,5 +20,5 @@ def search(query): 'search_results': result, } resp = jsonify(data) - resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', 'localhost') + resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*') return resp \ No newline at end of file -- GitLab