diff --git a/.example.env b/.example.env new file mode 100644 index 0000000000000000000000000000000000000000..c86789f387ca2a1c1e2ea7ae851c7fba2caecebe --- /dev/null +++ b/.example.env @@ -0,0 +1,3 @@ +ALLOW_ORIGINS="*" +ES_HOST="localhost" +ES_PORT=9200 \ No newline at end of file diff --git a/.gitignore b/.gitignore index ba0430d26c996e7f078385407f959c96c271087c..8a23f93d7cf877b58d14f2b3b4999cb33c57e903 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,3 @@ -__pycache__/ \ No newline at end of file +__pycache__/ +.env +venv \ No newline at end of file diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml new file mode 100644 index 0000000000000000000000000000000000000000..ece5b10a82c7d853183496e4845d978c5816d869 --- /dev/null +++ b/.gitlab-ci.yml @@ -0,0 +1,35 @@ +# When using dind, it's wise to use the overlayfs driver for +# improved performance. +variables: + DOCKER_DRIVER: overlay2 +stages: + - build + - deploy +default: + image: docker:20.10 + +# Build stage +.build:docker: + stage: build + tags: + - generic_privileged + services: + - docker:20.10-dind + before_script: + - docker login -u "$CI_REGISTRY_USER" -p "$CI_REGISTRY_PASSWORD" $CI_REGISTRY + script: + - docker build . --pull -t "$CI_REGISTRY_IMAGE:$IMAGE_TAG" -f Dockerfile + - docker push "$CI_REGISTRY_IMAGE:$IMAGE_TAG" + +build:branch: + extends: .build:docker + rules: + - if: '$CI_COMMIT_TAG' + when: never + - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' + when: never + - if: '$CI_PIPELINE_SOURCE =~ /schedule|web|api|trigger/ && $IMAGE_TAG != $CI_COMMIT_REF_SLUG' + when: never + - when: on_success + variables: + IMAGE_TAG: $CI_COMMIT_REF_SLUG diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..2f292652c91f5d2f968f47745016c760b774697a --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.9.6-alpine3.13 AS py + +EXPOSE 5000 + +WORKDIR /usr/src/app + +COPY requirements.txt ./ +RUN pip install --no-cache-dir -r requirements.txt + +COPY src . + +CMD [ "flask", "run" ] \ No newline at end of file diff --git a/README.md b/README.md index 58d21f0dc6f2011edf8f3db63cfea570b3d6a615..f5ddc2d2a3511b9c6324a436e66a1a9ce7409817 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,32 @@ Full-text search for your Jekyll blog with ElasticSearch. +## Installation +1. Clone the project +1. run `python3 -m venv venv` to create the virtual env +1. run `. venv/bin/activate` to be able to use the python packages +1. run `pip install -r requirements.txt` to install all packages + +To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode + +## Development +1. You need a running copy of elasticsearch: +```bash +docker run \ +--name elasticsearch \ +--rm \ +-p 9200:9200 \ +-p 9300:9300 \ +-e "discovery.type=single-node" \ +elasticsearch:7.13.3 +``` +1. Make sure you are at the virtual env (run `. venv/bin/activate`) +1. To index your content, run: `python3 src/main.py "PATH_TO_YOUR_CONTENT"` +1. run `export FLASK_ENV=development` +1. run `export FLASK_APP=src/app` +1. run `flask run` + + ## Features - Parses the html from your Jekyll `_site` directory using BeautifulSoup to get more accurate content instead of using the raw Markdown. @@ -13,7 +39,7 @@ Full-text search for your Jekyll blog with ElasticSearch. ### Indexing: - Make sure you have an ElasticSearch server running. If not local, change the config in `indexer.py` to reflect your location. - - Run the command `python main.py `, running without an argument will assume your compiled blog is located at `~/blog/_site`. + - Run the command `python src/main.py `, running without an argument will assume your compiled blog is located at `~/blog/_site`. - If the library cannot find your content correctly, modify `indexer.py` to point to the correct HTML elements for title, post content etc (assuming you have unique CSS classes for these). ### Searching: diff --git a/find_posts.py b/find_posts.py deleted file mode 100644 index 826f4470c8e2b722bf7ed8d196f36699101bb515..0000000000000000000000000000000000000000 --- a/find_posts.py +++ /dev/null @@ -1,37 +0,0 @@ -import glob -from bs4 import BeautifulSoup -from post import Post - -def find_post_paths(base_dir): - files = glob.glob(base_dir + "/20*/*/*/*.html") - files = [f.replace("\\", "/") for f in files] - return files - -def parse_post(path): - with open(path, encoding="utf8") as f: - contents = f.read() - - soup = BeautifulSoup(contents, 'html.parser') - title = soup.find('h1', { "class" : "post-title" }).text.strip() - - post_elem = soup.find("div", {"class": "post"}) - post_elem.find(attrs={"class": "post-title"}).decompose() - post_elem.find(attrs={"class": "post-date"}).decompose() - - paras = post_elem.find_all(text=True) - - body = " ".join(p.strip() for p in paras).replace(" ", " ").strip() - # remove special characters - - return (title, body) - - raise "Could not read file: " + path - - -def create_posts(base_dir): - paths = find_post_paths(base_dir) - for path in paths: - id = path.replace(base_dir, "").replace("/", "-") - url = path.replace(base_dir, "") - (title, body) = parse_post(path) - yield Post(id, title, url, body) diff --git a/post.py b/post.py deleted file mode 100644 index aaef2b534e7bdbf7a4d2d6dcef451c4c225fd9b9..0000000000000000000000000000000000000000 --- a/post.py +++ /dev/null @@ -1,6 +0,0 @@ -class Post: - def __init__(self, id, title, url, body): - self.id = id - self.title = title - self.url = url - self.body = body \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..265688d21e5a680bbbfc1903c3cf51c025277976 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,12 @@ +beautifulsoup4==4.9.3 +certifi==2021.5.30 +click==8.0.1 +elasticsearch==7.13.3 +Flask==2.0.1 +itsdangerous==2.0.1 +Jinja2==3.0.1 +MarkupSafe==2.0.1 +python-dotenv==0.18.0 +soupsieve==2.2.1 +urllib3==1.26.6 +Werkzeug==2.0.1 diff --git a/src/app.py b/src/app.py new file mode 100644 index 0000000000000000000000000000000000000000..259c08dac0190dbe20bb9133e2a7e9eac11245bf --- /dev/null +++ b/src/app.py @@ -0,0 +1,24 @@ +from flask import Flask, jsonify +from searcher import search_query +from indexer import connect_elastic +from dotenv import load_dotenv +import os + +app = Flask(__name__) + +load_dotenv() + +es_host = os.getenv('ES_HOST', 'localhost') +es_port = os.getenv('ES_PORT', 9200) + +es = connect_elastic(es_host, es_port) + +@app.route('/search/') +def search(query): + result = search_query(es, query) + data = { + 'search_results': result, + } + resp = jsonify(data) + resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*') + return resp \ No newline at end of file diff --git a/src/find_posts.py b/src/find_posts.py new file mode 100644 index 0000000000000000000000000000000000000000..a3cecdea701cfda5f829ed8bec012727b36a3919 --- /dev/null +++ b/src/find_posts.py @@ -0,0 +1,104 @@ +import glob +from bs4 import BeautifulSoup +from post import Post +import logging + +DEFAULT_LANG = 'en' + +def find_post_paths(base_dir): + files = glob.glob(base_dir + "/**/*.html", recursive=True) + files = [f.replace("\\", "/") for f in files] + return files + +def get_title_from_htmltree(htmltree: BeautifulSoup): + title = htmltree.select_one('[data-elasticsearch-title]') + if title == None: + title = htmltree.find('h1', { "class" : "post-title" }) + if title == None: + return None + return title.text.strip() + +def get_body_from_htmltree(htmltree: BeautifulSoup): + post_elem = htmltree.select_one('[data-elasticsearch-body]') + + if post_elem == None: + post_elem = htmltree.find("div", {"class": "post"}) + if post_elem == None: + return None + post_elem.find(attrs={"class": "post-title"}).decompose() + post_elem.find(attrs={"class": "post-date"}).decompose() + + paras = post_elem.find_all(text=True) + + body = " ".join(p.strip() for p in paras).replace(" ", " ").strip() + + return body + +def get_htmltree_from_file(path: str) -> BeautifulSoup: + with open(path, encoding="utf8") as f: + contents = f.read() + return BeautifulSoup(contents, 'html.parser') + +def get_lang_from_htmltree(htmltree: BeautifulSoup) -> str: + html = htmltree.select_one('html') + lang = html.get('lang') + return DEFAULT_LANG if lang == None else lang + +def get_description_from_htmltree(htmltree: BeautifulSoup) -> str: + metatag = htmltree.select_one('meta[name="description"]') + if metatag == None: + return None + description = metatag.get('content') + return description + +def should_crawl_page(htmltree: BeautifulSoup) -> bool: + + metatag = htmltree.select_one('meta[name="robots"]') + if metatag == None: + return True + + metatag_content = metatag.get('content') + if metatag_content == None: + return True + options = metatag_content.split(',') + + if 'noindex' in options: + return False + return True + +def create_posts(base_dir): + paths = find_post_paths(base_dir) + for path in paths: + + htmltree = get_htmltree_from_file(path) + + should = should_crawl_page(htmltree) + if not should: + continue + + title = get_title_from_htmltree(htmltree) + if title == None: + logging.warning(f"No element for title found in '{path}'") + continue + body = get_body_from_htmltree(htmltree) + if body == None: + logging.warning(f"No element for body found in '{path}'") + continue + + description = get_description_from_htmltree(htmltree) + if description == None: + description = body + + lang = get_lang_from_htmltree(htmltree) + + id = path.replace(base_dir, "").replace("/", "-") + url = path.replace(base_dir, "") + + yield Post( + id=id, + title=title, + url=url, + body=body, + description=description, + lang=lang + ) diff --git a/indexer.py b/src/indexer.py similarity index 74% rename from indexer.py rename to src/indexer.py index 8fcd94c4522a70347d3039f97dd9b2b0c2b3c4d3..fe4501ed7a5476eb74e1d16737b37971400a2005 100644 --- a/indexer.py +++ b/src/indexer.py @@ -1,12 +1,10 @@ from elasticsearch import Elasticsearch -import os -import re index_name = "blog" doc_type = "post" -def connect_elastic(host="localhost", port=9200): - return Elasticsearch([{'host': 'localhost', 'port': 9200}]) +def connect_elastic(host='localhost', port=9200): + return Elasticsearch([{'host': host, 'port': port}]) def refresh_index(es): if es.indices.exists(index=index_name): @@ -18,7 +16,8 @@ def index_posts(es, posts): doc = { "title": post.title, "url": post.url, - "body": post.body + "body": post.body, + "lang": post.lang, } es.index(index=index_name, doc_type=doc_type, id=post.id, body=doc) diff --git a/main.py b/src/main.py similarity index 57% rename from main.py rename to src/main.py index 992bddfd4f8101f4c84d8fa5e0adcfa749beec6a..c5678000555bc13f8e07de5d3ce6cebc12ab518c 100644 --- a/main.py +++ b/src/main.py @@ -2,20 +2,27 @@ import sys from os.path import expanduser from find_posts import create_posts import indexer +from dotenv import load_dotenv +import os + +load_dotenv() if __name__ == "__main__": # provide blog base directory as arg - if len(sys.argv) > 1: - base_dir = str(sys.argv[1]) - else: - base_dir = expanduser("~") + "/blog/_site" + if len(sys.argv) != 2: + raise BaseException('You must pass the project folder to be crawled, and only it.') + + base_dir = str(sys.argv[1]) print("Finding posts in %s" % base_dir) posts = create_posts(base_dir) print("Posts created") - es = indexer.connect_elastic() + es_host = os.getenv('ES_HOST', 'localhost') + es_port = os.getenv('ES_PORT', 9200) + + es = indexer.connect_elastic(es_host, es_port) print("ElasticSearch connection established") indexer.refresh_index(es) diff --git a/src/post.py b/src/post.py new file mode 100644 index 0000000000000000000000000000000000000000..3ad4c113517d5f3b27fdc6b9ffcb33ef647950e3 --- /dev/null +++ b/src/post.py @@ -0,0 +1,8 @@ +class Post: + def __init__(self, id, title, url, body, description, lang): + self.id = id + self.title = title + self.url = url + self.body = body + self.description = description + self.lang = lang \ No newline at end of file diff --git a/searcher.py b/src/searcher.py similarity index 56% rename from searcher.py rename to src/searcher.py index 9f4eb0aad02fdf717a33b707bc11ed409021e5fd..881b33ac73bbeee29fbdc79626e41c1f7fb719e4 100644 --- a/searcher.py +++ b/src/searcher.py @@ -1,10 +1,6 @@ -from elasticsearch import Elasticsearch +def search_query(es, user_query): -es = Elasticsearch([{'host': 'localhost', 'port': 9200}]) - -user_query = "python" - -query = { + query = { "query": { "multi_match": { "query": user_query, @@ -19,14 +15,25 @@ query = { "body" : {} } }, - "_source": ["title", "url"] -} + "_source": ["title", "url", "body", "lang"] + } + + res = es.search(index="blog", body=query) + results = [] + for h in res['hits']['hits']: + results.append(h['_source']) + return results -res = es.search(index="blog", body=query) -print("Found %d Hits:" % res['hits']['total']) +if __name__ == "__main__": + import os + from indexer import connect_elastic + from dotenv import load_dotenv + load_dotenv() + es_host = os.getenv('ES_HOST', 'localhost') + es_port = os.getenv('ES_PORT', 9200) -for hit in res['hits']['hits']: - print(hit["_source"]) + es = connect_elastic(es_host, es_port) + print(search_query(es, "map")) # POST /blog/post/_search # {