Checks for the codestyle build into the CI (81bf40fe) · Commits · e / infra / web / Elasticsearch Jekyll integration

.gitlab-ci.yml

+10 −0

Original line number	Diff line number	Diff line
		@@ -3,11 +3,21 @@
		variables:
		DOCKER_DRIVER: overlay2
		stages:
		- lint
		- build
		- deploy
		default:
		image: docker:20.10

		lint:branch:
		stage: lint
		tags:
		- generic_privileged
		image: python:3.9
		script:
		- pip install -r requirements.txt
		- pylint --load-plugins pylint_quotes ./src

		# Build stage
		.build:docker:
		stage: build

README.md

+8 −0

Original line number	Diff line number	Diff line
		@@ -11,6 +11,14 @@ Full-text search for your Jekyll blog with ElasticSearch.

		To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode

		This project uses [pylint](https://pylint.org/) to help with the codestyle. If you wish to save some server resources, you can put the following shell script into the `.git/hooks/pre-commit` file (create one if it does not exists):
		```shell
		. venv/bin/activate
		pylint --load-plugins pylint_quotes ./src
		```

		So every time you try to commit something, it will validate the linter locally beforehand.

		## Development
		1. You need a running copy of elasticsearch:
		```bash

requirements.txt

+10 −0

Original line number	Diff line number	Diff line
		astroid==2.9.0
		beautifulsoup4==4.9.3
		certifi==2021.5.30
		click==8.0.1
		elasticsearch==7.13.3
		Flask==2.0.1
		folder-hash==1.0.0
		isort==5.10.1
		itsdangerous==2.0.1
		Jinja2==3.0.1
		lazy-object-proxy==1.6.0
		MarkupSafe==2.0.1
		mccabe==0.6.1
		platformdirs==2.4.0
		pylint==2.12.1
		pylint-quotes==0.2.3
		python-dotenv==0.18.0
		soupsieve==2.2.1
		toml==0.10.2
		typing-extensions==4.0.1
		urllib3==1.26.6
		uWSGI==2.0.19.1
		Werkzeug==2.0.1
		wrapt==1.13.3

src/app.py

+21 −13

Original line number	Diff line number	Diff line
		""" Handles connection with the outside world by REST api """
		import gzip
		import os
		from flask import Flask, json, Response, make_response
		from dotenv import load_dotenv
		import searcher
		import gzip
		from indexer import connect_elastic
		from dotenv import load_dotenv
		import os

		app = Flask(__name__)

		load_dotenv()

		es_host = os.getenv('ES_HOST', 'localhost')
		es_port = os.getenv('ES_PORT', 9200)
		es_port = os.getenv('ES_PORT', '9200')

		es = connect_elastic(es_host, es_port)


		@app.route('/<language>/search/<query>')
		def search(language, query):
		""" Endpoint to search into all contents of the specific language """
		result = searcher.search_query(es, query, language)
		data = {
		'search_results': result,
		}
		try:
		resp = gzip_json_response(data)
		except:
		except BaseException: # pylint: disable=w0703
		resp = json.dumps({'error': 'Unable to process at the moment'})
		headers = {
		'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*')
		}
		return Response(response=resp,
		status=500,
		content_type='application/json',
		headers={
		'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*')
		})

		resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*')
		headers=headers)
		resp.headers['Access-Control-Allow-Origin'] = os.getenv(
		'ALLOW_ORIGINS', '*')
		return resp


		@app.route('/<language>/autocomplete')
		def autocomplete(language):
		""" Endpoint to return a list of autocomplete words """
		result = searcher.autocomplete(es, language)
		data = {
		'autocomplete': result,
		}
		return gzip_json_response(data)


		def gzip_json_response(data: dict) -> Response:
		""" Converts a dictionary into a flask response with it's content compressed (gzip) """
		content = gzip.compress(json.dumps(data).encode('utf8'), 5)
		response = make_response(content)
		response.headers['Content-length'] = len(content)

src/find_posts.py

+54 −35

Original line number	Diff line number	Diff line
		""" Parse the content from html """
		import glob
		import logging
		from bs4 import BeautifulSoup
		from post import Post
		import logging

		DEFAULT_LANG = 'en'


		def find_post_paths(base_dir):
		files = glob.glob(base_dir + "/*/.html", recursive=True)
		files = [f.replace("\\", "/") for f in files]
		""" Get all html files inside a folder """
		files = glob.glob(base_dir + '/*/.html', recursive=True)
		files = [f.replace('\\', '/') for f in files]
		return files


		def get_title_from_htmltree(htmltree: BeautifulSoup):
		""" Grab the title from a set of possible places """
		title = htmltree.select_one('[data-elasticsearch-title]')
		if title == None:
		title = htmltree.find('h1', { "class" : "post-title" })
		if title == None:
		if title is None:
		title = htmltree.find('h1', {'class': 'post-title'})
		if title is None:
		title = htmltree.find('h1')
		if title == None:
		if title is None:
		return None
		return title.text.strip()


		def get_subtitles_from_htmltree(htmltree: BeautifulSoup):
		""" Grab the subtitles from all h2,h3,h4,h5,h6 elements """
		subtitles = []
		headings = htmltree.select('h2,h3,h4,h5,h6')
		for h in headings:
		subtitles.append(h.text.strip())
		for heading in headings:
		subtitles.append(heading.text.strip())

		return subtitles


		def get_body_from_htmltree(htmltree: BeautifulSoup):
		""" Get the body content of the page """
		post_elem = htmltree.select_one('[data-elasticsearch-body]')

		if post_elem == None:
		post_elem = htmltree.find("div", {"class": "post"})
		if post_elem == None:
		if post_elem is None:
		post_elem = htmltree.find('div', {'class': 'post'})
		if post_elem is None:
		return None
		post_elem.find(attrs={"class": "post-title"}).decompose()
		post_elem.find(attrs={"class": "post-date"}).decompose()
		post_elem.find(attrs={'class': 'post-title'}).decompose()
		post_elem.find(attrs={'class': 'post-date'}).decompose()

		paras = post_elem.find_all(text=True)

		body = " ".join(p.strip() for p in paras).replace(" ", " ").strip()
		body = ' '.join(p.strip() for p in paras).replace(' ', ' ').strip()

		return body


		def get_htmltree_from_file(path: str) -> BeautifulSoup:
		with open(path, encoding="utf8") as f:
		contents = f.read()
		""" Parse an html file into a html struct """
		with open(path, encoding='utf8') as file:
		contents = file.read()
		return BeautifulSoup(contents, 'html.parser')


		def get_lang_from_htmltree(htmltree: BeautifulSoup) -> str:
		""" Returns the language set in the html tag """
		html = htmltree.select_one('html')
		lang = html.get('lang')
		return DEFAULT_LANG if lang == None else lang
		return DEFAULT_LANG if lang is None else lang


		def get_description_from_htmltree(htmltree: BeautifulSoup) -> str:
		metatag = htmltree.select_one('meta[name="description"]')
		if metatag == None:
		""" Gather the description of the page from the meta tag description """
		metatag = htmltree.select_one('meta[name=\'description\']')
		if metatag is None:
		return None
		description = metatag.get('content')
		return description


		def should_crawl_page(htmltree: BeautifulSoup) -> bool:
		""" Determines if the page should be crawled """

		metatag = htmltree.select_one('meta[name="robots"]')
		if metatag == None:
		metatag = htmltree.select_one('meta[name=\'robots\']')
		if metatag is None:
		return True

		metatag_content = metatag.get('content')
		if metatag_content == None:
		if metatag_content is None:
		return True
		options = metatag_content.split(',')

		@@ -76,7 +93,9 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool:
		return False
		return True


		def create_posts(base_dir):
		""" Returns a list posts crawled from the html content """
		paths = find_post_paths(base_dir)
		posts = []
		for path in paths:
		@@ -88,27 +107,27 @@ def create_posts(base_dir):
		continue

		title = get_title_from_htmltree(htmltree)
		if title == None:
		logging.warning(f"No element for title found in '{path}'")
		if title is None:
		logging.warning('No element for title found in "%s"', path)
		continue
		body = get_body_from_htmltree(htmltree)
		if body == None:
		logging.warning(f"No element for body found in '{path}'")
		if body is None:
		logging.warning('No element for body found in "%s"', path)
		continue

		subtittles = get_subtitles_from_htmltree(htmltree)

		description = get_description_from_htmltree(htmltree)
		if description == None:
		if description is None:
		description = body

		lang = get_lang_from_htmltree(htmltree)

		id = path.replace(base_dir, "").replace("/", "-")
		url = path.replace(base_dir, "")
		page_id = path.replace(base_dir, '').replace('/', '-')
		url = path.replace(base_dir, '')

		posts.append(Post(
		id=id,
		post_id=page_id,
		title=title,
		subtittles=subtittles,
		url=url,