Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 81bf40fe authored by Israel Yago Pereira's avatar Israel Yago Pereira
Browse files

Checks for the codestyle build into the CI

parent 42d054ea
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -3,11 +3,21 @@
variables:
    DOCKER_DRIVER: overlay2
stages:
  - lint
  - build
  - deploy
default:
  image: docker:20.10

lint:branch:
  stage: lint
  tags:
    - generic_privileged
  image: python:3.9
  script:
    - pip install -r requirements.txt
    - pylint --load-plugins pylint_quotes ./src

# Build stage
.build:docker:
  stage: build
+8 −0
Original line number Diff line number Diff line
@@ -11,6 +11,14 @@ Full-text search for your Jekyll blog with ElasticSearch.

To be able to use the linting inside the virtual environment, it is recomended to use the python inside the virtual env folder. Take a look at [here](https://code.visualstudio.com/docs/python/environments#_select-and-activate-an-environment) for vscode

This project uses [pylint](https://pylint.org/) to help with the codestyle. If you wish to save some server resources, you can put the following shell script into the `.git/hooks/pre-commit` file (create one if it does not exists):
```shell
. venv/bin/activate
pylint --load-plugins pylint_quotes ./src
```

So every time you try to commit something, it will validate the linter locally beforehand.

## Development
1. You need a running copy of elasticsearch:
```bash
+10 −0
Original line number Diff line number Diff line
astroid==2.9.0
beautifulsoup4==4.9.3
certifi==2021.5.30
click==8.0.1
elasticsearch==7.13.3
Flask==2.0.1
folder-hash==1.0.0
isort==5.10.1
itsdangerous==2.0.1
Jinja2==3.0.1
lazy-object-proxy==1.6.0
MarkupSafe==2.0.1
mccabe==0.6.1
platformdirs==2.4.0
pylint==2.12.1
pylint-quotes==0.2.3
python-dotenv==0.18.0
soupsieve==2.2.1
toml==0.10.2
typing-extensions==4.0.1
urllib3==1.26.6
uWSGI==2.0.19.1
Werkzeug==2.0.1
wrapt==1.13.3
+21 −13
Original line number Diff line number Diff line
""" Handles connection with the outside world by REST api """
import gzip
import os
from flask import Flask, json, Response, make_response
from dotenv import load_dotenv
import searcher
import gzip
from indexer import connect_elastic
from dotenv import load_dotenv
import os

app = Flask(__name__)

load_dotenv()

es_host = os.getenv('ES_HOST', 'localhost')
es_port = os.getenv('ES_PORT', 9200)
es_port = os.getenv('ES_PORT', '9200')

es = connect_elastic(es_host, es_port)


@app.route('/<language>/search/<query>')
def search(language, query):
    """ Endpoint to search into all contents of the specific language """
    result = searcher.search_query(es, query, language)
    data = {
        'search_results': result,
    }
    try:
        resp = gzip_json_response(data)
    except:
    except BaseException: # pylint: disable=w0703
        resp = json.dumps({'error': 'Unable to process at the moment'})
        headers = {
            'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*')
        }
        return Response(response=resp,
                        status=500,
                        content_type='application/json',
                        headers={
                             'Access-Control-Allow-Origin': os.getenv('ALLOW_ORIGINS', '*')
                        })

    resp.headers['Access-Control-Allow-Origin'] = os.getenv('ALLOW_ORIGINS', '*')
                        headers=headers)
    resp.headers['Access-Control-Allow-Origin'] = os.getenv(
        'ALLOW_ORIGINS', '*')
    return resp


@app.route('/<language>/autocomplete')
def autocomplete(language):
    """ Endpoint to return a list of autocomplete words """
    result = searcher.autocomplete(es, language)
    data = {
        'autocomplete': result,
    }
    return gzip_json_response(data)


def gzip_json_response(data: dict) -> Response:
    """ Converts a dictionary into a flask response with it's content compressed (gzip) """
    content = gzip.compress(json.dumps(data).encode('utf8'), 5)
    response = make_response(content)
    response.headers['Content-length'] = len(content)
+54 −35
Original line number Diff line number Diff line
""" Parse the content from html """
import glob
import logging
from bs4 import BeautifulSoup
from post import Post
import logging

DEFAULT_LANG = 'en'


def find_post_paths(base_dir):
    files = glob.glob(base_dir + "/**/*.html", recursive=True)
    files = [f.replace("\\", "/") for f in files]
    """ Get all html files inside a folder """
    files = glob.glob(base_dir + '/**/*.html', recursive=True)
    files = [f.replace('\\', '/') for f in files]
    return files


def get_title_from_htmltree(htmltree: BeautifulSoup):
    """ Grab the title from a set of possible places """
    title = htmltree.select_one('[data-elasticsearch-title]')
    if title == None:
        title = htmltree.find('h1', { "class" : "post-title" })
    if title == None:
    if title is None:
        title = htmltree.find('h1', {'class': 'post-title'})
    if title is None:
        title = htmltree.find('h1')
    if title == None:
    if title is None:
        return None
    return title.text.strip()


def get_subtitles_from_htmltree(htmltree: BeautifulSoup):
    """ Grab the subtitles from all h2,h3,h4,h5,h6 elements """
    subtitles = []
    headings = htmltree.select('h2,h3,h4,h5,h6')
    for h in headings:
        subtitles.append(h.text.strip())
    for heading in headings:
        subtitles.append(heading.text.strip())

    return subtitles


def get_body_from_htmltree(htmltree: BeautifulSoup):
    """ Get the body content of the page """
    post_elem = htmltree.select_one('[data-elasticsearch-body]')

    if post_elem == None:
        post_elem = htmltree.find("div", {"class": "post"})
        if post_elem == None:
    if post_elem is None:
        post_elem = htmltree.find('div', {'class': 'post'})
        if post_elem is None:
            return None
        post_elem.find(attrs={"class": "post-title"}).decompose()
        post_elem.find(attrs={"class": "post-date"}).decompose()
        post_elem.find(attrs={'class': 'post-title'}).decompose()
        post_elem.find(attrs={'class': 'post-date'}).decompose()

    paras = post_elem.find_all(text=True)

    body = " ".join(p.strip() for p in paras).replace("  ", " ").strip()
    body = ' '.join(p.strip() for p in paras).replace('  ', ' ').strip()

    return body


def get_htmltree_from_file(path: str) -> BeautifulSoup:
    with open(path, encoding="utf8") as f:
        contents = f.read()
    """ Parse an html file into a html struct """
    with open(path, encoding='utf8') as file:
        contents = file.read()
        return BeautifulSoup(contents, 'html.parser')


def get_lang_from_htmltree(htmltree: BeautifulSoup) -> str:
    """ Returns the language set in the html tag """
    html = htmltree.select_one('html')
    lang = html.get('lang')
    return DEFAULT_LANG if lang == None else lang
    return DEFAULT_LANG if lang is None else lang


def get_description_from_htmltree(htmltree: BeautifulSoup) -> str:
    metatag = htmltree.select_one('meta[name="description"]')
    if metatag == None:
    """ Gather the description of the page from the meta tag description """
    metatag = htmltree.select_one('meta[name=\'description\']')
    if metatag is None:
        return None
    description = metatag.get('content')
    return description


def should_crawl_page(htmltree: BeautifulSoup) -> bool:
    """ Determines if the page should be crawled """

    metatag = htmltree.select_one('meta[name="robots"]')
    if metatag == None:
    metatag = htmltree.select_one('meta[name=\'robots\']')
    if metatag is None:
        return True

    metatag_content = metatag.get('content')
    if metatag_content == None:
    if metatag_content is None:
        return True
    options = metatag_content.split(',')

@@ -76,7 +93,9 @@ def should_crawl_page(htmltree: BeautifulSoup) -> bool:
        return False
    return True


def create_posts(base_dir):
    """ Returns a list posts crawled from the html content """
    paths = find_post_paths(base_dir)
    posts = []
    for path in paths:
@@ -88,27 +107,27 @@ def create_posts(base_dir):
            continue

        title = get_title_from_htmltree(htmltree)
        if title == None:
            logging.warning(f"No element for title found in '{path}'")
        if title is None:
            logging.warning('No element for title found in "%s"', path)
            continue
        body = get_body_from_htmltree(htmltree)
        if body == None:
            logging.warning(f"No element for body found in '{path}'")
        if body is None:
            logging.warning('No element for body found in "%s"', path)
            continue

        subtittles = get_subtitles_from_htmltree(htmltree)

        description = get_description_from_htmltree(htmltree)
        if description == None:
        if description is None:
            description = body

        lang = get_lang_from_htmltree(htmltree)

        id = path.replace(base_dir, "").replace("/", "-")
        url = path.replace(base_dir, "")
        page_id = path.replace(base_dir, '').replace('/', '-')
        url = path.replace(base_dir, '')

        posts.append(Post(
            id=id, 
            post_id=page_id,
            title=title,
            subtittles=subtittles,
            url=url,
Loading