diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..2b29f27645f8758ab5180164612d9f28154dee3a --- /dev/null +++ b/.dockerignore @@ -0,0 +1 @@ +tests diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 47b51fadf3f5944a6669f808ccec0a3dff4977a1..14426c6a5441143684d34f0a29b2e26be483c740 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -1,12 +1,20 @@ image: $CI_REGISTRY_IMAGE/env:latest stages: + - check - build - test - report - deploy -front-end: +python: + stage: check + before_script: + - ./manage.sh update_dev_packages + script: + - ./manage.sh pep8_check + +build:web: stage: build before_script: - ./manage.sh npm_packages @@ -16,14 +24,18 @@ front-end: - ./manage.sh styles - ./manage.sh grunt_build -coding-rules: +build:docker: stage: build before_script: - - ./manage.sh update_dev_packages + - docker login -u gitlab-ci-token -p $CI_JOB_TOKEN $CI_REGISTRY script: - - ./manage.sh pep8_check + - docker build -t $CI_REGISTRY_IMAGE:${CI_COMMIT_TAG:-latest} . + - docker push $CI_REGISTRY_IMAGE:${CI_COMMIT_TAG:-latest} + only: + - master + - tags -unit-test: +test:unit: stage: test before_script: - ./manage.sh update_dev_packages @@ -34,7 +46,7 @@ unit-test: - coverage expire_in: 1 hour -functional-test: +test:functional: stage: test image: docker:stable services: @@ -57,31 +69,47 @@ coverage: script: - ./manage.sh coverage dependencies: - - unit-test - - functional-test + - test:unit + - test:functional coverage: '/TOTAL.*\s+(\d+%)$/' -deploy-test: +.deploy:template: image: docker:stable stage: deploy only: - branches when: manual - variables: - PUBLISH_USER: root - PUBLISH_URL: spot.test.ecloud.global - GIT_STRATEGY: none - SPOT_HOSTNAME: spot.test.ecloud.global dependencies: [] + variables: + DEPLOY_FOLDER: /mnt/data before_script: - 'which ssh-agent || ( apk --update add openssh-client )' - eval $(ssh-agent -s) - - echo "$SSH_PRIVATE_KEY_TEST" | tr -d '\r' | ssh-add - > /dev/null + - echo "$SSH_PRIVATE_KEY" | tr -d '\r' | ssh-add - > /dev/null - mkdir -p ~/.ssh - chmod 700 ~/.ssh - echo "$SSH_KNOWN_HOSTS" > ~/.ssh/known_hosts - chmod 644 ~/.ssh/known_hosts script: - - ssh -2 $PUBLISH_USER@$PUBLISH_URL 'if [ ! "$(docker ps -q -f name=proxy)" ] ; then docker run -d -p 80:80 --net my-network my-spot_default -v /var/run/docker.sock:/tmp/docker.sock:ro --restart unless-stopped --name proxy jwilder/nginx-proxy ; elif [ ! "$(docker ps -q -f name=proxy -f status=exited)" ] ; then docker start proxy ; fi' - - ssh -2 $PUBLISH_USER@$PUBLISH_URL "cd /root/my-spot/ && git fetch && git checkout $CI_COMMIT_SHA" - - ssh -2 $PUBLISH_USER@$PUBLISH_URL "SPOT_HOSTNAME=$SPOT_HOSTNAME && export SPOT_HOSTNAME && cd /root/my-spot/ && docker-compose pull && docker-compose up -d --build --force-recreate" + - ssh -2 $PUBLISH_USER@$PUBLISH_URL "mkdir -p ${DEPLOY_FOLDER} && cd ${DEPLOY_FOLDER} && if [ ! -d ${CI_PROJECT_NAME} ] ; then git clone ${CI_PROJECT_URL}.git ; fi && cd ${CI_PROJECT_NAME} && git fetch && git checkout $CI_COMMIT_SHA" + - ssh -2 $PUBLISH_USER@$PUBLISH_URL "SPOT_HOSTNAME=$SPOT_HOSTNAME && export SPOT_HOSTNAME && cd ${DEPLOY_FOLDER}/${CI_PROJECT_NAME} && docker-compose pull && docker-compose up -d --build --force-recreate" + +test: + extends: .deploy:template + variables: + PUBLISH_USER: root + PUBLISH_URL: spot.test.ecloud.global + GIT_STRATEGY: none + SPOT_HOSTNAME: spot.test.ecloud.global + SSH_PRIVATE_KEY: ${SSH_PRIVATE_KEY_TEST} + +prod: + extends: .deploy:template + only: + - master + variables: + PUBLISH_USER: root + PUBLISH_URL: spot.ecloud.global + GIT_STRATEGY: none + SPOT_HOSTNAME: spot.ecloud.global + SSH_PRIVATE_KEY: ${SSH_PRIVATE_KEY_PROD} diff --git a/Dockerfile b/Dockerfile index 70fb0323bb14b67f157d02f6017bc08a6cefaf23..40ac3eb320098673275e7ee864731e7f8cd8090d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -4,9 +4,7 @@ RUN apk add \ git \ build-base \ libxml2-dev \ - libxslt-dev \ - libffi-dev \ - openssl-dev + libxslt-dev # Only to use the docker cache and optimize the build time WORKDIR /src @@ -24,7 +22,6 @@ LABEL description="A privacy-respecting, hackable metasearch engine." RUN apk add \ ca-certificates \ libxslt \ - openssl \ && pip install coverage COPY --from=builder /install/ /usr/local/ diff --git a/docker-compose.yml b/docker-compose.yml index ad66e647e02431d761979aabdb31ccfe70f0a23a..a3a0da13945f6dc8c6b30fd1e522ee13a2b09416 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -2,11 +2,39 @@ version: '3.6' services: redis: + restart: unless-stopped image: redis:5-alpine spot: build: . + restart: unless-stopped environment: SEARX_REDIS_HOST: redis - VIRTUAL_HOST: ${SPOT_HOSTNAME} + VIRTUAL_HOST: ${SPOT_HOSTNAME:-spot} + LETSENCRYPT_HOST: ${SPOT_HOSTNAME:-spot} SEARX_LOGGER: INFO + + proxy: + image: jwilder/nginx-proxy:alpine + restart: unless-stopped + container_name: proxy + volumes: + - /mnt/data/html:/usr/share/nginx/html + - /mnt/data/vhosts:/etc/nginx/vhost.d + - /mnt/data/certs:/etc/nginx/certs:ro + - /var/run/docker.sock:/tmp/docker.sock:ro + restart: unless-stopped + ports: + - "80:80" + - "443:443" + + cert: + image: jrcs/letsencrypt-nginx-proxy-companion + restart: unless-stopped + environment: + NGINX_PROXY_CONTAINER: proxy + volumes: + - /mnt/data/html:/usr/share/nginx/html + - /mnt/data/vhosts:/etc/nginx/vhost.d + - /mnt/data/certs:/etc/nginx/certs + - /var/run/docker.sock:/var/run/docker.sock:ro diff --git a/manage.sh b/manage.sh index 47da7b36e7b850dba5c655678340c74d61f5b353..1d21b5886557e9c0a316fc109901662978091d63 100755 --- a/manage.sh +++ b/manage.sh @@ -38,6 +38,7 @@ pep8_check() { # W503 line break before binary operator # E722 do not use bare 'except' pycodestyle --exclude=searx/static --max-line-length=120 --ignore "E402,W503,E722" "$SEARX_DIR" "$BASE_DIR/tests" + flake8 --ignore=E722 $SEARX_DIR/*.py } unit_tests() { diff --git a/requirements-dev.txt b/requirements-dev.txt index 1de575509df51a7ab65090584ed94374bb422a9a..695f9e18333ea12f236aba72309063e2f51b121b 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -1,6 +1,7 @@ babel==2.3.4 mock==2.0.0 -pycodestyle==2.4.0 +pycodestyle==2.5.0 +flake8==3.7.7 mockredispy==2.9.3 pytest==4.1.0 pytest-cov==2.6.1 diff --git a/requirements.txt b/requirements.txt index 546b8b7611320ca1a0e08d103626f914f22fd0c7..0dd25e0ead16aac71a498ee1d2c9159fc6419cfd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,9 @@ -redis==2.10.6 -certifi==2017.11.5 flask==1.0.2 -flask-babel==0.11.2 -lxml==4.2.3 -idna==2.7 -pygments==2.1.3 -pyopenssl==18.0.0 -python-dateutil==2.7.3 -pyyaml==3.13 -requests[socks]==2.19.1 +jinja2==2.10 +flask-babel==0.12.2 +lxml==4.3.3 +pygments==2.3.1 +python-dateutil==2.8.0 +pyyaml==5.1 +requests[socks]==2.21.0 +redis==3.2.1 diff --git a/searx/__init__.py b/searx/__init__.py index 9dc594431bf21f2c50e8d98029703ebf9bdc3e20..5bddf92189ecd0f1b7a2f70941ddda54408762b9 100644 --- a/searx/__init__.py +++ b/searx/__init__.py @@ -15,18 +15,11 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. (C) 2013- by Adam Tauber, ''' -import certifi import logging from os import environ from os.path import realpath, dirname, join, abspath, isfile -from io import open -from ssl import OPENSSL_VERSION_INFO, OPENSSL_VERSION -try: - from yaml import load -except ImportError: - from sys import exit, stderr - stderr.write('[E] install pyyaml\n') - exit(2) + +import yaml searx_dir = abspath(dirname(__file__)) engine_dir = dirname(realpath(__file__)) @@ -53,7 +46,11 @@ if not settings_path: # load settings with open(settings_path, 'r', encoding='utf-8') as settings_yaml: - settings = load(settings_yaml) + # XXX: docker-compose does not support yet yaml >= 5 + if int(yaml.__version__.split('.')[0]) >= 5: + settings = yaml.load(settings_yaml, Loader=yaml.FullLoader) + else: + settings = yaml.load(settings_yaml) ''' enable debug if @@ -73,12 +70,6 @@ logging.basicConfig(level=getattr(logging, searx_loglevel)) logger = logging.getLogger('searx') logger.debug('read configuration from %s', settings_path) -# Workaround for openssl versions <1.0.2 -# https://github.com/certifi/python-certifi/issues/26 -if OPENSSL_VERSION_INFO[0:3] < (1, 0, 2): - if hasattr(certifi, 'old_where'): - environ['REQUESTS_CA_BUNDLE'] = certifi.old_where() - logger.warning('You are using an old openssl version({0}), please upgrade above 1.0.2!'.format(OPENSSL_VERSION)) logger.info('Initialisation done') @@ -89,6 +80,7 @@ if 'BASE_URL' in environ: if 'IMAGE_PROXY' in environ: settings['server']['image_proxy'] = environ['IMAGE_PROXY'] if 'SEARX_REDIS_HOST' in environ: + settings['redis']['enable'] = True settings['redis']['host'] = environ['SEARX_REDIS_HOST'] if 'HTTP_PROXY_URL' in environ: settings['proxies']['http'] = environ['HTTP_PROXY_URL'] diff --git a/searx/engines/__init__.py b/searx/engines/__init__.py index f32b57202352f3ac9d3ac5df25080768a2c03452..953fa8bf0360fa19fc07ba459751d4663e707b39 100644 --- a/searx/engines/__init__.py +++ b/searx/engines/__init__.py @@ -18,12 +18,11 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >. import sys import threading -from os.path import realpath, dirname -from io import open +import json +from pathlib import Path from babel.localedata import locale_identifiers from flask_babel import gettext from operator import itemgetter -from json import loads from requests import get from searx import settings from searx import logger @@ -32,13 +31,14 @@ from searx.utils import load_module, match_language logger = logger.getChild('engines') -engine_dir = dirname(realpath(__file__)) +engine_dir = Path(__file__).parent engines = {} categories = {'general': []} -languages = loads(open(engine_dir + '/../data/engines_languages.json', 'r', encoding='utf-8').read()) +with open(engine_dir.parent / "data" / "engines_languages.json", encoding='utf-8') as fd: + languages = json.load(fd) babel_langs = [lang_parts[0] + '-' + lang_parts[-1] if len(lang_parts) > 1 else lang_parts[0] for lang_parts in (lang_code.split('_') for lang_code in locale_identifiers())] diff --git a/searx/engines/apkmirror.py b/searx/engines/apkmirror.py new file mode 100644 index 0000000000000000000000000000000000000000..f2ee12b29269a6dbae125266568ad94a4aed5ef1 --- /dev/null +++ b/searx/engines/apkmirror.py @@ -0,0 +1,61 @@ +""" + APK Mirror + + @website https://www.apkmirror.com + + @using-api no + @results HTML + @stable no (HTML can change) + @parse url, title, thumbnail_src +""" + +from lxml import html +from searx.engines.xpath import extract_text +from searx.url_utils import urlencode + +# engine dependent config +categories = ['it'] +paging = True + +# I am not 100% certain about this, as apkmirror appears to be a wordpress site, +# which might support time_range searching. If you want to implement it, go ahead. +time_range_support = False + +# search-url +base_url = 'https://www.apkmirror.com' +search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}' + + +# do search-request +def request(query, params): + + params['url'] = search_url.format(pageno=params['pageno'], + query=urlencode({'s': query})) + return params + + +# get response from search-request +def response(resp): + results = [] + + dom = html.fromstring(resp.text) + + # parse results + for result in dom.xpath('.//div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'): + + link = result.xpath('.//h5/a')[0] + url = base_url + link.attrib.get('href') + '#downloads' + title = extract_text(link) + thumbnail_src = base_url + result.xpath('.//img')[0].attrib.get('src').replace('&w=32&h=32', '&w=64&h=64') + + res = { + 'url': url, + 'title': title, + 'thumbnail_src': thumbnail_src + } + + # append result + results.append(res) + + # return results + return results diff --git a/searx/engines/arxiv.py b/searx/engines/arxiv.py index 5ef84f0c1df2e26146d42855054024714fae83a7..84e844fbbcd5e8fd5a277a9a5185ceb17444bd89 100644 --- a/searx/engines/arxiv.py +++ b/searx/engines/arxiv.py @@ -61,7 +61,7 @@ def response(resp): content = content_string.format(doi_content="", abstract_content=abstract) if len(content) > 300: - content = content[0:300] + "..." + content = content[0:300] + "..." # TODO: center snippet on query term publishedDate = datetime.strptime(entry.xpath('.//published')[0].text, '%Y-%m-%dT%H:%M:%SZ') diff --git a/searx/engines/currency_convert.py b/searx/engines/currency_convert.py index 9424d7d5ece8ad60b6f262baf6b0485ce521a00a..2a2a3c35d80d60b8a9c28ab56a7f50529d4096c2 100644 --- a/searx/engines/currency_convert.py +++ b/searx/engines/currency_convert.py @@ -1,10 +1,9 @@ import json import re -import os import sys import unicodedata -from io import open +from pathlib import Path from datetime import datetime @@ -88,10 +87,10 @@ def response(resp): def load(): global db - current_dir = os.path.dirname(os.path.realpath(__file__)) - json_data = open(current_dir + "/../data/currencies.json", 'r', encoding='utf-8').read() - - db = json.loads(json_data) + with open( + Path(__file__).parent.parent / "data" / "currencies.json", encoding='utf-8' + ) as fd: + db = json.load(fd) load() diff --git a/searx/engines/duckduckgo_images.py b/searx/engines/duckduckgo_images.py index 8c46ec92d4dff9311f56abd53341143f77d2657e..89924b71c7de4ed13d43885ab284aa9c2aa4daad 100644 --- a/searx/engines/duckduckgo_images.py +++ b/searx/engines/duckduckgo_images.py @@ -35,9 +35,12 @@ site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images' # run query in site to get vqd number needed for requesting images # TODO: find a way to get this number without an extra request (is it a hash of the query?) -def get_vqd(query): - res = get(site_url.format(query=urlencode({'q': query}))) +def get_vqd(query, headers): + query_url = site_url.format(query=urlencode({'q': query})) + res = get(query_url, headers=headers) content = res.text + if content.find('vqd=\'') == -1: + raise Exception('Request failed') vqd = content[content.find('vqd=\'') + 5:] vqd = vqd[:vqd.find('\'')] return vqd @@ -47,7 +50,7 @@ def get_vqd(query): def request(query, params): # to avoid running actual external requests when testing if 'is_test' not in params: - vqd = get_vqd(query) + vqd = get_vqd(query, params['headers']) else: vqd = '12345' @@ -74,7 +77,7 @@ def response(resp): try: res_json = loads(content) except: - return [] + raise Exception('Cannot parse results') # parse results for result in res_json['results']: diff --git a/searx/engines/google_images.py b/searx/engines/google_images.py index 6a32500fb228d03ecadba8cfcb70eae0d0657f4b..66a543e57ffd70d102f0a12dc89ba9df8cd9b653 100644 --- a/searx/engines/google_images.py +++ b/searx/engines/google_images.py @@ -11,7 +11,6 @@ """ from datetime import date, timedelta -from json import loads from lxml import html from searx.url_utils import urlencode, urlparse, parse_qs @@ -39,7 +38,6 @@ time_range_dict = {'day': 'd', # do search-request def request(query, params): search_options = { - 'ijn': params['pageno'] - 1, 'start': (params['pageno'] - 1) * number_of_results } @@ -53,7 +51,7 @@ def request(query, params): search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) if safesearch and params['safesearch']: - search_options['safe'] = 'on' + search_options['safe'] = 'active' params['url'] = search_url.format(query=urlencode({'q': query}), search_options=urlencode(search_options)) @@ -63,24 +61,30 @@ def request(query, params): # get response from search-request def response(resp): + dom = html.fromstring(resp.text) + results = [] + for element in dom.xpath('//div[@id="search"] //td'): + link = element.xpath('./a')[0] - dom = html.fromstring(resp.text) + google_url = urlparse(link.xpath('.//@href')[0]) + query = parse_qs(google_url.query) + source_url = next(iter(query.get('q', [])), None) - # parse results - for img in dom.xpath('//a'): - r = { - 'title': ' '.join(img.xpath('.//div[class="rg_ilmbg"]//text()')), + title_parts = element.xpath('./cite//following-sibling::*/text()') + title_parts.extend(element.xpath('./cite//following-sibling::text()')[:-1]) + + result = { + 'title': ''.join(title_parts), 'content': '', 'template': 'images.html', + 'url': source_url, + 'img_src': source_url, + 'thumbnail_src': next(iter(link.xpath('.//img //@src')), None) } - url = urlparse(img.xpath('.//@href')[0]) - query = parse_qs(url.query) - r['url'] = query['imgrefurl'][0] - r['img_src'] = query['imgurl'][0] - r['thumbnail_src'] = r['img_src'] - # append result - results.append(r) - - # return results + + if not source_url or not result['thumbnail_src']: + continue + + results.append(result) return results diff --git a/searx/engines/json_engine.py b/searx/engines/json_engine.py index 550947ab6818587e15b77259f06c348c3f2baeca..7a48bc8cee536229822d9fc1ecf77a75912da2e2 100644 --- a/searx/engines/json_engine.py +++ b/searx/engines/json_engine.py @@ -1,4 +1,4 @@ -from collections import Iterable +from collections.abc import Iterable from json import loads from searx.url_utils import urlencode from searx.utils import to_string diff --git a/searx/engines/pubmed.py b/searx/engines/pubmed.py index 055f092269d64adcfb5d3f004753bccbbaf8c176..82a37a564e7efca26fa1c50066215f467e40f506 100644 --- a/searx/engines/pubmed.py +++ b/searx/engines/pubmed.py @@ -81,7 +81,7 @@ def response(resp): pass if len(content) > 300: - content = content[0:300] + "..." + content = content[0:300] + "..." # TODO: center snippet on query term res_dict = {'url': url, diff --git a/searx/engines/wikidata.py b/searx/engines/wikidata.py index 047f1ba77c0c82d52826b7370576f05ea5f4fd69..81cf262c50c5bd775f113fffa64e606f5d1100a6 100644 --- a/searx/engines/wikidata.py +++ b/searx/engines/wikidata.py @@ -374,7 +374,7 @@ def add_url(urls, result, property_id=None, default_label=None, url_prefix=None, # wiki links don't have property in wikidata page if link_type and 'wiki' in link_type: - links.append(get_wikilink(result, link_type)) + links.append(get_wikilink(result, link_type)) else: dom_element = result.xpath(property_xpath.replace('{propertyid}', property_id)) if dom_element: diff --git a/searx/engines/youtube_noapi.py b/searx/engines/youtube_noapi.py index 9f01841f66000245329467ec821c6787220193d8..3bf25932b24476785e6e6547fa560e900feb6615 100644 --- a/searx/engines/youtube_noapi.py +++ b/searx/engines/youtube_noapi.py @@ -8,7 +8,8 @@ # @stable no # @parse url, title, content, publishedDate, thumbnail, embedded -from lxml import html +from functools import reduce +from json import loads from searx.engines.xpath import extract_text from searx.utils import list_get from searx.url_utils import quote_plus @@ -34,20 +35,6 @@ embedded_url = '