...
 
Commits (3)
......@@ -4,7 +4,9 @@ RUN apk add \
git \
build-base \
libxml2-dev \
libxslt-dev
libxslt-dev \
libffi-dev \
hiredis
# Only to use the docker cache and optimize the build time
WORKDIR /src
......
......@@ -6,7 +6,7 @@ engine <https://en.wikipedia.org/wiki/Metasearch_engine>`__.
Spot was forked from searx: read `documentation <https://asciimoo.github.io/searx>`__ and the `wiki <https://github.com/asciimoo/searx/wiki>`__ for more information.
Spot is based on Python3.7+
Spot is based on Python3.7+ and asyncio.
Getting Started
~~~~~~~~~~~~
......
# add tests
# static page only on dev mode
# remove usage of requests
# remove last use of threading
flask==1.0.2
jinja2==2.10
flask-babel==0.12.2
lxml==4.3.3
pygments==2.3.1
python-dateutil==2.8.0
pyyaml==5.1
requests[socks]==2.21.0
redis==3.2.1
aioredis==1.2.0
aiohttp==3.5.4
cchardet==2.1.4
aiodns==2.0.0
aiohttp_jinja2==1.1.0
Babel==2.6.0
aiohttpbabel==0.0.7
......@@ -3,7 +3,7 @@ import random
import string
import sys
import uuid
from flask_babel import gettext
from gettext import gettext
# required answerer attribute
# specifies which search query keywords triggers this answerer
......
from functools import reduce
from operator import mul
from flask_babel import gettext
from gettext import gettext
keywords = ('min',
'max',
......
......@@ -15,7 +15,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2013- by Adam Tauber, <asciimoo@gmail.com>
'''
import requests
from lxml import etree
from json import loads
from searx import settings
......@@ -23,15 +23,13 @@ from searx.languages import language_codes
from searx.engines import (
categories, engines, engine_shortcuts
)
from searx.poolrequests import get as http_get
from searx.url_utils import urlencode
def get(*args, **kwargs):
if 'timeout' not in kwargs:
kwargs['timeout'] = settings['outgoing']['request_timeout']
return http_get(*args, **kwargs)
return requests.get(*args, **kwargs)
def searx_bang(full_query):
......
......@@ -21,7 +21,7 @@ import threading
import json
from pathlib import Path
from babel.localedata import locale_identifiers
from flask_babel import gettext
from gettext import gettext
from operator import itemgetter
from requests import get
from searx import settings
......
......@@ -16,7 +16,6 @@
from lxml.html import fromstring
from json import loads
from searx.engines.xpath import extract_text
from searx.poolrequests import get
from searx.url_utils import urlencode
from searx.utils import match_language
......
......@@ -12,14 +12,13 @@
@todo avoid extra request
"""
import requests
from json import loads
from searx.engines.xpath import extract_text
from searx.engines.duckduckgo import (
_fetch_supported_languages, supported_languages_url,
get_region_code, language_aliases
)
from searx.poolrequests import get
from searx.url_utils import urlencode
# engine dependent config
......@@ -36,8 +35,7 @@ site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'
# run query in site to get vqd number needed for requesting images
# TODO: find a way to get this number without an extra request (is it a hash of the query?)
def get_vqd(query, headers):
query_url = site_url.format(query=urlencode({'q': query}))
res = get(query_url, headers=headers)
res = requests.get(site_url.format(query=urlencode({'q': query})), headers=headers)
content = res.text
if content.find('vqd=\'') == -1:
raise Exception('Request failed')
......
......@@ -9,7 +9,7 @@
# @parse url, title, content, suggestion
import re
from flask_babel import gettext
from gettext import gettext
from lxml import html, etree
from searx.engines.xpath import extract_text, extract_url
from searx import logger
......
......@@ -11,7 +11,7 @@
"""
from json import loads
from flask_babel import gettext
from gettext import gettext
categories = ['science']
......
......@@ -10,12 +10,11 @@
@parse url, title, publishedDate, content
More info on api: https://www.ncbi.nlm.nih.gov/books/NBK25501/
"""
from flask_babel import gettext
import requests
from gettext import gettext
from lxml import etree
from datetime import datetime
from searx.url_utils import urlencode
from searx.poolrequests import get
categories = ['science']
......@@ -59,7 +58,7 @@ def response(resp):
retrieve_url_encoded = pubmed_retrieve_api_url.format(**retrieve_notice_args)
search_results_xml = get(retrieve_url_encoded).content
search_results_xml = requests.get(retrieve_url_encoded).content
search_results = etree.XML(search_results_xml).xpath('//PubmedArticleSet/PubmedArticle/MedlineCitation')
for entry in search_results:
......
......@@ -11,11 +11,11 @@
"""
import re
import requests
from json import loads
from lxml import html
from dateutil import parser
from searx import logger
from searx.poolrequests import get as http_get
from searx.url_utils import quote_plus, urlencode
from io import StringIO
......@@ -41,7 +41,7 @@ guest_client_id = ''
def get_client_id():
response = http_get("https://soundcloud.com")
response = requests.get("https://soundcloud.com")
if response.ok:
tree = html.fromstring(response.content)
......@@ -51,7 +51,7 @@ def get_client_id():
# extracts valid app_js urls from soundcloud.com content
for app_js_url in app_js_urls:
# gets app_js and searches for the clientid
response = http_get(app_js_url)
response = requests.get(app_js_url)
if response.ok:
cids = cid_re.search(response.text)
if cids is not None and len(cids.groups()):
......
......@@ -11,8 +11,8 @@
@parse url, infobox
"""
import requests
from searx import logger
from searx.poolrequests import get
from searx.engines.xpath import extract_text
from searx.engines.wikipedia import _fetch_supported_languages, supported_languages_url
from searx.url_utils import urlencode
......@@ -77,7 +77,7 @@ def response(resp):
for search_result in search_results[:result_count]:
wikidata_id = search_result.split('/')[-1]
url = url_detail.format(query=urlencode({'page': wikidata_id, 'uselang': language}))
htmlresponse = get(url)
htmlresponse = requests.get(url)
jsonresponse = loads(htmlresponse.text)
results += getDetail(jsonresponse, wikidata_id, language, resp.search_params['language'])
......
......@@ -11,7 +11,7 @@
from json import loads
from time import time
from searx.poolrequests import get as http_get
import requests
from searx.url_utils import urlencode
# search-url
......@@ -47,7 +47,10 @@ image_pods = {'VisualRepresentation',
def obtain_token():
update_time = time() - (time() % 3600)
try:
token_response = http_get('https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999', timeout=2.0)
token_response = requests.get(
'https://www.wolframalpha.com/input/api/v1/code?ts=9999999999999999999',
timeout=2.0
)
token['value'] = loads(token_response.text)['code']
token['last_updated'] = update_time
except:
......
......@@ -21,7 +21,7 @@ from lxml import etree
from os import listdir, environ
from os.path import isfile, isdir, join
from searx.plugins import logger
from flask_babel import gettext
from gettext import gettext
from searx import searx_dir
from searx.url_utils import urlparse
......
from flask_babel import gettext
from gettext import gettext
name = gettext('Infinite scroll')
description = gettext('Automatically load next page when scrolling to bottom of current page')
......
from flask_babel import gettext
from gettext import gettext
import re
from searx.url_utils import urlparse, parse_qsl
from searx import settings
......
......@@ -14,7 +14,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2016 by Adam Tauber, <asciimoo@gmail.com>
'''
from flask_babel import gettext
from gettext import gettext
name = gettext('Open result links on new browser tabs')
description = gettext('Results are opened in the same window by default. '
'This plugin overwrites the default behaviour to open links on new tabs/windows. '
......
......@@ -14,7 +14,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2015 by Adam Tauber, <asciimoo@gmail.com>
'''
from flask_babel import gettext
from gettext import gettext
name = gettext('Search on category select')
description = gettext('Perform search immediately if a category selected. '
'Disable to select multiple categories. (JavaScript required)')
......
......@@ -14,7 +14,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2015 by Adam Tauber, <asciimoo@gmail.com>
'''
from flask_babel import gettext
from gettext import gettext
import re
name = "Self Informations"
description = gettext('Displays your IP if the query is "ip" and your user agent if the query contains "user agent".')
......
......@@ -15,7 +15,7 @@ along with searx. If not, see < http://www.gnu.org/licenses/ >.
(C) 2015 by Adam Tauber, <asciimoo@gmail.com>
'''
from flask_babel import gettext
from gettext import gettext
import re
from searx.url_utils import urlunparse
......
from flask_babel import gettext
from gettext import gettext
name = gettext('Vim-like hotkeys')
description = gettext('Navigate search results with Vim-like hotkeys '
......
import requests
from itertools import cycle
from threading import RLock, local
from searx import settings
from time import time
class HTTPAdapterWithConnParams(requests.adapters.HTTPAdapter):
def __init__(self, pool_connections=requests.adapters.DEFAULT_POOLSIZE,
pool_maxsize=requests.adapters.DEFAULT_POOLSIZE,
max_retries=requests.adapters.DEFAULT_RETRIES,
pool_block=requests.adapters.DEFAULT_POOLBLOCK,
**conn_params):
if max_retries == requests.adapters.DEFAULT_RETRIES:
self.max_retries = requests.adapters.Retry(0, read=False)
else:
self.max_retries = requests.adapters.Retry.from_int(max_retries)
self.config = {}
self.proxy_manager = {}
super(requests.adapters.HTTPAdapter, self).__init__()
self._pool_connections = pool_connections
self._pool_maxsize = pool_maxsize
self._pool_block = pool_block
self._conn_params = conn_params
self.init_poolmanager(pool_connections, pool_maxsize, block=pool_block, **conn_params)
def __setstate__(self, state):
# Can't handle by adding 'proxy_manager' to self.__attrs__ because
# because self.poolmanager uses a lambda function, which isn't pickleable.
self.proxy_manager = {}
self.config = {}
for attr, value in state.items():
setattr(self, attr, value)
self.init_poolmanager(self._pool_connections, self._pool_maxsize,
block=self._pool_block, **self._conn_params)
threadLocal = local()
connect = settings['outgoing'].get('pool_connections', 100) # Magic number kept from previous code
maxsize = settings['outgoing'].get('pool_maxsize', requests.adapters.DEFAULT_POOLSIZE) # Picked from constructor
if settings['outgoing'].get('source_ips'):
http_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize,
source_address=(source_ip, 0))
for source_ip in settings['outgoing']['source_ips'])
https_adapters = cycle(HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize,
source_address=(source_ip, 0))
for source_ip in settings['outgoing']['source_ips'])
else:
http_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), ))
https_adapters = cycle((HTTPAdapterWithConnParams(pool_connections=connect, pool_maxsize=maxsize), ))
class SessionSinglePool(requests.Session):
def __init__(self):
super(SessionSinglePool, self).__init__()
# reuse the same adapters
with RLock():
self.adapters.clear()
self.mount('https://', next(https_adapters))
self.mount('http://', next(http_adapters))
def close(self):
"""Call super, but clear adapters since there are managed globaly"""
self.adapters.clear()
super(SessionSinglePool, self).close()
def set_timeout_for_thread(timeout, start_time=None):
threadLocal.timeout = timeout
threadLocal.start_time = start_time
def reset_time_for_thread():
threadLocal.total_time = 0
def get_time_for_thread():
return threadLocal.total_time
def request(method, url, **kwargs):
"""same as requests/requests/api.py request(...)"""
time_before_request = time()
# session start
session = SessionSinglePool()
# proxies
kwargs['proxies'] = settings['outgoing'].get('proxies') or None
# timeout
if 'timeout' in kwargs:
timeout = kwargs['timeout']
else:
timeout = getattr(threadLocal, 'timeout', None)
if timeout is not None:
kwargs['timeout'] = timeout
# do request
response = session.request(method=method, url=url, **kwargs)
time_after_request = time()
# is there a timeout for this engine ?
if timeout is not None:
timeout_overhead = 0.2 # seconds
# start_time = when the user request started
start_time = getattr(threadLocal, 'start_time', time_before_request)
search_duration = time_after_request - start_time
if search_duration > timeout + timeout_overhead:
raise requests.exceptions.Timeout(response=response)
# session end
session.close()
if hasattr(threadLocal, 'total_time'):
threadLocal.total_time += time_after_request - time_before_request
return response
def get(url, **kwargs):
kwargs.setdefault('allow_redirects', True)
return request('get', url, **kwargs)
def options(url, **kwargs):
kwargs.setdefault('allow_redirects', True)
return request('options', url, **kwargs)
def head(url, **kwargs):
kwargs.setdefault('allow_redirects', False)
return request('head', url, **kwargs)
def post(url, data=None, **kwargs):
return request('post', url, data=data, **kwargs)
def put(url, data=None, **kwargs):
return request('put', url, data=data, **kwargs)
def patch(url, data=None, **kwargs):
return request('patch', url, data=data, **kwargs)
def delete(url, **kwargs):
return request('delete', url, **kwargs)
This diff is collapsed.
import threading
import redis
import pickle
import time
import asyncio
import aioredis
from searx import settings
from searx.query import SearchQuery
......@@ -24,8 +26,7 @@ class CacheInterface:
class RedisCache(CacheInterface):
def __init__(self):
self.pool = redis.ConnectionPool(host=settings['redis']['host'])
self.running = threading.Event()
self.pool = None
def make_key(self, q):
if q.time_range is None:
......@@ -41,43 +42,41 @@ class RedisCache(CacheInterface):
q.time_range,
)
def _get_connection(self):
return redis.Redis(connection_pool=self.pool)
async def _get_connection(self):
if not self.pool:
host = settings["redis"]["host"]
self.pool = await aioredis.create_redis_pool(
f"redis://{host}", minsize=5, maxsize=10
)
return self.pool
def read(self, q):
conn = self._get_connection()
async def read(self, q):
redis = await self._get_connection()
key = self.make_key(q)
response = conn.get(key)
response = await redis.get(key)
if not response:
return None
return pickle.loads(response)
def _save(self, d):
conn = self._get_connection()
async def save(self, d):
redis = await self._get_connection()
key = self.make_key(d)
history = conn.incr("SEARCH_HISTORY_INDEX")
conn.zadd("SEARCH_HISTORY_KEYS", {key: history})
conn.set(key, pickle.dumps(d, protocol=4))
history = await redis.incr("SEARCH_HISTORY_INDEX")
await redis.zadd("SEARCH_HISTORY_KEYS", history, key)
await redis.set(key, pickle.dumps(d, protocol=4))
def save(self, d):
threading.Thread(
target=self._save,
args=(d,),
name='save_search_' + str(d)
).start()
def get_twenty_queries(self, x):
async def get_twenty_queries(self, x):
result = []
conn = self._get_connection()
keys = conn.zrange('SEARCH_HISTORY_KEYS', int(x), int(x) + 20)
redis = await self._get_connection()
keys = await redis.zrange('SEARCH_HISTORY_KEYS', int(x), int(x) + 20)
if not keys:
return result
pipe = conn.pipeline()
pipe = redis.pipeline()
for key in keys:
pipe.get(key)
output = pipe.execute()
output = await pipe.execute()
for row in output:
row = pickle.loads(row)
result.append(
......@@ -94,10 +93,10 @@ class RedisCache(CacheInterface):
return result
def update(self, d):
conn = self._get_connection()
async def update(self, d):
redis = await self._get_connection()
key = self.make_key(d)
current = self.read(d)
current = await self.read(d)
current.results = d.results
current.paging = d.paging
current.results_number = d.results_number
......@@ -106,4 +105,28 @@ class RedisCache(CacheInterface):
current.infoboxes = d.infoboxes
current.suggestions = d.suggestions
current.unresponsive_engines = d.unresponsive_engines
conn.set(key, pickle.dumps(current, protocol=4))
await redis.set(key, pickle.dumps(current, protocol=4))
async def wait_updating(self, start_time):
wait = settings["redis"]["upgrade_history"] - int(time.time() - start_time)
if wait > 0:
await asyncio.sleep(wait)
async def update_results(self, search_instance):
start_time = time.time()
x = 0
try:
while True:
queries = await self.get_twenty_queries(x)
for query in queries:
result_container = await search_instance.search(query)
searchData = search_instance.create_search_data(query, result_container)
await self.update(searchData)
x += 20
if len(queries) < 20:
x = 0
await self.wait_updating(start_time)
start_time = time.time()
except asyncio.CancelledError:
pass
This diff is collapsed.
# -*- coding: utf-8 -*-
import json
from mock import Mock, patch
from mockredis import mock_strict_redis_client
from searx import webapp
from unittest import TestCase
from searx.search import Search
from searx.url_utils import ParseResult
class ViewsTestCase(TestCase):
def setUp(self):
webapp.app.config['TESTING'] = True # to get better error messages
self.app = webapp.app.test_client()
# set some defaults
self.test_results = [
{
'content': 'first test content',
'title': 'First Test',
'category': 'general',
'url': 'http://first.test.xyz',
'engines': ['youtube', 'startpage'],
'engine': 'startpage',
'parsed_url': ParseResult(scheme='http', netloc='first.test.xyz', path='/', params='', query='', fragment=''), # noqa
}, {
'content': 'second test content',
'category': 'general',
'title': 'Second Test',
'url': 'http://second.test.xyz',
'engines': ['youtube', 'startpage'],
'engine': 'youtube',
'parsed_url': ParseResult(scheme='http', netloc='second.test.xyz', path='/', params='', query='', fragment=''), # noqa
},
]
def search_mock(*args):
return Mock(get_ordered_results=lambda: self.test_results,
answers=set(),
corrections=set(),
suggestions=set(),
infoboxes=[],
unresponsive_engines=set(),
results=self.test_results,
results_number=lambda: 3,
results_length=lambda: len(self.test_results))
Search.search = search_mock
def get_current_theme_name_mock(override=None):
if override:
return override
return 'legacy'
webapp.get_current_theme_name = get_current_theme_name_mock
self.maxDiff = None # to see full diffs
def test_index_empty(self):
result = self.app.post('/')
self.assertEqual(result.status_code, 200)
self.assertIn(b'<div class="title"><h1>searx</h1></div>', result.data)
@patch('redis.Redis', mock_strict_redis_client)
def test_index_html(self):
result = self.app.post('/', data={'q': 'test'})
self.assertIn(
b'<h3 class="result_title"><img width="14" height="14" class="favicon" src="/static/themes/legacy/img/icons/icon_youtube.ico" alt="youtube" /><a href="http://second.test.xyz" rel="noreferrer">Second <span class="highlight">Test</span></a></h3>', # noqa
result.data
)
self.assertIn(
b'<p class="content">first <span class="highlight">test</span> content<br class="last"/></p>', # noqa
result.data
)
def test_about(self):
result = self.app.get('/about')
self.assertEqual(result.status_code, 200)
self.assertIn(b'<h1>About <a href="/">searx</a></h1>', result.data)
def test_preferences(self):
result = self.app.get('/preferences')
self.assertEqual(result.status_code, 200)
self.assertIn(
b'<form method="post" action="/preferences" id="search_form">',
result.data
)
self.assertIn(
b'<legend>Default categories</legend>',
result.data
)
self.assertIn(
b'<legend>Interface language</legend>',
result.data
)
def test_robots_txt(self):
result = self.app.get('/robots.txt')
self.assertEqual(result.status_code, 200)
self.assertIn(b'Allow: /', result.data)
def test_opensearch_xml(self):
result = self.app.get('/opensearch.xml')
self.assertEqual(result.status_code, 200)
self.assertIn(b'<Description>a privacy-respecting, hackable metasearch engine</Description>', result.data)
def test_favicon(self):
result = self.app.get('/favicon.ico')
self.assertEqual(result.status_code, 200)