diff --git a/requirements.txt b/requirements.txt index 4b7d390a9aaa2ff04265dc5890b5dd8beec1069e..3eb166c85c908df13713ec2f9b20e3f18c656e51 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,11 +7,11 @@ lxml==4.6.3 pygments==2.8.0 python-dateutil==2.8.2 pyyaml==5.4.1 -httpx[http2]==0.19.0 +httpx[http2]==0.21.2 Brotli==1.0.9 uvloop==0.16.0; python_version >= '3.7' uvloop==0.14.0; python_version < '3.7' -httpx-socks[asyncio]==0.4.1 +httpx-socks[asyncio]==0.7.2 langdetect==1.0.9 setproctitle==1.2.2 redis==3.4.1 diff --git a/searx/network/__init__.py b/searx/network/__init__.py index 2f51a9087addfdb3038840a61445738458f9e62a..1e851af7f5525dcd32b05ede37affe1dcd76bd8e 100644 --- a/searx/network/__init__.py +++ b/searx/network/__init__.py @@ -126,17 +126,17 @@ def request(method, url, **kwargs): if "redis_host" not in settings["server"]: def get(url, **kwargs): - kwargs.setdefault('allow_redirects', True) + kwargs.setdefault('follow_redirects', True) return request('get', url, **kwargs) def options(url, **kwargs): - kwargs.setdefault('allow_redirects', True) + kwargs.setdefault('follow_redirects', True) return request('options', url, **kwargs) def head(url, **kwargs): - kwargs.setdefault('allow_redirects', False) + kwargs.setdefault('follow_redirects', False) return request('head', url, **kwargs) @@ -172,7 +172,7 @@ if "redis_host" in settings["server"]: async def stream_chunk_to_queue(network, q, method, url, **kwargs): try: - async with network.stream(method, url, **kwargs) as response: + async with await network.stream(method, url, **kwargs) as response: q.put(response) async for chunk in response.aiter_bytes(65536): if len(chunk) > 0: diff --git a/searx/network/client.py b/searx/network/client.py index 47b2a9819581f3178cb8bf84fa0c48487d6143fe..175b243337eb354189d4dc95827c94061e8e3100 100644 --- a/searx/network/client.py +++ b/searx/network/client.py @@ -1,19 +1,14 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=missing-module-docstring, global-statement import asyncio import logging import threading -import httpcore import httpx from httpx_socks import AsyncProxyTransport -from python_socks import ( - parse_proxy_url, - ProxyConnectionError, - ProxyTimeoutError, - ProxyError -) -import python_socks._errors +from python_socks import parse_proxy_url, ProxyConnectionError, ProxyTimeoutError, ProxyError from searx import logger @@ -26,29 +21,15 @@ else: uvloop.install() -logger = logger.getChild('searx.http.client') +logger = logger.getChild('searx.network.client') LOOP = None SSLCONTEXTS = {} TRANSPORT_KWARGS = { - 'backend': 'asyncio', 'trust_env': False, } -async def close_connections_for_url(connection_pool: httpcore.AsyncConnectionPool, url: httpcore._utils.URL): - origin = httpcore._utils.url_to_origin(url) - logger.debug('Drop connections for %r', origin) - connections_to_close = connection_pool._connections_for_origin(origin) - for connection in connections_to_close: - await connection_pool._remove_from_pool(connection) - try: - await connection.aclose() - except httpx.NetworkError as e: - logger.warning('Error closing an existing connection', exc_info=e) - - def get_sslcontexts(proxy_url=None, cert=None, verify=True, trust_env=True, http2=False): - global SSLCONTEXTS key = (proxy_url, cert, verify, trust_env, http2) if key not in SSLCONTEXTS: SSLCONTEXTS[key] = httpx.create_ssl_context(cert, verify, trust_env, http2) @@ -58,74 +39,25 @@ def get_sslcontexts(proxy_url=None, cert=None, verify=True, trust_env=True, http class AsyncHTTPTransportNoHttp(httpx.AsyncHTTPTransport): """Block HTTP request""" - async def handle_async_request(self, method, url, headers=None, stream=None, extensions=None): - raise httpx.UnsupportedProtocol("HTTP protocol is disabled") + async def handle_async_request(self, request): + raise httpx.UnsupportedProtocol('HTTP protocol is disabled') class AsyncProxyTransportFixed(AsyncProxyTransport): """Fix httpx_socks.AsyncProxyTransport - Map python_socks exceptions to httpx.ProxyError - - Map socket.gaierror to httpx.ConnectError - - Note: keepalive_expiry is ignored, AsyncProxyTransport should call: - * self._keepalive_sweep() - * self._response_closed(self, connection) - - Note: AsyncProxyTransport inherit from AsyncConnectionPool + Map python_socks exceptions to httpx.ProxyError exceptions """ - async def handle_async_request(self, method, url, headers=None, stream=None, extensions=None): - retry = 2 - while retry > 0: - retry -= 1 - try: - return await super().handle_async_request(method, url, headers, stream, extensions) - except (ProxyConnectionError, ProxyTimeoutError, ProxyError) as e: - raise httpx.ProxyError(e) - except OSError as e: - # socket.gaierror when DNS resolution fails - raise httpx.NetworkError(e) - except httpx.RemoteProtocolError as e: - # in case of httpx.RemoteProtocolError: Server disconnected - await close_connections_for_url(self, url) - logger.warning('httpx.RemoteProtocolError: retry', exc_info=e) - # retry - except (httpx.NetworkError, httpx.ProtocolError) as e: - # httpx.WriteError on HTTP/2 connection leaves a new opened stream - # then each new request creates a new stream and raise the same WriteError - await close_connections_for_url(self, url) - raise e - - -class AsyncHTTPTransportFixed(httpx.AsyncHTTPTransport): - """Fix httpx.AsyncHTTPTransport""" - - async def handle_async_request(self, method, url, headers=None, stream=None, extensions=None): - retry = 2 - while retry > 0: - retry -= 1 - try: - return await super().handle_async_request(method, url, headers, stream, extensions) - except OSError as e: - # socket.gaierror when DNS resolution fails - raise httpx.ConnectError(e) - except httpx.CloseError as e: - # httpx.CloseError: [Errno 104] Connection reset by peer - # raised by _keepalive_sweep() - # from https://github.com/encode/httpcore/blob/4b662b5c42378a61e54d673b4c949420102379f5/httpcore/_backends/asyncio.py#L198 # noqa - await close_connections_for_url(self._pool, url) - logger.warning('httpx.CloseError: retry', exc_info=e) - # retry - except httpx.RemoteProtocolError as e: - # in case of httpx.RemoteProtocolError: Server disconnected - await close_connections_for_url(self._pool, url) - logger.warning('httpx.RemoteProtocolError: retry', exc_info=e) - # retry - except (httpx.ProtocolError, httpx.NetworkError) as e: - await close_connections_for_url(self._pool, url) - raise e + async def handle_async_request(self, request): + try: + return await super().handle_async_request(request) + except ProxyConnectionError as e: + raise httpx.ProxyError("ProxyConnectionError: " + e.strerror, request=request) from e + except ProxyTimeoutError as e: + raise httpx.ProxyError("ProxyTimeoutError: " + e.args[0], request=request) from e + except ProxyError as e: + raise httpx.ProxyError("ProxyError: " + e.args[0], request=request) from e def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit, retries): @@ -141,56 +73,65 @@ def get_transport_for_socks_proxy(verify, http2, local_address, proxy_url, limit proxy_type, proxy_host, proxy_port, proxy_username, proxy_password = parse_proxy_url(proxy_url) verify = get_sslcontexts(proxy_url, None, True, False, http2) if verify is True else verify - return AsyncProxyTransportFixed(proxy_type=proxy_type, proxy_host=proxy_host, proxy_port=proxy_port, - username=proxy_username, password=proxy_password, - rdns=rdns, - loop=get_loop(), - verify=verify, - http2=http2, - local_address=local_address, - max_connections=limit.max_connections, - max_keepalive_connections=limit.max_keepalive_connections, - keepalive_expiry=limit.keepalive_expiry, - retries=retries, - **TRANSPORT_KWARGS) + return AsyncProxyTransportFixed( + proxy_type=proxy_type, + proxy_host=proxy_host, + proxy_port=proxy_port, + username=proxy_username, + password=proxy_password, + rdns=rdns, + loop=get_loop(), + verify=verify, + http2=http2, + local_address=local_address, + limits=limit, + retries=retries, + **TRANSPORT_KWARGS, + ) def get_transport(verify, http2, local_address, proxy_url, limit, retries): verify = get_sslcontexts(None, None, True, False, http2) if verify is True else verify - return AsyncHTTPTransportFixed(verify=verify, - http2=http2, - local_address=local_address, - proxy=httpx._config.Proxy(proxy_url) if proxy_url else None, - limits=limit, - retries=retries, - **TRANSPORT_KWARGS) - - -def iter_proxies(proxies): - # https://www.python-httpx.org/compatibility/#proxy-keys - if isinstance(proxies, str): - yield 'all://', proxies - elif isinstance(proxies, dict): - for pattern, proxy_url in proxies.items(): - yield pattern, proxy_url - - -def new_client(enable_http, verify, enable_http2, - max_connections, max_keepalive_connections, keepalive_expiry, - proxies, local_address, retries, max_redirects): - limit = httpx.Limits(max_connections=max_connections, - max_keepalive_connections=max_keepalive_connections, - keepalive_expiry=keepalive_expiry) + return httpx.AsyncHTTPTransport( + # pylint: disable=protected-access + verify=verify, + http2=http2, + limits=limit, + proxy=httpx._config.Proxy(proxy_url) if proxy_url else None, + local_address=local_address, + retries=retries, + **TRANSPORT_KWARGS, + ) + + +def new_client( + # pylint: disable=too-many-arguments + enable_http, + verify, + enable_http2, + max_connections, + max_keepalive_connections, + keepalive_expiry, + proxies, + local_address, + retries, + max_redirects, + hook_log_response, +): + limit = httpx.Limits( + max_connections=max_connections, + max_keepalive_connections=max_keepalive_connections, + keepalive_expiry=keepalive_expiry, + ) # See https://www.python-httpx.org/advanced/#routing mounts = {} - for pattern, proxy_url in iter_proxies(proxies): - if not enable_http and (pattern == 'http' or pattern.startswith('http://')): + for pattern, proxy_url in proxies.items(): + if not enable_http and pattern.startswith('http://'): continue - if proxy_url.startswith('socks4://') \ - or proxy_url.startswith('socks5://') \ - or proxy_url.startswith('socks5h://'): - mounts[pattern] = get_transport_for_socks_proxy(verify, enable_http2, local_address, proxy_url, limit, - retries) + if proxy_url.startswith('socks4://') or proxy_url.startswith('socks5://') or proxy_url.startswith('socks5h://'): + mounts[pattern] = get_transport_for_socks_proxy( + verify, enable_http2, local_address, proxy_url, limit, retries + ) else: mounts[pattern] = get_transport(verify, enable_http2, local_address, proxy_url, limit, retries) @@ -198,17 +139,26 @@ def new_client(enable_http, verify, enable_http2, mounts['http://'] = AsyncHTTPTransportNoHttp() transport = get_transport(verify, enable_http2, local_address, None, limit, retries) - return httpx.AsyncClient(transport=transport, mounts=mounts, max_redirects=max_redirects) + + event_hooks = None + if hook_log_response: + event_hooks = {'response': [hook_log_response]} + + return httpx.AsyncClient( + transport=transport, + mounts=mounts, + max_redirects=max_redirects, + event_hooks=event_hooks, + ) def get_loop(): - global LOOP return LOOP def init(): # log - for logger_name in ('hpack.hpack', 'hpack.table'): + for logger_name in ('hpack.hpack', 'hpack.table', 'httpx._client'): logging.getLogger(logger_name).setLevel(logging.WARNING) # loop @@ -217,12 +167,12 @@ def init(): LOOP = asyncio.new_event_loop() LOOP.run_forever() - th = threading.Thread( + thread = threading.Thread( target=loop_thread, name='asyncio_loop', daemon=True, ) - th.start() + thread.start() init() diff --git a/searx/network/network.py b/searx/network/network.py index 25b3c9863ffdccbeede61baedfa4c881225bf87b..2ab9127c6f2b3a37d143877702f64f550e324cf3 100644 --- a/searx/network/network.py +++ b/searx/network/network.py @@ -1,4 +1,7 @@ # SPDX-License-Identifier: AGPL-3.0-or-later +# lint: pylint +# pylint: disable=global-statement +# pylint: disable=missing-module-docstring, missing-class-docstring import atexit import asyncio @@ -7,9 +10,11 @@ from itertools import cycle import httpx -from .client import new_client, get_loop +from searx import logger, searx_debug +from .client import new_client, get_loop, AsyncHTTPTransportNoHttp +logger = logger.getChild('network') DEFAULT_NAME = '__DEFAULT__' NETWORKS = {} # requests compatibility when reading proxy settings from settings.yml @@ -26,31 +31,50 @@ PROXY_PATTERN_MAPPING = { 'socks5h:': 'socks5h://', } -ADDRESS_MAPPING = { - 'ipv4': '0.0.0.0', - 'ipv6': '::' -} +ADDRESS_MAPPING = {'ipv4': '0.0.0.0', 'ipv6': '::'} class Network: - __slots__ = ('enable_http', 'verify', 'enable_http2', - 'max_connections', 'max_keepalive_connections', 'keepalive_expiry', - 'local_addresses', 'proxies', 'max_redirects', 'retries', 'retry_on_http_error', - '_local_addresses_cycle', '_proxies_cycle', '_clients') - - def __init__(self, - enable_http=True, - verify=True, - enable_http2=False, - max_connections=None, - max_keepalive_connections=None, - keepalive_expiry=None, - proxies=None, - local_addresses=None, - retries=0, - retry_on_http_error=None, - max_redirects=30): + __slots__ = ( + 'enable_http', + 'verify', + 'enable_http2', + 'max_connections', + 'max_keepalive_connections', + 'keepalive_expiry', + 'local_addresses', + 'proxies', + 'using_tor_proxy', + 'max_redirects', + 'retries', + 'retry_on_http_error', + '_local_addresses_cycle', + '_proxies_cycle', + '_clients', + '_logger', + ) + + _TOR_CHECK_RESULT = {} + + def __init__( + # pylint: disable=too-many-arguments + self, + enable_http=True, + verify=True, + enable_http2=False, + max_connections=None, + max_keepalive_connections=None, + keepalive_expiry=None, + proxies=None, + using_tor_proxy=False, + local_addresses=None, + retries=0, + retry_on_http_error=None, + max_redirects=30, + logger_name=None, + ): + self.enable_http = enable_http self.verify = verify self.enable_http2 = enable_http2 @@ -58,6 +82,7 @@ class Network: self.max_keepalive_connections = max_keepalive_connections self.keepalive_expiry = keepalive_expiry self.proxies = proxies + self.using_tor_proxy = using_tor_proxy self.local_addresses = local_addresses self.retries = retries self.retry_on_http_error = retry_on_http_error @@ -65,6 +90,7 @@ class Network: self._local_addresses_cycle = self.get_ipaddress_cycle() self._proxies_cycle = self.get_proxy_cycles() self._clients = {} + self._logger = logger.getChild(logger_name) if logger_name else logger self.check_parameters() def check_parameters(self): @@ -81,7 +107,7 @@ class Network: local_addresses = self.local_addresses if not local_addresses: return - elif isinstance(local_addresses, str): + if isinstance(local_addresses, str): local_addresses = [local_addresses] for address in local_addresses: yield address @@ -119,25 +145,62 @@ class Network: for pattern, proxy_urls in self.iter_proxies(): proxy_settings[pattern] = cycle(proxy_urls) while True: + # pylint: disable=stop-iteration-return yield tuple((pattern, next(proxy_url_cycle)) for pattern, proxy_url_cycle in proxy_settings.items()) - def get_client(self, verify=None, max_redirects=None): + async def log_response(self, response: httpx.Response): + request = response.request + status = f"{response.status_code} {response.reason_phrase}" + response_line = f"{response.http_version} {status}" + content_type = response.headers.get("Content-Type") + content_type = f' ({content_type})' if content_type else '' + self._logger.debug(f'HTTP Request: {request.method} {request.url} "{response_line}"{content_type}') + + @staticmethod + async def check_tor_proxy(client: httpx.AsyncClient, proxies) -> bool: + if proxies in Network._TOR_CHECK_RESULT: + return Network._TOR_CHECK_RESULT[proxies] + + result = True + # ignore client._transport because it is not used with all:// + for transport in client._mounts.values(): # pylint: disable=protected-access + if isinstance(transport, AsyncHTTPTransportNoHttp): + continue + if not getattr(transport, '_rdns', False): + result = False + break + else: + response = await client.get('https://check.torproject.org/api/ip') + if not response.json()['IsTor']: + result = False + Network._TOR_CHECK_RESULT[proxies] = result + return result + + async def get_client(self, verify=None, max_redirects=None): verify = self.verify if verify is None else verify max_redirects = self.max_redirects if max_redirects is None else max_redirects local_address = next(self._local_addresses_cycle) proxies = next(self._proxies_cycle) # is a tuple so it can be part of the key key = (verify, max_redirects, local_address, proxies) + hook_log_response = None if key not in self._clients or self._clients[key].is_closed: - self._clients[key] = new_client(self.enable_http, - verify, - self.enable_http2, - self.max_connections, - self.max_keepalive_connections, - self.keepalive_expiry, - dict(proxies), - local_address, - 0, - max_redirects) + client = new_client( + self.enable_http, + verify, + self.enable_http2, + self.max_connections, + self.max_keepalive_connections, + self.keepalive_expiry, + dict(proxies), + local_address, + 0, + max_redirects, + hook_log_response, + ) + if self.using_tor_proxy and not await self.check_tor_proxy(client, proxies): + await client.aclose() + raise httpx.ProxyError('Network configuration problem: not using Tor') + self._clients[key] = client return self._clients[key] async def aclose(self): @@ -146,98 +209,124 @@ class Network: await client.aclose() except httpx.HTTPError: pass + await asyncio.gather(*[close_client(client) for client in self._clients.values()], return_exceptions=False) @staticmethod - def get_kwargs_clients(kwargs): + def extract_kwargs_clients(kwargs): kwargs_clients = {} if 'verify' in kwargs: kwargs_clients['verify'] = kwargs.pop('verify') if 'max_redirects' in kwargs: kwargs_clients['max_redirects'] = kwargs.pop('max_redirects') + if 'allow_redirects' in kwargs: + # see https://github.com/encode/httpx/pull/1808 + kwargs['follow_redirects'] = kwargs.pop('allow_redirects') return kwargs_clients - def is_valid_respones(self, response): - if (self.retry_on_http_error is True and 400 <= response.status_code <= 599) \ - or (isinstance(self.retry_on_http_error, list) and response.status_code in self.retry_on_http_error) \ - or (isinstance(self.retry_on_http_error, int) and response.status_code == self.retry_on_http_error): + def is_valid_response(self, response): + # pylint: disable=too-many-boolean-expressions + if ( + (self.retry_on_http_error is True and 400 <= response.status_code <= 599) + or (isinstance(self.retry_on_http_error, list) and response.status_code in self.retry_on_http_error) + or (isinstance(self.retry_on_http_error, int) and response.status_code == self.retry_on_http_error) + ): return False return True - async def request(self, method, url, **kwargs): + async def call_client(self, stream, method, url, **kwargs): retries = self.retries + was_disconnected = False + kwargs_clients = Network.extract_kwargs_clients(kwargs) while retries >= 0: # pragma: no cover - kwargs_clients = Network.get_kwargs_clients(kwargs) - client = self.get_client(**kwargs_clients) + client = await self.get_client(**kwargs_clients) try: - response = await client.request(method, url, **kwargs) - if self.is_valid_respones(response) or retries <= 0: + if stream: + response = client.stream(method, url, **kwargs) + else: + response = await client.request(method, url, **kwargs) + if self.is_valid_response(response) or retries <= 0: return response - except (httpx.RequestError, httpx.HTTPStatusError) as e: + except httpx.RemoteProtocolError as e: + if not was_disconnected: + # the server has closed the connection: + # try again without decreasing the retries variable & with a new HTTP client + was_disconnected = True + await client.aclose() + self._logger.warning('httpx.RemoteProtocolError: the server has disconnected, retrying') + continue if retries <= 0: raise e - retries -= 1 - - def stream(self, method, url, **kwargs): - retries = self.retries - while retries >= 0: # pragma: no cover - kwargs_clients = Network.get_kwargs_clients(kwargs) - client = self.get_client(**kwargs_clients) - try: - response = client.stream(method, url, **kwargs) - if self.is_valid_respones(response) or retries <= 0: - return response except (httpx.RequestError, httpx.HTTPStatusError) as e: if retries <= 0: raise e retries -= 1 + async def request(self, method, url, **kwargs): + return await self.call_client(False, method, url, **kwargs) + + async def stream(self, method, url, **kwargs): + return await self.call_client(True, method, url, **kwargs) + @classmethod async def aclose_all(cls): await asyncio.gather(*[network.aclose() for network in NETWORKS.values()], return_exceptions=False) def get_network(name=None): - global NETWORKS - return NETWORKS[name or DEFAULT_NAME] + return NETWORKS.get(name or DEFAULT_NAME) + + +def check_network_configuration(): + async def check(): + exception_count = 0 + for network in NETWORKS.values(): + if network.using_tor_proxy: + try: + await network.get_client() + except Exception: # pylint: disable=broad-except + network._logger.exception('Error') # pylint: disable=protected-access + exception_count += 1 + return exception_count + + future = asyncio.run_coroutine_threadsafe(check(), get_loop()) + exception_count = future.result() + if exception_count > 0: + raise RuntimeError("Invalid network configuration") def initialize(settings_engines=None, settings_outgoing=None): + # pylint: disable=import-outside-toplevel) from searx.engines import engines from searx import settings - global NETWORKS + # pylint: enable=import-outside-toplevel) - settings_engines = settings_engines or settings.get('engines') - settings_outgoing = settings_outgoing or settings.get('outgoing') + settings_engines = settings_engines or settings['engines'] + settings_outgoing = settings_outgoing or settings['outgoing'] - # default parameters for AsyncHTTPTransport - # see https://github.com/encode/httpx/blob/e05a5372eb6172287458b37447c30f650047e1b8/httpx/_transports/default.py#L108-L121 # noqa default_params = { 'enable_http': False, 'verify': True, - 'enable_http2': settings_outgoing.get('enable_http2', True), - # Magic number kept from previous code - 'max_connections': settings_outgoing.get('pool_connections', 100), - # Picked from constructor - 'max_keepalive_connections': settings_outgoing.get('pool_maxsize', 10), - # - 'keepalive_expiry': settings_outgoing.get('keepalive_expiry', 5.0), - 'local_addresses': settings_outgoing.get('source_ips'), - 'proxies': settings_outgoing.get('proxies'), - # default maximum redirect - # from https://github.com/psf/requests/blob/8c211a96cdbe9fe320d63d9e1ae15c5c07e179f8/requests/models.py#L55 - 'max_redirects': settings_outgoing.get('max_redirects', 30), - # - 'retries': settings_outgoing.get('retries', 3), + 'enable_http2': settings_outgoing['enable_http2'], + 'max_connections': settings_outgoing['pool_connections'], + 'max_keepalive_connections': settings_outgoing['pool_maxsize'], + 'keepalive_expiry': settings_outgoing['keepalive_expiry'], + 'local_addresses': settings_outgoing['source_ips'], + 'using_tor_proxy': settings_outgoing['using_tor_proxy'], + 'proxies': settings_outgoing['proxies'], + 'max_redirects': settings_outgoing['max_redirects'], + 'retries': settings_outgoing['retries'], 'retry_on_http_error': None, } - def new_network(params): + def new_network(params, logger_name=None): nonlocal default_params result = {} result.update(default_params) result.update(params) + if logger_name: + result['logger_name'] = logger_name return Network(**result) def iter_networks(): @@ -253,13 +342,13 @@ def initialize(settings_engines=None, settings_outgoing=None): if NETWORKS: done() NETWORKS.clear() - NETWORKS[DEFAULT_NAME] = new_network({}) - NETWORKS['ipv4'] = new_network({'local_addresses': '0.0.0.0'}) - NETWORKS['ipv6'] = new_network({'local_addresses': '::'}) + NETWORKS[DEFAULT_NAME] = new_network({}, logger_name='default') + NETWORKS['ipv4'] = new_network({'local_addresses': '0.0.0.0'}, logger_name='ipv4') + NETWORKS['ipv6'] = new_network({'local_addresses': '::'}, logger_name='ipv6') # define networks from outgoing.networks - for network_name, network in settings_outgoing.get('networks', {}).items(): - NETWORKS[network_name] = new_network(network) + for network_name, network in settings_outgoing['networks'].items(): + NETWORKS[network_name] = new_network(network, logger_name=network_name) # define networks from engines.[i].network (except references) for engine_name, engine, network in iter_networks(): @@ -270,15 +359,23 @@ def initialize(settings_engines=None, settings_outgoing=None): network[attribute_name] = getattr(engine, attribute_name) else: network[attribute_name] = attribute_value - NETWORKS[engine_name] = new_network(network) + NETWORKS[engine_name] = new_network(network, logger_name=engine_name) elif isinstance(network, dict): - NETWORKS[engine_name] = new_network(network) + NETWORKS[engine_name] = new_network(network, logger_name=engine_name) # define networks from engines.[i].network (references) for engine_name, engine, network in iter_networks(): if isinstance(network, str): NETWORKS[engine_name] = NETWORKS[network] + # the /image_proxy endpoint has a dedicated network. + # same parameters than the default network, but HTTP/2 is disabled. + # It decreases the CPU load average, and the total time is more or less the same + if 'image_proxy' not in NETWORKS: + image_proxy_params = default_params.copy() + image_proxy_params['enable_http2'] = False + NETWORKS['image_proxy'] = new_network(image_proxy_params, logger_name='image_proxy') + @atexit.register def done(): diff --git a/searx/settings.yml b/searx/settings.yml index 92be3fbeb1af5018e0af33419adc2886456fe0e8..a869bba4fd5950025079ebebe41b44944674b1e9 100644 --- a/searx/settings.yml +++ b/searx/settings.yml @@ -75,6 +75,13 @@ outgoing: # communication with search engines pool_connections : 100 # The maximum number of concurrent connections that may be established. pool_maxsize : 20 # Allow the connection pool to maintain keep-alive connections below this point. enable_http2: True # See https://www.python-httpx.org/http2/ + keepalive_expiry: 5.0 + source_ips: [] + using_tor_proxy: False + proxies: {} + max_redirects: 30 + retries: 0 + networks: {} # uncomment below section if you want to use a proxy # see https://2.python-requests.org/en/latest/user/advanced/#proxies # SOCKS proxies are also supported: see https://2.python-requests.org/en/latest/user/advanced/#socks diff --git a/searx/webapp.py b/searx/webapp.py index 9d3799191d26d993509122c73c120d34b8c6e27f..7a5503886f7c4b070dd21355efb60b0893930fbe 100755 --- a/searx/webapp.py +++ b/searx/webapp.py @@ -79,7 +79,7 @@ from searx.engines import ( ) from searx.webutils import ( UnicodeWriter, get_themes, highlight_content, get_resources_directory, - get_static_files, get_result_templates, get_themes_folder_name, + get_static_files, get_result_templates, get_themes_folder_name, is_hmac_of, prettify_url, new_hmac, is_flask_run_cmdline ) from searx.webadapter import get_search_query_from_webapp, get_selected_categories @@ -94,7 +94,7 @@ from searx.plugins import plugins from searx.plugins.oa_doi_rewrite import get_doi_resolver from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES from searx.answerers import answerers -from searx.network import stream as http_stream +from searx.network import set_context_network_name, stream as http_stream from searx.answerers import ask from searx.metrology.error_recorder import errors_per_engines from searx.settings_loader import get_default_settings_path @@ -936,61 +936,71 @@ def _is_selected_language_supported(engine, preferences): @app.route('/image_proxy', methods=['GET']) def image_proxy(): - url = request.args.get('url') + # pylint: disable=too-many-return-statements, too-many-branches + url = request.args.get('url') if not url: return '', 400 - h = new_hmac(settings['server']['secret_key'], url.encode()) - - if h != request.args.get('h'): + if not is_hmac_of(settings['server']['secret_key'], url.encode(), request.args.get('h', '')): return '', 400 maximum_size = 5 * 1024 * 1024 - + forward_resp = False + resp = None try: - headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'}) - headers['User-Agent'] = gen_useragent() - stream = http_stream( - method='GET', - url=url, - headers=headers, - timeout=settings['outgoing']['request_timeout'], - allow_redirects=True, - max_redirects=20) - - resp = next(stream) + request_headers = { + 'User-Agent': gen_useragent(), + 'Accept': 'image/webp,*/*', + 'Accept-Encoding': 'gzip, deflate', + 'Sec-GPC': '1', + 'DNT': '1', + } + set_context_network_name('image_proxy') + resp, stream = http_stream(method='GET', url=url, headers=request_headers) content_length = resp.headers.get('Content-Length') if content_length and content_length.isdigit() and int(content_length) > maximum_size: return 'Max size', 400 - if resp.status_code == 304: - return '', resp.status_code - if resp.status_code != 200: - logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code)) + logger.debug('image-proxy: wrong response code: %i', resp.status_code) if resp.status_code >= 400: return '', resp.status_code return '', 400 - if not resp.headers.get('content-type', '').startswith('image/'): - logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type'))) + if not resp.headers.get('Content-Type', '').startswith('image/'): + logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', '')) return '', 400 - headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'}) - - total_length = 0 - - def forward_chunk(): - nonlocal total_length - for chunk in stream: - total_length += len(chunk) - if total_length > maximum_size: - break - yield chunk + forward_resp = True + except httpx.HTTPError: + logger.exception('HTTP error') + return '', 400 + finally: + if resp and not forward_resp: + # the code is about to return an HTTP 400 error to the browser + # we make sure to close the response between searxng and the HTTP server + try: + resp.close() + except httpx.HTTPError: + logger.exception('HTTP error on closing') + + def close_stream(): + nonlocal resp, stream + try: + resp.close() + del resp + del stream + except httpx.HTTPError as e: + logger.debug('Exception while closing response', e) - return Response(forward_chunk(), mimetype=resp.headers['Content-Type'], headers=headers) + try: + headers = dict_subset(resp.headers, {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'}) + response = Response(stream, mimetype=resp.headers['Content-Type'], headers=headers, direct_passthrough=True) + response.call_on_close(close_stream) + return response except httpx.HTTPError: + close_stream() return '', 400 diff --git a/searx/webutils.py b/searx/webutils.py index 273fd9ca6a287bc123a6ad42bf7619eb00b66c94..0cf814909f8fc052cedeacd07c11578d7ca82a76 100644 --- a/searx/webutils.py +++ b/searx/webutils.py @@ -99,14 +99,12 @@ def get_result_templates(templates_path): def new_hmac(secret_key, url): - try: - secret_key_bytes = bytes(secret_key, 'utf-8') - except TypeError as err: - if isinstance(secret_key, bytes): - secret_key_bytes = secret_key - else: - raise err - return hmac.new(secret_key_bytes, url, hashlib.sha256).hexdigest() + return hmac.new(secret_key.encode(), url, hashlib.sha256).hexdigest() + + +def is_hmac_of(secret_key, value, hmac_to_check): + hmac_of_value = new_hmac(secret_key, value) + return len(hmac_of_value) == len(hmac_to_check) and hmac.compare_digest(hmac_of_value, hmac_to_check) def prettify_url(url, max_length=74): diff --git a/tests/unit/network/test_network.py b/tests/unit/network/test_network.py index e8d33240a2710c1ac2ed07c999850015d1a4d9dc..c0f0ab97517ad5c7d23a073ee2295d4446f69c8b 100644 --- a/tests/unit/network/test_network.py +++ b/tests/unit/network/test_network.py @@ -9,7 +9,6 @@ from searx.testing import SearxTestCase class TestNetwork(SearxTestCase): - def setUp(self): initialize() @@ -51,23 +50,23 @@ class TestNetwork(SearxTestCase): network = Network(proxies='http://localhost:1337') self.assertEqual(next(network._proxies_cycle), (('all://', 'http://localhost:1337'),)) - network = Network(proxies={ - 'https': 'http://localhost:1337', - 'http': 'http://localhost:1338' - }) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))) - - network = Network(proxies={ - 'https': ['http://localhost:1337', 'http://localhost:1339'], - 'http': 'http://localhost:1338' - }) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338'))) - self.assertEqual(next(network._proxies_cycle), - (('https://', 'http://localhost:1339'), ('http://', 'http://localhost:1338'))) + network = Network(proxies={'https': 'http://localhost:1337', 'http': 'http://localhost:1338'}) + self.assertEqual( + next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')) + ) + self.assertEqual( + next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')) + ) + + network = Network( + proxies={'https': ['http://localhost:1337', 'http://localhost:1339'], 'http': 'http://localhost:1338'} + ) + self.assertEqual( + next(network._proxies_cycle), (('https://', 'http://localhost:1337'), ('http://', 'http://localhost:1338')) + ) + self.assertEqual( + next(network._proxies_cycle), (('https://', 'http://localhost:1339'), ('http://', 'http://localhost:1338')) + ) with self.assertRaises(ValueError): Network(proxies=1) @@ -77,25 +76,27 @@ class TestNetwork(SearxTestCase): 'verify': True, 'max_redirects': 5, 'timeout': 2, + 'allow_redirects': True, } - kwargs_client = Network.get_kwargs_clients(kwargs) + kwargs_client = Network.extract_kwargs_clients(kwargs) self.assertEqual(len(kwargs_client), 2) - self.assertEqual(len(kwargs), 1) + self.assertEqual(len(kwargs), 2) self.assertEqual(kwargs['timeout'], 2) + self.assertEqual(kwargs['follow_redirects'], True) self.assertTrue(kwargs_client['verify']) self.assertEqual(kwargs_client['max_redirects'], 5) async def test_get_client(self): network = Network(verify=True) - client1 = network.get_client() - client2 = network.get_client(verify=True) - client3 = network.get_client(max_redirects=10) - client4 = network.get_client(verify=True) - client5 = network.get_client(verify=False) - client6 = network.get_client(max_redirects=10) + client1 = await network.get_client() + client2 = await network.get_client(verify=True) + client3 = await network.get_client(max_redirects=10) + client4 = await network.get_client(verify=True) + client5 = await network.get_client(verify=False) + client6 = await network.get_client(max_redirects=10) self.assertEqual(client1, client2) self.assertEqual(client1, client4) @@ -107,7 +108,7 @@ class TestNetwork(SearxTestCase): async def test_aclose(self): network = Network(verify=True) - network.get_client() + await network.get_client() await network.aclose() async def test_request(self): @@ -134,6 +135,7 @@ class TestNetworkRequestRetries(SearxTestCase): first = False return httpx.Response(status_code=403, text=TestNetworkRequestRetries.TEXT) return httpx.Response(status_code=200, text=TestNetworkRequestRetries.TEXT) + return get_response async def test_retries_ok(self): @@ -206,12 +208,13 @@ class TestNetworkStreamRetries(SearxTestCase): first = False raise httpx.RequestError('fake exception', request=None) return httpx.Response(status_code=200, text=TestNetworkStreamRetries.TEXT) + return stream async def test_retries_ok(self): with patch.object(httpx.AsyncClient, 'stream', new=TestNetworkStreamRetries.get_response_exception_then_200()): network = Network(enable_http=True, retries=1, retry_on_http_error=403) - response = network.stream('GET', 'https://example.com/') + response = await network.stream('GET', 'https://example.com/') self.assertEqual(response.text, TestNetworkStreamRetries.TEXT) await network.aclose() @@ -219,7 +222,7 @@ class TestNetworkStreamRetries(SearxTestCase): with patch.object(httpx.AsyncClient, 'stream', new=TestNetworkStreamRetries.get_response_exception_then_200()): network = Network(enable_http=True, retries=0, retry_on_http_error=403) with self.assertRaises(httpx.RequestError): - network.stream('GET', 'https://example.com/') + await network.stream('GET', 'https://example.com/') await network.aclose() async def test_retries_exception(self): @@ -234,6 +237,6 @@ class TestNetworkStreamRetries(SearxTestCase): with patch.object(httpx.AsyncClient, 'stream', new=stream): network = Network(enable_http=True, retries=0, retry_on_http_error=403) - response = network.stream('GET', 'https://example.com/') + response = await network.stream('GET', 'https://example.com/') self.assertEqual(response.status_code, 403) await network.aclose() diff --git a/tests/unit/test_webutils.py b/tests/unit/test_webutils.py index 023374b04843884ecf3aacd5949e43b2a874a1d5..65b69d94e8b5bffab5c64d790c84d0461ca1f111 100644 --- a/tests/unit/test_webutils.py +++ b/tests/unit/test_webutils.py @@ -74,14 +74,13 @@ class TestUnicodeWriter(SearxTestCase): class TestNewHmac(SearxTestCase): - def test_bytes(self): - for secret_key in ['secret', b'secret', 1]: - if secret_key == 1: - with self.assertRaises(TypeError): - webutils.new_hmac(secret_key, b'http://example.com') - continue - res = webutils.new_hmac(secret_key, b'http://example.com') - self.assertEqual( - res, - '23e2baa2404012a5cc8e4a18b4aabf0dde4cb9b56f679ddc0fd6d7c24339d819') + data = b'http://example.com' + with self.assertRaises(AttributeError): + webutils.new_hmac(b'secret', data) + + with self.assertRaises(AttributeError): + webutils.new_hmac(1, data) + + res = webutils.new_hmac('secret', data) + self.assertEqual(res, '23e2baa2404012a5cc8e4a18b4aabf0dde4cb9b56f679ddc0fd6d7c24339d819')