Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Unverified Commit ea38fea7 authored by Noémi Ványi's avatar Noémi Ványi Committed by GitHub
Browse files

Pick image_proxy changes from searxng (#2965)

* [mod] /image_proxy: don't decompress images

* [fix] image_proxy: always close the httpx respone

previously, when the content type was not an image and some other error,
the httpx response was not closed

* [mod] /image_proxy: use HTTP/1 instead of HTTP/2

httpx: HTTP/2 is slow when a lot data is downloaded.
https://github.com/dalf/pyhttp-benchmark



also, the usage of HTTP/1 decreases the load average

* [mod] searx.utils.dict_subset: rewrite with comprehension

Co-authored-by: default avatarAlexandre Flament <alex@al-f.net>
parent ad7e00ad
Loading
Loading
Loading
Loading
+22 −1
Original line number Diff line number Diff line
@@ -5,6 +5,7 @@ import threading
import concurrent.futures
from time import time
from queue import SimpleQueue
from types import MethodType

import httpx
import h2.exceptions
@@ -134,15 +135,27 @@ async def stream_chunk_to_queue(network, q, method, url, **kwargs):
    try:
        async with await network.stream(method, url, **kwargs) as response:
            q.put(response)
            # aiter_raw: access the raw bytes on the response without applying any HTTP content decoding
            # https://www.python-httpx.org/quickstart/#streaming-responses
            async for chunk in response.aiter_bytes(65536):
                if len(chunk) > 0:
                    q.put(chunk)
    except httpx.ResponseClosed as e:
        # the response was closed
        pass
    except (httpx.HTTPError, OSError, h2.exceptions.ProtocolError) as e:
        q.put(e)
    finally:
        q.put(None)


def _close_response_method(self):
    asyncio.run_coroutine_threadsafe(
        self.aclose(),
        get_loop()
    )


def stream(method, url, **kwargs):
    """Replace httpx.stream.

@@ -158,10 +171,18 @@ def stream(method, url, **kwargs):
    q = SimpleQueue()
    future = asyncio.run_coroutine_threadsafe(stream_chunk_to_queue(get_network(), q, method, url, **kwargs),
                                              get_loop())
    # yield response
    response = q.get()
    if isinstance(response, Exception):
        raise response
    response.close = MethodType(_close_response_method, response)
    yield response

    # yield chunks
    chunk_or_exception = q.get()
    while chunk_or_exception is not None:
        if isinstance(chunk_or_exception, Exception):
            raise chunk_or_exception
        yield chunk_or_exception
        chunk_or_exception = q.get()
    return future.result()
    future.result()
+8 −0
Original line number Diff line number Diff line
@@ -326,6 +326,14 @@ def initialize(settings_engines=None, settings_outgoing=None):
        if isinstance(network, str):
            NETWORKS[engine_name] = NETWORKS[network]

    # the /image_proxy endpoint has a dedicated network.
    # same parameters than the default network, but HTTP/2 is disabled.
    # It decreases the CPU load average, and the total time is more or less the same
    if 'image_proxy' not in NETWORKS:
        image_proxy_params = default_params.copy()
        image_proxy_params['enable_http2'] = False
        NETWORKS['image_proxy'] = new_network(image_proxy_params)


@atexit.register
def done():
+1 −5
Original line number Diff line number Diff line
@@ -272,11 +272,7 @@ def dict_subset(d, properties):
        >>> >> dict_subset({'A': 'a', 'B': 'b', 'C': 'c'}, ['A', 'D'])
        {'A': 'a'}
    """
    result = {}
    for k in properties:
        if k in d:
            result[k] = d[k]
    return result
    return {k: d[k] for k in properties if k in d}


def get_torrent_size(filesize, filesize_multiplier):
+41 −13
Original line number Diff line number Diff line
@@ -94,7 +94,7 @@ from searx.plugins import plugins
from searx.plugins.oa_doi_rewrite import get_doi_resolver
from searx.preferences import Preferences, ValidationException, LANGUAGE_CODES
from searx.answerers import answerers
from searx.network import stream as http_stream
from searx.network import stream as http_stream, set_context_network_name
from searx.answerers import ask
from searx.metrology.error_recorder import errors_per_engines
from searx.settings_loader import get_default_settings_path
@@ -921,6 +921,8 @@ def _is_selected_language_supported(engine, preferences):

@app.route('/image_proxy', methods=['GET'])
def image_proxy():
    # pylint: disable=too-many-return-statements, too-many-branches

    url = request.args.get('url')

    if not url:
@@ -932,14 +934,21 @@ def image_proxy():
        return '', 400

    maximum_size = 5 * 1024 * 1024

    forward_resp = False
    resp = None
    try:
        headers = dict_subset(request.headers, {'If-Modified-Since', 'If-None-Match'})
        headers['User-Agent'] = gen_useragent()
        request_headers = {
            'User-Agent': gen_useragent(),
            'Accept': 'image/webp,*/*',
            'Accept-Encoding': 'gzip, deflate',
            'Sec-GPC': '1',
            'DNT': '1',
        }
        set_context_network_name('image_proxy')
        stream = http_stream(
            method='GET',
            url=url,
            headers=headers,
            headers=request_headers,
            timeout=settings['outgoing']['request_timeout'],
            follow_redirects=True,
            max_redirects=20)
@@ -949,25 +958,37 @@ def image_proxy():
        if content_length and content_length.isdigit() and int(content_length) > maximum_size:
            return 'Max size', 400

        if resp.status_code == 304:
            return '', resp.status_code

        if resp.status_code != 200:
            logger.debug('image-proxy: wrong response code: {0}'.format(resp.status_code))
            if resp.status_code >= 400:
                return '', resp.status_code
            return '', 400

        if not resp.headers.get('content-type', '').startswith('image/'):
            logger.debug('image-proxy: wrong content-type: {0}'.format(resp.headers.get('content-type')))
        if not resp.headers.get('Content-Type', '').startswith('image/'):
            logger.debug('image-proxy: wrong content-type: %s', resp.headers.get('Content-Type', ''))
            return '', 400

        headers = dict_subset(resp.headers, {'Content-Length', 'Length', 'Date', 'Last-Modified', 'Expires', 'Etag'})
        forward_resp = True
    except httpx.HTTPError:
        logger.exception('HTTP error')
        return '', 400
    finally:
        if resp and not forward_resp:
            # the code is about to return an HTTP 400 error to the browser
            # we make sure to close the response between searxng and the HTTP server
            try:
                resp.close()
            except httpx.HTTPError:
                logger.exception('HTTP error on closing')

        total_length = 0
    try:
        headers = dict_subset(
            resp.headers,
            {'Content-Type', 'Content-Encoding', 'Content-Length', 'Length'}
        )

        def forward_chunk():
            nonlocal total_length
            total_length = 0
            for chunk in stream:
                total_length += len(chunk)
                if total_length > maximum_size:
@@ -1148,6 +1169,13 @@ def run():
    )


def patch_application(app):
    # serve pages with HTTP/1.1
    WSGIRequestHandler.protocol_version = "HTTP/{}".format(settings['server']['http_protocol_version'])
    # patch app to handle non root url-s behind proxy & wsgi
    app.wsgi_app = ReverseProxyPathFix(ProxyFix(app.wsgi_app))


class ReverseProxyPathFix:
    '''Wrap the application in this middleware and configure the
    front-end server to add these headers, to let you quietly bind