Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit d07c006a authored by Alexandre Flament's avatar Alexandre Flament
Browse files

Replace chompjs with pure Python code

The new implementation is good enough for the current usage (brave)
parent 8e45ac42
Loading
Loading
Loading
Loading
+0 −1
Original line number Original line Diff line number Diff line
@@ -17,4 +17,3 @@ markdown-it-py==3.0.0
typing_extensions==4.7.1
typing_extensions==4.7.1
fasttext-predict==0.9.2.1
fasttext-predict==0.9.2.1
pytomlpp==1.0.13
pytomlpp==1.0.13
chompjs==1.2.2
 No newline at end of file
+2 −2
Original line number Original line Diff line number Diff line
@@ -104,7 +104,6 @@ from urllib.parse import (
    parse_qs,
    parse_qs,
)
)


import chompjs
from lxml import html
from lxml import html


from searx import locales
from searx import locales
@@ -112,6 +111,7 @@ from searx.utils import (
    extract_text,
    extract_text,
    eval_xpath_list,
    eval_xpath_list,
    eval_xpath_getindex,
    eval_xpath_getindex,
    js_variable_to_python,
)
)
from searx.enginelib.traits import EngineTraits
from searx.enginelib.traits import EngineTraits


@@ -215,7 +215,7 @@ def response(resp):
            datastr = line.replace("const data = ", "").strip()[:-1]
            datastr = line.replace("const data = ", "").strip()[:-1]
            break
            break


    json_data = chompjs.parse_js_object(datastr)
    json_data = js_variable_to_python(datastr)
    json_resp = json_data[1]['data']['body']['response']
    json_resp = json_data[1]['data']['body']['response']


    if brave_category == 'news':
    if brave_category == 'news':
+73 −0
Original line number Original line Diff line number Diff line
@@ -7,6 +7,7 @@
import re
import re
import importlib
import importlib
import importlib.util
import importlib.util
import json
import types
import types


from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
from typing import Optional, Union, Any, Set, List, Dict, MutableMapping, Tuple, Callable
@@ -37,6 +38,9 @@ _BLOCKED_TAGS = ('script', 'style')
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE4_RE = re.compile(r'%u([0-9a-fA-F]{4})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)
_ECMA_UNESCAPE2_RE = re.compile(r'%([0-9a-fA-F]{2})', re.UNICODE)


_JS_QUOTE_KEYS_RE = re.compile(r'([\{\s,])(\w+)(:)')
_JS_VOID_RE = re.compile(r'void\s+[0-9]+|void\s*\([0-9]+\)')

_STORAGE_UNIT_VALUE: Dict[str, int] = {
_STORAGE_UNIT_VALUE: Dict[str, int] = {
    'TB': 1024 * 1024 * 1024 * 1024,
    'TB': 1024 * 1024 * 1024 * 1024,
    'GB': 1024 * 1024 * 1024,
    'GB': 1024 * 1024 * 1024,
@@ -645,3 +649,72 @@ def detect_language(text: str, threshold: float = 0.3, only_search_languages: bo
            return None
            return None
        return language
        return language
    return None
    return None


def js_variable_to_python(js_variable):
    """Convert a javascript variable into JSON and then load the value

    It does not deal with all cases, but it is good enough for now.
    chompjs has a better implementation.
    """
    # when in_string is not None, it contains the character that has opened the string
    # either simple quote or double quote
    in_string = None
    # cut the string:
    # r"""{ a:"f\"irst", c:'sec"ond'}"""
    # becomes
    # ['{ a:', '"', 'f\\', '"', 'irst', '"', ', c:', "'", 'sec', '"', 'ond', "'", '}']
    parts = re.split(r'(["\'])', js_variable)
    # previous part (to check the escape character antislash)
    previous_p = ""
    for i, p in enumerate(parts):
        # parse characters inside a ECMA string
        if in_string:
            # we are in a JS string: replace the colon by a temporary character
            # so quote_keys_regex doesn't have to deal with colon inside the JS strings
            parts[i] = parts[i].replace(':', chr(1))
            if in_string == "'":
                # the JS string is delimited by simple quote.
                # This is not supported by JSON.
                # simple quote delimited string are converted to double quote delimited string
                # here, inside a JS string, we escape the double quote
                parts[i] = parts[i].replace('"', r'\"')

        # deal with delimieters and escape character
        if not in_string and p in ('"', "'"):
            # we are not in string
            # but p is double or simple quote
            # that's the start of a new string
            # replace simple quote by double quote
            # (JSON doesn't support simple quote)
            parts[i] = '"'
            in_string = p
            continue
        if p == in_string:
            # we are in a string and the current part MAY close the string
            if len(previous_p) > 0 and previous_p[-1] == '\\':
                # there is an antislash just before: the ECMA string continue
                continue
            # the current p close the string
            # replace simple quote by double quote
            parts[i] = '"'
            in_string = None
        #
        if not in_string:
            # replace void 0 by null
            # https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/void
            # we are sure there is no string in p
            parts[i] = _JS_VOID_RE.sub("null", p)
        # update previous_p
        previous_p = p
    # join the string
    s = ''.join(parts)
    # add quote arround the key
    # { a: 12 }
    # becomes
    # { "a": 12 }
    s = _JS_QUOTE_KEYS_RE.sub(r'\1"\2"\3', s)
    # replace the surogate character by colon
    s = s.replace(chr(1), ':')
    # load the JSON and return the result
    return json.loads(s)