Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e9afc4f8 authored by Markus Heiser's avatar Markus Heiser
Browse files

[mod] Startpage: reversed engineered & upgrade to data_type: traits_v1



One reason for the often seen CAPTCHA of the Startpage requests are the
incomplete requests SearXNG sends to startpage.com: this patch is a complete new
implementation of the ``request()`` function, reversed engineered from the
Startpage's search form.  The new implementation:

- use traits of data_type: traits_v1 and drop deprecated data_type: supported_languages
- adds time-range support
- adds save-search support
- fix searxng/searxng/issues 1884
- fix searxng/searxng/issues 1081 --> improvements to avoid CAPTCHA

In preparation for more categories (News, Images, Videos ..) from Startpage, the
variable ``startpage_categ`` was set up.  The default value is ``web`` and other
categories from Startpage are not yet implemented.

Signed-off-by: default avatarMarkus Heiser <markus.heiser@darmarit.de>
parent 858aa3e6
Loading
Loading
Loading
Loading
+1 −6
Original line number Diff line number Diff line
@@ -10,9 +10,4 @@ Startpage engines
   :backlinks: entry

.. automodule:: searx.engines.startpage

Functions
=========

.. autofunction:: searx.engines.startpage.fetch_traits
.. autofunction:: searx.engines.startpage.get_sc_code
   :members:
+3 −3
Original line number Diff line number Diff line
@@ -109,9 +109,9 @@ def seznam(query, _lang):
    ]


def startpage(query, lang):
    # startpage autocompleter
    lui = engines['startpage'].supported_languages.get(lang, 'english')  # vintage / deprecated
def startpage(query, sxng_locale):
    """Autocomplete from Startpage. Supports Startpage's languages"""
    lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
    url = 'https://startpage.com/suggestions?{query}'
    resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
    data = resp.json()
+2 −252
Original line number Diff line number Diff line
@@ -3078,7 +3078,7 @@
  "startpage": {
    "all_locale": null,
    "custom": {},
    "data_type": "supported_languages",
    "data_type": "traits_v1",
    "languages": {
      "af": "afrikaans",
      "am": "amharic",
@@ -3213,257 +3213,7 @@
      "zh-HK": "zh-TW_HK",
      "zh-TW": "zh-TW_TW"
    },
    "supported_languages": {
      "af": {
        "alias": "afrikaans"
      },
      "am": {
        "alias": "amharic"
      },
      "ar": {
        "alias": "arabic"
      },
      "az": {
        "alias": "azerbaijani"
      },
      "be": {
        "alias": "belarusian"
      },
      "bg": {
        "alias": "bulgarian"
      },
      "bn": {
        "alias": "bengali"
      },
      "bs": {
        "alias": "bosnian"
      },
      "ca": {
        "alias": "catalan"
      },
      "cs": {
        "alias": "czech"
      },
      "cy": {
        "alias": "welsh"
      },
      "da": {
        "alias": "dansk"
      },
      "de": {
        "alias": "deutsch"
      },
      "el": {
        "alias": "greek"
      },
      "en": {
        "alias": "english"
      },
      "en-GB": {
        "alias": "english_uk"
      },
      "eo": {
        "alias": "esperanto"
      },
      "es": {
        "alias": "espanol"
      },
      "et": {
        "alias": "estonian"
      },
      "eu": {
        "alias": "basque"
      },
      "fa": {
        "alias": "persian"
      },
      "fi": {
        "alias": "suomi"
      },
      "fo": {
        "alias": "faroese"
      },
      "fr": {
        "alias": "francais"
      },
      "fy": {
        "alias": "frisian"
      },
      "ga": {
        "alias": "irish"
      },
      "gd": {
        "alias": "gaelic"
      },
      "gl": {
        "alias": "galician"
      },
      "gu": {
        "alias": "gujarati"
      },
      "he": {
        "alias": "hebrew"
      },
      "hi": {
        "alias": "hindi"
      },
      "hr": {
        "alias": "croatian"
      },
      "hu": {
        "alias": "hungarian"
      },
      "ia": {
        "alias": "interlingua"
      },
      "id": {
        "alias": "indonesian"
      },
      "is": {
        "alias": "icelandic"
      },
      "it": {
        "alias": "italiano"
      },
      "ja": {
        "alias": "nihongo"
      },
      "jv": {
        "alias": "javanese"
      },
      "ka": {
        "alias": "georgian"
      },
      "kn": {
        "alias": "kannada"
      },
      "ko": {
        "alias": "hangul"
      },
      "la": {
        "alias": "latin"
      },
      "lt": {
        "alias": "lithuanian"
      },
      "lv": {
        "alias": "latvian"
      },
      "mai": {
        "alias": "bihari"
      },
      "mk": {
        "alias": "macedonian"
      },
      "ml": {
        "alias": "malayalam"
      },
      "mr": {
        "alias": "marathi"
      },
      "ms": {
        "alias": "malay"
      },
      "mt": {
        "alias": "maltese"
      },
      "ne": {
        "alias": "nepali"
      },
      "nl": {
        "alias": "nederlands"
      },
      "no": {
        "alias": "norsk"
      },
      "oc": {
        "alias": "occitan"
      },
      "pa": {
        "alias": "punjabi"
      },
      "pl": {
        "alias": "polski"
      },
      "pt": {
        "alias": "portugues"
      },
      "ro": {
        "alias": "romanian"
      },
      "ru": {
        "alias": "russian"
      },
      "si": {
        "alias": "sinhalese"
      },
      "sk": {
        "alias": "slovak"
      },
      "sl": {
        "alias": "slovenian"
      },
      "sq": {
        "alias": "albanian"
      },
      "sr": {
        "alias": "serbian"
      },
      "su": {
        "alias": "sudanese"
      },
      "sv": {
        "alias": "svenska"
      },
      "sw": {
        "alias": "swahili"
      },
      "ta": {
        "alias": "tamil"
      },
      "te": {
        "alias": "telugu"
      },
      "th": {
        "alias": "thai"
      },
      "ti": {
        "alias": "tigrinya"
      },
      "tl": {
        "alias": "tagalog"
      },
      "tr": {
        "alias": "turkce"
      },
      "uk": {
        "alias": "ukrainian"
      },
      "ur": {
        "alias": "urdu"
      },
      "uz": {
        "alias": "uzbek"
      },
      "vi": {
        "alias": "vietnamese"
      },
      "xh": {
        "alias": "xhosa"
      },
      "zh": {
        "alias": "jiantizhongwen"
      },
      "zh-HK": {
        "alias": "fantizhengwen"
      },
      "zh-TW": {
        "alias": "fantizhengwen"
      },
      "zu": {
        "alias": "zulu"
      }
    }
    "supported_languages": {}
  },
  "wikidata": {
    "all_locale": null,
+213 −150
Original line number Diff line number Diff line
@@ -50,38 +50,58 @@ W3C recommends subtag over macrolanguage [2]_.
Startpage languages
===================

The displayed name in Startpage's settings page depend on the location of the IP
when the 'Accept-Language' HTTP header is unset (in the language update script
we use "en-US,en;q=0.5" to get uniform names independent from the IP).
:py:obj:`send_accept_language_header`:
  The displayed name in Startpage's settings page depend on the location of the
  IP when ``Accept-Language`` HTTP header is unset.  In :py:obj:`fetch_traits`
  we use::

Each option has a displayed name and a value, either of which may represent the
language name in the native script, the language name in English, an English
transliteration of the native name, the English name of the writing script used
by the language, or occasionally something else entirely.
    'Accept-Language': "en-US,en;q=0.5",
    ..

  to get uniform names independent from the IP).

.. _startpage categories:

Startpage categories
====================

Startpage's category (for Web-search, News, Videos, ..) is set by
:py:obj:`startpage_categ` in  settings.yml::

  - name: startpage
    engine: startpage
    startpage_categ: web
    ...

.. hint::

   The default category is ``web`` .. and other categories than ``web`` are not
   yet implemented.

"""

from typing import TYPE_CHECKING
from collections import OrderedDict
import re
from time import time

from urllib.parse import urlencode
from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta

from dateutil import parser
from lxml import html
from babel import Locale
from babel.localedata import locale_identifiers
import dateutil.parser
import lxml.html
import babel

from searx import network
from searx.utils import extract_text, eval_xpath, match_language
from searx.exceptions import (
    SearxEngineResponseException,
    SearxEngineCaptchaException,
)

from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.exceptions import SearxEngineCaptchaException
from searx.locales import region_tag
from searx.enginelib.traits import EngineTraits

if TYPE_CHECKING:
    import logging

    logger: logging.Logger

traits: EngineTraits

# about
@@ -94,18 +114,28 @@ about = {
    "results": 'HTML',
}

startpage_categ = 'web'
"""Startpage's category, visit :ref:`startpage categories`.
"""

send_accept_language_header = True
"""Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``.  Optional the user can select a search-language (can be
different to the UI language) and a region filter.
"""

# engine dependent config
categories = ['general', 'web']
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls

paging = True
supported_languages_url = 'https://www.startpage.com/do/settings'
time_range_support = True
safesearch = True

time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
safesearch_dict = {0: '0', 1: '1', 2: '1'}

# search-url
base_url = 'https://startpage.com/'
search_url = base_url + 'sp/search?'
base_url = 'https://www.startpage.com'
search_url = base_url + '/sp/search'

# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
@@ -113,92 +143,193 @@ search_url = base_url + 'sp/search?'
results_xpath = '//div[@class="w-gl__result__main"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]'
content_xpath = './/p[@class="w-gl__description"]'
search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form

.. code: html

    <form action="/sp/search" method="post">
      <input type="text" name="query"  value="" ..>
      <input type="hidden" name="t" value="device">
      <input type="hidden" name="lui" value="english">
      <input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
      <input type="hidden" name="cat" value="web">
      <input type="hidden" class="abp" id="abp-input" name="abp" value="1">
    </form>
"""

# timestamp of the last fetch of 'sc' code
sc_code_ts = 0
sc_code = ''
sc_code_cache_sec = 30
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""


def raise_captcha(resp):
def get_sc_code(searxng_locale, params):
    """Get an actual ``sc`` argument from Startpage's search form (HTML page).

    if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
        raise SearxEngineCaptchaException()
    Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
    <search_form_xpath>`.  Without this argument Startpage considers the request
    is from a bot.  We do not know what is encoded in the value of the ``sc``
    argument, but it seems to be a kind of a *time-stamp*.

    Startpage's search form generates a new sc-code on each request.  This
    function scrap a new sc-code from Startpage's home page every
    :py:obj:`sc_code_cache_sec` seconds.

def get_sc_code(headers):
    """Get an actual ``sc`` argument from Startpage's home page.
    """

    Startpage puts a ``sc`` argument on every link.  Without this argument
    Startpage considers the request is from a bot.  We do not know what is
    encoded in the value of the ``sc`` argument, but it seems to be a kind of a
    *time-stamp*.  This *time-stamp* is valid for a few hours.
    global sc_code_ts, sc_code  # pylint: disable=global-statement

    This function scrap a new *time-stamp* from startpage's home page every hour
    (3000 sec).
    if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
        logger.debug("get_sc_code: reuse '%s'", sc_code)
        return sc_code

    """
    headers = {**params['headers']}
    headers['Origin'] = base_url
    headers['Referer'] = base_url + '/'
    # headers['Connection'] = 'keep-alive'
    # headers['Accept-Encoding'] = 'gzip, deflate, br'
    # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
    # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'

    # add Accept-Language header
    if searxng_locale == 'all':
        searxng_locale = 'en-US'
    locale = babel.Locale.parse(searxng_locale, sep='-')

    if send_accept_language_header:
        ac_lang = locale.language
        if locale.territory:
            ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
                locale.language,
                locale.territory,
                locale.language,
            )
        headers['Accept-Language'] = ac_lang

    global sc_code_ts, sc_code  # pylint: disable=global-statement
    get_sc_url = base_url + '/?sc=%s' % (sc_code)
    logger.debug("query new sc time-stamp ... %s", get_sc_url)
    logger.debug("headers: %s", headers)
    resp = network.get(get_sc_url, headers=headers)

    if time() > (sc_code_ts + 3000):
        logger.debug("query new sc time-stamp ...")
    # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
    # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
    # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21

        resp = network.get(base_url, headers=headers)
        raise_captcha(resp)
        dom = html.fromstring(resp.text)
    if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
        raise SearxEngineCaptchaException(
            message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
        )

    dom = lxml.html.fromstring(resp.text)

    try:
            # <input type="hidden" name="sc" value="...">
            sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0]
        sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
    except IndexError as exc:
            # suspend startpage API --> https://github.com/searxng/searxng/pull/695
            raise SearxEngineResponseException(
                suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!"
        logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
        raise SearxEngineCaptchaException(
            message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
        ) from exc

    sc_code_ts = time()
        logger.debug("new value is: %s", sc_code)

    logger.debug("get_sc_code: new value is: %s", sc_code)
    return sc_code


# do search-request
def request(query, params):
    """Assemble a Startpage request.

    # pylint: disable=line-too-long
    # The format string from Startpage's FFox add-on [1]::
    #
    #     https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0
    #
    # [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/
    To avoid CAPTCHA we need to send a well formed HTTP POST request with a
    cookie.  We need to form a request that is identical to the request build by
    Startpage's search form:

    - in the cookie the **region** is selected
    - in the HTTP POST data the **language** is selected

    Additionally the arguments form Startpage's search form needs to be set in
    HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
    """
    if startpage_categ == 'web':
        return _request_cat_web(query, params)

    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
    return params


def _request_cat_web(query, params):

    engine_region = traits.get_region(params['searxng_locale'], 'en-US')
    engine_language = traits.get_language(params['searxng_locale'], 'en')

    # build arguments
    args = {
        'query': query,
        'page': params['pageno'],
        'cat': 'web',
        # 'pl': 'ext-ff',
        # 'extVersion': '1.3.0',
        # 'abp': "-1",
        'sc': get_sc_code(params['headers']),
        't': 'device',
        'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers,
        'with_date': time_range_dict.get(params['time_range'], ''),
    }

    # set language if specified
    if params['language'] != 'all':
        lang_code = match_language(params['language'], supported_languages, fallback=None)
        if lang_code:
            language_name = supported_languages[lang_code]['alias']
            args['language'] = language_name
            args['lui'] = language_name
    if engine_language:
        args['language'] = engine_language
        args['lui'] = engine_language

    args['abp'] = '1'
    if params['pageno'] > 1:
        args['page'] = params['pageno']

    # build cookie
    lang_homepage = 'en'
    cookie = OrderedDict()
    cookie['date_time'] = 'world'
    cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
    cookie['disable_open_in_new_window'] = '0'
    cookie['enable_post_method'] = '1'  # hint: POST
    cookie['enable_proxy_safety_suggest'] = '1'
    cookie['enable_stay_control'] = '1'
    cookie['instant_answers'] = '1'
    cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
    cookie['num_of_results'] = '10'
    cookie['suggestions'] = '1'
    cookie['wt_unit'] = 'celsius'

    if engine_language:
        cookie['language'] = engine_language
        cookie['language_ui'] = engine_language

    if engine_region:
        cookie['search_results_region'] = engine_region

    params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
    logger.debug('cookie preferences: %s', params['cookies']['preferences'])

    # POST request
    logger.debug("data: %s", args)
    params['data'] = args
    params['method'] = 'POST'
    params['url'] = search_url
    params['headers']['Origin'] = base_url
    params['headers']['Referer'] = base_url + '/'
    # is the Accept header needed?
    # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'

    params['url'] = search_url + urlencode(args)
    return params


# get response from search-request
def response(resp):
    results = []
    dom = lxml.html.fromstring(resp.text)

    dom = html.fromstring(resp.text)
    if startpage_categ == 'web':
        return _response_cat_web(dom)

    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
    return []


def _response_cat_web(dom):
    results = []

    # parse results
    for result in eval_xpath(dom, results_xpath):
@@ -233,7 +364,7 @@ def response(resp):
            content = content[date_pos:]

            try:
                published_date = parser.parse(date_string, dayfirst=True)
                published_date = dateutil.parser.parse(date_string, dayfirst=True)
            except ValueError:
                pass

@@ -259,78 +390,10 @@ def response(resp):
    return results


# get supported languages from their site
def _fetch_supported_languages(resp):
    # startpage's language selector is a mess each option has a displayed name
    # and a value, either of which may represent the language name in the native
    # script, the language name in English, an English transliteration of the
    # native name, the English name of the writing script used by the language,
    # or occasionally something else entirely.

    # this cases are so special they need to be hardcoded, a couple of them are misspellings
    language_names = {
        'english_uk': 'en-GB',
        'fantizhengwen': ['zh-TW', 'zh-HK'],
        'hangul': 'ko',
        'malayam': 'ml',
        'norsk': 'nb',
        'sinhalese': 'si',
        'sudanese': 'su',
    }

    # get the English name of every language known by babel
    language_names.update(
        {
            # fmt: off
            name.lower(): lang_code
            # pylint: disable=protected-access
            for lang_code, name in Locale('en')._data['languages'].items()
            # fmt: on
        }
    )

    # get the native name of every language known by babel
    for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
        native_name = Locale(lang_code).get_language_name().lower()
        # add native name exactly as it is
        language_names[native_name] = lang_code

        # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
        unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
        if len(unaccented_name) == len(unaccented_name.encode()):
            # add only if result is ascii (otherwise "normalization" didn't work)
            language_names[unaccented_name] = lang_code

    dom = html.fromstring(resp.text)
    sp_lang_names = []
    for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
        sp_lang_names.append((option.get('value'), extract_text(option).lower()))

    supported_languages = {}
    for sp_option_value, sp_option_text in sp_lang_names:
        lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
        if isinstance(lang_code, str):
            supported_languages[lang_code] = {'alias': sp_option_value}
        elif isinstance(lang_code, list):
            for _lc in lang_code:
                supported_languages[_lc] = {'alias': sp_option_value}
        else:
            print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))

    return supported_languages


def fetch_traits(engine_traits: EngineTraits):
    """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
    regions>` from Startpage."""
    # pylint: disable=import-outside-toplevel, too-many-locals, too-many-branches
    # pylint: disable=too-many-statements

    engine_traits.data_type = 'supported_languages'  # deprecated

    import babel
    from searx.utils import gen_useragent
    from searx.locales import region_tag
    # pylint: disable=too-many-branches

    headers = {
        'User-Agent': gen_useragent(),
@@ -341,7 +404,7 @@ def fetch_traits(engine_traits: EngineTraits):
    if not resp.ok:
        print("ERROR: response from Startpage is not OK.")

    dom = html.fromstring(resp.text)
    dom = lxml.html.fromstring(resp.text)

    # regions