Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit e9afc4f8 authored by Markus Heiser's avatar Markus Heiser
Browse files

[mod] Startpage: reversed engineered & upgrade to data_type: traits_v1



One reason for the often seen CAPTCHA of the Startpage requests are the
incomplete requests SearXNG sends to startpage.com: this patch is a complete new
implementation of the ``request()`` function, reversed engineered from the
Startpage's search form.  The new implementation:

- use traits of data_type: traits_v1 and drop deprecated data_type: supported_languages
- adds time-range support
- adds save-search support
- fix searxng/searxng/issues 1884
- fix searxng/searxng/issues 1081 --> improvements to avoid CAPTCHA

In preparation for more categories (News, Images, Videos ..) from Startpage, the
variable ``startpage_categ`` was set up.  The default value is ``web`` and other
categories from Startpage are not yet implemented.

Signed-off-by: default avatarMarkus Heiser <markus.heiser@darmarit.de>
parent 858aa3e6
Loading
Loading
Loading
Loading
+1 −6
Original line number Original line Diff line number Diff line
@@ -10,9 +10,4 @@ Startpage engines
   :backlinks: entry
   :backlinks: entry


.. automodule:: searx.engines.startpage
.. automodule:: searx.engines.startpage

   :members:
Functions
=========

.. autofunction:: searx.engines.startpage.fetch_traits
.. autofunction:: searx.engines.startpage.get_sc_code
+3 −3
Original line number Original line Diff line number Diff line
@@ -109,9 +109,9 @@ def seznam(query, _lang):
    ]
    ]




def startpage(query, lang):
def startpage(query, sxng_locale):
    # startpage autocompleter
    """Autocomplete from Startpage. Supports Startpage's languages"""
    lui = engines['startpage'].supported_languages.get(lang, 'english')  # vintage / deprecated
    lui = engines['startpage'].traits.get_language(sxng_locale, 'english')
    url = 'https://startpage.com/suggestions?{query}'
    url = 'https://startpage.com/suggestions?{query}'
    resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
    resp = get(url.format(query=urlencode({'q': query, 'segment': 'startpage.udog', 'lui': lui})))
    data = resp.json()
    data = resp.json()
+2 −252
Original line number Original line Diff line number Diff line
@@ -3078,7 +3078,7 @@
  "startpage": {
  "startpage": {
    "all_locale": null,
    "all_locale": null,
    "custom": {},
    "custom": {},
    "data_type": "supported_languages",
    "data_type": "traits_v1",
    "languages": {
    "languages": {
      "af": "afrikaans",
      "af": "afrikaans",
      "am": "amharic",
      "am": "amharic",
@@ -3213,257 +3213,7 @@
      "zh-HK": "zh-TW_HK",
      "zh-HK": "zh-TW_HK",
      "zh-TW": "zh-TW_TW"
      "zh-TW": "zh-TW_TW"
    },
    },
    "supported_languages": {
    "supported_languages": {}
      "af": {
        "alias": "afrikaans"
      },
      "am": {
        "alias": "amharic"
      },
      "ar": {
        "alias": "arabic"
      },
      "az": {
        "alias": "azerbaijani"
      },
      "be": {
        "alias": "belarusian"
      },
      "bg": {
        "alias": "bulgarian"
      },
      "bn": {
        "alias": "bengali"
      },
      "bs": {
        "alias": "bosnian"
      },
      "ca": {
        "alias": "catalan"
      },
      "cs": {
        "alias": "czech"
      },
      "cy": {
        "alias": "welsh"
      },
      "da": {
        "alias": "dansk"
      },
      "de": {
        "alias": "deutsch"
      },
      "el": {
        "alias": "greek"
      },
      "en": {
        "alias": "english"
      },
      "en-GB": {
        "alias": "english_uk"
      },
      "eo": {
        "alias": "esperanto"
      },
      "es": {
        "alias": "espanol"
      },
      "et": {
        "alias": "estonian"
      },
      "eu": {
        "alias": "basque"
      },
      "fa": {
        "alias": "persian"
      },
      "fi": {
        "alias": "suomi"
      },
      "fo": {
        "alias": "faroese"
      },
      "fr": {
        "alias": "francais"
      },
      "fy": {
        "alias": "frisian"
      },
      "ga": {
        "alias": "irish"
      },
      "gd": {
        "alias": "gaelic"
      },
      "gl": {
        "alias": "galician"
      },
      "gu": {
        "alias": "gujarati"
      },
      "he": {
        "alias": "hebrew"
      },
      "hi": {
        "alias": "hindi"
      },
      "hr": {
        "alias": "croatian"
      },
      "hu": {
        "alias": "hungarian"
      },
      "ia": {
        "alias": "interlingua"
      },
      "id": {
        "alias": "indonesian"
      },
      "is": {
        "alias": "icelandic"
      },
      "it": {
        "alias": "italiano"
      },
      "ja": {
        "alias": "nihongo"
      },
      "jv": {
        "alias": "javanese"
      },
      "ka": {
        "alias": "georgian"
      },
      "kn": {
        "alias": "kannada"
      },
      "ko": {
        "alias": "hangul"
      },
      "la": {
        "alias": "latin"
      },
      "lt": {
        "alias": "lithuanian"
      },
      "lv": {
        "alias": "latvian"
      },
      "mai": {
        "alias": "bihari"
      },
      "mk": {
        "alias": "macedonian"
      },
      "ml": {
        "alias": "malayalam"
      },
      "mr": {
        "alias": "marathi"
      },
      "ms": {
        "alias": "malay"
      },
      "mt": {
        "alias": "maltese"
      },
      "ne": {
        "alias": "nepali"
      },
      "nl": {
        "alias": "nederlands"
      },
      "no": {
        "alias": "norsk"
      },
      "oc": {
        "alias": "occitan"
      },
      "pa": {
        "alias": "punjabi"
      },
      "pl": {
        "alias": "polski"
      },
      "pt": {
        "alias": "portugues"
      },
      "ro": {
        "alias": "romanian"
      },
      "ru": {
        "alias": "russian"
      },
      "si": {
        "alias": "sinhalese"
      },
      "sk": {
        "alias": "slovak"
      },
      "sl": {
        "alias": "slovenian"
      },
      "sq": {
        "alias": "albanian"
      },
      "sr": {
        "alias": "serbian"
      },
      "su": {
        "alias": "sudanese"
      },
      "sv": {
        "alias": "svenska"
      },
      "sw": {
        "alias": "swahili"
      },
      "ta": {
        "alias": "tamil"
      },
      "te": {
        "alias": "telugu"
      },
      "th": {
        "alias": "thai"
      },
      "ti": {
        "alias": "tigrinya"
      },
      "tl": {
        "alias": "tagalog"
      },
      "tr": {
        "alias": "turkce"
      },
      "uk": {
        "alias": "ukrainian"
      },
      "ur": {
        "alias": "urdu"
      },
      "uz": {
        "alias": "uzbek"
      },
      "vi": {
        "alias": "vietnamese"
      },
      "xh": {
        "alias": "xhosa"
      },
      "zh": {
        "alias": "jiantizhongwen"
      },
      "zh-HK": {
        "alias": "fantizhengwen"
      },
      "zh-TW": {
        "alias": "fantizhengwen"
      },
      "zu": {
        "alias": "zulu"
      }
    }
  },
  },
  "wikidata": {
  "wikidata": {
    "all_locale": null,
    "all_locale": null,
+213 −150
Original line number Original line Diff line number Diff line
@@ -50,38 +50,58 @@ W3C recommends subtag over macrolanguage [2]_.
Startpage languages
Startpage languages
===================
===================


The displayed name in Startpage's settings page depend on the location of the IP
:py:obj:`send_accept_language_header`:
when the 'Accept-Language' HTTP header is unset (in the language update script
  The displayed name in Startpage's settings page depend on the location of the
we use "en-US,en;q=0.5" to get uniform names independent from the IP).
  IP when ``Accept-Language`` HTTP header is unset.  In :py:obj:`fetch_traits`
  we use::


Each option has a displayed name and a value, either of which may represent the
    'Accept-Language': "en-US,en;q=0.5",
language name in the native script, the language name in English, an English
    ..
transliteration of the native name, the English name of the writing script used

by the language, or occasionally something else entirely.
  to get uniform names independent from the IP).

.. _startpage categories:

Startpage categories
====================

Startpage's category (for Web-search, News, Videos, ..) is set by
:py:obj:`startpage_categ` in  settings.yml::

  - name: startpage
    engine: startpage
    startpage_categ: web
    ...

.. hint::

   The default category is ``web`` .. and other categories than ``web`` are not
   yet implemented.


"""
"""


from typing import TYPE_CHECKING
from collections import OrderedDict
import re
import re
from time import time

from urllib.parse import urlencode
from unicodedata import normalize, combining
from unicodedata import normalize, combining
from time import time
from datetime import datetime, timedelta
from datetime import datetime, timedelta


from dateutil import parser
import dateutil.parser
from lxml import html
import lxml.html
from babel import Locale
import babel
from babel.localedata import locale_identifiers


from searx import network
from searx import network
from searx.utils import extract_text, eval_xpath, match_language
from searx.utils import extract_text, eval_xpath, gen_useragent
from searx.exceptions import (
from searx.exceptions import SearxEngineCaptchaException
    SearxEngineResponseException,
from searx.locales import region_tag
    SearxEngineCaptchaException,
)

from searx.enginelib.traits import EngineTraits
from searx.enginelib.traits import EngineTraits


if TYPE_CHECKING:
    import logging

    logger: logging.Logger

traits: EngineTraits
traits: EngineTraits


# about
# about
@@ -94,18 +114,28 @@ about = {
    "results": 'HTML',
    "results": 'HTML',
}
}


startpage_categ = 'web'
"""Startpage's category, visit :ref:`startpage categories`.
"""

send_accept_language_header = True
"""Startpage tries to guess user's language and territory from the HTTP
``Accept-Language``.  Optional the user can select a search-language (can be
different to the UI language) and a region filter.
"""

# engine dependent config
# engine dependent config
categories = ['general', 'web']
categories = ['general', 'web']
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls

paging = True
paging = True
supported_languages_url = 'https://www.startpage.com/do/settings'
time_range_support = True
safesearch = True

time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
safesearch_dict = {0: '0', 1: '1', 2: '1'}


# search-url
# search-url
base_url = 'https://startpage.com/'
base_url = 'https://www.startpage.com'
search_url = base_url + 'sp/search?'
search_url = base_url + '/sp/search'


# specific xpath variables
# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
@@ -113,92 +143,193 @@ search_url = base_url + 'sp/search?'
results_xpath = '//div[@class="w-gl__result__main"]'
results_xpath = '//div[@class="w-gl__result__main"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]'
link_xpath = './/a[@class="w-gl__result-title result-link"]'
content_xpath = './/p[@class="w-gl__description"]'
content_xpath = './/p[@class="w-gl__description"]'
search_form_xpath = '//form[@id="search"]'
"""XPath of Startpage's origin search form

.. code: html

    <form action="/sp/search" method="post">
      <input type="text" name="query"  value="" ..>
      <input type="hidden" name="t" value="device">
      <input type="hidden" name="lui" value="english">
      <input type="hidden" name="sc" value="Q7Mt5TRqowKB00">
      <input type="hidden" name="cat" value="web">
      <input type="hidden" class="abp" id="abp-input" name="abp" value="1">
    </form>
"""


# timestamp of the last fetch of 'sc' code
# timestamp of the last fetch of 'sc' code
sc_code_ts = 0
sc_code_ts = 0
sc_code = ''
sc_code = ''
sc_code_cache_sec = 30
"""Time in seconds the sc-code is cached in memory :py:obj:`get_sc_code`."""




def raise_captcha(resp):
def get_sc_code(searxng_locale, params):
    """Get an actual ``sc`` argument from Startpage's search form (HTML page).


    if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
    Startpage puts a ``sc`` argument on every HTML :py:obj:`search form
        raise SearxEngineCaptchaException()
    <search_form_xpath>`.  Without this argument Startpage considers the request
    is from a bot.  We do not know what is encoded in the value of the ``sc``
    argument, but it seems to be a kind of a *time-stamp*.


    Startpage's search form generates a new sc-code on each request.  This
    function scrap a new sc-code from Startpage's home page every
    :py:obj:`sc_code_cache_sec` seconds.


def get_sc_code(headers):
    """
    """Get an actual ``sc`` argument from Startpage's home page.


    Startpage puts a ``sc`` argument on every link.  Without this argument
    global sc_code_ts, sc_code  # pylint: disable=global-statement
    Startpage considers the request is from a bot.  We do not know what is
    encoded in the value of the ``sc`` argument, but it seems to be a kind of a
    *time-stamp*.  This *time-stamp* is valid for a few hours.


    This function scrap a new *time-stamp* from startpage's home page every hour
    if sc_code and (time() < (sc_code_ts + sc_code_cache_sec)):
    (3000 sec).
        logger.debug("get_sc_code: reuse '%s'", sc_code)
        return sc_code


    """
    headers = {**params['headers']}
    headers['Origin'] = base_url
    headers['Referer'] = base_url + '/'
    # headers['Connection'] = 'keep-alive'
    # headers['Accept-Encoding'] = 'gzip, deflate, br'
    # headers['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8'
    # headers['User-Agent'] = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:105.0) Gecko/20100101 Firefox/105.0'

    # add Accept-Language header
    if searxng_locale == 'all':
        searxng_locale = 'en-US'
    locale = babel.Locale.parse(searxng_locale, sep='-')

    if send_accept_language_header:
        ac_lang = locale.language
        if locale.territory:
            ac_lang = "%s-%s,%s;q=0.9,*;q=0.5" % (
                locale.language,
                locale.territory,
                locale.language,
            )
        headers['Accept-Language'] = ac_lang


    global sc_code_ts, sc_code  # pylint: disable=global-statement
    get_sc_url = base_url + '/?sc=%s' % (sc_code)
    logger.debug("query new sc time-stamp ... %s", get_sc_url)
    logger.debug("headers: %s", headers)
    resp = network.get(get_sc_url, headers=headers)


    if time() > (sc_code_ts + 3000):
    # ?? x = network.get('https://www.startpage.com/sp/cdn/images/filter-chevron.svg', headers=headers)
        logger.debug("query new sc time-stamp ...")
    # ?? https://www.startpage.com/sp/cdn/images/filter-chevron.svg
    # ?? ping-back URL: https://www.startpage.com/sp/pb?sc=TLsB0oITjZ8F21


        resp = network.get(base_url, headers=headers)
    if str(resp.url).startswith('https://www.startpage.com/sp/captcha'):
        raise_captcha(resp)
        raise SearxEngineCaptchaException(
        dom = html.fromstring(resp.text)
            message="get_sc_code: got redirected to https://www.startpage.com/sp/captcha",
        )

    dom = lxml.html.fromstring(resp.text)


    try:
    try:
            # <input type="hidden" name="sc" value="...">
        sc_code = eval_xpath(dom, search_form_xpath + '//input[@name="sc"]/@value')[0]
            sc_code = eval_xpath(dom, '//input[@name="sc"]/@value')[0]
    except IndexError as exc:
    except IndexError as exc:
            # suspend startpage API --> https://github.com/searxng/searxng/pull/695
        logger.debug("suspend startpage API --> https://github.com/searxng/searxng/pull/695")
            raise SearxEngineResponseException(
        raise SearxEngineCaptchaException(
                suspended_time=7 * 24 * 3600, message="PR-695: query new sc time-stamp failed!"
            message="get_sc_code: [PR-695] query new sc time-stamp failed! (%s)" % resp.url,
        ) from exc
        ) from exc


    sc_code_ts = time()
    sc_code_ts = time()
        logger.debug("new value is: %s", sc_code)
    logger.debug("get_sc_code: new value is: %s", sc_code)

    return sc_code
    return sc_code




# do search-request
def request(query, params):
def request(query, params):
    """Assemble a Startpage request.


    # pylint: disable=line-too-long
    To avoid CAPTCHA we need to send a well formed HTTP POST request with a
    # The format string from Startpage's FFox add-on [1]::
    cookie.  We need to form a request that is identical to the request build by
    #
    Startpage's search form:
    #     https://www.startpage.com/do/dsearch?query={searchTerms}&cat=web&pl=ext-ff&language=__MSG_extensionUrlLanguage__&extVersion=1.3.0
    #
    # [1] https://addons.mozilla.org/en-US/firefox/addon/startpage-private-search/


    - in the cookie the **region** is selected
    - in the HTTP POST data the **language** is selected

    Additionally the arguments form Startpage's search form needs to be set in
    HTML POST data / compare ``<input>`` elements: :py:obj:`search_form_xpath`.
    """
    if startpage_categ == 'web':
        return _request_cat_web(query, params)

    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
    return params


def _request_cat_web(query, params):

    engine_region = traits.get_region(params['searxng_locale'], 'en-US')
    engine_language = traits.get_language(params['searxng_locale'], 'en')

    # build arguments
    args = {
    args = {
        'query': query,
        'query': query,
        'page': params['pageno'],
        'cat': 'web',
        'cat': 'web',
        # 'pl': 'ext-ff',
        't': 'device',
        # 'extVersion': '1.3.0',
        'sc': get_sc_code(params['searxng_locale'], params),  # hint: this func needs HTTP headers,
        # 'abp': "-1",
        'with_date': time_range_dict.get(params['time_range'], ''),
        'sc': get_sc_code(params['headers']),
    }
    }


    # set language if specified
    if engine_language:
    if params['language'] != 'all':
        args['language'] = engine_language
        lang_code = match_language(params['language'], supported_languages, fallback=None)
        args['lui'] = engine_language
        if lang_code:

            language_name = supported_languages[lang_code]['alias']
    args['abp'] = '1'
            args['language'] = language_name
    if params['pageno'] > 1:
            args['lui'] = language_name
        args['page'] = params['pageno']

    # build cookie
    lang_homepage = 'en'
    cookie = OrderedDict()
    cookie['date_time'] = 'world'
    cookie['disable_family_filter'] = safesearch_dict[params['safesearch']]
    cookie['disable_open_in_new_window'] = '0'
    cookie['enable_post_method'] = '1'  # hint: POST
    cookie['enable_proxy_safety_suggest'] = '1'
    cookie['enable_stay_control'] = '1'
    cookie['instant_answers'] = '1'
    cookie['lang_homepage'] = 's/device/%s/' % lang_homepage
    cookie['num_of_results'] = '10'
    cookie['suggestions'] = '1'
    cookie['wt_unit'] = 'celsius'

    if engine_language:
        cookie['language'] = engine_language
        cookie['language_ui'] = engine_language

    if engine_region:
        cookie['search_results_region'] = engine_region

    params['cookies']['preferences'] = 'N1N'.join(["%sEEE%s" % x for x in cookie.items()])
    logger.debug('cookie preferences: %s', params['cookies']['preferences'])

    # POST request
    logger.debug("data: %s", args)
    params['data'] = args
    params['method'] = 'POST'
    params['url'] = search_url
    params['headers']['Origin'] = base_url
    params['headers']['Referer'] = base_url + '/'
    # is the Accept header needed?
    # params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'


    params['url'] = search_url + urlencode(args)
    return params
    return params




# get response from search-request
# get response from search-request
def response(resp):
def response(resp):
    results = []
    dom = lxml.html.fromstring(resp.text)


    dom = html.fromstring(resp.text)
    if startpage_categ == 'web':
        return _response_cat_web(dom)

    logger.error("Startpages's category '%' is not yet implemented.", startpage_categ)
    return []


def _response_cat_web(dom):
    results = []


    # parse results
    # parse results
    for result in eval_xpath(dom, results_xpath):
    for result in eval_xpath(dom, results_xpath):
@@ -233,7 +364,7 @@ def response(resp):
            content = content[date_pos:]
            content = content[date_pos:]


            try:
            try:
                published_date = parser.parse(date_string, dayfirst=True)
                published_date = dateutil.parser.parse(date_string, dayfirst=True)
            except ValueError:
            except ValueError:
                pass
                pass


@@ -259,78 +390,10 @@ def response(resp):
    return results
    return results




# get supported languages from their site
def _fetch_supported_languages(resp):
    # startpage's language selector is a mess each option has a displayed name
    # and a value, either of which may represent the language name in the native
    # script, the language name in English, an English transliteration of the
    # native name, the English name of the writing script used by the language,
    # or occasionally something else entirely.

    # this cases are so special they need to be hardcoded, a couple of them are misspellings
    language_names = {
        'english_uk': 'en-GB',
        'fantizhengwen': ['zh-TW', 'zh-HK'],
        'hangul': 'ko',
        'malayam': 'ml',
        'norsk': 'nb',
        'sinhalese': 'si',
        'sudanese': 'su',
    }

    # get the English name of every language known by babel
    language_names.update(
        {
            # fmt: off
            name.lower(): lang_code
            # pylint: disable=protected-access
            for lang_code, name in Locale('en')._data['languages'].items()
            # fmt: on
        }
    )

    # get the native name of every language known by babel
    for lang_code in filter(lambda lang_code: lang_code.find('_') == -1, locale_identifiers()):
        native_name = Locale(lang_code).get_language_name().lower()
        # add native name exactly as it is
        language_names[native_name] = lang_code

        # add "normalized" language name (i.e. français becomes francais and español becomes espanol)
        unaccented_name = ''.join(filter(lambda c: not combining(c), normalize('NFKD', native_name)))
        if len(unaccented_name) == len(unaccented_name.encode()):
            # add only if result is ascii (otherwise "normalization" didn't work)
            language_names[unaccented_name] = lang_code

    dom = html.fromstring(resp.text)
    sp_lang_names = []
    for option in dom.xpath('//form[@name="settings"]//select[@name="language"]/option'):
        sp_lang_names.append((option.get('value'), extract_text(option).lower()))

    supported_languages = {}
    for sp_option_value, sp_option_text in sp_lang_names:
        lang_code = language_names.get(sp_option_value) or language_names.get(sp_option_text)
        if isinstance(lang_code, str):
            supported_languages[lang_code] = {'alias': sp_option_value}
        elif isinstance(lang_code, list):
            for _lc in lang_code:
                supported_languages[_lc] = {'alias': sp_option_value}
        else:
            print('Unknown language option in Startpage: {} ({})'.format(sp_option_value, sp_option_text))

    return supported_languages


def fetch_traits(engine_traits: EngineTraits):
def fetch_traits(engine_traits: EngineTraits):
    """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
    """Fetch :ref:`languages <startpage languages>` and :ref:`regions <startpage
    regions>` from Startpage."""
    regions>` from Startpage."""
    # pylint: disable=import-outside-toplevel, too-many-locals, too-many-branches
    # pylint: disable=too-many-branches
    # pylint: disable=too-many-statements

    engine_traits.data_type = 'supported_languages'  # deprecated

    import babel
    from searx.utils import gen_useragent
    from searx.locales import region_tag


    headers = {
    headers = {
        'User-Agent': gen_useragent(),
        'User-Agent': gen_useragent(),
@@ -341,7 +404,7 @@ def fetch_traits(engine_traits: EngineTraits):
    if not resp.ok:
    if not resp.ok:
        print("ERROR: response from Startpage is not OK.")
        print("ERROR: response from Startpage is not OK.")


    dom = html.fromstring(resp.text)
    dom = lxml.html.fromstring(resp.text)


    # regions
    # regions