Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Verified Commit 240d359d authored by Nivesh Krishna's avatar Nivesh Krishna
Browse files

use lite for DDG

parent 629d5730
Loading
Loading
Loading
Loading
+292 −127
Original line number Diff line number Diff line
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""
 DuckDuckGo (Web)
DuckDuckGo Lite
~~~~~~~~~~~~~~~
"""

from json import loads
from urllib.parse import urlencode
from searx.utils import match_language, extract_text
from typing import TYPE_CHECKING
import re
from searx.network import get
from lxml.html import fromstring
from urllib.parse import urlencode
import json
import babel
import lxml.html

from searx import (
    network,
    locales,
    redislib,
    external_bang,
)
from searx import redisdb
from searx.utils import (
    eval_xpath,
    eval_xpath_getindex,
    extract_text,
)
from searx.enginelib.traits import EngineTraits
from searx.exceptions import SearxEngineAPIException

if TYPE_CHECKING:
    import logging

    logger: logging.Logger

traits: EngineTraits

about = {
    "website": "https://duckduckgo.com/",
    "wikidata_id": "Q12805",
    "official_api_documentation": "https://duckduckgo.com/api",
    "website": 'https://lite.duckduckgo.com/lite/',
    "wikidata_id": 'Q12805',
    "use_official_api": False,
    "require_api_key": False,
    "results": "HTML",
    "results": 'HTML',
}

send_accept_language_header = True
@@ -26,157 +49,299 @@ language).
"""

# engine dependent config
categories = ["general"]
categories = ['general', 'web']
paging = True
supported_languages_url = "https://duckduckgo.com/util/u172.js"
number_of_results = 10
time_range_support = True
safesearch = True
VQD_REGEX = r"vqd='(\d+-\d+)'"
language_aliases = {
    "ca-ES": "ct-ca",
    "de-AT": "de-de",
    "de-CH": "de-de",
    "es-AR": "es-es",
    "es-CL": "es-es",
    "es-MX": "es-es",
    "fr-BE": "be-fr",
    "fr-CA": "ca-fr",
    "fr-CH": "ch-fr",
    "ar-SA": "ar-XA",
    "es-419": "es-XL",
    "ja": "jp-JP",
    "ko": "kr-KR",
    "sl-SI": "sl-SL",
    "zh-TW": "tzh-TW",
    "zh-HK": "tzh-HK",
}
safesearch = True  # user can't select but the results are filtered

url = 'https://lite.duckduckgo.com/lite/'
# url_ping = 'https://duckduckgo.com/t/sl_l'

# search-url
url = "https://links.duckduckgo.com/d.js?"
url_ping = "https://duckduckgo.com/t/sl_h"
time_range_dict = {"day": "d", "week": "w", "month": "m", "year": "y"}
time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm', 'year': 'y'}
form_data = {'v': 'l', 'api': 'd.js', 'o': 'json'}


# match query's language to a region code that duckduckgo will accept
def get_region_code(lang, lang_list=None):
    if lang == "all":
        return None
def cache_vqd(query, value):
    """Caches a ``vqd`` value from a query.

    lang_code = match_language(lang, lang_list or [], language_aliases, "wt-WT")
    lang_parts = lang_code.split("-")
    The vqd value depends on the query string and is needed for the follow up
    pages or the images loaded by a XMLHttpRequest:

    # country code goes first
    return lang_parts[1].lower() + "-" + lang_parts[0].lower()
    - DuckDuckGo Web: `https://links.duckduckgo.com/d.js?q=...&vqd=...`
    - DuckDuckGo Images: `https://duckduckgo.com/i.js??q=...&vqd=...`

    """
    c = redisdb.client()
    if c:
        logger.debug("cache vqd value: %s", value)
        key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
        c.set(key, value, ex=600)


def get_vqd(query, headers):
    resp = get(f"https://duckduckgo.com/?q={query}&ia=web", headers=headers)
    resp = re.findall(VQD_REGEX, resp.text)
    return resp[0]
    """Returns the ``vqd`` that fits to the *query*.  If there is no ``vqd`` cached
    (:py:obj:`cache_vqd`) the query is sent to DDG to get a vqd value from the
    response.

    """
    value = None
    c = redisdb.client()
    if c:
        key = 'SearXNG_ddg_vqd' + redislib.secret_hash(query)
        value = c.get(key)
        if value:
            value = value.decode('utf-8')
            logger.debug("re-use cached vqd value: %s", value)
            return value

def request(query, params):
    query_url = 'https://duckduckgo.com/?{query}&iar=images'.format(query=urlencode({'q': query}))
    res = network.get(query_url, headers=headers)
    content = res.text
    if content.find('vqd=\'') == -1:
        raise SearxEngineAPIException('Request failed')
    value = content[content.find('vqd=\'') + 5 :]
    value = value[: value.find('\'')]
    logger.debug("new vqd value: %s", value)
    cache_vqd(query, value)
    return value


def get_ddg_lang(eng_traits: EngineTraits, sxng_locale, default='en_US'):
    """Get DuckDuckGo's language identifier from SearXNG's locale.

    DuckDuckGo defines its lanaguages by region codes (see
    :py:obj:`fetch_traits`).

    To get region and language of a DDG service use:

    .. code: python

       eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
       eng_lang = get_ddg_lang(traits, params['searxng_locale'])

    It might confuse, but the ``l`` value of the cookie is what SearXNG calls
    the *region*:

    .. code:: python

        # !ddi paris :es-AR --> {'ad': 'es_AR', 'ah': 'ar-es', 'l': 'ar-es'}
        params['cookies']['ad'] = eng_lang
        params['cookies']['ah'] = eng_region
        params['cookies']['l'] = eng_region

    .. hint::

    params["method"] = "GET"

    vqd = get_vqd(query, params["headers"])
    dl, ct = match_language(params["language"], supported_languages, language_aliases, "wt-WT").split("-")
    query_dict = {
        "q": query,
        "t": "D",
        "l": f"{dl}-{ct}",
        "kl": f"{ct}-{dl}",
        "s": (params["pageno"] - 1) * number_of_results,
        "dl": dl,
        "ct": ct,
        "ss_mkt": get_region_code(params["language"], supported_languages),
        "df": params["time_range"],
        "vqd": vqd,
        "ex": -2,
        "sp": "1",
        "bpa": "1",
        "biaexp": "b",
        "msvrtexp": "b",
       `DDG-lite <https://lite.duckduckgo.com/lite>`__ does not offer a language
       selection to the user, only a region can be selected by the user
       (``eng_region`` from the example above).  DDG-lite stores the selected
       region in a cookie::

         params['cookies']['kl'] = eng_region  # 'ar-es'

    """
    return eng_traits.custom['lang_region'].get(sxng_locale, eng_traits.get_language(sxng_locale, default))


ddg_reg_map = {
    'tw-tzh': 'zh_TW',
    'hk-tzh': 'zh_HK',
    'ct-ca': 'skip',  # ct-ca and es-ca both map to ca_ES
    'es-ca': 'ca_ES',
    'id-en': 'id_ID',
    'no-no': 'nb_NO',
    'jp-jp': 'ja_JP',
    'kr-kr': 'ko_KR',
    'xa-ar': 'ar_SA',
    'sl-sl': 'sl_SI',
    'th-en': 'th_TH',
    'vn-en': 'vi_VN',
}
    if params["safesearch"] == 2:  # STRICT
        del query_dict["t"]
        query_dict["p"] = 1
        query_dict.update(
            {
                "videxp": "a",
                "nadse": "b",
                "eclsexp": "a",
                "stiaexp": "a",
                "tjsexp": "b",
                "related": "b",
                "msnexp": "a",

ddg_lang_map = {
    # use ar --> ar_EG (Egypt's arabic)
    "ar_DZ": 'lang_region',
    "ar_JO": 'lang_region',
    "ar_SA": 'lang_region',
    # use bn --> bn_BD
    'bn_IN': 'lang_region',
    # use de --> de_DE
    'de_CH': 'lang_region',
    # use en --> en_US,
    'en_AU': 'lang_region',
    'en_CA': 'lang_region',
    'en_GB': 'lang_region',
    # Esperanto
    'eo_XX': 'eo',
    # use es --> es_ES,
    'es_AR': 'lang_region',
    'es_CL': 'lang_region',
    'es_CO': 'lang_region',
    'es_CR': 'lang_region',
    'es_EC': 'lang_region',
    'es_MX': 'lang_region',
    'es_PE': 'lang_region',
    'es_UY': 'lang_region',
    'es_VE': 'lang_region',
    # use fr --> rf_FR
    'fr_CA': 'lang_region',
    'fr_CH': 'lang_region',
    'fr_BE': 'lang_region',
    # use nl --> nl_NL
    'nl_BE': 'lang_region',
    # use pt --> pt_PT
    'pt_BR': 'lang_region',
    # skip these languages
    'od_IN': 'skip',
    'io_XX': 'skip',
    'tokipona_XX': 'skip',
}
        )
    elif params["safesearch"] == 1:  # MODERATE
        query_dict["ex"] = -1
        query_dict.update({"nadse": "b", "eclsexp": "b", "tjsexp": "b"})
    else:  # OFF
        query_dict["ex"] = -2
        query_dict.update({"nadse": "b", "eclsexp": "b", "tjsexp": "b"})

    params["allow_redirects"] = False
    params["data"] = query_dict
    params["cookies"]["kl"] = params["data"]["kl"]
    if params["time_range"] in time_range_dict:
        params["data"]["df"] = time_range_dict[params["time_range"]]
        params["cookies"]["df"] = time_range_dict[params["time_range"]]
    params["url"] = url + urlencode(params["data"])


def request(query, params):

    # quote ddg bangs
    query_parts = []
    # for val in re.split(r'(\s+)', query):
    for val in re.split(r'(\s+)', query):
        if not val.strip():
            continue
        if val.startswith('!') and external_bang.get_node(external_bang.EXTERNAL_BANGS, val[1:]):
            val = f"'{val}'"
        query_parts.append(val)
    query = ' '.join(query_parts)

    eng_region = traits.get_region(params['searxng_locale'], traits.all_locale)
    # eng_lang = get_ddg_lang(traits, params['searxng_locale'])

    params['url'] = url
    params['method'] = 'POST'
    params['data']['q'] = query

    # The API is not documented, so we do some reverse engineering and emulate
    # what https://lite.duckduckgo.com/lite/ does when you press "next Page"
    # link again and again ..

    params['headers']['Content-Type'] = 'application/x-www-form-urlencoded'
    params['headers']['Referer'] = 'https://google.com/'

    # initial page does not have an offset
    if params['pageno'] == 2:
        # second page does have an offset of 30
        offset = (params['pageno'] - 1) * 30
        params['data']['s'] = offset
        params['data']['dc'] = offset + 1

    elif params['pageno'] > 2:
        # third and following pages do have an offset of 30 + n*50
        offset = 30 + (params['pageno'] - 2) * 50
        params['data']['s'] = offset
        params['data']['dc'] = offset + 1

    # request needs a vqd argument
    params['data']['vqd'] = get_vqd(query, params["headers"])

    # initial page does not have additional data in the input form
    if params['pageno'] > 1:

        params['data']['o'] = form_data.get('o', 'json')
        params['data']['api'] = form_data.get('api', 'd.js')
        params['data']['nextParams'] = form_data.get('nextParams', '')
        params['data']['v'] = form_data.get('v', 'l')

    params['data']['kl'] = eng_region
    params['cookies']['kl'] = eng_region

    params['data']['df'] = ''
    if params['time_range'] in time_range_dict:
        params['data']['df'] = time_range_dict[params['time_range']]
        params['cookies']['df'] = time_range_dict[params['time_range']]

    logger.debug("param data: %s", params['data'])
    logger.debug("param cookies: %s", params['cookies'])
    return params


def response(resp):

    if resp.status_code == 303:
        return []

    # parse the response
    results = []
    doc = lxml.html.fromstring(resp.text)

    data = re.findall(
        r"DDG\.pageLayout\.load\('d',(\[.+\])\);DDG\.duckbar\.load\('images'",
        str(resp.text),
    )
    try:
        search_data = loads(data[0].replace("/\t/g", "    "))
    except IndexError:
        return

    if len(search_data) == 1 and ("n" not in search_data[0]):
        only_result = search_data[0]
        if (
            (only_result.get("da") is not None and only_result.get("t") == "EOF")
            or only_result.get("a") is not None
            or only_result.get("d") == "google.com search"
        ):
            return

    for search_result in search_data:
        if "n" in search_result:
    result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')

    if len(result_table) == 2:
        # some locales (at least China) does not have a "next page" button and
        # the layout of the HTML tables is different.
        result_table = result_table[1]
    elif not len(result_table) >= 3:
        # no more results
        return []
    else:
        result_table = result_table[2]
        # update form data from response
        form = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table//input/..')
        if len(form):

            form = form[0]
            form_data['v'] = eval_xpath(form, '//input[@name="v"]/@value')[0]
            form_data['api'] = eval_xpath(form, '//input[@name="api"]/@value')[0]
            form_data['o'] = eval_xpath(form, '//input[@name="o"]/@value')[0]
            logger.debug('form_data: %s', form_data)

            value = eval_xpath(form, '//input[@name="vqd"]/@value')[0]
            query = resp.search_params['data']['q']
            cache_vqd(query, value)

    tr_rows = eval_xpath(result_table, './/tr')
    # In the last <tr> is the form of the 'previous/next page' links
    tr_rows = tr_rows[:-1]

    len_tr_rows = len(tr_rows)
    offset = 0

    while len_tr_rows >= offset + 4:

        # assemble table rows we need to scrap
        tr_title = tr_rows[offset]
        tr_content = tr_rows[offset + 1]
        offset += 4

        # ignore sponsored Adds <tr class="result-sponsored">
        if tr_content.get('class') == 'result-sponsored':
            continue

        title = extract_text(fromstring(search_result.get("t")))
        a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
        if a_tag is None:
            continue

        content = extract_text(fromstring(search_result.get("a")))
        td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
        if td_content is None:
            continue

        results.append(
            {
                'title': a_tag.text_content(),
                'content': extract_text(td_content),
                'url': a_tag.get('href'),
            }
        )

        results.append({"title": title, "content": content, "url": search_result.get("u")})
    return results


def fetch_traits(engine_traits: EngineTraits):
    """Fetch languages & regions from DuckDuckGo.

    # response is a js file with regions as an embedded object
    response_page = resp.text
    response_page = response_page[response_page.find("regions:{") + 8 :]
    response_page = response_page[: response_page.find("}") + 1]
    SearXNG's ``all`` locale maps DuckDuckGo's "Alle regions" (``wt-wt``).
    DuckDuckGo's language "Browsers prefered language" (``wt_WT``) makes no
    sense in a SearXNG request since SearXNG's ``all`` will not add a
    ``Accept-Language`` HTTP header.  The value in ``engine_traits.all_locale``
    is ``wt-wt`` (the region).

    regions_json = loads(response_page)
    supported_languages = map((lambda x: x[3:] + "-" + x[:2].upper()), regions_json.keys())
    Beside regions DuckDuckGo also defines its lanaguages by region codes.  By
    example these are the english languages in DuckDuckGo:

    - en_US
    - en_AU

searx/engines/duckduckgo_lite.py

deleted100644 → 0
+0 −186
Original line number Diff line number Diff line
# SPDX-License-Identifier: AGPL-3.0-or-later
# lint: pylint
"""DuckDuckGo Lite
"""

from json import loads

from lxml.html import fromstring

from searx.utils import (
    dict_subset,
    eval_xpath,
    eval_xpath_getindex,
    extract_text,
    match_language,
)
from searx.network import get

# about
about = {
    "website": "https://lite.duckduckgo.com/lite/",
    "wikidata_id": "Q12805",
    "official_api_documentation": "https://duckduckgo.com/api",
    "use_official_api": False,
    "require_api_key": False,
    "results": "HTML",
}

# engine dependent config
categories = ["general", "web"]
paging = True
supported_languages_url = "https://duckduckgo.com/util/u588.js"
time_range_support = True
send_accept_language_header = True

language_aliases = {
    "ar-SA": "ar-XA",
    "es-419": "es-XL",
    "ja": "jp-JP",
    "ko": "kr-KR",
    "sl-SI": "sl-SL",
    "zh-TW": "tzh-TW",
    "zh-HK": "tzh-HK",
}

time_range_dict = {"day": "d", "week": "w", "month": "m", "year": "y"}

# search-url
url = "https://lite.duckduckgo.com/lite/"
url_ping = "https://duckduckgo.com/t/sl_l"

# match query's language to a region code that duckduckgo will accept
def get_region_code(lang, lang_list=None):
    if lang == "all":
        return None

    lang_code = match_language(lang, lang_list or [], language_aliases, "wt-WT")
    lang_parts = lang_code.split("-")

    # country code goes first
    return lang_parts[1].lower() + "-" + lang_parts[0].lower()


def request(query, params):

    params["url"] = url
    params["method"] = "POST"

    params["data"]["q"] = query

    # The API is not documented, so we do some reverse engineering and emulate
    # what https://lite.duckduckgo.com/lite/ does when you press "next Page"
    # link again and again ..

    params["headers"]["Content-Type"] = "application/x-www-form-urlencoded"
    params["headers"]["Referer"] = "https://lite.duckduckgo.com/"

    # initial page does not have an offset
    if params["pageno"] == 2:
        # second page does have an offset of 30
        offset = (params["pageno"] - 1) * 30
        params["data"]["s"] = offset
        params["data"]["dc"] = offset + 1

    elif params["pageno"] > 2:
        # third and following pages do have an offset of 30 + n*50
        offset = 30 + (params["pageno"] - 2) * 50
        params["data"]["s"] = offset
        params["data"]["dc"] = offset + 1

    # initial page does not have additional data in the input form
    if params["pageno"] > 1:
        # request the second page (and more pages) needs 'o' and 'api' arguments
        params["data"]["o"] = "json"
        params["data"]["api"] = "d.js"

    # initial page does not have additional data in the input form
    if params["pageno"] > 2:
        # request the third page (and more pages) some more arguments
        params["data"]["nextParams"] = ""
        params["data"]["v"] = ""
        params["data"]["vqd"] = ""

    region_code = get_region_code(params["language"], supported_languages)
    if region_code:
        params["data"]["kl"] = region_code
        params["cookies"]["kl"] = region_code

    params["data"]["df"] = ""
    if params["time_range"] in time_range_dict:
        params["data"]["df"] = time_range_dict[params["time_range"]]
        params["cookies"]["df"] = time_range_dict[params["time_range"]]

    logger.debug("param data: %s", params["data"])
    logger.debug("param cookies: %s", params["cookies"])
    return params


# get response from search-request
def response(resp):

    headers_ping = dict_subset(resp.request.headers, ["User-Agent", "Accept-Encoding", "Accept", "Cookie"])
    get(url_ping, headers=headers_ping)

    if resp.status_code == 303:
        return []

    results = []
    doc = fromstring(resp.text)

    result_table = eval_xpath(doc, '//html/body/form/div[@class="filters"]/table')
    if not len(result_table) >= 3:
        # no more results
        return []
    result_table = result_table[2]

    tr_rows = eval_xpath(result_table, ".//tr")

    # In the last <tr> is the form of the 'previous/next page' links
    tr_rows = tr_rows[:-1]

    len_tr_rows = len(tr_rows)
    offset = 0

    while len_tr_rows >= offset + 4:

        # assemble table rows we need to scrap
        tr_title = tr_rows[offset]
        tr_content = tr_rows[offset + 1]
        offset += 4

        # ignore sponsored Adds <tr class="result-sponsored">
        if tr_content.get("class") == "result-sponsored":
            continue

        a_tag = eval_xpath_getindex(tr_title, './/td//a[@class="result-link"]', 0, None)
        if a_tag is None:
            continue

        td_content = eval_xpath_getindex(tr_content, './/td[@class="result-snippet"]', 0, None)
        if td_content is None:
            continue

        results.append(
            {
                "title": a_tag.text_content(),
                "content": extract_text(td_content),
                "url": a_tag.get("href"),
            }
        )

    return results


# get supported languages from their site
def _fetch_supported_languages(resp):

    # response is a js file with regions as an embedded object
    response_page = resp.text
    response_page = response_page[response_page.find("regions:{") + 8 :]
    response_page = response_page[: response_page.find("}") + 1]

    regions_json = loads(response_page)
    supported_languages = map((lambda x: x[3:] + "-" + x[:2].upper()), regions_json.keys())

    return list(supported_languages)