Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Verified Commit 3af143b8 authored by Nivesh Krishna's avatar Nivesh Krishna
Browse files

fix google images

parent 54f35e5b
Loading
Loading
Loading
Loading
+69 −158
Original line number Original line Diff line number Diff line
# SPDX-License-Identifier: AGPL-3.0-or-later
# SPDX-License-Identifier: AGPL-3.0-or-later
"""Google (Images)
# lint: pylint
"""This is the implementation of the google images engine using the google
internal API used the Google Go Android app.


For detailed description of the *REST-full* API see: `Query Parameter
This internal API offer results in
Definitions`_.


.. _admonition:: Content-Security-Policy (CSP)
- JSON (_fmt:json)
- Protobuf (_fmt:pb)
- Protobuf compressed? (_fmt:pc)
- HTML (_fmt:html)
- Protobuf encoded in JSON (_fmt:jspb).


   This engine needs to allow images from the `data URLs`_ (prefixed with the
   ``data:` scheme).::

     Header set Content-Security-Policy "img-src 'self' data: ;"

.. _Query Parameter Definitions:
   https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions
.. _data URLs:
   https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs
"""
"""


from urllib.parse import urlencode, unquote
from urllib.parse import urlencode
from lxml import html
from json import loads

from searx import logger
from searx.utils import (
    eval_xpath,
    eval_xpath_list,
    eval_xpath_getindex,
    extract_text,
)


from searx.engines.google import (
from searx.engines.google import (
    get_lang_info,
    get_lang_info,
@@ -35,13 +23,9 @@ from searx.engines.google import (
)
)


# pylint: disable=unused-import
# pylint: disable=unused-import
from searx.engines.google import (
from searx.engines.google import supported_languages_url, _fetch_supported_languages
    supported_languages_url
    ,  _fetch_supported_languages
)
# pylint: enable=unused-import


logger = logger.getChild('google images')
# pylint: enable=unused-import


# about
# about
about = {
about = {
@@ -50,83 +34,52 @@ about = {
    "official_api_documentation": 'https://developers.google.com/custom-search',
    "official_api_documentation": 'https://developers.google.com/custom-search',
    "use_official_api": False,
    "use_official_api": False,
    "require_api_key": False,
    "require_api_key": False,
    "results": 'HTML',
    "results": 'JSON',
}
}


# engine dependent config
# engine dependent config
categories = ['images']
categories = ['images', 'web']
paging = False
paging = True
use_locale_domain = True
use_locale_domain = True
time_range_support = True
time_range_support = True
safesearch = True
safesearch = True
send_accept_language_header = True


filter_mapping = {
filter_mapping = {0: 'images', 1: 'active', 2: 'active'}
    0: 'images',
    1: 'active',
    2: 'active'
}


def scrap_out_thumbs(dom):
    """Scrap out thumbnail data from <script> tags.
    """
    ret_val = {}
    for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'):
        _script = script.text
        # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....');
        _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",", 1)
        _thumb_no = _thumb_no.replace("'", "")
        _img_data = _img_data.replace("'", "")
        _img_data = _img_data.replace(r"\/", r"/")
        ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=")
    return ret_val


def scrap_img_by_id(script, data_id):
    """Get full image URL by data-id in parent element
    """
    img_url = ''
    _script = script.split('\n')
    for i, line in enumerate(_script):
        if 'gstatic.com/images' in line and data_id in line and i + 1 < len(_script):
            url_line = _script[i + 1]
            img_url = url_line.split('"')[1]
            img_url = unquote(img_url.replace(r'\u00', r'%'))
    return img_url




def request(query, params):
def request(query, params):
    """Google-Video search request"""
    """Google-Image search request"""


    lang_info = get_lang_info(
    lang_info = get_lang_info(params, supported_languages, language_aliases, False)
        # pylint: disable=undefined-variable
        params, supported_languages, language_aliases, False
    )


    query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({
    query_url = (
        'https://'
        + lang_info['subdomain']
        + '/search'
        + "?"
        + urlencode(
            {
                'q': query,
                'q': query,
                'tbm': "isch",
                'tbm': "isch",
                **lang_info['params'],
                **lang_info['params'],
                'ie': "utf8",
                'ie': "utf8",
                'oe': "utf8",
                'oe': "utf8",
        'ucbcd': 1,
                'asearch': 'isch',
        'num': 30,
                'async': '_fmt:json,p:1,ijn:' + str(params['pageno']),
    })
            }
        )
    )


    if params['time_range'] in time_range_dict:
    if params['time_range'] in time_range_dict:
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
        query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]})
    if params['safesearch']:
    if params['safesearch']:
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})
        query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]})

    logger.debug("query_url --> %s", query_url)
    params['url'] = query_url
    params['url'] = query_url


    logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language'))
    params['cookies']['CONSENT'] = "YES+"
    params['headers'].update(lang_info['headers'])
    params['headers'].update(lang_info['headers'])
    params['headers']['Accept'] = (
    params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip'
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8'
    params['headers']['Accept'] = '*/*'
    )
    return params
    return params




@@ -136,76 +89,34 @@ def response(resp):


    detect_google_sorry(resp)
    detect_google_sorry(resp)


    # convert the text to dom
    json_start = resp.text.find('{"ischj":')
    dom = html.fromstring(resp.text)
    json_data = loads(resp.text[json_start:])
    img_bas64_map = scrap_out_thumbs(dom)

    img_src_script = eval_xpath_getindex(
    for item in json_data["ischj"]["metadata"]:
        dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text


        result_item = {
    # parse results
            'url': item["result"]["referrer_url"],
    #
            'title': item["result"]["page_title"],
    # root element::
            'content': item["text_in_grid"]["snippet"],
    #     <div id="islmp" ..>
            'source': item["result"]["site_title"],
    # result div per image::
            'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}',
    #     <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..."
            'img_src': item["original_image"]["url"],
    #     The data-id matches to a item in a json-data structure in::
            'thumbnail_src': item["thumbnail"]["url"],
    #         <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ...
            'template': 'images.html',
    #     In this structure the link to the origin PNG, JPG or whatever is given
        }
    # first link per image-div contains a <img> with the data-iid for bas64 encoded image data::

    #      <img class="rg_i Q4LuWd" data-iid="0"
        author = item["result"].get('iptc', {}).get('creator')
    # second link per image-div is the target link::
        if author:
    #      <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper">
            result_item['author'] = ', '.join(author)
    # the second link also contains two div tags with the *description* and *publisher*::

    #      <div class="WGvvNb">The Sacrament of the Last Supper ...</div>
        copyright_notice = item["result"].get('iptc', {}).get('copyright_notice')
    #      <div class="fxgdke">en.wikipedia.org</div>
        if copyright_notice:

            result_item['source'] += ' / ' + copyright_notice
    root = eval_xpath(dom, '//div[@id="islmp"]')

    if not root:
        file_size = item.get('gsa', {}).get('file_size')
        logger.error("did not find root element id='islmp'")
        if file_size:
        return results
            result_item['source'] += ' (%s)' % file_size


    root = root[0]
        results.append(result_item)
    for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'):

        img_alt = eval_xpath_getindex(img_node, '@alt', 0)

        img_base64_id = eval_xpath(img_node, '@data-iid')
        if img_base64_id:
            img_base64_id = img_base64_id[0]
            thumbnail_src = img_bas64_map[img_base64_id]
        else:
            thumbnail_src = eval_xpath(img_node, '@src')
            if not thumbnail_src:
                thumbnail_src = eval_xpath(img_node, '@data-src')
            if thumbnail_src:
                thumbnail_src = thumbnail_src[0]
            else:
                thumbnail_src = ''

        link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0)
        url = eval_xpath_getindex(link_node, '@href', 0)

        pub_nodes = eval_xpath(link_node, './div/div')
        pub_descr = img_alt
        pub_source = ''
        if pub_nodes:
            pub_descr = extract_text(pub_nodes[0])
            pub_source = extract_text(pub_nodes[1])

        img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0)
        src_url = scrap_img_by_id(img_src_script, img_src_id)
        if not src_url:
            src_url = thumbnail_src

        results.append({
            'url': url,
            'title': img_alt,
            'content': pub_descr,
            'source': pub_source,
            'img_src': src_url,
            # 'img_format': img_format,
            'thumbnail_src': thumbnail_src,
            'template': 'images.html'
        })


    return results
    return results