Loading searx/engines/google_images.py +69 −158 Original line number Original line Diff line number Diff line # SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later """Google (Images) # lint: pylint """This is the implementation of the google images engine using the google internal API used the Google Go Android app. For detailed description of the *REST-full* API see: `Query Parameter This internal API offer results in Definitions`_. .. _admonition:: Content-Security-Policy (CSP) - JSON (_fmt:json) - Protobuf (_fmt:pb) - Protobuf compressed? (_fmt:pc) - HTML (_fmt:html) - Protobuf encoded in JSON (_fmt:jspb). This engine needs to allow images from the `data URLs`_ (prefixed with the ``data:` scheme).:: Header set Content-Security-Policy "img-src 'self' data: ;" .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _data URLs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs """ """ from urllib.parse import urlencode, unquote from urllib.parse import urlencode from lxml import html from json import loads from searx import logger from searx.utils import ( eval_xpath, eval_xpath_list, eval_xpath_getindex, extract_text, ) from searx.engines.google import ( from searx.engines.google import ( get_lang_info, get_lang_info, Loading @@ -35,13 +23,9 @@ from searx.engines.google import ( ) ) # pylint: disable=unused-import # pylint: disable=unused-import from searx.engines.google import ( from searx.engines.google import supported_languages_url, _fetch_supported_languages supported_languages_url , _fetch_supported_languages ) # pylint: enable=unused-import logger = logger.getChild('google images') # pylint: enable=unused-import # about # about about = { about = { Loading @@ -50,83 +34,52 @@ about = { "official_api_documentation": 'https://developers.google.com/custom-search', "official_api_documentation": 'https://developers.google.com/custom-search', "use_official_api": False, "use_official_api": False, "require_api_key": False, "require_api_key": False, "results": 'HTML', "results": 'JSON', } } # engine dependent config # engine dependent config categories = ['images'] categories = ['images', 'web'] paging = False paging = True use_locale_domain = True use_locale_domain = True time_range_support = True time_range_support = True safesearch = True safesearch = True send_accept_language_header = True filter_mapping = { filter_mapping = {0: 'images', 1: 'active', 2: 'active'} 0: 'images', 1: 'active', 2: 'active' } def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ ret_val = {} for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'): _script = script.text # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....'); _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",", 1) _thumb_no = _thumb_no.replace("'", "") _img_data = _img_data.replace("'", "") _img_data = _img_data.replace(r"\/", r"/") ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=") return ret_val def scrap_img_by_id(script, data_id): """Get full image URL by data-id in parent element """ img_url = '' _script = script.split('\n') for i, line in enumerate(_script): if 'gstatic.com/images' in line and data_id in line and i + 1 < len(_script): url_line = _script[i + 1] img_url = url_line.split('"')[1] img_url = unquote(img_url.replace(r'\u00', r'%')) return img_url def request(query, params): def request(query, params): """Google-Video search request""" """Google-Image search request""" lang_info = get_lang_info( lang_info = get_lang_info(params, supported_languages, language_aliases, False) # pylint: disable=undefined-variable params, supported_languages, language_aliases, False ) query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ query_url = ( 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, 'q': query, 'tbm': "isch", 'tbm': "isch", **lang_info['params'], **lang_info['params'], 'ie': "utf8", 'ie': "utf8", 'oe': "utf8", 'oe': "utf8", 'ucbcd': 1, 'asearch': 'isch', 'num': 30, 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), }) } ) ) if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict: query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) if params['safesearch']: if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) logger.debug("query_url --> %s", query_url) params['url'] = query_url params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['cookies']['CONSENT'] = "YES+" params['headers'].update(lang_info['headers']) params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' params['headers']['Accept'] = '*/*' ) return params return params Loading @@ -136,76 +89,34 @@ def response(resp): detect_google_sorry(resp) detect_google_sorry(resp) # convert the text to dom json_start = resp.text.find('{"ischj":') dom = html.fromstring(resp.text) json_data = loads(resp.text[json_start:]) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath_getindex( for item in json_data["ischj"]["metadata"]: dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text result_item = { # parse results 'url': item["result"]["referrer_url"], # 'title': item["result"]["page_title"], # root element:: 'content': item["text_in_grid"]["snippet"], # <div id="islmp" ..> 'source': item["result"]["site_title"], # result div per image:: 'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}', # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." 'img_src': item["original_image"]["url"], # The data-id matches to a item in a json-data structure in:: 'thumbnail_src': item["thumbnail"]["url"], # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... 'template': 'images.html', # In this structure the link to the origin PNG, JPG or whatever is given } # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" author = item["result"].get('iptc', {}).get('creator') # second link per image-div is the target link:: if author: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> result_item['author'] = ', '.join(author) # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') # <div class="fxgdke">en.wikipedia.org</div> if copyright_notice: result_item['source'] += ' / ' + copyright_notice root = eval_xpath(dom, '//div[@id="islmp"]') if not root: file_size = item.get('gsa', {}).get('file_size') logger.error("did not find root element id='islmp'") if file_size: return results result_item['source'] += ' (%s)' % file_size root = root[0] results.append(result_item) for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): img_alt = eval_xpath_getindex(img_node, '@alt', 0) img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) url = eval_xpath_getindex(link_node, '@href', 0) pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) return results return results Loading
searx/engines/google_images.py +69 −158 Original line number Original line Diff line number Diff line # SPDX-License-Identifier: AGPL-3.0-or-later # SPDX-License-Identifier: AGPL-3.0-or-later """Google (Images) # lint: pylint """This is the implementation of the google images engine using the google internal API used the Google Go Android app. For detailed description of the *REST-full* API see: `Query Parameter This internal API offer results in Definitions`_. .. _admonition:: Content-Security-Policy (CSP) - JSON (_fmt:json) - Protobuf (_fmt:pb) - Protobuf compressed? (_fmt:pc) - HTML (_fmt:html) - Protobuf encoded in JSON (_fmt:jspb). This engine needs to allow images from the `data URLs`_ (prefixed with the ``data:` scheme).:: Header set Content-Security-Policy "img-src 'self' data: ;" .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions .. _data URLs: https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URIs """ """ from urllib.parse import urlencode, unquote from urllib.parse import urlencode from lxml import html from json import loads from searx import logger from searx.utils import ( eval_xpath, eval_xpath_list, eval_xpath_getindex, extract_text, ) from searx.engines.google import ( from searx.engines.google import ( get_lang_info, get_lang_info, Loading @@ -35,13 +23,9 @@ from searx.engines.google import ( ) ) # pylint: disable=unused-import # pylint: disable=unused-import from searx.engines.google import ( from searx.engines.google import supported_languages_url, _fetch_supported_languages supported_languages_url , _fetch_supported_languages ) # pylint: enable=unused-import logger = logger.getChild('google images') # pylint: enable=unused-import # about # about about = { about = { Loading @@ -50,83 +34,52 @@ about = { "official_api_documentation": 'https://developers.google.com/custom-search', "official_api_documentation": 'https://developers.google.com/custom-search', "use_official_api": False, "use_official_api": False, "require_api_key": False, "require_api_key": False, "results": 'HTML', "results": 'JSON', } } # engine dependent config # engine dependent config categories = ['images'] categories = ['images', 'web'] paging = False paging = True use_locale_domain = True use_locale_domain = True time_range_support = True time_range_support = True safesearch = True safesearch = True send_accept_language_header = True filter_mapping = { filter_mapping = {0: 'images', 1: 'active', 2: 'active'} 0: 'images', 1: 'active', 2: 'active' } def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ ret_val = {} for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'): _script = script.text # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....'); _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",", 1) _thumb_no = _thumb_no.replace("'", "") _img_data = _img_data.replace("'", "") _img_data = _img_data.replace(r"\/", r"/") ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=") return ret_val def scrap_img_by_id(script, data_id): """Get full image URL by data-id in parent element """ img_url = '' _script = script.split('\n') for i, line in enumerate(_script): if 'gstatic.com/images' in line and data_id in line and i + 1 < len(_script): url_line = _script[i + 1] img_url = url_line.split('"')[1] img_url = unquote(img_url.replace(r'\u00', r'%')) return img_url def request(query, params): def request(query, params): """Google-Video search request""" """Google-Image search request""" lang_info = get_lang_info( lang_info = get_lang_info(params, supported_languages, language_aliases, False) # pylint: disable=undefined-variable params, supported_languages, language_aliases, False ) query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ query_url = ( 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, 'q': query, 'tbm': "isch", 'tbm': "isch", **lang_info['params'], **lang_info['params'], 'ie': "utf8", 'ie': "utf8", 'oe': "utf8", 'oe': "utf8", 'ucbcd': 1, 'asearch': 'isch', 'num': 30, 'async': '_fmt:json,p:1,ijn:' + str(params['pageno']), }) } ) ) if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict: query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) if params['safesearch']: if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) logger.debug("query_url --> %s", query_url) params['url'] = query_url params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['cookies']['CONSENT'] = "YES+" params['headers'].update(lang_info['headers']) params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( params['headers']['User-Agent'] = 'NSTN/3.60.474802233.release Dalvik/2.1.0 (Linux; U; Android 12; US) gzip' 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' params['headers']['Accept'] = '*/*' ) return params return params Loading @@ -136,76 +89,34 @@ def response(resp): detect_google_sorry(resp) detect_google_sorry(resp) # convert the text to dom json_start = resp.text.find('{"ischj":') dom = html.fromstring(resp.text) json_data = loads(resp.text[json_start:]) img_bas64_map = scrap_out_thumbs(dom) img_src_script = eval_xpath_getindex( for item in json_data["ischj"]["metadata"]: dom, '//script[contains(., "AF_initDataCallback({key: ")]', 1).text result_item = { # parse results 'url': item["result"]["referrer_url"], # 'title': item["result"]["page_title"], # root element:: 'content': item["text_in_grid"]["snippet"], # <div id="islmp" ..> 'source': item["result"]["site_title"], # result div per image:: 'img_format': f'{item["original_image"]["width"]} x {item["original_image"]["height"]}', # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." 'img_src': item["original_image"]["url"], # The data-id matches to a item in a json-data structure in:: 'thumbnail_src': item["thumbnail"]["url"], # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... 'template': 'images.html', # In this structure the link to the origin PNG, JPG or whatever is given } # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" author = item["result"].get('iptc', {}).get('creator') # second link per image-div is the target link:: if author: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> result_item['author'] = ', '.join(author) # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> copyright_notice = item["result"].get('iptc', {}).get('copyright_notice') # <div class="fxgdke">en.wikipedia.org</div> if copyright_notice: result_item['source'] += ' / ' + copyright_notice root = eval_xpath(dom, '//div[@id="islmp"]') if not root: file_size = item.get('gsa', {}).get('file_size') logger.error("did not find root element id='islmp'") if file_size: return results result_item['source'] += ' (%s)' % file_size root = root[0] results.append(result_item) for img_node in eval_xpath_list(root, './/img[contains(@class, "rg_i")]'): img_alt = eval_xpath_getindex(img_node, '@alt', 0) img_base64_id = eval_xpath(img_node, '@data-iid') if img_base64_id: img_base64_id = img_base64_id[0] thumbnail_src = img_bas64_map[img_base64_id] else: thumbnail_src = eval_xpath(img_node, '@src') if not thumbnail_src: thumbnail_src = eval_xpath(img_node, '@data-src') if thumbnail_src: thumbnail_src = thumbnail_src[0] else: thumbnail_src = '' link_node = eval_xpath_getindex(img_node, '../../../a[2]', 0) url = eval_xpath_getindex(link_node, '@href', 0) pub_nodes = eval_xpath(link_node, './div/div') pub_descr = img_alt pub_source = '' if pub_nodes: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) img_src_id = eval_xpath_getindex(img_node, '../../../@data-id', 0) src_url = scrap_img_by_id(img_src_script, img_src_id) if not src_url: src_url = thumbnail_src results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': src_url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) return results return results