Loading searx/engines/google_images.py +177 −69 Original line number Original line Diff line number Diff line """ # SPDX-License-Identifier: AGPL-3.0-or-later Google (Images) """Google (Images) :website: https://images.google.com (redirected to subdomain www.) :provide-api: yes (https://developers.google.com/custom-search/) :using-api: not the offical, since it needs registration to another service :results: HTML :stable: no :template: images.html :parse: url, title, content, source, thumbnail_src, img_src For detailed description of the *REST-full* API see: `Query Parameter Definitions`_. .. _admonition:: Content-Security-Policy (CSP) This engine needs to allow images from the `data URLs`_ (prefixed with the ``data:` scheme).:: @website https://www.google.com Header set Content-Security-Policy "img-src 'self' data: ;" @provide-api yes (https://developers.google.com/custom-search/) .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions @using-api no @results HTML chunks with JSON inside @stable no @parse url, title, img_src """ """ from datetime import date, timedelta from json import loads from lxml import html from lxml import html from searx.url_utils import urlencode from flask_babel import gettext from searx import logger from searx.url_utils import urlencode, urlparse from searx.utils import eval_xpath from searx.engines.xpath import extract_text # pylint: disable=unused-import from searx.engines.google import ( supported_languages_url , _fetch_supported_languages ) # pylint: enable=unused-import from searx.engines.google import ( get_lang_country , google_domains , time_range_dict ) logger = logger.getChild('google images') # engine dependent config # engine dependent config categories = ['images'] categories = ['images'] paging = True paging = False safesearch = True language_support = True use_locale_domain = True time_range_support = True time_range_support = True number_of_results = 100 safesearch = True search_url = 'https://www.google.com/search'\ filter_mapping = { '?{query}'\ 0 : 'images', '&tbm=isch'\ 1 : 'active', '&yv=2'\ 2 : 'active' '&{search_options}' } time_range_attr = "qdr:{range}" time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'} def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ ret_val = dict() for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'): _script = script.text # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....'); _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",",1) _thumb_no = _thumb_no.replace("'","") _img_data = _img_data.replace("'","") _img_data = _img_data.replace(r"\/", r"/") ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=") return ret_val # do search-request def request(query, params): def request(query, params): search_options = { """Google-Video search request""" 'ijn': params['pageno'] - 1, 'start': (params['pageno'] - 1) * number_of_results language, country, lang_country = get_lang_country( } # pylint: disable=undefined-variable params, supported_languages, language_aliases ) subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ 'q': query, 'tbm': "isch", 'hl': lang_country, 'lr': "lang_" + language, 'ie': "utf8", 'oe': "utf8", 'num': 30, }) if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict: search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) elif params['time_range'] == 'year': if params['safesearch']: now = date.today() query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) then = now - timedelta(days=365) start = then.strftime('%m/%d/%Y') params['url'] = query_url end = now.strftime('%m/%d/%Y') logger.debug("query_url --> %s", query_url) search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) params['headers']['Accept-Language'] = ( if safesearch and params['safesearch']: "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) search_options['safe'] = 'on' logger.debug( "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) params['url'] = search_url.format(query=urlencode({'q': query}), params['headers']['Accept'] = ( search_options=urlencode(search_options)) 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) #params['google_subdomain'] = subdomain return params return params # get response from search-request def response(resp): def response(resp): """Get response from google's search request""" results = [] results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) # parse results # parse results for result in dom.xpath('//div[contains(@class, "rg_meta")]/text()'): # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the ling to the origin PNG, JPG or whatever is given # (we do not blow out the link there, you could still implement that) # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: try: metadata = loads(result) img_alt = eval_xpath(img_node, '@alt')[0] img_format = metadata.get('ity', '') img_base64_id = eval_xpath(img_node, '@data-iid') img_width = metadata.get('ow', '') if img_base64_id: img_height = metadata.get('oh', '') img_base64_id = img_base64_id[0] if img_width and img_height: thumbnail_src = img_bas64_map[img_base64_id] img_format += " {0}x{1}".format(img_width, img_height) else: thumbnail_src = eval_xpath(img_node, '@src') source = metadata.get('st', '') if not thumbnail_src: source_url = metadata.get('isu', '') thumbnail_src = eval_xpath(img_node, '@data-src') if source_url: if thumbnail_src: source += " ({0})".format(source_url) thumbnail_src = thumbnail_src[0] else: results.append({'url': metadata['ru'], thumbnail_src = '' 'title': metadata['pt'], 'content': metadata.get('s', ''), link_node = eval_xpath(img_node, '../../../a[2]')[0] 'source': source, url = eval_xpath(link_node, '@href')[0] 'img_format': img_format, 'thumbnail_src': metadata['tu'], pub_nodes = eval_xpath(link_node, './div/div') 'img_src': metadata['ou'], pub_descr = img_alt 'template': 'images.html'}) pub_source = '' if pub_nodes: except: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) #from lxml import etree #logger.debug(etree.tostring(img_node, pretty_print=True)) #import pdb #pdb.set_trace() continue continue return results return results Loading
searx/engines/google_images.py +177 −69 Original line number Original line Diff line number Diff line """ # SPDX-License-Identifier: AGPL-3.0-or-later Google (Images) """Google (Images) :website: https://images.google.com (redirected to subdomain www.) :provide-api: yes (https://developers.google.com/custom-search/) :using-api: not the offical, since it needs registration to another service :results: HTML :stable: no :template: images.html :parse: url, title, content, source, thumbnail_src, img_src For detailed description of the *REST-full* API see: `Query Parameter Definitions`_. .. _admonition:: Content-Security-Policy (CSP) This engine needs to allow images from the `data URLs`_ (prefixed with the ``data:` scheme).:: @website https://www.google.com Header set Content-Security-Policy "img-src 'self' data: ;" @provide-api yes (https://developers.google.com/custom-search/) .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions @using-api no @results HTML chunks with JSON inside @stable no @parse url, title, img_src """ """ from datetime import date, timedelta from json import loads from lxml import html from lxml import html from searx.url_utils import urlencode from flask_babel import gettext from searx import logger from searx.url_utils import urlencode, urlparse from searx.utils import eval_xpath from searx.engines.xpath import extract_text # pylint: disable=unused-import from searx.engines.google import ( supported_languages_url , _fetch_supported_languages ) # pylint: enable=unused-import from searx.engines.google import ( get_lang_country , google_domains , time_range_dict ) logger = logger.getChild('google images') # engine dependent config # engine dependent config categories = ['images'] categories = ['images'] paging = True paging = False safesearch = True language_support = True use_locale_domain = True time_range_support = True time_range_support = True number_of_results = 100 safesearch = True search_url = 'https://www.google.com/search'\ filter_mapping = { '?{query}'\ 0 : 'images', '&tbm=isch'\ 1 : 'active', '&yv=2'\ 2 : 'active' '&{search_options}' } time_range_attr = "qdr:{range}" time_range_custom_attr = "cdr:1,cd_min:{start},cd_max{end}" time_range_dict = {'day': 'd', 'week': 'w', 'month': 'm'} def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ ret_val = dict() for script in eval_xpath(dom, '//script[contains(., "_setImgSrc(")]'): _script = script.text # _setImgSrc('0','data:image\/jpeg;base64,\/9j\/4AAQSkZJR ....'); _thumb_no, _img_data = _script[len("_setImgSrc("):-2].split(",",1) _thumb_no = _thumb_no.replace("'","") _img_data = _img_data.replace("'","") _img_data = _img_data.replace(r"\/", r"/") ret_val[_thumb_no] = _img_data.replace(r"\x3d", "=") return ret_val # do search-request def request(query, params): def request(query, params): search_options = { """Google-Video search request""" 'ijn': params['pageno'] - 1, 'start': (params['pageno'] - 1) * number_of_results language, country, lang_country = get_lang_country( } # pylint: disable=undefined-variable params, supported_languages, language_aliases ) subdomain = 'www.' + google_domains.get(country.upper(), 'google.com') query_url = 'https://'+ subdomain + '/search' + "?" + urlencode({ 'q': query, 'tbm': "isch", 'hl': lang_country, 'lr': "lang_" + language, 'ie': "utf8", 'oe': "utf8", 'num': 30, }) if params['time_range'] in time_range_dict: if params['time_range'] in time_range_dict: search_options['tbs'] = time_range_attr.format(range=time_range_dict[params['time_range']]) query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) elif params['time_range'] == 'year': if params['safesearch']: now = date.today() query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) then = now - timedelta(days=365) start = then.strftime('%m/%d/%Y') params['url'] = query_url end = now.strftime('%m/%d/%Y') logger.debug("query_url --> %s", query_url) search_options['tbs'] = time_range_custom_attr.format(start=start, end=end) params['headers']['Accept-Language'] = ( if safesearch and params['safesearch']: "%s,%s;q=0.8,%s;q=0.5" % (lang_country, language, language)) search_options['safe'] = 'on' logger.debug( "HTTP Accept-Language --> %s", params['headers']['Accept-Language']) params['url'] = search_url.format(query=urlencode({'q': query}), params['headers']['Accept'] = ( search_options=urlencode(search_options)) 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) #params['google_subdomain'] = subdomain return params return params # get response from search-request def response(resp): def response(resp): """Get response from google's search request""" results = [] results = [] # detect google sorry resp_url = urlparse(resp.url) if resp_url.netloc == 'sorry.google.com' or resp_url.path == '/sorry/IndexRedirect': raise RuntimeWarning('sorry.google.com') if resp_url.path.startswith('/sorry'): raise RuntimeWarning(gettext('CAPTCHA required')) # which subdomain ? # subdomain = resp.search_params.get('google_subdomain') # convert the text to dom dom = html.fromstring(resp.text) dom = html.fromstring(resp.text) img_bas64_map = scrap_out_thumbs(dom) # parse results # parse results for result in dom.xpath('//div[contains(@class, "rg_meta")]/text()'): # # root element:: # <div id="islmp" ..> # result div per image:: # <div jsmodel="tTXmib"> / <div jsaction="..." data-id="..." # The data-id matches to a item in a json-data structure in:: # <script nonce="I+vqelcy/01CKiBJi5Z1Ow">AF_initDataCallback({key: 'ds:1', ... data:function(){return [ ... # In this structure the ling to the origin PNG, JPG or whatever is given # (we do not blow out the link there, you could still implement that) # first link per image-div contains a <img> with the data-iid for bas64 encoded image data:: # <img class="rg_i Q4LuWd" data-iid="0" # second link per image-div is the target link:: # <a class="VFACy kGQAp" href="https://en.wikipedia.org/wiki/The_Sacrament_of_the_Last_Supper"> # the second link also contains two div tags with the *description* and *publisher*:: # <div class="WGvvNb">The Sacrament of the Last Supper ...</div> # <div class="fxgdke">en.wikipedia.org</div> root = eval_xpath(dom, '//div[@id="islmp"]') if not root: logger.error("did not find root element id='islmp'") return results root = root[0] for img_node in eval_xpath(root, './/img[contains(@class, "rg_i")]'): try: try: metadata = loads(result) img_alt = eval_xpath(img_node, '@alt')[0] img_format = metadata.get('ity', '') img_base64_id = eval_xpath(img_node, '@data-iid') img_width = metadata.get('ow', '') if img_base64_id: img_height = metadata.get('oh', '') img_base64_id = img_base64_id[0] if img_width and img_height: thumbnail_src = img_bas64_map[img_base64_id] img_format += " {0}x{1}".format(img_width, img_height) else: thumbnail_src = eval_xpath(img_node, '@src') source = metadata.get('st', '') if not thumbnail_src: source_url = metadata.get('isu', '') thumbnail_src = eval_xpath(img_node, '@data-src') if source_url: if thumbnail_src: source += " ({0})".format(source_url) thumbnail_src = thumbnail_src[0] else: results.append({'url': metadata['ru'], thumbnail_src = '' 'title': metadata['pt'], 'content': metadata.get('s', ''), link_node = eval_xpath(img_node, '../../../a[2]')[0] 'source': source, url = eval_xpath(link_node, '@href')[0] 'img_format': img_format, 'thumbnail_src': metadata['tu'], pub_nodes = eval_xpath(link_node, './div/div') 'img_src': metadata['ou'], pub_descr = img_alt 'template': 'images.html'}) pub_source = '' if pub_nodes: except: pub_descr = extract_text(pub_nodes[0]) pub_source = extract_text(pub_nodes[1]) results.append({ 'url': url, 'title': img_alt, 'content': pub_descr, 'source': pub_source, 'img_src': url, # 'img_format': img_format, 'thumbnail_src': thumbnail_src, 'template': 'images.html' }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) #from lxml import etree #logger.debug(etree.tostring(img_node, pretty_print=True)) #import pdb #pdb.set_trace() continue continue return results return results