Loading searx/engines/google.py +47 −96 Original line number Diff line number Diff line # SPDX-License-Identifier: AGPL-3.0-or-later """Google (Web) For detailed description of the *REST-full* API see: `Query Parameter Definitions`_. .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions """ # pylint: disable=invalid-name, missing-function-docstring # pylint: disable=invalid-name, missing-function-docstring, too-many-branches from urllib.parse import urlencode from lxml import html Loading Loading @@ -108,8 +106,9 @@ filter_mapping = { # specific xpath variables # ------------------------ # google results are grouped into <div class="g" ../> results_xpath = '//div[@class="g"]' # google results are grouped into <div class="g ..." ../> results_xpath = '//div[@id="search"]//div[contains(@class, "g ")]' results_xpath_mobile_ui = '//div[contains(@class, "g ")]' # google *sections* are no usual *results*, we ignore them g_section_with_header = './g-section-with-header' Loading @@ -121,8 +120,8 @@ title_xpath = './/h3[1]' # href=...> href_xpath = './/div[@class="yuRUbf"]//a/@href' # in the result group there is <div class="IsZvec" ../> containing he *content* content_xpath = './/div[@class="IsZvec"]' # in the result group there is <div class="VwiC3b ..." ../> containing the *content* content_xpath = './/div[contains(@class, "VwiC3b")]' # Suggestions are links placed in a *card-section*, we extract only the text # from the links not the links itself. Loading @@ -134,113 +133,42 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): """Composing various language properties for the google engines. This function is called by the various google engines (google itself, google-images, -news, -scholar, -videos). :param dict param: request parameters of the engine :param list lang_list: list of supported languages of the engine :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` :param dict lang_list: custom aliases for non standard language codes (used when calling :py:func:`searx.utils.match_language) :param bool supported_any_language: When a language is not specified, the language interpretation is left up to Google to decide how the search results should be delivered. This argument is ``True`` for the google engine and ``False`` for the other engines (google-images, -news, -scholar, -videos). :rtype: dict :returns: Py-Dictionary with the key/value pairs: language: Return value from :py:func:`searx.utils.match_language country: The country code (e.g. US, AT, CA, FR, DE ..) subdomain: Google subdomain :py:obj:`google_domains` that fits to the country code. params: Py-Dictionary with additional request arguments (can be passed to :py:func:`urllib.parse.urlencode`). headers: Py-Dictionary with additional HTTP headers (can be passed to request's headers) """ ret_val = { 'language' : None, 'country' : None, 'subdomain' : None, 'params' : {}, 'headers' : {}, } # language ... ret_val = {} _lang = params['language'] _any_language = _lang.lower() == 'all' if _any_language: _lang = 'en-US' language = match_language(_lang, lang_list, custom_aliases) ret_val['language'] = language # country ... # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...) _l = _lang.split('-') # the country code (US, AT, CA) if len(_l) == 2: country = _l[1] else: country = _l[0].upper() if country == 'EN': country = 'US' ret_val['country'] = country # subdomain ... # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR) lang_country = '%s-%s' % (language, country) # subdomain ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com') # params & headers lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..) # hl parameter: # https://developers.google.com/custom-search/docs/xml_results#hlsp The # Interface Language: # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages ret_val['params']['hl'] = lang_list.get(lang_country, language) # lr parameter: # The lr (language restrict) parameter restricts search results to # documents written in a particular language. # https://developers.google.com/custom-search/docs/xml_results#lrsp # Language Collection Values: # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections ret_val['params'] = {} ret_val['headers'] = {} if _any_language and supported_any_language: # interpretation is left up to Google (based on whoogle) # # - add parameter ``source=lnt`` # - don't use parameter ``lr`` # - don't add a ``Accept-Language`` HTTP header. # based on whoogle ret_val['params']['source'] = 'lnt' else: # restricts search results to documents written in a particular # language. ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 ret_val['headers']['Accept-Language'] = ','.join([ lang_country, Loading @@ -249,6 +177,18 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): '*;q=0.5', ]) # lr parameter: # https://developers.google.com/custom-search/docs/xml_results#lrsp # Language Collection Values: # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language ret_val['params']['hl'] = lang_country if lang_country in lang_list else language # hl parameter: # https://developers.google.com/custom-search/docs/xml_results#hlsp The # Interface Language: # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages return ret_val def detect_google_sorry(resp): Loading Loading @@ -287,8 +227,11 @@ def request(query, params): query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) logger.debug("query_url --> %s", query_url) params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['headers'].update(lang_info['headers']) if use_mobile_ui: params['headers']['Accept'] = '*/*' Loading Loading @@ -330,7 +273,12 @@ def response(resp): logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): _results_xpath = results_xpath if use_mobile_ui: _results_xpath = results_xpath_mobile_ui for result in eval_xpath_list(dom, _results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): Loading @@ -341,24 +289,27 @@ def response(resp): title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug('ingoring <div class="g" ../> section: missing title') logger.debug('ingoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) if content is None: logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title) continue logger.debug('add link to results: %s', title) results.append({ 'url': url, 'title': title, 'content': content }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion Loading searx/engines/google_scholar.py +27 −20 Original line number Diff line number Diff line Loading @@ -82,27 +82,32 @@ def request(query, params): params, supported_languages, language_aliases, False ) logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) # subdomain is: scholar.google.xy lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({ query_url = ( 'https://' + lang_info['subdomain'] + '/scholar' + "?" + urlencode( { 'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset, }) } ) ) query_url += time_range_url(params) logger.debug("query_url --> %s", query_url) params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' # params['google_subdomain'] = subdomain return params Loading Loading @@ -139,11 +144,13 @@ def response(resp): if pub_type: title = title + " " + pub_type results.append({ results.append( { 'url': url, 'title': title, 'content': content, }) } ) # parse suggestion for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): Loading searx/engines/google_videos.py +54 −50 Original line number Diff line number Diff line Loading @@ -36,13 +36,9 @@ from searx.engines.google import ( get_lang_info, time_range_dict, filter_mapping, results_xpath, g_section_with_header, title_xpath, href_xpath, content_xpath, suggestion_xpath, spelling_suggestion_xpath, detect_google_sorry, ) Loading @@ -53,6 +49,7 @@ from searx.engines.google import ( ) # pylint: enable=unused-import logger = logger.getChild('google videos') # about about = { "website": 'https://www.google.com', Loading @@ -63,11 +60,9 @@ about = { "results": 'HTML', } logger = logger.getChild('google video') # engine dependent config categories = ['videos'] categories = ['videos', 'web'] paging = False language_support = True use_locale_domain = True Loading @@ -76,16 +71,32 @@ safesearch = True RE_CACHE = {} def _re(regexpr): """returns compiled regular expression""" RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) return RE_CACHE[regexpr] def scrap_out_thumbs_src(dom): ret_val = {} thumb_name = 'dimg_' for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): _script = script.text # "dimg_35":"https://i.ytimg.c....", _dimurl = _re("s='([^']*)").findall(_script) for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): v = v.replace(r'\u003d', '=') v = v.replace(r'\u0026', '&') ret_val[k] = v logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) return ret_val def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ """Scrap out thumbnail data from <script> tags.""" ret_val = {} thumb_name = 'vidthumb' thumb_name = 'dimg_' for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): _script = script.text Loading @@ -95,20 +106,11 @@ def scrap_out_thumbs(dom): if not _imgdata: continue # var ii=['vidthumb4','vidthumb7'] # var ii=['dimg_17'] for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): # At least the equal sign in the URL needs to be decoded ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): _script = script.text for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) if match: # At least the equal sign in the URL needs to be decoded ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=") logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) return ret_val Loading @@ -121,27 +123,30 @@ def request(query, params): params, supported_languages, language_aliases, False ) query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ query_url = ( 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8", }) } ) ) if params['time_range'] in time_range_dict: query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) logger.debug("query_url --> %s", query_url) params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' return params Loading @@ -154,31 +159,33 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) vidthumb_imgdata = scrap_out_thumbs(dom) thumbs_src = scrap_out_thumbs_src(dom) logger.debug(str(thumbs_src)) # parse results for result in eval_xpath_list(dom, results_xpath): for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): # google *sections* # ignore google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, href_xpath, 0) c_node = eval_xpath_getindex(result, content_xpath, 0) # <img id="vidthumb1" ...> img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) # ingnore articles without an image id / e.g. news articles img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) if img_id is None: logger.error("no img_id found in item %s (news article?)", len(results) + 1) continue img_src = vidthumb_imgdata.get(img_id, None) if not img_src: logger.error("no vidthumb imgdata for: %s" % img_id) img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) img_src = thumbs_src.get(img_id, "") length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) content = extract_text(eval_xpath(c_node, './/div[2]/span')) pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span')) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) content = extract_text(c_node) pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) results.append({ 'url': url, Loading @@ -195,7 +202,4 @@ def response(resp): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) return results Loading
searx/engines/google.py +47 −96 Original line number Diff line number Diff line # SPDX-License-Identifier: AGPL-3.0-or-later """Google (Web) For detailed description of the *REST-full* API see: `Query Parameter Definitions`_. .. _Query Parameter Definitions: https://developers.google.com/custom-search/docs/xml_results#WebSearch_Query_Parameter_Definitions """ # pylint: disable=invalid-name, missing-function-docstring # pylint: disable=invalid-name, missing-function-docstring, too-many-branches from urllib.parse import urlencode from lxml import html Loading Loading @@ -108,8 +106,9 @@ filter_mapping = { # specific xpath variables # ------------------------ # google results are grouped into <div class="g" ../> results_xpath = '//div[@class="g"]' # google results are grouped into <div class="g ..." ../> results_xpath = '//div[@id="search"]//div[contains(@class, "g ")]' results_xpath_mobile_ui = '//div[contains(@class, "g ")]' # google *sections* are no usual *results*, we ignore them g_section_with_header = './g-section-with-header' Loading @@ -121,8 +120,8 @@ title_xpath = './/h3[1]' # href=...> href_xpath = './/div[@class="yuRUbf"]//a/@href' # in the result group there is <div class="IsZvec" ../> containing he *content* content_xpath = './/div[@class="IsZvec"]' # in the result group there is <div class="VwiC3b ..." ../> containing the *content* content_xpath = './/div[contains(@class, "VwiC3b")]' # Suggestions are links placed in a *card-section*, we extract only the text # from the links not the links itself. Loading @@ -134,113 +133,42 @@ spelling_suggestion_xpath = '//div[@class="med"]/p/a' def get_lang_info(params, lang_list, custom_aliases, supported_any_language): """Composing various language properties for the google engines. This function is called by the various google engines (google itself, google-images, -news, -scholar, -videos). :param dict param: request parameters of the engine :param list lang_list: list of supported languages of the engine :py:obj:`ENGINES_LANGUAGES[engine-name] <searx.data.ENGINES_LANGUAGES>` :param dict lang_list: custom aliases for non standard language codes (used when calling :py:func:`searx.utils.match_language) :param bool supported_any_language: When a language is not specified, the language interpretation is left up to Google to decide how the search results should be delivered. This argument is ``True`` for the google engine and ``False`` for the other engines (google-images, -news, -scholar, -videos). :rtype: dict :returns: Py-Dictionary with the key/value pairs: language: Return value from :py:func:`searx.utils.match_language country: The country code (e.g. US, AT, CA, FR, DE ..) subdomain: Google subdomain :py:obj:`google_domains` that fits to the country code. params: Py-Dictionary with additional request arguments (can be passed to :py:func:`urllib.parse.urlencode`). headers: Py-Dictionary with additional HTTP headers (can be passed to request's headers) """ ret_val = { 'language' : None, 'country' : None, 'subdomain' : None, 'params' : {}, 'headers' : {}, } # language ... ret_val = {} _lang = params['language'] _any_language = _lang.lower() == 'all' if _any_language: _lang = 'en-US' language = match_language(_lang, lang_list, custom_aliases) ret_val['language'] = language # country ... # the requested language from params (en, en-US, de, de-AT, fr, fr-CA, ...) _l = _lang.split('-') # the country code (US, AT, CA) if len(_l) == 2: country = _l[1] else: country = _l[0].upper() if country == 'EN': country = 'US' ret_val['country'] = country # subdomain ... # the combination (en-US, en-EN, de-DE, de-AU, fr-FR, fr-FR) lang_country = '%s-%s' % (language, country) # subdomain ret_val['subdomain'] = 'www.' + google_domains.get(country.upper(), 'google.com') # params & headers lang_country = '%s-%s' % (language, country) # (en-US, en-EN, de-DE, de-AU, fr-FR ..) # hl parameter: # https://developers.google.com/custom-search/docs/xml_results#hlsp The # Interface Language: # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages ret_val['params']['hl'] = lang_list.get(lang_country, language) # lr parameter: # The lr (language restrict) parameter restricts search results to # documents written in a particular language. # https://developers.google.com/custom-search/docs/xml_results#lrsp # Language Collection Values: # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections ret_val['params'] = {} ret_val['headers'] = {} if _any_language and supported_any_language: # interpretation is left up to Google (based on whoogle) # # - add parameter ``source=lnt`` # - don't use parameter ``lr`` # - don't add a ``Accept-Language`` HTTP header. # based on whoogle ret_val['params']['source'] = 'lnt' else: # restricts search results to documents written in a particular # language. ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language # Accept-Language: fr-CH, fr;q=0.8, en;q=0.6, *;q=0.5 ret_val['headers']['Accept-Language'] = ','.join([ lang_country, Loading @@ -249,6 +177,18 @@ def get_lang_info(params, lang_list, custom_aliases, supported_any_language): '*;q=0.5', ]) # lr parameter: # https://developers.google.com/custom-search/docs/xml_results#lrsp # Language Collection Values: # https://developers.google.com/custom-search/docs/xml_results_appendices#languageCollections ret_val['params']['lr'] = "lang_" + lang_country if lang_country in lang_list else language ret_val['params']['hl'] = lang_country if lang_country in lang_list else language # hl parameter: # https://developers.google.com/custom-search/docs/xml_results#hlsp The # Interface Language: # https://developers.google.com/custom-search/docs/xml_results_appendices#interfaceLanguages return ret_val def detect_google_sorry(resp): Loading Loading @@ -287,8 +227,11 @@ def request(query, params): query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) logger.debug("query_url --> %s", query_url) params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['headers'].update(lang_info['headers']) if use_mobile_ui: params['headers']['Accept'] = '*/*' Loading Loading @@ -330,7 +273,12 @@ def response(resp): logger.error(e, exc_info=True) # parse results for result in eval_xpath_list(dom, results_xpath): _results_xpath = results_xpath if use_mobile_ui: _results_xpath = results_xpath_mobile_ui for result in eval_xpath_list(dom, _results_xpath): # google *sections* if extract_text(eval_xpath(result, g_section_with_header)): Loading @@ -341,24 +289,27 @@ def response(resp): title_tag = eval_xpath_getindex(result, title_xpath, 0, default=None) if title_tag is None: # this not one of the common google results *section* logger.debug('ingoring <div class="g" ../> section: missing title') logger.debug('ingoring item from the result_xpath list: missing title') continue title = extract_text(title_tag) url = eval_xpath_getindex(result, href_xpath, 0, None) if url is None: continue content = extract_text(eval_xpath_getindex(result, content_xpath, 0, default=None), allow_none=True) if content is None: logger.debug('ingoring item from the result_xpath list: missing content of title "%s"', title) continue logger.debug('add link to results: %s', title) results.append({ 'url': url, 'title': title, 'content': content }) except Exception as e: # pylint: disable=broad-except logger.error(e, exc_info=True) # from lxml import etree # logger.debug(etree.tostring(result, pretty_print=True)) # import pdb # pdb.set_trace() continue # parse suggestion Loading
searx/engines/google_scholar.py +27 −20 Original line number Diff line number Diff line Loading @@ -82,27 +82,32 @@ def request(query, params): params, supported_languages, language_aliases, False ) logger.debug("HTTP header Accept-Language --> %s", lang_info['headers']['Accept-Language']) # subdomain is: scholar.google.xy lang_info['subdomain'] = lang_info['subdomain'].replace("www.", "scholar.") query_url = 'https://'+ lang_info['subdomain'] + '/scholar' + "?" + urlencode({ query_url = ( 'https://' + lang_info['subdomain'] + '/scholar' + "?" + urlencode( { 'q': query, **lang_info['params'], 'ie': "utf8", 'oe': "utf8", 'start': offset, }) } ) ) query_url += time_range_url(params) logger.debug("query_url --> %s", query_url) params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' # params['google_subdomain'] = subdomain return params Loading Loading @@ -139,11 +144,13 @@ def response(resp): if pub_type: title = title + " " + pub_type results.append({ results.append( { 'url': url, 'title': title, 'content': content, }) } ) # parse suggestion for suggestion in eval_xpath(dom, '//div[contains(@class, "gs_qsuggest_wrap")]//li//a'): Loading
searx/engines/google_videos.py +54 −50 Original line number Diff line number Diff line Loading @@ -36,13 +36,9 @@ from searx.engines.google import ( get_lang_info, time_range_dict, filter_mapping, results_xpath, g_section_with_header, title_xpath, href_xpath, content_xpath, suggestion_xpath, spelling_suggestion_xpath, detect_google_sorry, ) Loading @@ -53,6 +49,7 @@ from searx.engines.google import ( ) # pylint: enable=unused-import logger = logger.getChild('google videos') # about about = { "website": 'https://www.google.com', Loading @@ -63,11 +60,9 @@ about = { "results": 'HTML', } logger = logger.getChild('google video') # engine dependent config categories = ['videos'] categories = ['videos', 'web'] paging = False language_support = True use_locale_domain = True Loading @@ -76,16 +71,32 @@ safesearch = True RE_CACHE = {} def _re(regexpr): """returns compiled regular expression""" RE_CACHE[regexpr] = RE_CACHE.get(regexpr, re.compile(regexpr)) return RE_CACHE[regexpr] def scrap_out_thumbs_src(dom): ret_val = {} thumb_name = 'dimg_' for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): _script = script.text # "dimg_35":"https://i.ytimg.c....", _dimurl = _re("s='([^']*)").findall(_script) for k, v in _re('(' + thumb_name + '[0-9]*)":"(http[^"]*)').findall(_script): v = v.replace(r'\u003d', '=') v = v.replace(r'\u0026', '&') ret_val[k] = v logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) return ret_val def scrap_out_thumbs(dom): """Scrap out thumbnail data from <script> tags. """ """Scrap out thumbnail data from <script> tags.""" ret_val = {} thumb_name = 'vidthumb' thumb_name = 'dimg_' for script in eval_xpath_list(dom, '//script[contains(., "_setImagesSrc")]'): _script = script.text Loading @@ -95,20 +106,11 @@ def scrap_out_thumbs(dom): if not _imgdata: continue # var ii=['vidthumb4','vidthumb7'] # var ii=['dimg_17'] for _vidthumb in _re(r"(%s\d+)" % thumb_name).findall(_script): # At least the equal sign in the URL needs to be decoded ret_val[_vidthumb] = _imgdata[0].replace(r"\x3d", "=") # {google.ldidly=-1;google.ldi={"vidthumb8":"https://... for script in eval_xpath_list(dom, '//script[contains(., "google.ldi={")]'): _script = script.text for key_val in _re(r'"%s\d+\":\"[^\"]*"' % thumb_name).findall( _script) : match = _re(r'"(%s\d+)":"(.*)"' % thumb_name).search(key_val) if match: # At least the equal sign in the URL needs to be decoded ret_val[match.group(1)] = match.group(2).replace(r"\u003d", "=") logger.debug("found %s imgdata for: %s", thumb_name, ret_val.keys()) return ret_val Loading @@ -121,27 +123,30 @@ def request(query, params): params, supported_languages, language_aliases, False ) query_url = 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode({ query_url = ( 'https://' + lang_info['subdomain'] + '/search' + "?" + urlencode( { 'q': query, 'tbm': "vid", **lang_info['params'], 'ie': "utf8", 'oe': "utf8", }) } ) ) if params['time_range'] in time_range_dict: query_url += '&' + urlencode({'tbs': 'qdr:' + time_range_dict[params['time_range']]}) if params['safesearch']: query_url += '&' + urlencode({'safe': filter_mapping[params['safesearch']]}) logger.debug("query_url --> %s", query_url) params['url'] = query_url logger.debug("HTTP header Accept-Language --> %s", lang_info.get('Accept-Language')) params['headers'].update(lang_info['headers']) params['headers']['Accept'] = ( 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' ) params['headers']['Accept'] = 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8' return params Loading @@ -154,31 +159,33 @@ def response(resp): # convert the text to dom dom = html.fromstring(resp.text) vidthumb_imgdata = scrap_out_thumbs(dom) thumbs_src = scrap_out_thumbs_src(dom) logger.debug(str(thumbs_src)) # parse results for result in eval_xpath_list(dom, results_xpath): for result in eval_xpath_list(dom, '//div[contains(@class, "g ")]'): # google *sections* # ignore google *sections* if extract_text(eval_xpath(result, g_section_with_header)): logger.debug("ingoring <g-section-with-header>") continue title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, href_xpath, 0) c_node = eval_xpath_getindex(result, content_xpath, 0) # <img id="vidthumb1" ...> img_id = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@id', 0, default=None) # ingnore articles without an image id / e.g. news articles img_id = eval_xpath_getindex(result, './/g-img/img/@id', 0, default=None) if img_id is None: logger.error("no img_id found in item %s (news article?)", len(results) + 1) continue img_src = vidthumb_imgdata.get(img_id, None) if not img_src: logger.error("no vidthumb imgdata for: %s" % img_id) img_src = eval_xpath_getindex(c_node, './div[1]//a/g-img/img/@src', 0) img_src = thumbs_src.get(img_id, "") length = extract_text(eval_xpath(c_node, './/div[1]//a/div[3]')) content = extract_text(eval_xpath(c_node, './/div[2]/span')) pub_info = extract_text(eval_xpath(c_node, './/div[2]/div')) title = extract_text(eval_xpath_getindex(result, title_xpath, 0)) url = eval_xpath_getindex(result, './/div[@class="dXiKIc"]//a/@href', 0) length = extract_text(eval_xpath(result, './/div[contains(@class, "P7xzyf")]/span/span')) c_node = eval_xpath_getindex(result, './/div[@class="Uroaid"]', 0) content = extract_text(c_node) pub_info = extract_text(eval_xpath(result, './/div[@class="Zg1NU"]')) results.append({ 'url': url, Loading @@ -195,7 +202,4 @@ def response(resp): # append suggestion results.append({'suggestion': extract_text(suggestion)}) for correction in eval_xpath_list(dom, spelling_suggestion_xpath): results.append({'correction': extract_text(correction)}) return results