Loading searx/engines/yahoo_news.py +18 −7 Original line number Diff line number Diff line Loading @@ -23,15 +23,15 @@ paging = True language_support = True # search-url search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' # noqa search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa # specific xpath variables results_xpath = '//div[@class="res"]' results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' url_xpath = './/h3/a/@href' title_xpath = './/h3/a' content_xpath = './/div[@class="abstr"]' publishedDate_xpath = './/span[@class="timestamp"]' suggestion_xpath = '//div[@id="satat"]//a' content_xpath = './/div[@class="compText"]' publishedDate_xpath = './/span[contains(@class,"tri")]' suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' # do search-request Loading @@ -48,11 +48,18 @@ def request(query, params): lang=language) # TODO required? params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ .format(lang=language) return params def sanitize_url(url): if ".yahoo.com/" in url: return re.sub(u"\;\_ylt\=.+$", "", url) else: return url # get response from search-request def response(resp): results = [] Loading @@ -61,13 +68,17 @@ def response(resp): # parse results for result in dom.xpath(results_xpath): url = parse_url(extract_url(result.xpath(url_xpath), search_url)) urls = result.xpath(url_xpath) if len(urls) != 1: continue url = sanitize_url(parse_url(extract_url(urls, search_url))) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) # parse publishedDate publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) # still useful ? if re.match("^[0-9]+ minute(s|) ago$", publishedDate): publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) # noqa else: Loading searx/tests/engines/test_yahoo_news.py +71 −82 Original line number Diff line number Diff line Loading @@ -39,8 +39,9 @@ class TestYahooNewsEngine(SearxTestCase): self.assertEqual(yahoo_news.response(response), []) html = """ <div class="res"> <div> <ol class=" reg searchCenterMiddle"> <li class="first"> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -48,10 +49,14 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo! Finance</span> <span class="timestamp">Feb 03 09:45am</span> <div class="abstr"> <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">May 01 10:00 AM</span> </div> <div class="compText"> This is the content </div> </li> </div> """ response = mock.Mock(text=html) Loading @@ -63,8 +68,9 @@ class TestYahooNewsEngine(SearxTestCase): self.assertEqual(results[0]['content'], 'This is the content') html = """ <div class="res"> <div> <ol class=" reg searchCenterMiddle"> <li class="first"> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -72,13 +78,16 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo!</span> <span class="timestamp">2 hours, 22 minutes ago</span> <div class="abstr"> This is the content <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">2 hours, 22 minutes ago</span> </div> <div class="compText"> This is the content </div> <div class="res"> <div> </li> <li> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -86,13 +95,16 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo!</span> <span class="timestamp">22 minutes ago</span> <div class="abstr"> This is the content <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">22 minutes ago</span> </div> <div class="compText"> This is the content </div> <div class="res"> <div> </li> <li> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -100,11 +112,15 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo!</span> <span class="timestamp">Feb 03 09:45am 1900</span> <div class="abstr"> This is the content <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">Feb 03 09:45AM 1900</span> </div> <div class="compText"> This is the content </div> </li> </ol> """ response = mock.Mock(text=html) results = yahoo_news.response(response) Loading @@ -114,30 +130,3 @@ class TestYahooNewsEngine(SearxTestCase): self.assertEqual(results[0]['url'], 'http://this.is.the.url/') self.assertEqual(results[0]['content'], 'This is the content') self.assertEqual(results[2]['publishedDate'].year, datetime.now().year) html = """ <li class="b_algo" u="0|5109|4755453613245655|UAGjXgIrPH5yh-o5oNHRx_3Zta87f_QO"> <div Class="sa_mc"> <div class="sb_tlst"> <h2> <a href="http://this.should.be.the.link/" h="ID=SERP,5124.1"> <strong>This</strong> should be the title</a> </h2> </div> <div class="sb_meta"> <cite> <strong>this</strong>.meta.com</cite> <span class="c_tlbxTrg"> <span class="c_tlbxH" H="BASE:CACHEDPAGEDEFAULT" K="SERP,5125.1"> </span> </span> </div> <p> <strong>This</strong> should be the content.</p> </div> </li> """ response = mock.Mock(text=html) results = yahoo_news.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 0) Loading
searx/engines/yahoo_news.py +18 −7 Original line number Diff line number Diff line Loading @@ -23,15 +23,15 @@ paging = True language_support = True # search-url search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&fl=1&vl=lang_{lang}' # noqa search_url = 'https://news.search.yahoo.com/search?{query}&b={offset}&{lang}=uh3_news_web_gs_1&pz=10&xargs=0&vl=lang_{lang}' # noqa # specific xpath variables results_xpath = '//div[@class="res"]' results_xpath = '//ol[contains(@class,"searchCenterMiddle")]//li' url_xpath = './/h3/a/@href' title_xpath = './/h3/a' content_xpath = './/div[@class="abstr"]' publishedDate_xpath = './/span[@class="timestamp"]' suggestion_xpath = '//div[@id="satat"]//a' content_xpath = './/div[@class="compText"]' publishedDate_xpath = './/span[contains(@class,"tri")]' suggestion_xpath = '//div[contains(@class,"VerALSOTRY")]//a' # do search-request Loading @@ -48,11 +48,18 @@ def request(query, params): lang=language) # TODO required? params['cookies']['sB'] = 'fl=1&vl=lang_{lang}&sh=1&rw=new&v=1'\ params['cookies']['sB'] = '"v=1&vm=p&fl=1&vl=lang_{lang}&sh=1&pn=10&rw=new'\ .format(lang=language) return params def sanitize_url(url): if ".yahoo.com/" in url: return re.sub(u"\;\_ylt\=.+$", "", url) else: return url # get response from search-request def response(resp): results = [] Loading @@ -61,13 +68,17 @@ def response(resp): # parse results for result in dom.xpath(results_xpath): url = parse_url(extract_url(result.xpath(url_xpath), search_url)) urls = result.xpath(url_xpath) if len(urls) != 1: continue url = sanitize_url(parse_url(extract_url(urls, search_url))) title = extract_text(result.xpath(title_xpath)[0]) content = extract_text(result.xpath(content_xpath)[0]) # parse publishedDate publishedDate = extract_text(result.xpath(publishedDate_xpath)[0]) # still useful ? if re.match("^[0-9]+ minute(s|) ago$", publishedDate): publishedDate = datetime.now() - timedelta(minutes=int(re.match(r'\d+', publishedDate).group())) # noqa else: Loading
searx/tests/engines/test_yahoo_news.py +71 −82 Original line number Diff line number Diff line Loading @@ -39,8 +39,9 @@ class TestYahooNewsEngine(SearxTestCase): self.assertEqual(yahoo_news.response(response), []) html = """ <div class="res"> <div> <ol class=" reg searchCenterMiddle"> <li class="first"> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -48,10 +49,14 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo! Finance</span> <span class="timestamp">Feb 03 09:45am</span> <div class="abstr"> <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">May 01 10:00 AM</span> </div> <div class="compText"> This is the content </div> </li> </div> """ response = mock.Mock(text=html) Loading @@ -63,8 +68,9 @@ class TestYahooNewsEngine(SearxTestCase): self.assertEqual(results[0]['content'], 'This is the content') html = """ <div class="res"> <div> <ol class=" reg searchCenterMiddle"> <li class="first"> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -72,13 +78,16 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo!</span> <span class="timestamp">2 hours, 22 minutes ago</span> <div class="abstr"> This is the content <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">2 hours, 22 minutes ago</span> </div> <div class="compText"> This is the content </div> <div class="res"> <div> </li> <li> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -86,13 +95,16 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo!</span> <span class="timestamp">22 minutes ago</span> <div class="abstr"> This is the content <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">22 minutes ago</span> </div> <div class="compText"> This is the content </div> <div class="res"> <div> </li> <li> <div class="compTitle"> <h3> <a class="yschttl spt" href="http://this.is.the.url" target="_blank"> This is Loading @@ -100,11 +112,15 @@ class TestYahooNewsEngine(SearxTestCase): </a> </h3> </div> <span class="url">Business via Yahoo!</span> <span class="timestamp">Feb 03 09:45am 1900</span> <div class="abstr"> This is the content <div> <span class="cite">Business via Yahoo!</span> <span class="tri fc-2nd ml-10">Feb 03 09:45AM 1900</span> </div> <div class="compText"> This is the content </div> </li> </ol> """ response = mock.Mock(text=html) results = yahoo_news.response(response) Loading @@ -114,30 +130,3 @@ class TestYahooNewsEngine(SearxTestCase): self.assertEqual(results[0]['url'], 'http://this.is.the.url/') self.assertEqual(results[0]['content'], 'This is the content') self.assertEqual(results[2]['publishedDate'].year, datetime.now().year) html = """ <li class="b_algo" u="0|5109|4755453613245655|UAGjXgIrPH5yh-o5oNHRx_3Zta87f_QO"> <div Class="sa_mc"> <div class="sb_tlst"> <h2> <a href="http://this.should.be.the.link/" h="ID=SERP,5124.1"> <strong>This</strong> should be the title</a> </h2> </div> <div class="sb_meta"> <cite> <strong>this</strong>.meta.com</cite> <span class="c_tlbxTrg"> <span class="c_tlbxH" H="BASE:CACHEDPAGEDEFAULT" K="SERP,5125.1"> </span> </span> </div> <p> <strong>This</strong> should be the content.</p> </div> </li> """ response = mock.Mock(text=html) results = yahoo_news.response(response) self.assertEqual(type(results), list) self.assertEqual(len(results), 0)