[fix] bing_news based on RSS output format (62cc2a56) · Commits · e / infra / spot

searx/engines/bing_news.py

+60 −51

Original line number	Diff line number	Diff line
		@@ -6,18 +6,17 @@
		max. 5000 query/month

		@using-api no (because of query limit)
		@results HTML (using search portal)
		@stable no (HTML can change)
		@parse url, title, content, publishedDate
		@results RSS (using search portal)
		@stable yes (except perhaps for the images)
		@parse url, title, content, publishedDate, thumbnail
		"""

		from urllib import urlencode
		from cgi import escape
		from lxml import html
		from datetime import datetime, timedelta
		from urlparse import urlparse, parse_qsl
		from datetime import datetime
		from dateutil import parser
		import re
		from searx.engines.xpath import extract_text
		from lxml import etree
		from searx.utils import list_get

		# engine dependent config
		categories = ['news']
		@@ -26,7 +25,25 @@ language_support = True

		# search-url
		base_url = 'https://www.bing.com/'
		search_string = 'news/search?{query}&first={offset}'
		search_string = 'news/search?{query}&first={offset}&format=RSS'


		# remove click
		def url_cleanup(url_string):
		parsed_url = urlparse(url_string)
		if parsed_url.netloc == 'www.bing.com' and parsed_url.path == '/news/apiclick.aspx':
		query = dict(parse_qsl(parsed_url.query))
		return query.get('url', None)
		return url_string


		# replace the http://*bing4.com/th?id=... by https://www.bing.com/th?id=...
		def image_url_cleanup(url_string):
		parsed_url = urlparse(url_string)
		if parsed_url.netloc.endswith('bing4.com') and parsed_url.path == '/th':
		query = dict(parse_qsl(parsed_url.query))
		return "https://www.bing.com/th?id=" + query.get('id')
		return url_string


		# do search-request
		@@ -42,8 +59,6 @@ def request(query, params):
		query=urlencode({'q': query, 'setmkt': language}),
		offset=offset)

		params['cookies']['_FP'] = "ui=en-US"

		params['url'] = base_url + search_path

		return params
		@@ -53,38 +68,19 @@ def request(query, params):
		def response(resp):
		results = []

		dom = html.fromstring(resp.content)
		rss = etree.fromstring(resp.content)

		ns = rss.nsmap

		# parse results
		for result in dom.xpath('//div[@class="sn_r"]'):
		link = result.xpath('.//div[@class="newstitle"]/a')[0]
		url = link.attrib.get('href')
		title = extract_text(link)
		contentXPath = result.xpath('.//div[@class="sn_txt"]/div//span[@class="sn_snip"]')
		content = escape(extract_text(contentXPath))

		# parse publishedDate
		publishedDateXPath = result.xpath('.//div[@class="sn_txt"]/div'
		'//div[contains(@class,"sn_ST")]'
		'//span[contains(@class,"sn_tm")]')

		publishedDate = escape(extract_text(publishedDateXPath))

		if re.match("^[0-9]+ minute(s\|) ago$", publishedDate):
		timeNumbers = re.findall(r'\d+', publishedDate)
		publishedDate = datetime.now() - timedelta(minutes=int(timeNumbers[0]))
		elif re.match("^[0-9]+ hour(s\|) ago$", publishedDate):
		timeNumbers = re.findall(r'\d+', publishedDate)
		publishedDate = datetime.now() - timedelta(hours=int(timeNumbers[0]))
		elif re.match("^[0-9]+ hour(s\|), [0-9]+ minute(s\|) ago$", publishedDate):
		timeNumbers = re.findall(r'\d+', publishedDate)
		publishedDate = datetime.now()\
		- timedelta(hours=int(timeNumbers[0]))\
		- timedelta(minutes=int(timeNumbers[1]))
		elif re.match("^[0-9]+ day(s\|) ago$", publishedDate):
		timeNumbers = re.findall(r'\d+', publishedDate)
		publishedDate = datetime.now() - timedelta(days=int(timeNumbers[0]))
		else:
		for item in rss.xpath('./channel/item'):
		# url / title / content
		url = url_cleanup(item.xpath('./link/text()')[0])
		title = list_get(item.xpath('./title/text()'), 0, url)
		content = list_get(item.xpath('./description/text()'), 0, '')

		# publishedDate
		publishedDate = list_get(item.xpath('./pubDate/text()'), 0)
		try:
		publishedDate = parser.parse(publishedDate, dayfirst=False)
		except TypeError:
		@@ -92,7 +88,20 @@ def response(resp):
		except ValueError:
		publishedDate = datetime.now()

		# thumbnail
		thumbnail = list_get(item.xpath('./News:Image/text()', namespaces=ns), 0)
		if thumbnail is not None:
		thumbnail = image_url_cleanup(thumbnail)

		# append result
		if thumbnail is not None:
		results.append({'template': 'videos.html',
		'url': url,
		'title': title,
		'publishedDate': publishedDate,
		'content': content,
		'thumbnail': thumbnail})
		else:
		results.append({'url': url,
		'title': title,
		'publishedDate': publishedDate,

searx/tests/engines/test_bing_news.py

+88 −186

Original line number	Diff line number	Diff line
		@@ -2,6 +2,7 @@ from collections import defaultdict
		import mock
		from searx.engines import bing_news
		from searx.testing import SearxTestCase
		import lxml


		class TestBingNewsEngine(SearxTestCase):
		@@ -16,14 +17,10 @@ class TestBingNewsEngine(SearxTestCase):
		self.assertIn(query, params['url'])
		self.assertIn('bing.com', params['url'])
		self.assertIn('fr', params['url'])
		self.assertIn('_FP', params['cookies'])
		self.assertIn('en', params['cookies']['_FP'])

		dicto['language'] = 'all'
		params = bing_news.request(query, dicto)
		self.assertIn('en', params['url'])
		self.assertIn('_FP', params['cookies'])
		self.assertIn('en', params['cookies']['_FP'])

		def test_response(self):
		self.assertRaises(AttributeError, bing_news.response, None)
		@@ -37,200 +34,105 @@ class TestBingNewsEngine(SearxTestCase):
		response = mock.Mock(content='<html></html>')
		self.assertEqual(bing_news.response(response), [])

		html = """
		<div class="sn_r">
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">44 minutes ago</span>
		</div>
		</div>
		</div>
		</div>
		"""
		html = """<?xml version="1.0" encoding="utf-8" ?>
		<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&setmkt=en-US&first=1&format=RSS">
		<channel>
		<title>python - Bing News</title>
		<link>https://www.bing.com:443/news/search?q=python&setmkt=en-US&first=1&format=RSS</link>
		<description>Search results</description>
		<image>
		<url>http://10.53.64.9/rsslogo.gif</url>
		<title>test</title>
		<link>https://www.bing.com:443/news/search?q=test&setmkt=en-US&first=1&format=RSS</link>
		</image>
		<copyright>Copyright</copyright>
		<item>
		<title>Title</title>
		<link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&aid=&tid=c237eccc50bd4758b106a5e3c94fce09&url=http%3a%2f%2furl.of.article%2f&c=xxxxxxxxx&mkt=en-us</link>
		<description>Article Content</description>
		<pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
		<News:Source>Infoworld</News:Source>
		<News:Image>http://a1.bing4.com/th?id=ON.13371337133713371337133713371337&pid=News</News:Image>
		<News:ImageSize>w={0}&h={1}&c=7</News:ImageSize>
		<News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
		<News:ImageMaxWidth>620</News:ImageMaxWidth>
		<News:ImageMaxHeight>413</News:ImageMaxHeight>
		</item>
		<item>
		<title>Another Title</title>
		<link>https://www.bing.com/news/apiclick.aspx?ref=FexRss&aid=&tid=c237eccc50bd4758b106a5e3c94fce09&url=http%3a%2f%2fanother.url.of.article%2f&c=xxxxxxxxx&mkt=en-us</link>
		<description>Another Article Content</description>
		<pubDate>Tue, 02 Jun 2015 13:37:00 GMT</pubDate>
		</item>
		</channel>
		</rss>""" # noqa
		response = mock.Mock(content=html)
		results = bing_news.response(response)
		self.assertEqual(type(results), list)
		self.assertEqual(len(results), 1)
		self.assertEqual(len(results), 2)
		self.assertEqual(results[0]['title'], 'Title')
		self.assertEqual(results[0]['url'], 'http://url.of.article/')
		self.assertEqual(results[0]['content'], 'Article Content')
		self.assertEqual(results[0]['thumbnail'], 'https://www.bing.com/th?id=ON.13371337133713371337133713371337')
		self.assertEqual(results[1]['title'], 'Another Title')
		self.assertEqual(results[1]['url'], 'http://another.url.of.article/')
		self.assertEqual(results[1]['content'], 'Another Article Content')
		self.assertNotIn('thumbnail', results[1])

		html = """
		<div class="sn_r">
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">44 minutes ago</span>
		</div>
		</div>
		</div>
		</div>
		<div class="sn_r">
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">3 hours, 44 minutes ago</span>
		</div>
		</div>
		</div>
		</div>
		<div class="sn_r">
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">44 hours ago</span>
		</div>
		</div>
		</div>
		</div>
		<div class="sn_r">
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">2 days ago</span>
		</div>
		</div>
		</div>
		</div>
		<div class="sn_r">
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">27/01/2015</span>
		</div>
		</div>
		</div>
		</div>
		<div class="sn_r">
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">Il y a 3 heures</span>
		</div>
		</div>
		</div>
		</div>
		"""
		html = """<?xml version="1.0" encoding="utf-8" ?>
		<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&setmkt=en-US&first=1&format=RSS">
		<channel>
		<title>python - Bing News</title>
		<link>https://www.bing.com:443/news/search?q=python&setmkt=en-US&first=1&format=RSS</link>
		<description>Search results</description>
		<image>
		<url>http://10.53.64.9/rsslogo.gif</url>
		<title>test</title>
		<link>https://www.bing.com:443/news/search?q=test&setmkt=en-US&first=1&format=RSS</link>
		</image>
		<copyright>Copyright</copyright>
		<item>
		<title>Title</title>
		<link>http://another.url.of.article/</link>
		<description>Article Content</description>
		<pubDate>garbage</pubDate>
		<News:Source>Infoworld</News:Source>
		<News:Image>http://another.bing.com/image</News:Image>
		<News:ImageSize>w={0}&h={1}&c=7</News:ImageSize>
		<News:ImageKeepOriginalRatio></News:ImageKeepOriginalRatio>
		<News:ImageMaxWidth>620</News:ImageMaxWidth>
		<News:ImageMaxHeight>413</News:ImageMaxHeight>
		</item>
		</channel>
		</rss>""" # noqa
		response = mock.Mock(content=html)
		results = bing_news.response(response)
		self.assertEqual(type(results), list)
		self.assertEqual(len(results), 6)
		self.assertEqual(len(results), 1)
		self.assertEqual(results[0]['title'], 'Title')
		self.assertEqual(results[0]['url'], 'http://another.url.of.article/')
		self.assertEqual(results[0]['content'], 'Article Content')
		self.assertEqual(results[0]['thumbnail'], 'http://another.bing.com/image')

		html = """<?xml version="1.0" encoding="utf-8" ?>
		<rss version="2.0" xmlns:News="https://www.bing.com:443/news/search?q=python&setmkt=en-US&first=1&format=RSS">
		<channel>
		<title>python - Bing News</title>
		<link>https://www.bing.com:443/news/search?q=python&setmkt=en-US&first=1&format=RSS</link>
		<description>Search results</description>
		<image>
		<url>http://10.53.64.9/rsslogo.gif</url>
		<title>test</title>
		<link>https://www.bing.com:443/news/search?q=test&setmkt=en-US&first=1&format=RSS</link>
		</image>
		</channel>
		</rss>""" # noqa

		html = """
		<div class="newstitle">
		<a href="http://url.of.article/" target="_blank" h="ID=news,5022.1">
		Title
		</a>
		</div>
		<div class="sn_img">
		<a href="http://url.of.article2/" target="_blank" h="ID=news,5024.1">
		<img class="rms_img" height="80" id="emb1" src="/image.src" title="Title" width="80" />
		</a>
		</div>
		<div class="sn_txt">
		<div class="sn_oi">
		<span class="sn_snip">Article Content</span>
		<div class="sn_ST">
		<cite class="sn_src">metronews.fr</cite>
		·
		<span class="sn_tm">44 minutes ago</span>
		</div>
		</div>
		</div>
		"""
		response = mock.Mock(content=html)
		results = bing_news.response(response)
		self.assertEqual(type(results), list)
		self.assertEqual(len(results), 0)

		html = """<?xml version="1.0" encoding="utf-8" ?>gabarge"""
		response = mock.Mock(content=html)
		self.assertRaises(lxml.etree.XMLSyntaxError, bing_news.response, response)

searx/utils.py

+8 −0

Original line number	Diff line number	Diff line
		@@ -228,6 +228,14 @@ def prettify_url(url):
		return url


		# get element in list or default value
		def list_get(a_list, index, default=None):
		if len(a_list) > index:
		return a_list[index]
		else:
		return default


		def get_blocked_engines(engines, cookies):
		if 'blocked_engines' not in cookies:
		return [(engine_name, category) for engine_name in engines