Merge pull request #2109 from ahmad-alkadri/fix/highlight-full-word (6d72ef3c) · Commits · e / infra / spot

searx/webutils.py

+53 −16

Original line number	Diff line number	Diff line
		@@ -113,31 +113,68 @@ def prettify_url(url, max_length=74):
		return url


		def contains_cjko(s: str) -> bool:
		"""This function check whether or not a string contains Chinese, Japanese,
		or Korean characters. It employs regex and uses the u escape sequence to
		match any character in a set of Unicode ranges.

		Args:
		s (str): string to be checked.

		Returns:
		bool: True if the input s contains the characters and False otherwise.
		"""
		unicode_ranges = (
		'\u4e00-\u9fff' # Chinese characters
		'\u3040-\u309f' # Japanese hiragana
		'\u30a0-\u30ff' # Japanese katakana
		'\u4e00-\u9faf' # Japanese kanji
		'\uac00-\ud7af' # Korean hangul syllables
		'\u1100-\u11ff' # Korean hangul jamo
		)
		return bool(re.search(fr'[{unicode_ranges}]', s))


		def regex_highlight_cjk(word: str) -> str:
		"""Generate the regex pattern to match for a given word according
		to whether or not the word contains CJK characters or not.
		If the word is and/or contains CJK character, the regex pattern
		will match standalone word by taking into account the presence
		of whitespace before and after it; if not, it will match any presence
		of the word throughout the text, ignoring the whitespace.

		Args:
		word (str): the word to be matched with regex pattern.

		Returns:
		str: the regex pattern for the word.
		"""
		rword = re.escape(word)
		if contains_cjko(rword):
		return fr'({rword})'
		else:
		return fr'\b({rword})(?!\w)'


		def highlight_content(content, query):

		if not content:
		return None

		# ignoring html contents
		# TODO better html content detection
		if content.find('<') != -1:
		return content

		if content.lower().find(query.lower()) > -1:
		query_regex = '({0})'.format(re.escape(query))
		content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I \| re.U)
		else:
		regex_parts = []
		for chunk in query.split():
		chunk = chunk.replace('"', '')
		if len(chunk) == 0:
		continue
		elif len(chunk) == 1:
		regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk)))
		else:
		regex_parts.append('{0}'.format(re.escape(chunk)))
		query_regex = '({0})'.format('\|'.join(regex_parts))
		content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I \| re.U)

		querysplit = query.split()
		queries = []
		for qs in querysplit:
		qs = qs.replace("'", "").replace('"', '').replace(" ", "")
		if len(qs) > 0:
		queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I \| re.U))
		if len(queries) > 0:
		for q in set(queries):
		content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content)
		return content

tests/unit/test_webutils.py

+10 −9

Original line number	Diff line number	Diff line
		@@ -28,9 +28,9 @@ class TestWebUtils(SearxTestCase):

		content = 'a'
		query = 'test'
		self.assertEqual(webutils.highlight_content(content, query), content)
		self.assertEqual(webutils.highlight_content(content, query), 'a')
		query = 'a test'
		self.assertEqual(webutils.highlight_content(content, query), content)
		self.assertEqual(webutils.highlight_content(content, query), '<span class="highlight">a</span>')

		data = (
		('" test "', 'a test string', 'a <span class="highlight">test</span> string'),
		@@ -38,22 +38,23 @@ class TestWebUtils(SearxTestCase):
		(
		'a test',
		'this is a test string that matches entire query',
		'this is <span class="highlight">a test</span> string that matches entire query',
		'this is <span class="highlight">a</span> <span class="highlight">test</span> string that matches entire query',
		),
		(
		'this a test',
		'this is a string to test.',
		(
		'<span class="highlight">this</span> is<span class="highlight"> a </span>'
		'string to <span class="highlight">test</span>.'
		'<span class="highlight">this</span> is <span class="highlight">a</span> string to <span class="highlight">test</span>.'
		),
		),
		(
		'match this "exact phrase"',
		'this string contains the exact phrase we want to match',
		(
		'<span class="highlight">this</span> string contains the <span class="highlight">exact</span>'
		' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>'
		''.join(
		[
		'<span class="highlight">this</span> string contains the <span class="highlight">exact</span> ',
		'<span class="highlight">phrase</span> we want to <span class="highlight">match</span>',
		]
		),
		),
		)