Loading searx/webutils.py +53 −16 Original line number Diff line number Diff line Loading @@ -113,31 +113,68 @@ def prettify_url(url, max_length=74): return url def contains_cjko(s: str) -> bool: """This function check whether or not a string contains Chinese, Japanese, or Korean characters. It employs regex and uses the u escape sequence to match any character in a set of Unicode ranges. Args: s (str): string to be checked. Returns: bool: True if the input s contains the characters and False otherwise. """ unicode_ranges = ( '\u4e00-\u9fff' # Chinese characters '\u3040-\u309f' # Japanese hiragana '\u30a0-\u30ff' # Japanese katakana '\u4e00-\u9faf' # Japanese kanji '\uac00-\ud7af' # Korean hangul syllables '\u1100-\u11ff' # Korean hangul jamo ) return bool(re.search(fr'[{unicode_ranges}]', s)) def regex_highlight_cjk(word: str) -> str: """Generate the regex pattern to match for a given word according to whether or not the word contains CJK characters or not. If the word is and/or contains CJK character, the regex pattern will match standalone word by taking into account the presence of whitespace before and after it; if not, it will match any presence of the word throughout the text, ignoring the whitespace. Args: word (str): the word to be matched with regex pattern. Returns: str: the regex pattern for the word. """ rword = re.escape(word) if contains_cjko(rword): return fr'({rword})' else: return fr'\b({rword})(?!\w)' def highlight_content(content, query): if not content: return None # ignoring html contents # TODO better html content detection if content.find('<') != -1: return content if content.lower().find(query.lower()) > -1: query_regex = '({0})'.format(re.escape(query)) content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) else: regex_parts = [] for chunk in query.split(): chunk = chunk.replace('"', '') if len(chunk) == 0: continue elif len(chunk) == 1: regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk))) else: regex_parts.append('{0}'.format(re.escape(chunk))) query_regex = '({0})'.format('|'.join(regex_parts)) content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) querysplit = query.split() queries = [] for qs in querysplit: qs = qs.replace("'", "").replace('"', '').replace(" ", "") if len(qs) > 0: queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U)) if len(queries) > 0: for q in set(queries): content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content) return content Loading tests/unit/test_webutils.py +10 −9 Original line number Diff line number Diff line Loading @@ -28,9 +28,9 @@ class TestWebUtils(SearxTestCase): content = 'a' query = 'test' self.assertEqual(webutils.highlight_content(content, query), content) self.assertEqual(webutils.highlight_content(content, query), 'a') query = 'a test' self.assertEqual(webutils.highlight_content(content, query), content) self.assertEqual(webutils.highlight_content(content, query), '<span class="highlight">a</span>') data = ( ('" test "', 'a test string', 'a <span class="highlight">test</span> string'), Loading @@ -38,22 +38,23 @@ class TestWebUtils(SearxTestCase): ( 'a test', 'this is a test string that matches entire query', 'this is <span class="highlight">a test</span> string that matches entire query', 'this is <span class="highlight">a</span> <span class="highlight">test</span> string that matches entire query', ), ( 'this a test', 'this is a string to test.', ( '<span class="highlight">this</span> is<span class="highlight"> a </span>' 'string to <span class="highlight">test</span>.' '<span class="highlight">this</span> is <span class="highlight">a</span> string to <span class="highlight">test</span>.' ), ), ( 'match this "exact phrase"', 'this string contains the exact phrase we want to match', ( '<span class="highlight">this</span> string contains the <span class="highlight">exact</span>' ' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>' ''.join( [ '<span class="highlight">this</span> string contains the <span class="highlight">exact</span> ', '<span class="highlight">phrase</span> we want to <span class="highlight">match</span>', ] ), ), ) Loading Loading
searx/webutils.py +53 −16 Original line number Diff line number Diff line Loading @@ -113,31 +113,68 @@ def prettify_url(url, max_length=74): return url def contains_cjko(s: str) -> bool: """This function check whether or not a string contains Chinese, Japanese, or Korean characters. It employs regex and uses the u escape sequence to match any character in a set of Unicode ranges. Args: s (str): string to be checked. Returns: bool: True if the input s contains the characters and False otherwise. """ unicode_ranges = ( '\u4e00-\u9fff' # Chinese characters '\u3040-\u309f' # Japanese hiragana '\u30a0-\u30ff' # Japanese katakana '\u4e00-\u9faf' # Japanese kanji '\uac00-\ud7af' # Korean hangul syllables '\u1100-\u11ff' # Korean hangul jamo ) return bool(re.search(fr'[{unicode_ranges}]', s)) def regex_highlight_cjk(word: str) -> str: """Generate the regex pattern to match for a given word according to whether or not the word contains CJK characters or not. If the word is and/or contains CJK character, the regex pattern will match standalone word by taking into account the presence of whitespace before and after it; if not, it will match any presence of the word throughout the text, ignoring the whitespace. Args: word (str): the word to be matched with regex pattern. Returns: str: the regex pattern for the word. """ rword = re.escape(word) if contains_cjko(rword): return fr'({rword})' else: return fr'\b({rword})(?!\w)' def highlight_content(content, query): if not content: return None # ignoring html contents # TODO better html content detection if content.find('<') != -1: return content if content.lower().find(query.lower()) > -1: query_regex = '({0})'.format(re.escape(query)) content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) else: regex_parts = [] for chunk in query.split(): chunk = chunk.replace('"', '') if len(chunk) == 0: continue elif len(chunk) == 1: regex_parts.append('\\W+{0}\\W+'.format(re.escape(chunk))) else: regex_parts.append('{0}'.format(re.escape(chunk))) query_regex = '({0})'.format('|'.join(regex_parts)) content = re.sub(query_regex, '<span class="highlight">\\1</span>', content, flags=re.I | re.U) querysplit = query.split() queries = [] for qs in querysplit: qs = qs.replace("'", "").replace('"', '').replace(" ", "") if len(qs) > 0: queries.extend(re.findall(regex_highlight_cjk(qs), content, flags=re.I | re.U)) if len(queries) > 0: for q in set(queries): content = re.sub(regex_highlight_cjk(q), f'<span class="highlight">{q}</span>', content) return content Loading
tests/unit/test_webutils.py +10 −9 Original line number Diff line number Diff line Loading @@ -28,9 +28,9 @@ class TestWebUtils(SearxTestCase): content = 'a' query = 'test' self.assertEqual(webutils.highlight_content(content, query), content) self.assertEqual(webutils.highlight_content(content, query), 'a') query = 'a test' self.assertEqual(webutils.highlight_content(content, query), content) self.assertEqual(webutils.highlight_content(content, query), '<span class="highlight">a</span>') data = ( ('" test "', 'a test string', 'a <span class="highlight">test</span> string'), Loading @@ -38,22 +38,23 @@ class TestWebUtils(SearxTestCase): ( 'a test', 'this is a test string that matches entire query', 'this is <span class="highlight">a test</span> string that matches entire query', 'this is <span class="highlight">a</span> <span class="highlight">test</span> string that matches entire query', ), ( 'this a test', 'this is a string to test.', ( '<span class="highlight">this</span> is<span class="highlight"> a </span>' 'string to <span class="highlight">test</span>.' '<span class="highlight">this</span> is <span class="highlight">a</span> string to <span class="highlight">test</span>.' ), ), ( 'match this "exact phrase"', 'this string contains the exact phrase we want to match', ( '<span class="highlight">this</span> string contains the <span class="highlight">exact</span>' ' <span class="highlight">phrase</span> we want to <span class="highlight">match</span>' ''.join( [ '<span class="highlight">this</span> string contains the <span class="highlight">exact</span> ', '<span class="highlight">phrase</span> we want to <span class="highlight">match</span>', ] ), ), ) Loading