Loading searx/engines/lemmy.py +6 −12 Original line number Diff line number Diff line Loading @@ -42,10 +42,9 @@ Implementations from datetime import datetime from urllib.parse import urlencode from markdown_it import MarkdownIt from flask_babel import gettext from searx.utils import html_to_text from searx.utils import markdown_to_text about = { "website": 'https://lemmy.ml/', Loading Loading @@ -78,11 +77,6 @@ def request(query, params): return params def _format_content(content): html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content) return html_to_text(html) def _get_communities(json): results = [] Loading @@ -97,7 +91,7 @@ def _get_communities(json): { 'url': result['community']['actor_id'], 'title': result['community']['title'], 'content': _format_content(result['community'].get('description', '')), 'content': markdown_to_text(result['community'].get('description', '')), 'img_src': result['community'].get('icon', result['community'].get('banner')), 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'), 'metadata': metadata, Loading @@ -114,7 +108,7 @@ def _get_users(json): { 'url': result['person']['actor_id'], 'title': result['person']['name'], 'content': _format_content(result['person'].get('bio', '')), 'content': markdown_to_text(result['person'].get('bio', '')), } ) Loading @@ -140,7 +134,7 @@ def _get_posts(json): content = result['post'].get('body', '').strip() if content: content = _format_content(content) content = markdown_to_text(content) results.append( { Loading @@ -164,7 +158,7 @@ def _get_comments(json): content = result['comment'].get('content', '').strip() if content: content = _format_content(content) content = markdown_to_text(content) metadata = ( f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}" Loading @@ -176,7 +170,7 @@ def _get_comments(json): { 'url': result['comment']['ap_id'], 'title': result['post']['name'], 'content': _format_content(result['comment']['content']), 'content': markdown_to_text(result['comment']['content']), 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'), 'metadata': metadata, } Loading searx/utils.py +24 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ from os.path import splitext, join from random import choice from html.parser import HTMLParser from urllib.parse import urljoin, urlparse from markdown_it import MarkdownIt from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult Loading Loading @@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str: return s.get_text() def markdown_to_text(markdown_str: str) -> str: """Extract text from a Markdown string Args: * markdown_str (str): string Markdown Returns: * str: extracted text Examples: >>> markdown_to_text('[example](https://example.com)') 'example' >>> markdown_to_text('## Headline') 'Headline' """ html_str = ( MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str) ) return html_to_text(html_str) def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]: """Extract text from a lxml result Loading Loading
searx/engines/lemmy.py +6 −12 Original line number Diff line number Diff line Loading @@ -42,10 +42,9 @@ Implementations from datetime import datetime from urllib.parse import urlencode from markdown_it import MarkdownIt from flask_babel import gettext from searx.utils import html_to_text from searx.utils import markdown_to_text about = { "website": 'https://lemmy.ml/', Loading Loading @@ -78,11 +77,6 @@ def request(query, params): return params def _format_content(content): html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content) return html_to_text(html) def _get_communities(json): results = [] Loading @@ -97,7 +91,7 @@ def _get_communities(json): { 'url': result['community']['actor_id'], 'title': result['community']['title'], 'content': _format_content(result['community'].get('description', '')), 'content': markdown_to_text(result['community'].get('description', '')), 'img_src': result['community'].get('icon', result['community'].get('banner')), 'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'), 'metadata': metadata, Loading @@ -114,7 +108,7 @@ def _get_users(json): { 'url': result['person']['actor_id'], 'title': result['person']['name'], 'content': _format_content(result['person'].get('bio', '')), 'content': markdown_to_text(result['person'].get('bio', '')), } ) Loading @@ -140,7 +134,7 @@ def _get_posts(json): content = result['post'].get('body', '').strip() if content: content = _format_content(content) content = markdown_to_text(content) results.append( { Loading @@ -164,7 +158,7 @@ def _get_comments(json): content = result['comment'].get('content', '').strip() if content: content = _format_content(content) content = markdown_to_text(content) metadata = ( f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}" Loading @@ -176,7 +170,7 @@ def _get_comments(json): { 'url': result['comment']['ap_id'], 'title': result['post']['name'], 'content': _format_content(result['comment']['content']), 'content': markdown_to_text(result['comment']['content']), 'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'), 'metadata': metadata, } Loading
searx/utils.py +24 −0 Original line number Diff line number Diff line Loading @@ -15,6 +15,7 @@ from os.path import splitext, join from random import choice from html.parser import HTMLParser from urllib.parse import urljoin, urlparse from markdown_it import MarkdownIt from lxml import html from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult Loading Loading @@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str: return s.get_text() def markdown_to_text(markdown_str: str) -> str: """Extract text from a Markdown string Args: * markdown_str (str): string Markdown Returns: * str: extracted text Examples: >>> markdown_to_text('[example](https://example.com)') 'example' >>> markdown_to_text('## Headline') 'Headline' """ html_str = ( MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str) ) return html_to_text(html_str) def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]: """Extract text from a lxml result Loading