[mod] utils.py: add markdown_to_text helper function (a3d7e9c2) · Commits · e / infra / spot

searx/engines/lemmy.py

+6 −12

Original line number	Diff line number	Diff line
		@@ -42,10 +42,9 @@ Implementations
		from datetime import datetime
		from urllib.parse import urlencode

		from markdown_it import MarkdownIt
		from flask_babel import gettext

		from searx.utils import html_to_text
		from searx.utils import markdown_to_text

		about = {
		"website": 'https://lemmy.ml/',
		@@ -78,11 +77,6 @@ def request(query, params):
		return params


		def _format_content(content):
		html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
		return html_to_text(html)


		def _get_communities(json):
		results = []

		@@ -97,7 +91,7 @@ def _get_communities(json):
		{
		'url': result['community']['actor_id'],
		'title': result['community']['title'],
		'content': _format_content(result['community'].get('description', '')),
		'content': markdown_to_text(result['community'].get('description', '')),
		'img_src': result['community'].get('icon', result['community'].get('banner')),
		'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
		'metadata': metadata,
		@@ -114,7 +108,7 @@ def _get_users(json):
		{
		'url': result['person']['actor_id'],
		'title': result['person']['name'],
		'content': _format_content(result['person'].get('bio', '')),
		'content': markdown_to_text(result['person'].get('bio', '')),
		}
		)

		@@ -140,7 +134,7 @@ def _get_posts(json):

		content = result['post'].get('body', '').strip()
		if content:
		content = _format_content(content)
		content = markdown_to_text(content)

		results.append(
		{
		@@ -164,7 +158,7 @@ def _get_comments(json):

		content = result['comment'].get('content', '').strip()
		if content:
		content = _format_content(content)
		content = markdown_to_text(content)

		metadata = (
		f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
		@@ -176,7 +170,7 @@ def _get_comments(json):
		{
		'url': result['comment']['ap_id'],
		'title': result['post']['name'],
		'content': _format_content(result['comment']['content']),
		'content': markdown_to_text(result['comment']['content']),
		'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
		'metadata': metadata,
		}

searx/utils.py

+24 −0

Original line number	Diff line number	Diff line
		@@ -15,6 +15,7 @@ from os.path import splitext, join
		from random import choice
		from html.parser import HTMLParser
		from urllib.parse import urljoin, urlparse
		from markdown_it import MarkdownIt

		from lxml import html
		from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
		@@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
		return s.get_text()


		def markdown_to_text(markdown_str: str) -> str:
		"""Extract text from a Markdown string

		Args:
		* markdown_str (str): string Markdown

		Returns:
		* str: extracted text

		Examples:
		>>> markdown_to_text('[example](https://example.com)')
		'example'

		>>> markdown_to_text('## Headline')
		'Headline'
		"""

		html_str = (
		MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
		)
		return html_to_text(html_str)


		def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
		"""Extract text from a lxml result