Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit a3d7e9c2 authored by Bnyro's avatar Bnyro Committed by Markus Heiser
Browse files

[mod] utils.py: add markdown_to_text helper function

parent 668b1d55
Loading
Loading
Loading
Loading
+6 −12
Original line number Diff line number Diff line
@@ -42,10 +42,9 @@ Implementations
from datetime import datetime
from urllib.parse import urlencode

from markdown_it import MarkdownIt
from flask_babel import gettext

from searx.utils import html_to_text
from searx.utils import markdown_to_text

about = {
    "website": 'https://lemmy.ml/',
@@ -78,11 +77,6 @@ def request(query, params):
    return params


def _format_content(content):
    html = MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(content)
    return html_to_text(html)


def _get_communities(json):
    results = []

@@ -97,7 +91,7 @@ def _get_communities(json):
            {
                'url': result['community']['actor_id'],
                'title': result['community']['title'],
                'content': _format_content(result['community'].get('description', '')),
                'content': markdown_to_text(result['community'].get('description', '')),
                'img_src': result['community'].get('icon', result['community'].get('banner')),
                'publishedDate': datetime.strptime(counts['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                'metadata': metadata,
@@ -114,7 +108,7 @@ def _get_users(json):
            {
                'url': result['person']['actor_id'],
                'title': result['person']['name'],
                'content': _format_content(result['person'].get('bio', '')),
                'content': markdown_to_text(result['person'].get('bio', '')),
            }
        )

@@ -140,7 +134,7 @@ def _get_posts(json):

        content = result['post'].get('body', '').strip()
        if content:
            content = _format_content(content)
            content = markdown_to_text(content)

        results.append(
            {
@@ -164,7 +158,7 @@ def _get_comments(json):

        content = result['comment'].get('content', '').strip()
        if content:
            content = _format_content(content)
            content = markdown_to_text(content)

        metadata = (
            f"▲ {result['counts']['upvotes']} ▼ {result['counts']['downvotes']}"
@@ -176,7 +170,7 @@ def _get_comments(json):
            {
                'url': result['comment']['ap_id'],
                'title': result['post']['name'],
                'content': _format_content(result['comment']['content']),
                'content': markdown_to_text(result['comment']['content']),
                'publishedDate': datetime.strptime(result['comment']['published'][:19], '%Y-%m-%dT%H:%M:%S'),
                'metadata': metadata,
            }
+24 −0
Original line number Diff line number Diff line
@@ -15,6 +15,7 @@ from os.path import splitext, join
from random import choice
from html.parser import HTMLParser
from urllib.parse import urljoin, urlparse
from markdown_it import MarkdownIt

from lxml import html
from lxml.etree import ElementBase, XPath, XPathError, XPathSyntaxError, _ElementStringResult, _ElementUnicodeResult
@@ -158,6 +159,29 @@ def html_to_text(html_str: str) -> str:
    return s.get_text()


def markdown_to_text(markdown_str: str) -> str:
    """Extract text from a Markdown string

    Args:
        * markdown_str (str): string Markdown

    Returns:
        * str: extracted text

    Examples:
        >>> markdown_to_text('[example](https://example.com)')
        'example'

        >>> markdown_to_text('## Headline')
        'Headline'
    """

    html_str = (
        MarkdownIt("commonmark", {"typographer": True}).enable(["replacements", "smartquotes"]).render(markdown_str)
    )
    return html_to_text(html_str)


def extract_text(xpath_results, allow_none: bool = False) -> Optional[str]:
    """Extract text from a lxml result