Donate to e Foundation | Murena handsets with /e/OS | Own a part of Murena! Learn more

Commit 310d7b70 authored by Nicolas Gelot's avatar Nicolas Gelot
Browse files

Merge remote-tracking branch 'asciimoo/master' into dev

parents 2cc736bd 629b36d4
Loading
Loading
Loading
Loading
+2 −1
Original line number Diff line number Diff line
flask==1.0.2
flask-babel==0.11.2
jinja2==2.10
flask-babel==0.12.2
lxml==4.3.3
pygments==2.3.1
python-dateutil==2.8.0
+61 −0
Original line number Diff line number Diff line
"""
 APK Mirror

 @website     https://www.apkmirror.com

 @using-api   no
 @results     HTML
 @stable      no (HTML can change)
 @parse       url, title, thumbnail_src
"""

from lxml import html
from searx.engines.xpath import extract_text
from searx.url_utils import urlencode

# engine dependent config
categories = ['it']
paging = True

# I am not 100% certain about this, as apkmirror appears to be a wordpress site,
# which might support time_range searching. If you want to implement it, go ahead.
time_range_support = False

# search-url
base_url = 'https://www.apkmirror.com'
search_url = base_url + '/?post_type=app_release&searchtype=apk&page={pageno}&{query}'


# do search-request
def request(query, params):

    params['url'] = search_url.format(pageno=params['pageno'],
                                      query=urlencode({'s': query}))
    return params


# get response from search-request
def response(resp):
    results = []

    dom = html.fromstring(resp.text)

    # parse results
    for result in dom.xpath('.//div[@id="content"]/div[@class="listWidget"]/div[@class="appRow"]'):

        link = result.xpath('.//h5/a')[0]
        url = base_url + link.attrib.get('href') + '#downloads'
        title = extract_text(link)
        thumbnail_src = base_url + result.xpath('.//img')[0].attrib.get('src').replace('&w=32&h=32', '&w=64&h=64')

        res = {
            'url': url,
            'title': title,
            'thumbnail_src': thumbnail_src
        }

        # append result
        results.append(res)

    # return results
    return results
+7 −4
Original line number Diff line number Diff line
@@ -35,9 +35,12 @@ site_url = 'https://duckduckgo.com/?{query}&iar=images&iax=1&ia=images'

# run query in site to get vqd number needed for requesting images
# TODO: find a way to get this number without an extra request (is it a hash of the query?)
def get_vqd(query):
    res = get(site_url.format(query=urlencode({'q': query})))
def get_vqd(query, headers):
    query_url = site_url.format(query=urlencode({'q': query}))
    res = get(query_url, headers=headers)
    content = res.text
    if content.find('vqd=\'') == -1:
        raise Exception('Request failed')
    vqd = content[content.find('vqd=\'') + 5:]
    vqd = vqd[:vqd.find('\'')]
    return vqd
@@ -47,7 +50,7 @@ def get_vqd(query):
def request(query, params):
    # to avoid running actual external requests when testing
    if 'is_test' not in params:
        vqd = get_vqd(query)
        vqd = get_vqd(query, params['headers'])
    else:
        vqd = '12345'

@@ -74,7 +77,7 @@ def response(resp):
    try:
        res_json = loads(content)
    except:
        return []
        raise Exception('Cannot parse results')

    # parse results
    for result in res_json['results']:
+21 −17
Original line number Diff line number Diff line
@@ -11,7 +11,6 @@
"""

from datetime import date, timedelta
from json import loads
from lxml import html
from searx.url_utils import urlencode, urlparse, parse_qs

@@ -39,7 +38,6 @@ time_range_dict = {'day': 'd',
# do search-request
def request(query, params):
    search_options = {
        'ijn': params['pageno'] - 1,
        'start': (params['pageno'] - 1) * number_of_results
    }

@@ -53,7 +51,7 @@ def request(query, params):
        search_options['tbs'] = time_range_custom_attr.format(start=start, end=end)

    if safesearch and params['safesearch']:
        search_options['safe'] = 'on'
        search_options['safe'] = 'active'

    params['url'] = search_url.format(query=urlencode({'q': query}),
                                      search_options=urlencode(search_options))
@@ -63,24 +61,30 @@ def request(query, params):

# get response from search-request
def response(resp):
    dom = html.fromstring(resp.text)

    results = []
    for element in dom.xpath('//div[@id="search"] //td'):
        link = element.xpath('./a')[0]

    dom = html.fromstring(resp.text)
        google_url = urlparse(link.xpath('.//@href')[0])
        query = parse_qs(google_url.query)
        source_url = next(iter(query.get('q', [])), None)

    # parse results
    for img in dom.xpath('//a'):
        r = {
            'title': ' '.join(img.xpath('.//div[class="rg_ilmbg"]//text()')),
        title_parts = element.xpath('./cite//following-sibling::*/text()')
        title_parts.extend(element.xpath('./cite//following-sibling::text()')[:-1])

        result = {
            'title': ''.join(title_parts),
            'content': '',
            'template': 'images.html',
            'url': source_url,
            'img_src': source_url,
            'thumbnail_src': next(iter(link.xpath('.//img //@src')), None)
        }
        url = urlparse(img.xpath('.//@href')[0])
        query = parse_qs(url.query)
        r['url'] = query['imgrefurl'][0]
        r['img_src'] = query['imgurl'][0]
        r['thumbnail_src'] = r['img_src']
        # append result
        results.append(r)

    # return results

        if not source_url or not result['thumbnail_src']:
            continue

        results.append(result)
    return results
+34 −36
Original line number Diff line number Diff line
@@ -8,7 +8,8 @@
# @stable      no
# @parse       url, title, content, publishedDate, thumbnail, embedded

from lxml import html
from functools import reduce
from json import loads
from searx.engines.xpath import extract_text
from searx.utils import list_get
from searx.url_utils import quote_plus
@@ -34,20 +35,6 @@ embedded_url = '<iframe width="540" height="304" ' +\

base_youtube_url = 'https://www.youtube.com/watch?v='

# specific xpath variables
results_xpath = "//ol/li/div[contains(@class, 'yt-lockup yt-lockup-tile yt-lockup-video vve-check')]"
url_xpath = './/h3/a/@href'
title_xpath = './/div[@class="yt-lockup-content"]/h3/a'
content_xpath = './/div[@class="yt-lockup-content"]/div[@class="yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2"]'


# returns extract_text on the first result selected by the xpath or None
def extract_text_from_dom(result, xpath):
    r = result.xpath(xpath)
    if len(r) > 0:
        return extract_text(r[0])
    return None


# do search-request
def request(query, params):
@@ -63,18 +50,29 @@ def request(query, params):
def response(resp):
    results = []

    dom = html.fromstring(resp.text)
    results_data = resp.text[resp.text.find('ytInitialData'):]
    results_data = results_data[results_data.find('{'):results_data.find(';\n')]

    # parse results
    for result in dom.xpath(results_xpath):
        videoid = list_get(result.xpath('@data-context-item-id'), 0)
    results_json = loads(results_data) if results_data else {}
    sections = results_json.get('contents', {})\
                           .get('twoColumnSearchResultsRenderer', {})\
                           .get('primaryContents', {})\
                           .get('sectionListRenderer', {})\
                           .get('contents', [])

    for section in sections:
        for video_container in section.get('itemSectionRenderer', {}).get('contents', []):
            video = video_container.get('videoRenderer', {})
            videoid = video.get('videoId')
            if videoid is not None:
                url = base_youtube_url + videoid
                thumbnail = 'https://i.ytimg.com/vi/' + videoid + '/hqdefault.jpg'

            title = extract_text_from_dom(result, title_xpath) or videoid
            content = extract_text_from_dom(result, content_xpath)

                title = video.get('title', {}).get('simpleText', videoid)
                description_snippet = video.get('descriptionSnippet', {})
                if 'runs' in description_snippet:
                    content = reduce(lambda a, b: a + b.get('text', ''), description_snippet.get('runs'), '')
                else:
                    content = description_snippet.get('simpleText', '')
                embedded = embedded_url.format(videoid=videoid)

                # append result
Loading