startpage.py 3.58 KB
Newer Older
1
2
#  Startpage (Web)
#
3
4
# @website     https://startpage.com
# @provide-api no (nothing found)
5
#
6
7
8
9
10
11
12
# @using-api   no
# @results     HTML
# @stable      no (HTML can change)
# @parse       url, title, content
#
# @todo        paging

asciimoo's avatar
asciimoo committed
13
from lxml import html
14
15
from dateutil import parser
from datetime import datetime, timedelta
16
import re
Cqoicebordel's avatar
Cqoicebordel committed
17
from searx.engines.xpath import extract_text
18
19
20

# engine dependent config
categories = ['general']
21
22
23
24
25
# there is a mechanism to block "bot" search
# (probably the parameter qid), require
# storing of qid's between mulitble search-calls

# paging = False
26
language_support = True
asciimoo's avatar
asciimoo committed
27

28
29
30
# search-url
base_url = 'https://startpage.com/'
search_url = base_url + 'do/search'
asciimoo's avatar
asciimoo committed
31

32
33
34
# specific xpath variables
# ads xpath //div[@id="results"]/div[@id="sponsored"]//div[@class="result"]
# not ads: div[@class="result"] are the direct childs of div[@id="results"]
35
results_xpath = '//li[contains(@class, "search-result") and contains(@class, "search-item")]'
36
link_xpath = './/h3/a'
37
content_xpath = './p[@class="search-item__body"]'
asciimoo's avatar
asciimoo committed
38

asciimoo's avatar
asciimoo committed
39

40
# do search-request
asciimoo's avatar
asciimoo committed
41
def request(query, params):
42
43
    offset = (params['pageno'] - 1) * 10

asciimoo's avatar
asciimoo committed
44
45
    params['url'] = search_url
    params['method'] = 'POST'
asciimoo's avatar
asciimoo committed
46
    params['data'] = {'query': query,
47
                      'startat': offset}
48

49
50
    # set language
    params['data']['with_language'] = ('lang_' + params['language'].split('-')[0])
51

asciimoo's avatar
asciimoo committed
52
53
54
    return params


55
# get response from search-request
asciimoo's avatar
asciimoo committed
56
57
def response(resp):
    results = []
58

Adam Tauber's avatar
Adam Tauber committed
59
    dom = html.fromstring(resp.text)
60

61
62
    # parse results
    for result in dom.xpath(results_xpath):
63
64
65
66
        links = result.xpath(link_xpath)
        if not links:
            continue
        link = links[0]
asciimoo's avatar
asciimoo committed
67
        url = link.attrib.get('href')
68

69
        # block google-ad url's
stepshal's avatar
stepshal committed
70
        if re.match(r"^http(s|)://(www\.)?google\.[a-z]+/aclk.*$", url):
71
72
73
            continue

        # block startpage search url's
stepshal's avatar
stepshal committed
74
        if re.match(r"^http(s|)://(www\.)?startpage\.com/do/search\?.*$", url):
75
76
            continue

77
        # block ixquick search url's
stepshal's avatar
stepshal committed
78
        if re.match(r"^http(s|)://(www\.)?ixquick\.com/do/search\?.*$", url):
79
80
            continue

81
        title = extract_text(link)
Cqoicebordel's avatar
Cqoicebordel committed
82

83
84
        if result.xpath(content_xpath):
            content = extract_text(result.xpath(content_xpath))
85
86
        else:
            content = ''
asciimoo's avatar
asciimoo committed
87

88
89
90
        published_date = None

        # check if search result starts with something like: "2 Sep 2014 ... "
stepshal's avatar
stepshal committed
91
        if re.match(r"^([1-9]|[1-2][0-9]|3[0-1]) [A-Z][a-z]{2} [0-9]{4} \.\.\. ", content):
Adam Tauber's avatar
Adam Tauber committed
92
93
            date_pos = content.find('...') + 4
            date_string = content[0:date_pos - 5]
94
95
96
97
98
99
            published_date = parser.parse(date_string, dayfirst=True)

            # fix content string
            content = content[date_pos:]

        # check if search result starts with something like: "5 days ago ... "
stepshal's avatar
stepshal committed
100
        elif re.match(r"^[0-9]+ days? ago \.\.\. ", content):
Adam Tauber's avatar
Adam Tauber committed
101
102
            date_pos = content.find('...') + 4
            date_string = content[0:date_pos - 5]
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120

            # calculate datetime
            published_date = datetime.now() - timedelta(days=int(re.match(r'\d+', date_string).group()))

            # fix content string
            content = content[date_pos:]

        if published_date:
            # append result
            results.append({'url': url,
                            'title': title,
                            'content': content,
                            'publishedDate': published_date})
        else:
            # append result
            results.append({'url': url,
                            'title': title,
                            'content': content})
asciimoo's avatar
asciimoo committed
121

122
    # return results
asciimoo's avatar
asciimoo committed
123
    return results