-
Notifications
You must be signed in to change notification settings - Fork 4
/
envato.py
executable file
·157 lines (122 loc) · 4.49 KB
/
envato.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/env python3
# see http://www.ibm.com/developerworks/aix/library/au-pythocli/index.html
import csv
import optparse
import urllib
from urllib.parse import urljoin
import requests
import sys
from bs4 import BeautifulSoup, NavigableString
from tabulate import tabulate
def main():
p = optparse.OptionParser()
p.add_option('--category', '-c', default='',
help="The category to choose (site-templates, wordpress, psd-templates, marketing, ecommerce, "
+ "cms-themes, muse-templates, blogging, courses, sketch-templates, forums, "
+ "static-site-generators, typeengine-themes)")
p.add_option('--pages', '-p', default=1, help="Number of pages to fetch")
p.add_option('--search', '-s', default='', help="Term to search for")
p.add_option('--output', '-o', default='table', help="The output format (csv or table)")
options, arguments = p.parse_args()
# extract options
category = options.category
max_page_count = int(options.pages)
search_term = options.search
output_format = options.output
# fetch pages
pages = fetch_html_pages(max_page_count, search_term, category)
# extract items
items = extract_items(pages)
# generate ouput
if output_format == 'table':
output_table(items)
elif output_format == 'csv':
output_csv(items)
else:
exit('Unknown output format')
def extract_items(pages):
items = []
for page in pages:
soup = BeautifulSoup(page, 'html.parser')
product_list = soup.findAll(attrs={'class': 'product-list'})[0]
for li in product_list.contents:
if not isinstance(li, NavigableString):
items.append(extract_item(li))
return items
def fetch_html_pages(page_count, search_term, category):
pages = []
for page_number in range(1, page_count + 1):
url = get_url(page_number, term=search_term, category=category)
r = requests.get(url)
if r.history:
# 302 means last page was exceeded
if r.history[0].status_code == 302:
# exit loop
break
else:
exit('Unexpected history when requesting ' + url)
elif r.status_code != 200:
# unexpected status code
exit('HTTP code is ' + str(r.status))
# only save text to save space
pages.append(r.text)
return pages
def output_csv(items):
list_writer = csv.DictWriter(
sys.stdout,
fieldnames=items[0].keys(),
delimiter=',',
quotechar='"',
quoting=csv.QUOTE_MINIMAL
)
list_writer.writeheader()
for a in items:
list_writer.writerow(a)
def output_table(items):
table = tabulate(items, headers='keys')
print(table)
def get_url(page=1, term='', category=''):
# https://docs.python.org/3/library/urllib.parse.html#urllib.parse.urlencode
query = {
'page': page,
'utf8': '✓',
'term': term,
'referrer': 'search',
# grid
'view': 'list',
# empty (is newest), sales, rating, price-asc, price-desc
'sort': 'sales',
# this-year, this-month, this-week, this-day
'date': '',
# site-templates, wordpress, psd-templates, marketing, ecommerce, cms-themes, muse-templates, blogging,
# courses, sketch-templates, forums, static-site-generators, typeengine-themes
'category': category,
# int
'price_min': '',
# int
'price_max': '',
# rank-0 (no sales) to rank-4 (top sellers)
'sales': '',
# empty, 1 to 4
'rating_min': '',
}
return 'https://themeforest.net/search?' + urllib.parse.urlencode(query, True)
def extract_item(li):
template = {}
# heading
heading = li.findAll("h3")[0]
template['name'] = heading.text.strip()
# template url
template_link_relative = heading.a['href']
template['link'] = make_link_absolute(template_link_relative)
# price
template['price'] = li.findAll(attrs={'class': 'product-list__price'})[0].text.strip().replace('$', '')
# demo url
if len(li.findAll(attrs={'class': 'item-thumbnail__preview'})) == 1:
template_demo_relative = li.findAll(attrs={'class': 'item-thumbnail__preview'})[0].a['href']
template['demo'] = make_link_absolute(template_demo_relative)
return template
def make_link_absolute(url):
return urljoin('https://themeforest.net/', url)
if __name__ == '__main__':
main()