forked from ReconInfoSec/web-traffic-generator
-
Notifications
You must be signed in to change notification settings - Fork 0
/
gen.py
207 lines (170 loc) · 5.76 KB
/
gen.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#!/usr/bin/python
#
# written by @eric_capuano
# https://github.com/ecapuano/web-traffic-generator
#
# published under MIT license :) do what you want.
#
#20170714 shyft ADDED python 2.7 and 3.x compatibility and generic config
from __future__ import print_function
import requests, re, time, random
try:
import config
except ImportError:
class ConfigClass: #minimal config incase you don't have the config.py
clickDepth = 5 # how deep to browse from the rootURL
minWait = 1 # minimum amount of time allowed between HTTP requests
maxWait = 3 # maximum amount of time to wait between HTTP requests
debug = True # set to True to enable useful console output
# use this single item list to test how a site responds to this crawler
# be sure to comment out the list below it.
#rootURLs = ["https://digg.com/"]
rootURLs = [
"https://www.reddit.com"
]
# items can be a URL "https://t.co" or simple string to check for "amazon"
blacklist = [
'facebook.com',
'pinterest.com'
]
# must use a valid user agent or sites will hate you
userAgent = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) ' \
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'
config = ConfigClass
def doRequest(url):
global dataMeter
global goodRequests
global badRequests
sleepTime = random.randrange(config.minWait,config.maxWait)
if config.debug:
print("requesting: %s" % url)
headers = {'user-agent': config.userAgent}
try:
r = requests.get(url, headers=headers, timeout=5)
except:
time.sleep(30) # else we'll enter 100% CPU loop in a net down situation
return False
status = r.status_code
pageSize = len(r.content)
dataMeter = dataMeter + pageSize
if config.debug:
print("Page size: %s" % pageSize)
if ( dataMeter > 1000000 ):
print("Data meter: %s MB" % (dataMeter / 1000000))
else:
print("Data meter: %s bytes" % dataMeter)
if ( status != 200 ):
badRequests+=1
if config.debug:
print("Response status: %s" % r.status_code)
if ( status == 429 ):
if config.debug:
print("We're making requests too frequently... sleeping longer...")
sleepTime+=30
else:
goodRequests+=1
# need to sleep for random number of seconds!
if config.debug:
print("Good requests: %s" % goodRequests)
print("Bad reqeusts: %s" % badRequests)
print("Sleeping for %s seconds..." % sleepTime)
time.sleep(sleepTime)
return r
def getLinks(page):
links=[]
pattern=r"(?:href\=\")(https?:\/\/[^\"]+)(?:\")"
matches = re.findall(pattern,str(page.content))
for match in matches: # check all matches against config.blacklist
if any(bl in match for bl in config.blacklist):
pass
else:
links.insert(0,match)
return links
def browse(urls):
currURL = 1
for url in urls:
urlCount = len(urls)
page = doRequest(url) # hit current root URL
if page:
links = getLinks(page) # extract links from page
linkCount = len(links)
else:
if config.debug:
print("Error requesting %s" % url)
continue
depth=0
while ( depth < config.clickDepth ):
if config.debug:
print("------------------------------------------------------")
print("config.blacklist: %s" % config.blacklist )
# set the link count, which will change throughout the loop
linkCount = len(links)
if ( linkCount > 1): # make sure we have more than 1 link to use
if config.debug:
print("URL: %s / %s -- Depth: %s / %s" \
% (currURL,urlCount,depth,config.clickDepth))
print("Choosing random link from total: %s" % linkCount)
randomLink = random.randrange(0,linkCount - 1)
if config.debug:
print("Link chosen: %s of %s" % (randomLink,linkCount))
clickLink = links[randomLink]
try:
# browse to random link on rootURL
sub_page = doRequest(clickLink)
if sub_page:
checkLinkCount = len(getLinks(sub_page))
else:
if config.debug:
print("Error requesting %s" % url)
break
checkLinkCount = len(getLinks(sub_page))
# make sure we have more than 1 link to pick from
if ( checkLinkCount > 1 ):
# extract links from the new page
links = getLinks(sub_page)
else:
# else retry with current link list
if config.debug:
print("Not enough links found! Found: %s -- " \
"Going back up a level" % checkLinkCount)
config.blacklist.insert(0,clickLink)
# remove the dead-end link from our list
del links[randomLink]
except:
if config.debug:
print("Exception on URL: %s -- " \
"removing from list and trying again!" % clickLink)
# I need to expand more on exception type for config.debugging
config.blacklist.insert(0,clickLink)
# remove the dead-end link from our list
del links[randomLink]
pass
# increment counter whether request was successful or not
# so that we don't end up in an infinite failed request loop
depth+=1
else:
# we land here if we went down a path that dead-ends
# could implement logic to simply restart at same root
if config.debug:
print("Hit a dead end...Moving to next Root URL")
config.blacklist.insert(0,clickLink)
depth = config.clickDepth
currURL+=1 # increase rootURL iteration
if config.debug:
print("Done.")
# initialize our global variables
dataMeter = 0
goodRequests = 0
badRequests = 0
while True:
print("Traffic generator started...")
print("----------------------------")
print("https://github.com/ecapuano/web-traffic-generator")
print("")
print("Clicking %s links deep into %s different root URLs, " \
% (config.clickDepth,len(config.rootURLs)))
print("waiting between %s and %s seconds between requests. " \
% (config.minWait,config.maxWait))
print("")
print("This script will run indefinitely. Ctrl+C to stop.")
browse(config.rootURLs)