Skip to content

Commit

Permalink
refactor: refactor functions to handle redirections and improve error…
Browse files Browse the repository at this point in the history
… handling (#53)

- Modify the `executeGET` function signature to include an additional boolean parameter `isRedirection`
- Add a comment explaining that crawls are temporarily paused for individual hosts if they are over the configured maximum concurrent requests per domain, but not if the request is a redirection
- Replace the function `isRedirection` with `isStatusCodeRedirect` in the `executeGET` function
- Modify the `captureAsset` function signature to include an additional boolean parameter `isRedirection`
- Replace the function `isRedirection` with `isStatusCodeRedirect` in the `captureAsset` function
- Replace the function `SetupLogging` with a new implementation that includes error handling and a retry mechanism for creating the elastic hooks

Signed-off-by: Corentin Barreau <corentin@archive.org>
  • Loading branch information
CorentinB authored Aug 3, 2023
1 parent 2724577 commit 7ce9f24
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 18 deletions.
28 changes: 14 additions & 14 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ import (
"github.com/CorentinB/Zeno/internal/pkg/frontier"
)

func (c *Crawl) executeGET(item *frontier.Item, req *http.Request) (resp *http.Response, err error) {
func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection bool) (resp *http.Response, err error) {
var (
executionStart = time.Now()
newItem *frontier.Item
Expand All @@ -48,14 +48,17 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request) (resp *http.R
time.Sleep(time.Second)
}

// Temporarily pause crawls for individual hosts if they are over our configured maximum concurrent requests per domain.
for c.shouldPause(item.Host) {
time.Sleep(time.Millisecond * time.Duration(c.RateLimitDelay))
}
// Temporarily pause crawls for individual hosts if they are over our configured maximum concurrent requests per domain.'
// If the request is a redirection, we do not pause the crawl because we want to follow the redirection.
if isRedirection {
for c.shouldPause(item.Host) {
time.Sleep(time.Millisecond * time.Duration(c.RateLimitDelay))
}

c.CrawlPool.Incr(item.Host)
c.CrawlPool.Incr(item.Host)

defer c.CrawlPool.Decr(item.Host)
defer c.CrawlPool.Decr(item.Host)
}

// Retry on 429 error
for retry := 0; retry < c.MaxRetry; retry++ {
Expand Down Expand Up @@ -109,7 +112,7 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request) (resp *http.R
c.logCrawlSuccess(executionStart, resp.StatusCode, item)

// If a redirection is catched, then we execute the redirection
if isRedirection(resp.StatusCode) {
if isStatusCodeRedirect(resp.StatusCode) {
if resp.Header.Get("location") == utils.URLToString(req.URL) || item.Redirect >= c.MaxRedirect {
return resp, nil
}
Expand Down Expand Up @@ -160,10 +163,7 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request) (resp *http.R
newReq.Header.Set("User-Agent", c.UserAgent)
newReq.Header.Set("Referer", utils.URLToString(newItem.ParentItem.URL))

resp, err = c.executeGET(newItem, newReq)
if err != nil {
return resp, err
}
return c.executeGET(newItem, newReq, true)
}

return resp, nil
Expand All @@ -186,7 +186,7 @@ func (c *Crawl) captureAsset(item *frontier.Item, cookies []*http.Cookie) error
req.AddCookie(cookies[i])
}

resp, err = c.executeGET(item, req)
resp, err = c.executeGET(item, req, false)
if err != nil && err.Error() == "URL from redirection has already been seen" {
return nil
} else if err != nil {
Expand Down Expand Up @@ -234,7 +234,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
}

// Execute request
resp, err = c.executeGET(item, req)
resp, err = c.executeGET(item, req, false)
if err != nil && err.Error() == "URL from redirection has already been seen" {
return
} else if err != nil {
Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/crawl/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ func (c *Crawl) shouldPause(host string) bool {
}
}

func isRedirection(statusCode int) bool {
func isStatusCodeRedirect(statusCode int) bool {
if statusCode == 300 || statusCode == 301 ||
statusCode == 302 || statusCode == 307 ||
statusCode == 308 {
Expand Down
10 changes: 7 additions & 3 deletions internal/pkg/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,19 +37,23 @@ func SetupLogging(jobPath string, liveStats bool, esURL string) (logInfo, logWar
}

go func() {
newHook:
hookInfo, err := elogrus.NewAsyncElasticHook(client, hostname, logrus.InfoLevel, "zeno-"+time.Now().Format("2006.01.02"))
if err != nil {
logrus.Panic(err)
logrus.Error(err)
goto newHook
}

hookWarning, err := elogrus.NewAsyncElasticHook(client, hostname, logrus.WarnLevel, "zeno-"+time.Now().Format("2006.01.02"))
if err != nil {
logrus.Panic(err)
logrus.Error(err)
goto newHook
}

hookError, err := elogrus.NewAsyncElasticHook(client, hostname, logrus.ErrorLevel, "zeno-"+time.Now().Format("2006.01.02"))
if err != nil {
logrus.Panic(err)
logrus.Error(err)
goto newHook
}

logInfo.Hooks.Add(hookInfo)
Expand Down

0 comments on commit 7ce9f24

Please sign in to comment.