Skip to content

Commit

Permalink
Merge pull request #139 from internetarchive/fix-seencheck
Browse files Browse the repository at this point in the history
Fix seencheck re-implementation with current queue
  • Loading branch information
CorentinB authored Aug 21, 2024
2 parents ca6aa89 + f6a38bf commit f3ae9cf
Show file tree
Hide file tree
Showing 6 changed files with 89 additions and 52 deletions.
2 changes: 1 addition & 1 deletion cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().String("cookies", "", "File containing cookies that will be used for requests.")
getCmd.PersistentFlags().Bool("keep-cookies", false, "Keep a global cookie jar")
getCmd.PersistentFlags().Bool("headless", false, "Use headless browsers instead of standard GET requests.")
getCmd.PersistentFlags().Bool("local-seencheck", false, "Simple local seencheck to avoid re-crawling of URIs.")
getCmd.PersistentFlags().Bool("disable-seencheck", false, "Disable the (remote or local) seencheck that avoid re-crawling of URIs.")
getCmd.PersistentFlags().Bool("json", false, "Output logs in JSON")
getCmd.PersistentFlags().Bool("debug", false, "")
getCmd.PersistentFlags().Bool("api", false, "Enable API")
Expand Down
2 changes: 1 addition & 1 deletion config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ type Config struct {
HQBatchSize int64 `mapstructure:"hq-batch-size"`
KeepCookies bool `mapstructure:"keep-cookies"`
Headless bool `mapstructure:"headless"`
LocalSeencheck bool `mapstructure:"local-seencheck"`
DisableSeencheck bool `mapstructure:"disable-seencheck"`
JSON bool `mapstructure:"json"`
Debug bool `mapstructure:"debug"`
LiveStats bool `mapstructure:"live-stats"`
Expand Down
101 changes: 53 additions & 48 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,18 +149,20 @@ func (c *Crawl) executeGET(item *queue.Item, req *http.Request, isRedirection bo

// Seencheck the URL
if c.UseSeencheck {
found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "seed")
if found {
return nil, errors.New("URL from redirection has already been seen")
}
} else if c.UseHQ {
isNewURL, err := c.HQSeencheckURL(URL)
if err != nil {
return resp, err
}
if c.UseHQ {
isNewURL, err := c.HQSeencheckURL(URL)
if err != nil {
return resp, err
}

if !isNewURL {
return nil, errors.New("URL from redirection has already been seen")
if !isNewURL {
return nil, errors.New("URL from redirection has already been seen")
}
} else {
found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "seed")
if found {
return nil, errors.New("URL from redirection has already been seen")
}
}
}

Expand Down Expand Up @@ -432,18 +434,19 @@ func (c *Crawl) Capture(item *queue.Item) error {
// because we already archived the URLs, we just want them to be added
// to the seencheck table.
if c.UseSeencheck {
for _, cfstreamURL := range cfstreamURLs {
c.Seencheck.SeencheckURL(cfstreamURL, "asset")
}
} else if c.UseHQ {
_, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs))
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{
"urls": cfstreamURLs,
})).Error("error while seenchecking assets via HQ")
if c.UseHQ {
_, err := c.HQSeencheckURLs(utils.StringSliceToURLSlice(cfstreamURLs))
if err != nil {
c.Log.WithFields(c.genLogFields(err, item.URL, map[string]interface{}{
"urls": cfstreamURLs,
})).Error("error while seenchecking assets via HQ")
}
} else {
for _, cfstreamURL := range cfstreamURLs {
c.Seencheck.SeencheckURL(cfstreamURL, "asset")
}
}
}

// Log the archived URLs
for _, cfstreamURL := range cfstreamURLs {
c.Log.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{
Expand Down Expand Up @@ -511,38 +514,40 @@ func (c *Crawl) Capture(item *queue.Item) error {
// seencheck DB. If they are, then they are skipped.
// Else, if we use HQ, then we use HQ's seencheck.
if c.UseSeencheck {
seencheckedBatch := []*url.URL{}
if c.UseHQ {
seencheckedURLs, err := c.HQSeencheckURLs(assets)
// We ignore the error here because we don't want to slow down the crawl
// if HQ is down or if the request failed. So if we get an error, we just
// continue with the original list of assets.
if err != nil {
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
"urls": assets,
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
})).Error("error while seenchecking assets via HQ")
} else {
assets = seencheckedURLs
}

for _, URL := range assets {
found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset")
if found {
continue
if len(assets) == 0 {
return err
}
seencheckedBatch = append(seencheckedBatch, URL)
}
} else {
seencheckedBatch := []*url.URL{}

if len(seencheckedBatch) == 0 {
return err
}
for _, URL := range assets {
found := c.Seencheck.SeencheckURL(utils.URLToString(URL), "asset")
if found {
continue
}
seencheckedBatch = append(seencheckedBatch, URL)
}

assets = seencheckedBatch
} else if c.UseHQ {
seencheckedURLs, err := c.HQSeencheckURLs(assets)
// We ignore the error here because we don't want to slow down the crawl
// if HQ is down or if the request failed. So if we get an error, we just
// continue with the original list of assets.
if err != nil {
c.Log.WithFields(c.genLogFields(err, nil, map[string]interface{}{
"urls": assets,
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
})).Error("error while seenchecking assets via HQ")
} else {
assets = seencheckedURLs
}
if len(seencheckedBatch) == 0 {
return err
}

if len(assets) == 0 {
return err
assets = seencheckedBatch
}
}

Expand Down
2 changes: 1 addition & 1 deletion internal/pkg/crawl/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,7 +227,7 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) {
})
}

c.UseSeencheck = config.LocalSeencheck
c.UseSeencheck = !config.DisableSeencheck
c.HTTPTimeout = config.HTTPTimeout
c.MaxConcurrentRequestsPerDomain = config.MaxConcurrentRequestsPerDomain
c.RateLimitDelay = config.ConcurrentSleepLength
Expand Down
27 changes: 26 additions & 1 deletion internal/pkg/crawl/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
package crawl

import (
"os"
"path"
"sync"
"time"
Expand Down Expand Up @@ -155,7 +156,7 @@ func (c *Crawl) Start() (err error) {
go c.HQProducer()
go c.HQFinisher()
go c.HQWebsocket()
} else {
} else if len(c.SeedList) > 0 {
// Temporarily disable handover as it's not needed
enableBackHandover := make(chan struct{})
syncHandover := make(chan struct{})
Expand All @@ -166,6 +167,27 @@ func (c *Crawl) Start() (err error) {

<-syncHandover
}

// Dedupe the seeds list
if c.UseSeencheck {
c.Log.Info("Seenchecking seeds list..")

var seencheckedSeeds []queue.Item
var duplicates int
for i := 0; i < len(c.SeedList); i++ {
if c.Seencheck.SeencheckURL(c.SeedList[i].URL.String(), "seed") {
duplicates++
continue
}

seencheckedSeeds = append(seencheckedSeeds, c.SeedList[i])
}

c.SeedList = seencheckedSeeds

c.Log.Info("Seencheck done", "duplicates", duplicates)
}

// Push the seed list to the queue
c.Log.Info("Pushing seeds in the local queue..")
for i := 0; i < len(c.SeedList); i += 100000 {
Expand Down Expand Up @@ -199,6 +221,9 @@ func (c *Crawl) Start() (err error) {
close(syncHandover)

c.Log.Info("All seeds are now in queue")
} else {
c.Log.Info("No seeds to crawl")
os.Exit(0)
}

// Start the workers pool by building all the workers and starting them
Expand Down
7 changes: 7 additions & 0 deletions internal/pkg/crawl/outlinks.go
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,13 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *queue.Item, wg *sync.Wa
continue
}

// Seencheck the outlink
if c.UseSeencheck {
if c.Seencheck.SeencheckURL(utils.URLToString(outlink), "seed") {
continue
}
}

if c.DomainsCrawl && strings.Contains(item.URL.Host, outlink.Host) && item.Hop == 0 {
newItem, err := queue.NewItem(outlink, item.URL, "seed", 0, "", false)
if err != nil {
Expand Down

0 comments on commit f3ae9cf

Please sign in to comment.