Skip to content

Commit

Permalink
Use Go's .String() method & remove 'get list' URL validator
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Aug 22, 2023
1 parent 56277ed commit 8c6768a
Show file tree
Hide file tree
Showing 10 changed files with 52 additions and 280 deletions.
30 changes: 15 additions & 15 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection

// If a redirection is catched, then we execute the redirection
if isStatusCodeRedirect(resp.StatusCode) {
if resp.Header.Get("location") == utils.URLToString(req.URL) || item.Redirect >= c.MaxRedirect {
if resp.Header.Get("location") == req.URL.String() || item.Redirect >= c.MaxRedirect {
return resp, nil
}
defer resp.Body.Close()
Expand All @@ -135,7 +135,7 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection

// Seencheck the URL
if c.Seencheck {
found := c.seencheckURL(utils.URLToString(URL), "seed")
found := c.seencheckURL(URL.String(), "seed")
if found {
return nil, errors.New("URL from redirection has already been seen")
}
Expand All @@ -154,14 +154,14 @@ func (c *Crawl) executeGET(item *frontier.Item, req *http.Request, isRedirection
newItem.Redirect = item.Redirect + 1

// Prepare GET request
newReq, err = http.NewRequest("GET", utils.URLToString(URL), nil)
newReq, err = http.NewRequest("GET", URL.String(), nil)
if err != nil {
return resp, err
}

// Set new request headers on the new request :(
newReq.Header.Set("User-Agent", c.UserAgent)
newReq.Header.Set("Referer", utils.URLToString(newItem.ParentItem.URL))
newReq.Header.Set("Referer", newItem.ParentItem.URL.String())

return c.executeGET(newItem, newReq, true)
}
Expand All @@ -173,12 +173,12 @@ func (c *Crawl) captureAsset(item *frontier.Item, cookies []*http.Cookie) error
var resp *http.Response

// Prepare GET request
req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
req, err := http.NewRequest("GET", item.URL.String(), nil)
if err != nil {
return err
}

req.Header.Set("Referer", utils.URLToString(item.ParentItem.URL))
req.Header.Set("Referer", item.ParentItem.URL.String())
req.Header.Set("User-Agent", c.UserAgent)

// Apply cookies obtained from the original URL captured
Expand Down Expand Up @@ -216,14 +216,14 @@ func (c *Crawl) Capture(item *frontier.Item) {
}(item)

// Prepare GET request
req, err := http.NewRequest("GET", utils.URLToString(item.URL), nil)
req, err := http.NewRequest("GET", item.URL.String(), nil)
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while preparing GET request")
return
}

if item.Hop > 0 && item.ParentItem != nil {
req.Header.Set("Referer", utils.URLToString(item.ParentItem.URL))
req.Header.Set("Referer", item.ParentItem.URL.String())
}

req.Header.Set("User-Agent", c.UserAgent)
Expand Down Expand Up @@ -259,7 +259,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
go c.queueOutlinks(utils.MakeAbsolute(item.URL, utils.StringSliceToURLSlice(discovered)), item, &waitGroup)

// Store the base URL to turn relative links into absolute links later
base, err := url.Parse(utils.URLToString(resp.Request.URL))
base, err := url.Parse(resp.Request.URL.String())
if err != nil {
logError.WithFields(c.genLogFields(err, item.URL, nil)).Error("error while parsing base URL")
return
Expand Down Expand Up @@ -329,7 +329,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
for _, cfstreamURL := range cfstreamURLs {
logInfo.WithFields(c.genLogFields(err, cfstreamURL, map[string]interface{}{
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
"parentUrl": item.URL.String(),
"type": "asset",
})).Info("URL archived")
}
Expand Down Expand Up @@ -393,7 +393,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
seencheckedBatch := []*url.URL{}

for _, URL := range assets {
found := c.seencheckURL(utils.URLToString(URL), "asset")
found := c.seencheckURL(URL.String(), "asset")
if found {
continue
} else {
Expand All @@ -415,7 +415,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
logError.WithFields(c.genLogFields(err, nil, map[string]interface{}{
"urls": assets,
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
"parentUrl": item.URL.String(),
})).Error("error while seenchecking assets via HQ")
} else {
assets = seencheckedURLs
Expand All @@ -434,7 +434,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
c.Frontier.QueueCount.Incr(-1)

// Just making sure we do not over archive by archiving the original URL
if utils.URLToString(item.URL) == utils.URLToString(asset) {
if item.URL.String() == asset.String() {
continue
}

Expand All @@ -446,7 +446,7 @@ func (c *Crawl) Capture(item *frontier.Item) {

// If the URL match any excluded string, we ignore it
for _, excludedString := range c.ExcludedStrings {
if strings.Contains(utils.URLToString(asset), excludedString) {
if strings.Contains(asset.String(), excludedString) {
excluded = true
break
}
Expand All @@ -470,7 +470,7 @@ func (c *Crawl) Capture(item *frontier.Item) {
if err != nil {
logError.WithFields(c.genLogFields(err, &asset, map[string]interface{}{
"parentHop": item.Hop,
"parentUrl": utils.URLToString(item.URL),
"parentUrl": item.URL.String(),
"type": "asset",
})).Error("error while capturing asset")
return
Expand Down
10 changes: 5 additions & 5 deletions internal/pkg/crawl/hq.go
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,8 @@ func (c *Crawl) HQProducer() {
// listen to the discovered channel and add the URLs to the discoveredArray
for discoveredItem := range c.HQProducerChannel {
discoveredURL := gocrawlhq.URL{
Value: utils.URLToString(discoveredItem.URL),
Via: utils.URLToString(discoveredItem.ParentItem.URL),
Value: discoveredItem.URL.String(),
Via: discoveredItem.ParentItem.URL.String(),
}

for i := 0; uint8(i) < discoveredItem.Hop; i++ {
Expand Down Expand Up @@ -189,7 +189,7 @@ func (c *Crawl) HQFinisher() {
}

locallyCrawledTotal += int(finishedItem.LocallyCrawled)
finishedArray = append(finishedArray, gocrawlhq.URL{ID: finishedItem.ID, Value: utils.URLToString(finishedItem.URL)})
finishedArray = append(finishedArray, gocrawlhq.URL{ID: finishedItem.ID, Value: finishedItem.URL.String()})

if len(finishedArray) == int(math.Ceil(float64(c.Workers)/2)) {
for {
Expand Down Expand Up @@ -232,7 +232,7 @@ func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, e

for _, URL := range URLs {
discoveredURLs = append(discoveredURLs, gocrawlhq.URL{
Value: utils.URLToString(URL),
Value: URL.String(),
})
}

Expand Down Expand Up @@ -265,7 +265,7 @@ func (c *Crawl) HQSeencheckURLs(URLs []*url.URL) (seencheckedBatch []*url.URL, e

func (c *Crawl) HQSeencheckURL(URL *url.URL) (bool, error) {
discoveredURL := gocrawlhq.URL{
Value: utils.URLToString(URL),
Value: URL.String(),
}

discoveredResponse, err := c.HQClient.Discovered([]gocrawlhq.URL{discoveredURL}, "asset", false, true)
Expand Down
6 changes: 3 additions & 3 deletions internal/pkg/crawl/log.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,9 @@ func (c *Crawl) genLogFields(err interface{}, URL interface{}, additionalFields
case string:
fields["url"] = URLValue
case *url.URL:
fields["url"] = utils.URLToString(URLValue)
fields["url"] = URLValue.String()
case url.URL:
fields["url"] = utils.URLToString(&URLValue)
fields["url"] = URLValue.String()
default:
}

Expand All @@ -85,7 +85,7 @@ func (c *Crawl) logCrawlSuccess(executionStart time.Time, statusCode int, item *
fields["hop"] = item.Hop
fields["type"] = item.Type
fields["executionTime"] = time.Since(executionStart).Milliseconds()
fields["url"] = utils.URLToString(item.URL)
fields["url"] = item.URL.String()

logInfo.WithFields(fields).Info("URL archived")
}
2 changes: 1 addition & 1 deletion internal/pkg/crawl/outlinks.go
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ func (c *Crawl) queueOutlinks(outlinks []*url.URL, item *frontier.Item, wg *sync

// If the outlink match any excluded string, we ignore it
for _, excludedString := range c.ExcludedStrings {
if strings.Contains(utils.URLToString(outlink), excludedString) {
if strings.Contains(outlink.String(), excludedString) {
excluded = true
break
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ import (
"strconv"
"strings"

"github.com/CorentinB/Zeno/internal/pkg/utils"
"github.com/CorentinB/warc"
"github.com/PuerkitoBio/goquery"
)
Expand Down Expand Up @@ -95,13 +94,13 @@ func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.Cu
resp.Body.Close()

// Get the new URL
resp, err = httpClient.Get(utils.URLToString(location))
resp, err = httpClient.Get(location.String())
if err != nil {
return archivedURLs, err
}
defer resp.Body.Close()

archivedURLs = append(archivedURLs, utils.URLToString(location))
archivedURLs = append(archivedURLs, location.String())
}

if resp.StatusCode != 200 {
Expand Down Expand Up @@ -136,7 +135,7 @@ func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.Cu
}

// Get the video ID from the watchPageURL, it's the string between the first slash after the host and the second slash
videoID := strings.Replace(strings.Replace(utils.URLToString(watchPageURL), "/watch", "", 1), "https://"+watchPageURL.Host+"/", "", 1)
videoID := strings.Replace(strings.Replace(watchPageURL.String(), "/watch", "", 1), "https://"+watchPageURL.Host+"/", "", 1)

// Build the iframe URL
iframeURLString := baseURL.Scheme + "://" + baseURL.Host + "/embed/" + iframeFilename + "?videoId=" + videoID
Expand All @@ -151,13 +150,13 @@ func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.Cu
// we will look for the iframe-player JS file
var iframePlayerURL string

iframeURLResp, err := httpClient.Get(utils.URLToString(iframeURL))
iframeURLResp, err := httpClient.Get(iframeURL.String())
if err != nil {
return archivedURLs, err
}
defer iframeURLResp.Body.Close()

archivedURLs = append(archivedURLs, utils.URLToString(iframeURL))
archivedURLs = append(archivedURLs, iframeURL.String())

// Check that the status code is 200
if iframeURLResp.StatusCode == 301 {
Expand All @@ -170,13 +169,13 @@ func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.Cu
iframeURLResp.Body.Close()

// Get the new URL
iframeURLResp, err = httpClient.Get(utils.URLToString(location))
iframeURLResp, err = httpClient.Get(location.String())
if err != nil {
return archivedURLs, err
}
defer iframeURLResp.Body.Close()

archivedURLs = append(archivedURLs, utils.URLToString(location))
archivedURLs = append(archivedURLs, location.String())
}

if iframeURLResp.StatusCode != 200 {
Expand Down Expand Up @@ -229,13 +228,13 @@ func GetJSFiles(doc *goquery.Document, watchPageURL *url.URL, httpClient warc.Cu
iframePlayerResp.Body.Close()

// Get the new URL
iframePlayerResp, err = httpClient.Get(utils.URLToString(location))
iframePlayerResp, err = httpClient.Get(location.String())
if err != nil {
return archivedURLs, err
}
defer iframePlayerResp.Body.Close()

archivedURLs = append(archivedURLs, utils.URLToString(location))
archivedURLs = append(archivedURLs, location.String())
}

if iframePlayerResp.StatusCode != 200 {
Expand Down Expand Up @@ -321,11 +320,11 @@ func GetSegments(URL *url.URL, httpClient warc.CustomHTTPClient) (URLs []*url.UR
)

// Replace /watch with /manifest/video.mpd if the URL ends with /watch, else, raise an error
if len(utils.URLToString(URL)) < 6 {
if len(URL.String()) < 6 {
return nil, errors.New("cloudflaresteam.GetSegments: URL too short")
} else {
if strings.HasSuffix(utils.URLToString(URL), "/watch") {
mpdURL = strings.Replace(utils.URLToString(URL), "/watch", "/manifest/video.mpd?parentOrigin="+URL.Scheme+"://"+URL.Host, 1)
if strings.HasSuffix(URL.String(), "/watch") {
mpdURL = strings.Replace(URL.String(), "/watch", "/manifest/video.mpd?parentOrigin="+URL.Scheme+"://"+URL.Host, 1)
} else {
return nil, errors.New("cloudflaresteam.GetSegments: URL does not end with /watch")
}
Expand Down
3 changes: 1 addition & 2 deletions internal/pkg/frontier/item.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ package frontier
import (
"net/url"

"github.com/CorentinB/Zeno/internal/pkg/utils"
"github.com/zeebo/xxh3"
)

Expand Down Expand Up @@ -31,7 +30,7 @@ func NewItem(URL *url.URL, parentItem *Item, itemType string, hop uint8, ID stri
item.Host = URL.Host
item.Hop = hop
item.ParentItem = parentItem
item.Hash = xxh3.HashString(utils.URLToString(URL))
item.Hash = xxh3.HashString(URL.String())
item.Type = itemType

return item
Expand Down
21 changes: 10 additions & 11 deletions internal/pkg/frontier/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ import (
"net/url"
"os"

"github.com/CorentinB/Zeno/internal/pkg/utils"
"github.com/gosuri/uilive"
"github.com/sirupsen/logrus"
)
Expand All @@ -34,7 +33,7 @@ func IsSeedList(path string) (seeds []Item, err error) {

// Initialize scanner
scanner := bufio.NewScanner(file)

logrus.WithFields(logrus.Fields{
"path": path,
}).Info("Start reading input list")
Expand All @@ -50,14 +49,14 @@ func IsSeedList(path string) (seeds []Item, err error) {
continue
}

err = utils.ValidateURL(URL)
if err != nil {
logrus.WithFields(logrus.Fields{
"url": scanner.Text(),
"err": err.Error(),
}).Debug("This is not a valid URL")
continue
}
// err = utils.ValidateURL(URL)
// if err != nil {
// logrus.WithFields(logrus.Fields{
// "url": scanner.Text(),
// "err": err.Error(),
// }).Debug("this is not a valid URL")
// continue
// }

item := NewItem(URL, nil, "seed", 0, "")
seeds = append(seeds, *item)
Expand All @@ -72,7 +71,7 @@ func IsSeedList(path string) (seeds []Item, err error) {
}

if len(seeds) == 0 {
return seeds, errors.New("Seed list's content invalid")
return seeds, errors.New("seed list's content invalid")
}

return seeds, nil
Expand Down
5 changes: 1 addition & 4 deletions internal/pkg/utils/atom_bool.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,5 @@ func (b *TAtomBool) Set(value bool) {

// Get return the value of an atomic boolean
func (b *TAtomBool) Get() bool {
if atomic.LoadInt32(&(b.flag)) != 0 {
return true
}
return false
return atomic.LoadInt32(&(b.flag)) != 0
}
8 changes: 4 additions & 4 deletions internal/pkg/utils/url.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ func DedupeURLs(URLs []*url.URL) []*url.URL {
list := []*url.URL{}

for _, entry := range URLs {
if _, value := keys[URLToString(entry)]; !value {
keys[URLToString(entry)] = true
if _, value := keys[entry.String()]; !value {
keys[entry.String()] = true

if entry.Scheme == "http" || entry.Scheme == "https" {
list = append(list, entry)
Expand All @@ -47,14 +47,14 @@ func DedupeURLs(URLs []*url.URL) []*url.URL {

// ValidateURL validates a *url.URL
func ValidateURL(u *url.URL) error {
valid := govalidator.IsURL(URLToString(u))
valid := govalidator.IsURL(u.String())

if u.Scheme != "http" && u.Scheme != "https" {
valid = false
}

if !valid {
return errors.New("Not a valid URL")
return errors.New("not a valid URL")
}

return nil
Expand Down
Loading

0 comments on commit 8c6768a

Please sign in to comment.