Skip to content

Commit

Permalink
Add: Telegram support (#56)
Browse files Browse the repository at this point in the history
Signed-off-by: Corentin Barreau <corentin@archive.org>
  • Loading branch information
CorentinB authored Oct 9, 2023
1 parent 0569f25 commit 6a3708b
Show file tree
Hide file tree
Showing 3 changed files with 54 additions and 0 deletions.
15 changes: 15 additions & 0 deletions internal/pkg/crawl/assets.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,21 @@ func (c *Crawl) extractAssets(base *url.URL, item *frontier.Item, doc *goquery.D
}
}

// Check all style attributes for background-image
doc.Find("*").Each(func(index int, item *goquery.Selection) {
style, exists := item.Attr("style")
if exists {
re := regexp.MustCompile(`(?:\(['"]?)(.*?)(?:['"]?\))`)
matches := re.FindAllStringSubmatch(style, -1)

for match := range matches {
if len(matches[match]) > 0 {
rawAssets = append(rawAssets, matches[match][1])
}
}
}
})

// Extract assets on the page (images, scripts, videos..)
if !utils.StringInSlice("img", c.DisabledHTMLTags) {
doc.Find("img").Each(func(index int, item *goquery.Selection) {
Expand Down
12 changes: 12 additions & 0 deletions internal/pkg/crawl/capture.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"time"

"github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/cloudflarestream"
"github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/telegram"
"github.com/CorentinB/Zeno/internal/pkg/crawl/sitespecific/tiktok"
"github.com/CorentinB/Zeno/internal/pkg/utils"
"github.com/PuerkitoBio/goquery"
Expand Down Expand Up @@ -233,6 +234,17 @@ func (c *Crawl) Capture(item *frontier.Item) {
req = tiktok.AddHeaders(req)
}

if telegram.IsTelegramURL(utils.URLToString(item.URL)) && !telegram.IsTelegramEmbedURL(utils.URLToString(item.URL)) {
// If the URL is a Telegram URL, we make an embed URL out of it
embedURL := telegram.CreateEmbedURL(item.URL)

// Then we create an item
embedItem := frontier.NewItem(embedURL, item, item.Type, item.Hop, item.ID)

// And capture it
c.Capture(embedItem)
}

// Execute request
resp, err = c.executeGET(item, req, false)
if err != nil && err.Error() == "URL from redirection has already been seen" {
Expand Down
27 changes: 27 additions & 0 deletions internal/pkg/crawl/sitespecific/telegram/telegram.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
package telegram

import (
"net/url"
"strings"
)

func IsTelegramEmbedURL(url string) bool {
return strings.Contains(url, "/t.me/") && strings.Contains(url, "?embed=1")
}

func IsTelegramURL(url string) bool {
return strings.Contains(url, "/t.me/")
}

func CreateEmbedURL(URL *url.URL) *url.URL {
// Add embed=1 to the URL, without changing the original URL
embedURL := *URL

if len(embedURL.RawQuery) > 0 {
embedURL.RawQuery += "&embed=1&mode=tme"
} else {
embedURL.RawQuery = "embed=1&mode=tme"
}

return &embedURL
}

0 comments on commit 6a3708b

Please sign in to comment.