Skip to content

Commit

Permalink
add: --warc-size
Browse files Browse the repository at this point in the history
  • Loading branch information
CorentinB committed Oct 10, 2024
1 parent ae9aa49 commit 30ee22c
Show file tree
Hide file tree
Showing 4 changed files with 5 additions and 0 deletions.
1 change: 1 addition & 0 deletions cmd/get.go
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ func getCMDsFlags(getCmd *cobra.Command) {
getCmd.PersistentFlags().Bool("disable-assets-capture", false, "Disable assets capture.")
getCmd.PersistentFlags().Int("warc-dedupe-size", 1024, "Minimum size to deduplicate WARC records with revisit records.")
getCmd.PersistentFlags().String("cdx-cookie", "", "Pass custom cookie during CDX requests. Example: 'cdx_auth_token=test_value'")
getCmd.PersistentFlags().Int("warc-size", 1024, "Size of the WARC files in MB.")

// Logging flags
getCmd.PersistentFlags().Bool("live-stats", false, "Enable live stats but disable logging. (implies --no-stdout-log)")
Expand Down
1 change: 1 addition & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ type Config struct {
WARCOperator string `mapstructure:"warc-operator"`
CDXDedupeServer string `mapstructure:"warc-cdx-dedupe-server"`
WARCTempDir string `mapstructure:"warc-temp-dir"`
WARCSize int `mapstructure:"warc-size"`
CDXCookie string `mapstructure:"cdx-cookie"`
HQAddress string `mapstructure:"hq-address"`
HQKey string `mapstructure:"hq-key"`
Expand Down
2 changes: 2 additions & 0 deletions internal/pkg/crawl/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ type Crawl struct {
WARCFullOnDisk bool
WARCPoolSize int
WARCDedupeSize int
WARCSize int
DisableLocalDedupe bool
CertValidation bool
WARCCustomCookie string
Expand Down Expand Up @@ -253,6 +254,7 @@ func GenerateCrawlConfig(config *config.Config) (*Crawl, error) {
c.WARCPoolSize = config.WARCPoolSize
c.WARCDedupeSize = config.WARCDedupeSize
c.WARCCustomCookie = config.CDXCookie
c.WARCSize = config.WARCSize

c.API = config.API
c.APIPort = config.APIPort
Expand Down
1 change: 1 addition & 0 deletions internal/pkg/crawl/warc.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ func (c *Crawl) initWARCRotatorSettings() *warc.RotatorSettings {
rotatorSettings.Prefix = c.WARCPrefix
rotatorSettings.WarcinfoContent.Set("software", fmt.Sprintf("Zeno %s", utils.GetVersion().Version))
rotatorSettings.WARCWriterPoolSize = c.WARCPoolSize
rotatorSettings.WarcSize = float64(c.WARCSize)

if len(c.WARCOperator) > 0 {
rotatorSettings.WarcinfoContent.Set("operator", c.WARCOperator)
Expand Down

0 comments on commit 30ee22c

Please sign in to comment.