Skip to content

Commit

Permalink
Merge pull request #5 from paracrawl/extrafiles
Browse files Browse the repository at this point in the history
specify which files to shard with -f option
  • Loading branch information
lpla authored Jul 15, 2020
2 parents 0e41be7 + e097175 commit c00c93c
Showing 1 changed file with 4 additions and 0 deletions.
4 changes: 4 additions & 0 deletions cmd/giashard/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package main

import (
"errors"
"strings"
"flag"
"fmt"
"log"
Expand All @@ -12,11 +13,13 @@ import (
var outdir string
var shards uint
var batchsize int64
var fileslist string

var schema = []string{"url", "mime", "plain_text"}

func init() {
flag.StringVar(&outdir, "o", ".", "Output location")
flag.StringVar(&fileslist, "f", "plain_text,url,mime", "Files to shard, separated by commas")
flag.UintVar(&shards, "n", 8, "Number of shards (2^n)")
flag.Int64Var(&batchsize, "b", 100, "Batch size in MB")
flag.Usage = func() {
Expand All @@ -34,6 +37,7 @@ significant part of the hostname in a url and batch is approximately fixed size.
func main() {
log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
flag.Parse()
schema = strings.Split(fileslist, ",")

w, err := giashard.NewShard(outdir, shards, batchsize * 1024 * 1024, "url", append(schema, "source")...)
if err != nil {
Expand Down

0 comments on commit c00c93c

Please sign in to comment.