Skip to content

Commit

Permalink
Merge pull request #7 from paracrawl/additional-suffix-domains
Browse files Browse the repository at this point in the history
Add -d option to load in additional public domain suffixes
  • Loading branch information
jelmervdl authored May 12, 2021
2 parents e87d049 + 88d8662 commit 1e48f2a
Show file tree
Hide file tree
Showing 3 changed files with 26 additions and 0 deletions.
11 changes: 11 additions & 0 deletions cmd/giashard/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ var outdir string
var shards uint
var batchsize int64
var fileslist string
var domainList string

var schema = []string{"url", "mime", "plain_text"}

Expand All @@ -22,6 +23,7 @@ func init() {
flag.StringVar(&fileslist, "f", "plain_text,url,mime", "Files to shard, separated by commas")
flag.UintVar(&shards, "n", 8, "Number of shards (2^n)")
flag.Int64Var(&batchsize, "b", 100, "Batch size in MB")
flag.StringVar(&domainList, "d", "", "Additional public suffix entries")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] input directories\n", os.Args[0])
flag.PrintDefaults()
Expand All @@ -39,6 +41,15 @@ func main() {
flag.Parse()
schema = strings.Split(fileslist, ",")

if domainList != "" {
count, err := giashard.AddRulesToDefaultList(domainList)
if err != nil {
log.Fatalf("Error loading domain list: %v", err)
} else {
log.Printf("Loaded %d additional public suffix domains.", count)
}
}

w, err := giashard.NewShard(outdir, shards, batchsize * 1024 * 1024, "url", append(schema, "source")...)
if err != nil {
log.Fatalf("Error opening output shards: %v", err)
Expand Down
10 changes: 10 additions & 0 deletions cmd/giashardid/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@ import (

var shards uint
var slugs bool
var domainList string

func init() {
flag.UintVar(&shards, "n", 8, "Number of shards (2^n)")
flag.BoolVar(&slugs, "s", false, "Print slugs instead of shards")
flag.StringVar(&domainList, "d", "", "Additional public suffix entries")
flag.Usage = func() {
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] [url]\n", os.Args[0])
flag.PrintDefaults()
Expand Down Expand Up @@ -47,6 +49,14 @@ func main() {
log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
flag.Parse()

if domainList != "" {
count, err := giashard.AddRulesToDefaultList(domainList)
if err != nil {
log.Fatalf("Error loading domain list: %v", err)
} else {
log.Printf("Loaded %d additional public suffix domains.", count)
}
}

for url := range urls() {
if slugs {
Expand Down
5 changes: 5 additions & 0 deletions shard.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,11 @@ func (s *Shard)Close() (err error) {
return
}

func AddRulesToDefaultList(domainList string) (added int, err error) {
rules, err := publicsuffix.DefaultList.LoadFile(domainList, nil)
return len(rules), err
}

func Slug(key string) (slug string, err error) {
// parse the url to get the domain name
url, e := url.Parse(key)
Expand Down

0 comments on commit 1e48f2a

Please sign in to comment.