Skip to content

Commit 3f291bb

Browse files
committed
Implement new option to avoid Bash argument amount limit
Also, include 'go.mod' file for a proper and modern local development
1 parent 1e48f2a commit 3f291bb

File tree

2 files changed

+78
-23
lines changed

2 files changed

+78
-23
lines changed

cmd/giashard/main.go

Lines changed: 65 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,18 @@
11
package main
22

33
import (
4+
"bufio"
45
"errors"
5-
"strings"
66
"flag"
77
"fmt"
8+
"github.com/paracrawl/giashard"
89
"log"
910
"os"
10-
"github.com/paracrawl/giashard"
11+
"strings"
1112
)
1213

1314
var outdir string
15+
var dirslist string
1416
var shards uint
1517
var batchsize int64
1618
var fileslist string
@@ -20,22 +22,55 @@ var schema = []string{"url", "mime", "plain_text"}
2022

2123
func init() {
2224
flag.StringVar(&outdir, "o", ".", "Output location")
25+
flag.StringVar(&dirslist, "l", "", "Input file listing all input directories")
2326
flag.StringVar(&fileslist, "f", "plain_text,url,mime", "Files to shard, separated by commas")
2427
flag.UintVar(&shards, "n", 8, "Number of shards (2^n)")
2528
flag.Int64Var(&batchsize, "b", 100, "Batch size in MB")
2629
flag.StringVar(&domainList, "d", "", "Additional public suffix entries")
2730
flag.Usage = func() {
28-
fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] input directories\n", os.Args[0])
31+
_, err := fmt.Fprintf(flag.CommandLine.Output(), "Usage: %s [flags] input directories\n", os.Args[0])
32+
if err != nil {
33+
return
34+
}
2935
flag.PrintDefaults()
30-
fmt.Fprintf(flag.CommandLine.Output(),
31-
`Shards together the directories give on input. They are assumed to be in the
36+
_, err = fmt.Fprintf(flag.CommandLine.Output(),
37+
`Shards together the directories given on input. They are assumed to be in the
3238
standard Paracrawl column storage format. The output is a tree of directories
3339
of the form: outdir/shard/batch where shard is computed as a hash of the
3440
significant part of the hostname in a url and batch is approximately fixed size.
3541
`)
42+
if err != nil {
43+
return
44+
}
3645
}
3746
}
3847

48+
func processfile(source string, schema []string, w *giashard.Shard, hostname string) {
49+
log.Printf("Processing input: %v", source)
50+
r, err := giashard.NewColumnReader(source, schema...)
51+
if err != nil {
52+
log.Printf("Error opening input reader: %v", err)
53+
return
54+
}
55+
56+
// provenance data - where is this from
57+
provdata := []byte(fmt.Sprintf("%s:%s", hostname, source))
58+
for row := range r.Rows() {
59+
row["source"] = provdata
60+
if err := w.WriteRow(row); err != nil {
61+
if errors.Is(err, giashard.ShardError) { // not fatal
62+
log.Print(err)
63+
continue
64+
}
65+
log.Fatalf("Error writing row: %v", err)
66+
}
67+
}
68+
69+
err = r.Close()
70+
if err != nil {
71+
return
72+
}
73+
}
3974
func main() {
4075
log.SetFlags(log.Ldate | log.Ltime | log.Lshortfile)
4176
flag.Parse()
@@ -50,40 +85,47 @@ func main() {
5085
}
5186
}
5287

53-
w, err := giashard.NewShard(outdir, shards, batchsize * 1024 * 1024, "url", append(schema, "source")...)
88+
w, err := giashard.NewShard(outdir, shards, batchsize*1024*1024, "url", append(schema, "source")...)
5489
if err != nil {
5590
log.Fatalf("Error opening output shards: %v", err)
5691
}
57-
defer w.Close()
92+
defer func(w *giashard.Shard) {
93+
var err = w.Close()
94+
if err != nil {
95+
96+
}
97+
}(w)
5898

5999
hostname, err := os.Hostname()
60100
if err != nil {
61101
log.Fatalf("Error getting local hostname: %v", err)
62102
}
63103

64-
for i:=0; i<flag.NArg(); i++ {
104+
for i := 0; i < flag.NArg(); i++ {
65105
source := flag.Arg(i)
106+
processfile(source, schema, w, hostname)
107+
}
66108

67-
log.Printf("Processing input: %v", source)
68-
r, err := giashard.NewColumnReader(source, schema...)
109+
if dirslist != "" {
110+
file, err := os.Open(dirslist)
69111
if err != nil {
70-
log.Printf("Error opening input reader: %v", err)
71-
continue
112+
log.Fatal(err)
72113
}
114+
defer func(file *os.File) {
115+
var err = file.Close()
116+
if err != nil {
73117

74-
// provenance data - where is this from
75-
provdata := []byte(fmt.Sprintf("%s:%s", hostname, source))
76-
for row := range r.Rows() {
77-
row["source"] = provdata
78-
if err := w.WriteRow(row); err != nil {
79-
if errors.Is(err, giashard.ShardError) { // not fatal
80-
log.Print(err)
81-
continue
82-
}
83-
log.Fatalf("Error writing row: %v", err)
84118
}
119+
}(file)
120+
121+
scanner := bufio.NewScanner(file)
122+
for scanner.Scan() {
123+
source := scanner.Text()
124+
processfile(source, schema, w, hostname)
85125
}
86126

87-
r.Close()
127+
if err := scanner.Err(); err != nil {
128+
log.Fatal(err)
129+
}
88130
}
89131
}

go.mod

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
module github.com/paracrawl/giashard
2+
3+
go 1.18
4+
5+
require (
6+
github.com/weppos/publicsuffix-go v0.15.0
7+
gopkg.in/yaml.v2 v2.4.0
8+
)
9+
10+
require (
11+
golang.org/x/net v0.0.0-20200202094626-16171245cfb2 // indirect
12+
golang.org/x/text v0.3.0 // indirect
13+
)

0 commit comments

Comments
 (0)