-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
127 lines (110 loc) · 3.04 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
package main
import (
"bytes"
"context"
"flag"
"log"
"os"
"strings"
"github.com/apache/arrow/go/v14/arrow"
"github.com/apache/arrow/go/v14/arrow/array"
"github.com/apache/arrow/go/v14/arrow/memory"
"github.com/apache/arrow/go/v14/parquet/file"
"github.com/apache/arrow/go/v14/parquet/pqarrow"
)
func main() {
var inputName string
var outputName string
flag.StringVar(&inputName, "input", "", "input parquet file")
flag.StringVar(&outputName, "output", "", "output parquet file")
flag.Parse()
inputData, err := makeData()
if err != nil {
log.Fatal(err)
}
if err := os.WriteFile(inputName, inputData, 0644); err != nil {
log.Fatal(err)
}
outputData, err := copyData(inputData)
if err != nil {
log.Fatal(err)
}
if err := os.WriteFile(outputName, outputData, 0644); err != nil {
log.Fatal(err)
}
}
func copyData(data []byte) ([]byte, error) {
input := bytes.NewReader(data)
fileReader, err := file.NewParquetReader(input)
if err != nil {
return nil, err
}
defer fileReader.Close()
arrowReader, err := pqarrow.NewFileReader(fileReader, pqarrow.ArrowReadProperties{}, memory.DefaultAllocator)
if err != nil {
return nil, err
}
output := &bytes.Buffer{}
fileWriter := file.NewParquetWriter(output, fileReader.MetaData().Schema.Root())
ctx := context.Background()
numFields := len(arrowReader.Manifest.Fields)
numRowGroups := fileReader.NumRowGroups()
for rowGroupIndex := 0; rowGroupIndex < numRowGroups; rowGroupIndex += 1 {
rowGroupReader := arrowReader.RowGroup(rowGroupIndex)
rowGroupWriter := fileWriter.AppendRowGroup()
for fieldNum := 0; fieldNum < numFields; fieldNum += 1 {
arr, err := rowGroupReader.Column(fieldNum).Read(ctx)
if err != nil {
return nil, err
}
colWriter, err := pqarrow.NewArrowColumnWriter(arr, 0, int64(arr.Len()), arrowReader.Manifest, rowGroupWriter, fieldNum)
if err != nil {
return nil, err
}
if err := colWriter.Write(ctx); err != nil {
return nil, err
}
}
}
if err := fileWriter.Close(); err != nil {
return nil, err
}
return output.Bytes(), nil
}
func makeData() ([]byte, error) {
schema := arrow.NewSchema([]arrow.Field{
{Name: "source", Nullable: true, Type: arrow.StructOf(
arrow.Field{Name: "dataset", Nullable: true, Type: arrow.BinaryTypes.String},
arrow.Field{Name: "confidence", Nullable: true, Type: arrow.PrimitiveTypes.Float64},
)},
{Name: "bbox", Nullable: false, Type: arrow.StructOf(
arrow.Field{Name: "minx", Nullable: true, Type: arrow.PrimitiveTypes.Float64},
)},
}, nil)
record, _, err := array.RecordFromJSON(memory.DefaultAllocator, schema, strings.NewReader(`[
{
"source": {
"dataset": "test"
"confidence": 100
}
"bbox": {
"minx": -180
}
}
]`))
if err != nil {
return nil, err
}
output := &bytes.Buffer{}
writer, err := pqarrow.NewFileWriter(schema, output, nil, pqarrow.DefaultWriterProps())
if err != nil {
return nil, err
}
if err := writer.Write(record); err != nil {
return nil, err
}
if err := writer.Close(); err != nil {
return nil, err
}
return output.Bytes(), nil
}