-
Notifications
You must be signed in to change notification settings - Fork 360
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
use a worker pool to ingest entries faster #1964
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
@@ -1,7 +1,9 @@ | ||||||
package cmd | ||||||
|
||||||
import ( | ||||||
"context" | ||||||
"strings" | ||||||
"sync" | ||||||
|
||||||
"github.com/spf13/cobra" | ||||||
"github.com/treeverse/lakefs/cmd/lakectl/cmd/store" | ||||||
|
@@ -12,6 +14,32 @@ const ingestSummaryTemplate = ` | |||||
Staged {{ .Objects | yellow }} external objects (total of {{ .Bytes | human_bytes | yellow }}) | ||||||
` | ||||||
|
||||||
type stageRequest struct { | ||||||
ctx context.Context | ||||||
repository string | ||||||
branch string | ||||||
params *api.StageObjectParams | ||||||
body api.StageObjectJSONRequestBody | ||||||
} | ||||||
|
||||||
type stageResponse struct { | ||||||
resp *api.StageObjectResponse | ||||||
error error | ||||||
} | ||||||
|
||||||
func stageWorker(wg *sync.WaitGroup, requests <-chan *stageRequest, responses chan<- *stageResponse) { | ||||||
defer wg.Done() | ||||||
client := getClient() | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we share clients? I think HTTP connection pooling/reuse anyway happens at a level below, so it might manage to reduce connection re-use which could increase performance (or decrease performance...). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. From a different angle - getClient should be called once, it can fail and we do not want to dump multiple errors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||
for req := range requests { | ||||||
resp, err := client.StageObjectWithResponse( | ||||||
req.ctx, req.repository, req.branch, req.params, req.body) | ||||||
responses <- &stageResponse{ | ||||||
resp: resp, | ||||||
error: err, | ||||||
} | ||||||
} | ||||||
} | ||||||
|
||||||
var ingestCmd = &cobra.Command{ | ||||||
Use: "ingest --from <object store URI> --to <lakeFS path URI> [--dry-run]", | ||||||
Short: "Ingest objects from an external source into a lakeFS branch (without actually copying them)", | ||||||
|
@@ -21,51 +49,77 @@ var ingestCmd = &cobra.Command{ | |||||
dryRun := MustBool(cmd.Flags().GetBool("dry-run")) | ||||||
from := MustString(cmd.Flags().GetString("from")) | ||||||
to := MustString(cmd.Flags().GetString("to")) | ||||||
concurrency := MustInt(cmd.Flags().GetInt("concurrency")) | ||||||
lakefsURI := MustParsePathURI("to", to) | ||||||
|
||||||
// initialize worker pool | ||||||
var wg sync.WaitGroup | ||||||
wg.Add(concurrency) | ||||||
requests := make(chan *stageRequest) | ||||||
responses := make(chan *stageResponse) | ||||||
for w := 0; w < concurrency; w++ { | ||||||
go stageWorker(&wg, requests, responses) | ||||||
} | ||||||
|
||||||
summary := struct { | ||||||
Objects int64 | ||||||
Bytes int64 | ||||||
}{} | ||||||
client := getClient() | ||||||
err := store.Walk(ctx, from, func(e store.ObjectStoreEntry) error { | ||||||
if dryRun { | ||||||
Fmt("%s\n", e) | ||||||
|
||||||
// iterate entries and feed our pool | ||||||
go func() { | ||||||
err := store.Walk(ctx, from, func(e store.ObjectStoreEntry) error { | ||||||
if dryRun { | ||||||
Fmt("%s\n", e) | ||||||
return nil | ||||||
} | ||||||
key := e.RelativeKey | ||||||
if lakefsURI.Path != nil && *lakefsURI.Path != "" { | ||||||
path := *lakefsURI.Path | ||||||
if strings.HasSuffix(*lakefsURI.Path, "/") { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This can be done once outside the loop - just set There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||
key = path + key | ||||||
} else { | ||||||
key = path + "/" + key | ||||||
} | ||||||
} | ||||||
mtime := e.Mtime.Unix() | ||||||
requests <- &stageRequest{ | ||||||
ctx: ctx, | ||||||
repository: lakefsURI.Repository, | ||||||
branch: lakefsURI.Ref, | ||||||
params: &api.StageObjectParams{ | ||||||
Path: key, | ||||||
}, | ||||||
body: api.StageObjectJSONRequestBody{ | ||||||
Checksum: e.ETag, | ||||||
Mtime: &mtime, | ||||||
PhysicalAddress: e.Address, | ||||||
SizeBytes: e.Size, | ||||||
}, | ||||||
} | ||||||
return nil | ||||||
}) | ||||||
if err != nil { | ||||||
DieFmt("error walking object store: %v", err) | ||||||
} | ||||||
key := e.RelativeKey | ||||||
if lakefsURI.Path != nil && *lakefsURI.Path != "" { | ||||||
path := *lakefsURI.Path | ||||||
if strings.HasSuffix(*lakefsURI.Path, "/") { | ||||||
key = path + key | ||||||
} else { | ||||||
key = path + "/" + key | ||||||
} | ||||||
close(requests) // we're done feeding work! | ||||||
wg.Wait() // until all responses have been written | ||||||
close(responses) // so we're also done with responses | ||||||
}() | ||||||
|
||||||
for response := range responses { | ||||||
DieOnResponseError(response.resp, response.error) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we just die - so no point in passing errors from the worker - can reduce all the specific response channel and just process responses There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||
summary.Objects += 1 | ||||||
summary.Bytes += api.Int64Value(response.resp.JSON201.SizeBytes) | ||||||
|
||||||
// update every 10k records | ||||||
if summary.Objects%10000 == 0 { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider using time since instead of round numbers or optional progress bar There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Done |
||||||
Write("Staged {{ .Objects | green }} objects so far...\n", summary) | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
might give a cute single-line "animation". (But you'd have to write that There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice idea. Done. |
||||||
} | ||||||
mtime := e.Mtime.Unix() | ||||||
resp, err := client.StageObjectWithResponse(ctx, | ||||||
lakefsURI.Repository, | ||||||
lakefsURI.Ref, | ||||||
&api.StageObjectParams{ | ||||||
Path: key, | ||||||
}, | ||||||
api.StageObjectJSONRequestBody{ | ||||||
Checksum: e.ETag, | ||||||
Mtime: &mtime, | ||||||
PhysicalAddress: e.Address, | ||||||
SizeBytes: e.Size, | ||||||
}, | ||||||
) | ||||||
DieOnResponseError(resp, err) | ||||||
|
||||||
if verbose { | ||||||
Write("Staged "+fsStatTemplate+"\n", resp.JSON201) | ||||||
Write("Staged "+fsStatTemplate+"\n", response.resp.JSON201) | ||||||
} | ||||||
summary.Objects += 1 | ||||||
summary.Bytes += api.Int64Value(resp.JSON201.SizeBytes) | ||||||
return nil | ||||||
}) | ||||||
if err != nil { | ||||||
DieFmt("error walking object store: %v", err) | ||||||
} | ||||||
|
||||||
// print summary | ||||||
|
@@ -81,5 +135,6 @@ func init() { | |||||
_ = ingestCmd.MarkFlagRequired("to") | ||||||
ingestCmd.Flags().Bool("dry-run", false, "only print the paths to be ingested") | ||||||
ingestCmd.Flags().BoolP("verbose", "v", false, "print stats for each individual object staged") | ||||||
ingestCmd.Flags().IntP("concurrency", "C", 64, "max concurrent API calls to make to the lakeFS server") | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why capital 'C'?
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. because we use |
||||||
rootCmd.AddCommand(ingestCmd) | ||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ctx can be passed to the workers once on creation.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done