Skip to content

Commit

Permalink
Merge #47093
Browse files Browse the repository at this point in the history
47093: ycsb: only use column families when beneficial r=nvanbenschoten a=nvanbenschoten

This commit updates the `workload/ycsb` load generator to default its
`--families` flag (if unspecified) to the value that is most beneficial
for performance of that workload.

The determination of whether a workload benefits from column families or
not is based on the following experiments:

#### ycsb/[A-F]/nodes=3 (3x n1-standard-8 VMs):

| workload | --families=false | --families=true | better with families? |
|----------|-----------------:|----------------:|-----------------------|
| A        |         11,743.5 |        17,760.5 | true                  |
| B        |         35,232.3 |        32,982.2 | false                 |
| C        |         45,454.7 |        44,112.5 | false                 |
| D        |         36,091.0 |        35,615.1 | false                 |
| E        |          5,774.9 |         2,604.8 | false                 |
| F        |          4,933.1 |         8,259.7 | true                  |

#### ycsb/[A-F]/nodes=3/cpu=32 (3x n1-standard-32 VMs):

| workload | --families=false | --families=true | better with families? |
|----------|-----------------:|----------------:|-----------------------|
| A        |         14,144.1 |        27,179.4 | true                  |
| B        |         96,669.6 |       104,567.5 | true                  |
| C        |        137,463.3 |       131,953.7 | false                 |
| D        |        103,188.6 |        95,285.7 | false                 |
| E        |         10,417.5 |         7,913.6 | false                 |
| F        |          5,782.3 |        15,532.1 | true                  |

This addresses the recent regression we saw on `ycsb/E/nodes=3` due to
9124c4f.

There's an argument to be made that this determination should go in the
roachtest instead to avoid complicating the `--families` flag. I've
decided not to do that, but would be happy to discuss if others feel
like this is problematic.

Co-authored-by: Nathan VanBenschoten <nvanbenschoten@gmail.com>
  • Loading branch information
craig[bot] and nvanbenschoten committed Apr 7, 2020
2 parents 6df26fd + 7bc1af1 commit c257a8e
Showing 1 changed file with 85 additions and 6 deletions.
91 changes: 85 additions & 6 deletions pkg/workload/ycsb/ycsb.go
Original file line number Diff line number Diff line change
Expand Up @@ -149,34 +149,41 @@ func (g *ycsb) Flags() workload.Flags { return g.flags }
func (g *ycsb) Hooks() workload.Hooks {
return workload.Hooks{
Validate: func() error {
g.workload = strings.ToUpper(g.workload)
switch g.workload {
case "A", "a":
case "A":
g.readFreq = 0.5
g.updateFreq = 0.5
g.requestDistribution = "zipfian"
case "B", "b":
case "B":
g.readFreq = 0.95
g.updateFreq = 0.05
g.requestDistribution = "zipfian"
case "C", "c":
case "C":
g.readFreq = 1.0
g.requestDistribution = "zipfian"
case "D", "d":
case "D":
g.readFreq = 0.95
g.insertFreq = 0.05
g.requestDistribution = "latest"
case "E", "e":
case "E":
g.scanFreq = 0.95
g.insertFreq = 0.05
g.requestDistribution = "zipfian"
case "F", "f":
case "F":
g.readFreq = 0.5
g.readModifyWriteFreq = 0.5
g.requestDistribution = "zipfian"
default:
return errors.Errorf("Unknown workload: %q", g.workload)
}

if !g.flags.Lookup(`families`).Changed {
// If `--families` was not specified, default its value to the
// configuration that we expect to lead to better performance.
g.families = preferColumnFamilies(g.workload)
}

if g.recordCount == 0 {
g.recordCount = g.insertStart + g.insertCount
}
Expand All @@ -188,6 +195,78 @@ func (g *ycsb) Hooks() workload.Hooks {
}
}

// preferColumnFamilies returns whether we expect the use of column families to
// improve performance for a given workload.
func preferColumnFamilies(workload string) bool {
// These determinations were computed on 80da27b (04/04/2020) while running
// the ycsb roachtests.
//
// ycsb/[A-F]/nodes=3 (3x n1-standard-8 VMs):
//
// | workload | --families=false | --families=true | better with families? |
// |----------|-----------------:|----------------:|-----------------------|
// | A | 11,743.5 | 17,760.5 | true |
// | B | 35,232.3 | 32,982.2 | false |
// | C | 45,454.7 | 44,112.5 | false |
// | D | 36,091.0 | 35,615.1 | false |
// | E | 5,774.9 | 2,604.8 | false |
// | F | 4,933.1 | 8,259.7 | true |
//
// ycsb/[A-F]/nodes=3/cpu=32 (3x n1-standard-32 VMs):
//
// | workload | --families=false | --families=true | better with families? |
// |----------|-----------------:|----------------:|-----------------------|
// | A | 14,144.1 | 27,179.4 | true |
// | B | 96,669.6 | 104,567.5 | true |
// | C | 137,463.3 | 131,953.7 | false |
// | D | 103,188.6 | 95,285.7 | false |
// | E | 10,417.5 | 7,913.6 | false |
// | F | 5,782.3 | 15,532.1 | true |
//
switch workload {
case "A":
// Workload A is highly contended. It performs 50% single-row lookups
// and 50% single-column updates. Using column families breaks the
// contention between all updates to different columns of the same row,
// so we use them by default.
return true
case "B":
// Workload B is less contended than Workload A, but still bottlenecks
// on contention as concurrency grows. It performs 95% single-row
// lookups and 5% single-column updates. Using column families slows
// down the single-row lookups but speeds up the updates (see above).
// This trade-off favors column families for higher concurrency levels
// but does not at lower concurrency levels. We prefer larger YCSB
// deployments, so we use column families by default.
return true
case "C":
// Workload C has no contention. It consistent entirely of single-row
// lookups. Using column families slows down single-row lookups, so we
// do not use them by default.
return false
case "D":
// Workload D has no contention. It performs 95% single-row lookups and
// 5% single-row insertion. Using column families slows down single-row
// lookups and single-row insertion, so we do not use them by default.
return false
case "E":
// Workload E has moderate contention. It performs 95% multi-row scans
// and 5% single-row insertion. Using column families slows down
// multi-row scans and single-row insertion, so we do not use them by
// default.
return false
case "F":
// Workload F is highly contended. It performs 50% single-row lookups
// and 50% single-column updates expressed as multi-statement
// read-modify-write transactions. Using column families breaks the
// contention between all updates to different columns of the same row,
// so we use them by default.
return true
default:
panic(fmt.Sprintf("unexpected workload: %s", workload))
}
}

var usertableColTypes = []coltypes.T{
coltypes.Bytes, coltypes.Bytes, coltypes.Bytes, coltypes.Bytes, coltypes.Bytes, coltypes.Bytes,
coltypes.Bytes, coltypes.Bytes, coltypes.Bytes, coltypes.Bytes, coltypes.Bytes,
Expand Down

0 comments on commit c257a8e

Please sign in to comment.