From 4d09ed1dc595b2f677286f399fd7c1a3ca5a7f7f Mon Sep 17 00:00:00 2001
From: Yao Yue <yao@twitter.com>
Date: Thu, 11 Jul 2019 10:20:15 -0700
Subject: [PATCH 1/2] add calculator script

---
 scripts/capacity/calculator.py | 231 +++++++++++++++++++++++++++++++++
 1 file changed, 231 insertions(+)
 create mode 100644 scripts/capacity/calculator.py

diff --git a/scripts/capacity/calculator.py b/scripts/capacity/calculator.py
new file mode 100644
index 000000000..2c834952e
--- /dev/null
+++ b/scripts/capacity/calculator.py
@@ -0,0 +1,231 @@
+from __future__ import print_function
+import argparse
+from math import ceil, floor, log
+import textwrap
+
+
+# constants: units
+K = 1000
+M = K * 1000
+KB = 1024
+MB = 1024 * KB
+GB = 1024 * MB
+
+# constants: defaults
+DEFAULT_QPS = 100  # (K)
+DEFAULT_NKEY = 100  # (M)
+DEFAULT_NCONN = 5 * K
+DEFAULT_FAILURE_DOMAIN = 5.0  # 5% of the nodes may be lost at once
+DEFAULT_SIZE = 64  # slimcache only
+
+MAX_HOST_LIMIT = 10  # based on platform / job size constraint
+
+# constants: pelikan related
+CONN_OVERHEAD = 33 * KB  # 2 16KiB buffers, one channel, and stream overhead
+SAFETY_BUF = 128  # in MB
+BASE_OVERHEAD = 10  # in MB
+KQPS = 30  # much lower than single-instance max, picked to scale to 10 jobs/host
+HASH_OVERHEAD = {'twemcache': 8, 'slimcache': 0}
+ITEM_OVERHEAD = {'twemcache': 40 + 8, 'slimcache': 6 + 8}  # ITEM_HDR_SIZE + CAS
+KEYVAL_ALIGNMENT = 8  # in bytes
+NITEM_ALIGNMENT = 512  # so memory allocation is always 4K (page size) aligned
+
+# constants: job related
+CPU_PER_JOB = 2.0
+DISK_PER_JOB = 3   # in GB
+RAM_CANDIDATES = [4, 8]  # in GB
+RACK_TO_HOST_RATIO = 2.0  # a somewhat arbitrary ratio between rack/host-limit
+FAILURE_DOMAIN_LOWER = 0.5
+FAILURE_DOMAIN_UPPER = 20.0
+WARNING_THRESHOLD = 1000  # alert when too many jobs are needed
+
+
+def hash_parameters(nkey, runnable):
+  hash_power = int(ceil(log(nkey, 2)))
+  ram_hash = int(ceil(1.0 * HASH_OVERHEAD[runnable] * (2 ** hash_power) / MB))
+  return (hash_power, ram_hash)
+
+
+def calculate(args):
+  """calculate job configuration according to requirements.
+     For twemcache, returns a dict with:
+      cpu, ram, disk,
+      hash_power, slab_mem,
+      instance, host_limit, rack_limit,
+      memory_bound
+     For slimcache, return a dict with:
+      cpu, ram, disk,
+      item_size, nitem,
+      instance, host_limit, rack_limit,
+      memory_bound
+  """
+  if args.failure_domain < FAILURE_DOMAIN_LOWER or args.failure_domain > FAILURE_DOMAIN_UPPER:
+    print('ERROR: failure domain should be between {:.1f}% and {:.1f}'.format(
+      FAILURE_DOMAIN_LOWER, FAILURE_DOMAIN_UPPER))
+
+  # first calculate njob disrecarding memory, note both njob & bottleneck are not yet final
+  njob_qps = int(ceil(1.0 * args.qps / KQPS))
+  njob_fd = int(ceil(100.0 / args.failure_domain))
+  if njob_qps >= njob_fd:
+    bottleneck = 'qps'
+    njob = njob_qps
+  else:
+    bottleneck = 'failure domain'
+    njob = njob_fd
+
+  # then calculate njob (vector) assuming memory-bound
+
+  # all ram-related values in this function are in MB
+  # amount of ram needed to store dataset, factoring in overhead
+  item_size = int(KEYVAL_ALIGNMENT * ceil(1.0 * (ITEM_OVERHEAD[args.runnable] + args.size) /
+    KEYVAL_ALIGNMENT))
+  ram_data = 1.0 * item_size * args.nkey * M / MB
+  # per-job memory overhead, in MB
+  ram_conn = int(ceil(1.0 * CONN_OVERHEAD * args.nconn / MB))
+  ram_fixed = BASE_OVERHEAD + SAFETY_BUF
+
+  njob_mem = []
+  sorted_ram = sorted(args.ram)
+  for ram in sorted_ram:
+    ram = ram * GB / MB  # change unit to MB
+    n_low = int(ceil(ram_data / ram))  # number of shards, lower bound
+    nkey_per_shard = 1.0 * args.nkey * M / n_low  # number of keys per shard, upper bound
+    hash_power, ram_hash = hash_parameters(nkey_per_shard, args.runnable)  # upper bound for both
+    n = int(ceil(ram_data / (ram - ram_fixed - ram_conn - ram_hash)))
+    njob_mem.append(n)
+
+  # get final njob count; prefer larger ram if it reduces njob, which means:
+  # if cluster needs higher job ram AND more instances due to memory, update njob
+  # if cluster is memory-bound with smaller job ram but qps-bound with larger ones, use higher ram
+  # otherwise, use smaller job ram and keep njob value unchanged
+  index = 0  # if qps bound, use smallest ram setting
+  for i, n in reversed(list(enumerate(njob_mem))[1:]):
+    if n > njob or njob_mem[i - 1] > njob:
+      bottleneck = 'memory'
+      index = i
+      njob = max(njob, n)
+      break
+  if njob > WARNING_THRESHOLD:
+    print('WARNING: more than {} instances needed, please verify input.'.format(WARNING_THRESHOLD))
+
+  # recalculate hash parameters with the final job count
+  nkey_per_shard = 1.0 * (sorted_ram[index] * GB - ram_fixed * MB - ram_conn * MB) / item_size
+  # only used by twemcache
+  hash_power, ram_hash = hash_parameters(nkey_per_shard, args.runnable)
+  slab_mem = sorted_ram[index] * GB / MB - ram_fixed - ram_conn - ram_hash
+  # only used by slimcache
+  nitem = int(NITEM_ALIGNMENT * floor(nkey_per_shard / NITEM_ALIGNMENT))
+
+  rack_limit = int(floor(njob * args.failure_domain / 100))  # >= 1 given how we calculate njob
+  host_limit = int(floor(min(MAX_HOST_LIMIT, max(1, rack_limit / RACK_TO_HOST_RATIO))))
+
+  ret = {
+      'cpu': CPU_PER_JOB,
+      'ram': sorted_ram[index],
+      'disk': DISK_PER_JOB,
+      'instance': njob,
+      'rack_limit': rack_limit,
+      'host_limit': host_limit,
+      'bottleneck': bottleneck}
+  if args.runnable == 'twemcache':
+    ret['hash_power'] = hash_power
+    ret['slab_mem'] = slab_mem
+  elif args.runnable == 'slimcache':
+    ret['item_size'] = item_size
+    ret['nitem'] = nitem
+
+  return ret
+
+
+def format_input(args):
+  return textwrap.dedent('''
+    Requirement:
+      qps:             {} K
+      key-val size:    {}
+      number of key:   {} M
+      data, computed:  {:.1f} GB
+      number of conn:  {} per server
+      failure domain:  {:.1f} %
+
+  '''.format(args.qps, args.size, args.nkey,
+             1.0 * args.size * args.nkey * M / GB,
+             args.nconn, args.failure_domain))
+
+
+def twemcache_format_output(config):
+  return textwrap.dedent('''
+    pelikan_twemcache config:
+      hash_power:      {}
+      slab_mem:        {} MB
+
+    job config:
+      cpu:             {}
+      ram:             {} GB
+      disk:            {} GB
+      instances:       {}
+      host limit:      {}
+      rack limit:      {}
+
+  '''.format(config['hash_power'], config['slab_mem'],
+             config['cpu'], config['ram'], config['disk'],
+             config['instance'], config['host_limit'], config['rack_limit']))
+
+
+def slimcache_format_output(config):
+  return textwrap.dedent('''
+    pelikan_slimcache config:
+      item_size:       {}
+      nitem:           {}
+
+    job config:
+      cpu:             {}
+      ram:             {} GB
+      disk:            {} GB
+      instances:       {}
+      host limit:      {}
+      rack limit:      {}
+
+  '''.format(config['item_size'], config['nitem'],
+             config['cpu'], config['ram'], config['disk'],
+             config['instance'], config['host_limit'], config['rack_limit']))
+
+
+# parser for calculator: to be included by the generator as a parent parser
+parser = argparse.ArgumentParser(
+  formatter_class=argparse.RawDescriptionHelpFormatter,
+  description=textwrap.dedent("""
+    This script calculates resource requirement of a pelikan cluster (twemcache or slimcache)
+    based on input. It has to be run from the top level directory of source.\n
+
+    Optional arguments that probably should be overwritten:
+      qps, size, nkey, nconn
+
+    Optional arguments:
+      failure_domain (default to 5%, acceptable range {:.1f}% - {:.1f}%)
+    """.format(FAILURE_DOMAIN_LOWER, FAILURE_DOMAIN_UPPER)),
+  usage='%(prog)s [options]')
+
+parser.add_argument('--qps', dest='qps', type=int, default=DEFAULT_QPS,
+                    help='query per second in *thousands/K*, round up')
+parser.add_argument('--size', dest='size', type=int, default=DEFAULT_SIZE,
+                    help='key+value size in bytes, average for twemcache, max for slimcache')
+parser.add_argument('--nkey', dest='nkey', type=int, default=DEFAULT_NKEY,
+    help='number of keys in *millions/M*, round up')
+parser.add_argument('--nconn', dest='nconn', type=int, default=DEFAULT_NCONN,
+    help='number of connections to each server')
+parser.add_argument('--failure_domain', dest='failure_domain', type=float,
+    default=DEFAULT_FAILURE_DOMAIN,
+    help='percentage of server/data that may be lost simultaneously')
+parser.add_argument('--ram', nargs='+', type=int, default=RAM_CANDIDATES,
+    help='provide a (sorted) list of container ram sizes to consider')
+# end of parser
+
+if __name__ == "__main__":
+  # add runnable as a positional option instead of subparser (as in aurora.py) to avoid import
+  parser.add_argument('runnable', choices=['twemcache', 'slimcache'], help='flavor of backend')
+  format_output = {'twemcache': twemcache_format_output, 'slimcache': slimcache_format_output}
+  args = parser.parse_args()
+  print(format_input(args))
+  config = calculate(args)
+  print(format_output[args.runnable](config))
+  print('Cluster sizing is primarily driven by {}.\n'.format(config['bottleneck']))

From 34f688d57a481dc33f1e3bc466e67e54b8ea1137 Mon Sep 17 00:00:00 2001
From: Yao Yue <yao@twitter.com>
Date: Thu, 11 Jul 2019 10:26:39 -0700
Subject: [PATCH 2/2] some production-like examples

---
 scripts/capacity/README.md | 40 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 40 insertions(+)
 create mode 100644 scripts/capacity/README.md

diff --git a/scripts/capacity/README.md b/scripts/capacity/README.md
new file mode 100644
index 000000000..5408622a6
--- /dev/null
+++ b/scripts/capacity/README.md
@@ -0,0 +1,40 @@
+## Examples
+
+Below are some examples where cluster sizing is driven by different factors.
+Note that `twemcache` asks for _average_ key+val size in bytes via `--size` option,
+while `slimcache` asks for size of _largest_ key+value in bytes via `--size`
+option. They are otherwise similar.
+
+Twemcache Examples:
+
+```sh
+# throughput driven
+python3 calculator.py twemcache --qps 1000 --size 200 --nkey 500 --nconn 2000 --failure_domain 5
+
+# memory driven
+python3 calculator.py twemcache --qps 1000 --size 1000 --nkey 500 --nconn 2000 --failure_domain 5
+python3 calculator.py twemcache --qps 1000 --size 400 --nkey 500 --nconn 2000 --failure_domain 5
+
+# failure domain driven
+python3 calculator.py twemcache --qps 1000 --size 200 --nkey 500 --nconn 2000 --failure_domain 0.5
+```
+
+Slimcache Examples:
+
+```sh
+# throughput driven
+python3 calculator.py slimcache --qps 1000 --size 45 --nkey 500 --nconn 2000 --failure_domain 5
+
+# memory driven
+python3 calculator.py slimcache --qps 1000 --size 45 --nkey 4000 --nconn 2000 --failure_domain 5
+python3 calculator.py slimcache --qps 1000 --size 80 --nkey 5000 --nconn 2000 --failure_domain 5
+
+# failure domain driven
+python3 calculator.py slimcache --qps 1000 --size 48 --nkey 500 --nconn 2000 --failure_domain 0.5
+```
+
+```sh
+python3 calculator.py twemcache -h
+python3 calculator.py slimcache -h
+```
+