Skip to content

Commit

Permalink
feat(mocknet): add a make-backup command (#10931)
Browse files Browse the repository at this point in the history
It is often useful to stop all the nodes and make backups of their data
dirs so that we can reset to that point later. For example, when testing
resharding it's nice to have a backup at a point right before the
resharding epoch starts so that we can test the resharding part over and
over without having to start from the beginning. So this PR adds a
`make_backup` RPC method to `neard_runner.py` that stops neard and
copies its data dir, remembering the `backup_id` parameter associated
with it. Then in the `reset` command, the same `backup_id` can be
referenced to restore to a previous backup.
  • Loading branch information
marcelo-gonzalez authored Apr 4, 2024
1 parent cbf977a commit 22b603c
Show file tree
Hide file tree
Showing 4 changed files with 194 additions and 37 deletions.
143 changes: 117 additions & 26 deletions pytest/tests/mocknet/helpers/neard_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
# python script to handle neard process management.

import argparse
import datetime
from enum import Enum
import fcntl
import json
import jsonrpc
import logging
import os
import psutil
import re
import requests
import shutil
import signal
Expand Down Expand Up @@ -61,6 +63,10 @@ def __init__(self, request, client_address, server):
self.dispatcher.add_method(server.neard_runner.do_reset, name="reset")
self.dispatcher.add_method(server.neard_runner.do_update_binaries,
name="update_binaries")
self.dispatcher.add_method(server.neard_runner.do_make_backup,
name="make_backup")
self.dispatcher.add_method(server.neard_runner.do_ls_backups,
name="ls_backups")
super().__init__(request, client_address, server)

def do_GET(self):
Expand Down Expand Up @@ -108,6 +114,10 @@ class TestState(Enum):
STOPPED = 6
RESETTING = 7
ERROR = 8
MAKING_BACKUP = 9


backup_id_pattern = re.compile(r'^[0-9a-zA-Z.][0-9a-zA-Z_\-.]+$')


class NeardRunner:
Expand Down Expand Up @@ -140,6 +150,8 @@ def __init__(self, args):
'neard_process': None,
'current_neard_path': None,
'state': TestState.NONE.value,
'backups': {},
'state_data': None,
}
# protects self.data, and its representation on disk,
# because both the rpc server and the main loop touch them concurrently
Expand Down Expand Up @@ -190,9 +202,11 @@ def parse_binaries_config(self):
})
return binaries

def set_current_neard_path(self, path):
self.data['current_neard_path'] = path

def reset_current_neard_path(self):
self.data['current_neard_path'] = self.data['binaries'][0][
'system_path']
self.set_current_neard_path(self.data['binaries'][0]['system_path'])

# tries to download the binaries specified in config.json, saving them in $home/binaries/
# if force is set to true all binaries will be downloaded, otherwise only the missing ones
Expand Down Expand Up @@ -355,6 +369,7 @@ def do_new_test(self,
validator_account_id = None
validator_public_key = None

self.data['backups'] = {}
self.set_state(TestState.AWAITING_NETWORK_INIT)
self.save_data()

Expand Down Expand Up @@ -461,25 +476,54 @@ def do_stop(self):
self.set_state(TestState.STOPPED)
self.save_data()

def do_reset(self):
def do_reset(self, backup_id=None):
with self.lock:
state = self.get_state()
logging.info(f"do_reset {state}")
if state != TestState.RUNNING and state != TestState.STOPPED:
raise jsonrpc.exceptions.JSONRPCDispatchException(
code=-32600,
message='Cannot reset data dir as test state is not ready')

backups = self.data.get('backups', {})
if backup_id is not None and backup_id != 'start' and backup_id not in backups:
raise jsonrpc.exceptions.JSONRPCDispatchException(
code=-32600, message=f'backup ID {backup_id} not known')

if backup_id is None or backup_id == 'start':
path = self.data['binaries'][0]['system_path']
else:
path = backups[backup_id]['neard_path']

if state == TestState.RUNNING:
self.kill_neard()
self.set_state(TestState.RESETTING)
self.reset_current_neard_path()
self.save_data()
elif state == TestState.STOPPED:
self.set_state(TestState.RESETTING)
self.reset_current_neard_path()
self.save_data()
else:
self.set_state(TestState.RESETTING, data=backup_id)
self.set_current_neard_path(path)
self.save_data()

def do_make_backup(self, backup_id, description=None):
with self.lock:
state = self.get_state()
if state != TestState.RUNNING and state != TestState.STOPPED:
raise jsonrpc.exceptions.JSONRPCDispatchException(
code=-32600,
message=
'Cannot reset node as test state has not been initialized yet'
)
message='Cannot make backup as test state is not ready')

if backup_id_pattern.match(backup_id) is None:
raise jsonrpc.exceptions.JSONRPCDispatchException(
code=-32600, message=f'invalid backup ID: {backup_id}')

if backup_id in self.data.get('backups', {}):
raise jsonrpc.exceptions.JSONRPCDispatchException(
code=-32600, message=f'backup {backup_id} already exists')
if state == TestState.RUNNING:
self.kill_neard()
self.making_backup(backup_id, description)
self.save_data()

def do_ls_backups(self):
with self.lock:
return self.data.get('backups', {})

def do_update_binaries(self):
with self.lock:
Expand Down Expand Up @@ -690,14 +734,19 @@ def check_upgrade_neard(self):
start_neard = True

if start_neard:
self.data['current_neard_path'] = neard_path
self.set_current_neard_path(neard_path)
self.start_neard()

def get_state(self):
return TestState(self.data['state'])

def set_state(self, state):
def set_state(self, state, data=None):
self.data['state'] = state.value
self.data['state_data'] = data

def making_backup(self, backup_id, description=None):
backup_data = {'backup_id': backup_id, 'description': description}
self.set_state(TestState.MAKING_BACKUP, data=backup_data)

def network_init(self):
# wait til we get a network_init RPC
Expand Down Expand Up @@ -822,6 +871,41 @@ def check_amend_genesis(self):
self.set_state(TestState.STATE_ROOTS)
self.save_data()

def make_backup(self):
now = str(datetime.datetime.now())
backup_data = self.data['state_data']
name = backup_data['backup_id']
description = backup_data.get('description', None)

backup_dir = self.home_path('backups', name)
if os.path.exists(backup_dir):
# we already checked that this backup ID didn't already exist, so if this path
# exists, someone probably manually added it. for now just set the state to ERROR
# and make the human intervene, but it shouldn't happen in practice
logging.warn(f'{backup_dir} already exists')
self.set_state(TestState.ERROR)
return
logging.info(f'copying data dir to {backup_dir}')
shutil.copytree(self.target_near_home_path('data'),
backup_dir,
dirs_exist_ok=True)
logging.info(f'copied data dir to {backup_dir}')

backups = self.data.get('backups', {})
if name in backups:
# shouldn't happen if we check this in do_make_backups(), but fine to be paranoid and at least warn here
logging.warn(
f'backup {name} already existed in data.json, but it was not present before'
)
backups[name] = {
'time': now,
'description': description,
'neard_path': self.data['current_neard_path']
}
self.data['backups'] = backups
self.set_state(TestState.STOPPED)
self.save_data()

def check_genesis_state(self):
path, running, exit_code = self.poll_neard()
if not running:
Expand All @@ -843,26 +927,31 @@ def check_genesis_state(self):
except FileNotFoundError:
pass
os.mkdir(self.home_path('backups'))
# Right now we save the backup to backups/start and in the future
# it would be nice to support a feature that lets you stop all the nodes and
# make another backup to restore to
backup_dir = self.home_path('backups', 'start')
logging.info(f'copying data dir to {backup_dir}')
shutil.copytree(self.target_near_home_path('data'), backup_dir)
self.set_state(TestState.STOPPED)
self.making_backup(
'start',
description='initial test state after state root computation'
)
self.save_data()
self.make_backup()
except requests.exceptions.ConnectionError:
pass

def reset_near_home(self):
backup_id = self.data['state_data']
if backup_id is None:
backup_id = 'start'
backup_path = self.home_path('backups', backup_id)
if not os.path.exists(backup_path):
logging.error(f'backup dir {backup_path} does not exist')
self.set_state(TestState.ERROR)
self.save_data()
try:
logging.info("removing the old directory")
shutil.rmtree(self.target_near_home_path('data'))
except FileNotFoundError:
pass
logging.info('restoring data dir from backup')
shutil.copytree(self.home_path('backups', 'start'),
self.target_near_home_path('data'))
logging.info(f'restoring data dir from backup at {backup_path}')
shutil.copytree(backup_path, self.target_near_home_path('data'))
logging.info('data dir restored')
self.set_state(TestState.STOPPED)
self.save_data()
Expand All @@ -882,6 +971,8 @@ def main_loop(self):
self.check_upgrade_neard()
elif state == TestState.RESETTING:
self.reset_near_home()
elif state == TestState.MAKING_BACKUP:
self.make_backup()
time.sleep(10)

def serve(self, port):
Expand Down
55 changes: 54 additions & 1 deletion pytest/tests/mocknet/mirror.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import json
import random
from rc import pmap
import re
import sys
import time

Expand Down Expand Up @@ -219,13 +220,55 @@ def reset_cmd(args, traffic_generator, nodes):
)
if sys.stdin.readline().strip() != 'yes':
sys.exit()
if args.backup_id is None:
backups = nodes[0].neard_runner_ls_backups()
backups_msg = 'ID | Time | Description\n'
if 'start' not in backups:
backups_msg += 'start | None | initial test state after state root computation\n'
for backup_id, backup_data in backups.items():
backups_msg += f'{backup_id} | {backup_data.get("time")} | {backup_data.get("description")}\n'

print(f'Backups as reported by {nodes[0].name()}):\n\n{backups_msg}')
print('please enter a backup ID here:')
args.backup_id = sys.stdin.readline().strip()
if args.backup_id != 'start' and args.backup_id not in backups:
print(
f'Given backup ID ({args.backup_id}) was not in the list given')
sys.exit()

all_nodes = nodes + [traffic_generator]
pmap(lambda node: node.neard_runner_reset(), all_nodes)
pmap(lambda node: node.neard_runner_reset(backup_id=args.backup_id),
all_nodes)
logger.info(
'Data dir reset in progress. Run the `status` command to see when this is finished. Until it is finished, neard runners may not respond to HTTP requests.'
)


def make_backup_cmd(args, traffic_generator, nodes):
if not args.yes:
print(
'this will stop all nodes and create a new backup of their home dirs. continue? [yes/no]'
)
if sys.stdin.readline().strip() != 'yes':
sys.exit()

if args.backup_id is None:
print('please enter a backup ID:')
args.backup_id = sys.stdin.readline().strip()
if re.match(r'^[0-9a-zA-Z.][0-9a-zA-Z_\-.]+$', args.backup_id) is None:
sys.exit('invalid backup ID')
if args.description is None:
print('please enter a description (enter nothing to skip):')
description = sys.stdin.readline().strip()
if len(description) > 0:
args.description = description

all_nodes = nodes + [traffic_generator]
pmap(
lambda node: node.neard_runner_make_backup(
backup_id=args.backup_id, description=args.description), all_nodes)


def stop_nodes_cmd(args, traffic_generator, nodes):
pmap(lambda node: node.neard_runner_stop(), nodes + [traffic_generator])

Expand Down Expand Up @@ -373,13 +416,23 @@ def update_binaries_cmd(args, traffic_generator, nodes):
help='stop the traffic generator, but leave the other nodes running')
stop_parser.set_defaults(func=stop_traffic_cmd)

backup_parser = subparsers.add_parser('make-backup',
help='''
Stops all nodes and haves them make a backup of the data dir that can later be restored to with the reset command
''')
backup_parser.add_argument('--yes', action='store_true')
backup_parser.add_argument('--backup-id', type=str)
backup_parser.add_argument('--description', type=str)
backup_parser.set_defaults(func=make_backup_cmd)

reset_parser = subparsers.add_parser('reset',
help='''
The new_test command saves the data directory after the genesis state roots are computed so that
the test can be reset from the start without having to do that again. This command resets all nodes'
data dirs to what was saved then, so that start-traffic will start the test all over again.
''')
reset_parser.add_argument('--yes', action='store_true')
reset_parser.add_argument('--backup-id', type=str)
reset_parser.set_defaults(func=reset_cmd)

# It re-uses the same binary urls because it's quite easy to do it with the
Expand Down
15 changes: 13 additions & 2 deletions pytest/tests/mocknet/node_handle.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,19 @@ def neard_runner_network_init(self, validators, boot_nodes, epoch_length,
def neard_runner_ready(self):
return self.neard_runner_jsonrpc('ready')

def neard_runner_reset(self):
return self.neard_runner_jsonrpc('reset')
def neard_runner_make_backup(self, backup_id, description=None):
return self.neard_runner_jsonrpc('make_backup',
params={
'backup_id': backup_id,
'description': description
})

def neard_runner_ls_backups(self):
return self.neard_runner_jsonrpc('ls_backups')

def neard_runner_reset(self, backup_id=None):
return self.neard_runner_jsonrpc('reset',
params={'backup_id': backup_id})

def neard_runner_update_binaries(self):
return self.neard_runner_jsonrpc('update_binaries')
Expand Down
Loading

0 comments on commit 22b603c

Please sign in to comment.