Skip to content

Commit

Permalink
nsfs fixes
Browse files Browse the repository at this point in the history
adding special handling for sigabrt signal and for too many panics
adding read_bucket_sdk_config_info to remove errors when running nsfs in simple mode

Signed-off-by: jackyalbo <jacky.albo@gmail.com>
  • Loading branch information
jackyalbo committed Feb 11, 2024
1 parent 8ef4d07 commit 8614140
Show file tree
Hide file tree
Showing 2 changed files with 27 additions and 2 deletions.
1 change: 1 addition & 0 deletions src/cmd/nsfs.js
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ class NsfsObjectSDK extends ObjectSDK {
this._get_bucket_namespace = bucket_name => this._simple_get_single_bucket_namespace(bucket_name);
this.load_requesting_account = auth_req => this._simple_load_requesting_account(auth_req);
this.read_bucket_sdk_policy_info = bucket_name => this._simple_read_bucket_sdk_policy_info(bucket_name);
this.read_bucket_sdk_config_info = () => undefined;
this.read_bucket_usage_info = () => undefined;
this.read_bucket_sdk_website_info = () => undefined;
this.read_bucket_sdk_namespace_info = () => undefined;
Expand Down
28 changes: 26 additions & 2 deletions src/util/fork_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ const op_stats = {};
* @returns {boolean} true if workers were started.
*/
function start_workers(metrics_port, count = 0) {
let last_panic;
if (cluster.isPrimary && count > 0) {
for (let i = 0; i < count; ++i) {
const worker = cluster.fork();
Expand All @@ -38,11 +39,34 @@ function start_workers(metrics_port, count = 0) {
// We don't want to leave the process with a partial set of workers,
// so if any worker exits, we will print an error message in the logs and start a new one.
cluster.on('exit', (worker, code, signal) => {
console.warn('WORKER exit', { id: worker.id, pid: worker.process.pid, code, signal }, 'starting a new one.');
console.warn('WORKER exit', { id: worker.id, pid: worker.process.pid, code, signal });
new NoobaaEvent(NoobaaEvent.FORK_EXIT).create_event(undefined, { id: worker.id, pid: worker.process.pid,
code: code, signal: signal}, undefined);
// SIGABRT is used for abnormal program termination - https://www.qnx.com/developers/docs/6.4.1/dinkum_en/ecpp/stdlib.html
// we saw it happen both in case of no memory and on too much processes started
// in these case we won't restart the workers and will kill the main process
// for testing (linux only) limit memory(virt): ulimit -v xxx(kb)
// limit user processes: ulimit -u xxx
if (signal === 'SIGABRT') {
console.error('EXIT ON WORKER ERROR - received SIGABRT (abort) signal');
new NoobaaEvent(NoobaaEvent.ENDPOINT_CRASHED).create_event(undefined, { id: worker.id, pid: worker.process.pid,
code: code, signal: signal}, "received SIGABRT (abort) signal");
process.exit(1);
}
if (code === 1) { // when we panic, we exit the process with exit code = 1
const now = Date.now();
const min = 10; // not allowed to get more than 2 panics in 10 minutes
if (last_panic && now - last_panic < min * 60 * 1000) {
console.error(`EXIT ON WORKER ERROR - received more than 2 panics in the last ${min} minutes`);
new NoobaaEvent(NoobaaEvent.ENDPOINT_CRASHED).create_event(undefined, { id: worker.id, pid: worker.process.pid,
code: code, signal: signal}, `received more than 2 panics in the last ${min} minutes`);
process.exit(1);
}
console.warn('Encountered a panic, not killing main process until enough panics', now, last_panic);
last_panic = now;
}
const new_worker = cluster.fork();
console.warn('WORKER started', { id: new_worker.id, pid: new_worker.process.pid });
console.warn('WORKER re-started', { id: new_worker.id, pid: new_worker.process.pid });
});
for (const id in cluster.workers) {
if (id) {
Expand Down

0 comments on commit 8614140

Please sign in to comment.