tcmur_device.c

/*
 * Copyright (c) 2017 Red Hat, Inc.
 *
 * This file is licensed to you under your choice of the GNU Lesser
 * General Public License, version 2.1 or any later version (LGPLv2.1 or
 * later), or the Apache License 2.0.
 */

#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <inttypes.h>
#include <pthread.h>
#include <errno.h>
#include <unistd.h>

#include "libtcmu_log.h"
#include "libtcmu_common.h"
#include "tcmu-runner.h"
#include "tcmur_device.h"
#include "tcmur_cmd_handler.h"
#include "tcmur_work.h"
#include "tcmu_runner_priv.h"
#include "target.h"

bool tcmu_dev_in_recovery(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	int in_recov = false;

	pthread_mutex_lock(&rdev->rdev_lock);
	if (rdev->flags & TCMUR_DEV_FLAG_IN_RECOVERY)
		in_recov = true;
	pthread_mutex_unlock(&rdev->rdev_lock);
	return in_recov;
}

/*
 * TCMUR_DEV_FLAG_IN_RECOVERY must be set before calling
 */
int __tcmu_reopen_dev(struct tcmu_device *dev, int retries)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
	int ret, attempt = 0;
	bool needs_close = false;

	pthread_mutex_lock(&rdev->rdev_lock);
	assert(rdev->flags & TCMUR_DEV_FLAG_IN_RECOVERY);
	if (rdev->flags & TCMUR_DEV_FLAG_STOPPING) {
		ret = 0;
		goto done;
	}
	pthread_mutex_unlock(&rdev->rdev_lock);

	if (pthread_self() != rdev->cmdproc_thread)
		/*
		 * The cmdproc thread could be starting to execute a new IO.
		 * Make sure sync cmd handler callbacks for cmds like INQUIRY
		 * are completed.
		 */
		tcmu_dev_flush_ring(dev);

	/*
	 * To make sure all the in-flight IOs have been finished before
	 * flushing the event_work. Or just after we flush the event_work
	 * a new timedout IO callback could come and then it will fire a
	 * new event work, which may access the device while we are closing
	 * the device later.
	 */
	tcmu_dev_dbg(dev, "Waiting for outstanding commands to complete\n");
	ret = aio_wait_for_empty_queue(rdev);
	if (ret) {
		pthread_mutex_lock(&rdev->rdev_lock);
		goto done;
	}

	tcmur_flush_work(rdev->event_work);

	pthread_mutex_lock(&rdev->rdev_lock);
	if (rdev->flags & TCMUR_DEV_FLAG_IS_OPEN)
		needs_close = true;
	rdev->flags &= ~TCMUR_DEV_FLAG_IS_OPEN;
	pthread_mutex_unlock(&rdev->rdev_lock);

	if (needs_close) {
		tcmu_dev_dbg(dev, "Closing device.\n");
		rhandler->close(dev);
	}

	pthread_mutex_lock(&rdev->rdev_lock);
	ret = -EIO;
	while (ret != 0 && !(rdev->flags & TCMUR_DEV_FLAG_STOPPING) &&
	       (retries < 0 || attempt <= retries)) {
		pthread_mutex_unlock(&rdev->rdev_lock);

		tcmu_dev_dbg(dev, "Opening device. Attempt %d\n", attempt);
		ret = rhandler->open(dev, true);
		if (ret) {
			/* Avoid busy loop ? */
			sleep(1);
		}

		pthread_mutex_lock(&rdev->rdev_lock);
		if (!ret) {
			rdev->flags |= TCMUR_DEV_FLAG_IS_OPEN;
			rdev->lock_lost = false;
		}
		attempt++;
	}

done:
	rdev->flags &= ~TCMUR_DEV_FLAG_IN_RECOVERY;
	pthread_mutex_unlock(&rdev->rdev_lock);

	return ret;
}

/*
 * tcmu_reopen_dev - close and open device.
 * @dev: device to reopen
 * @retries: number of times to retry open() call. -1 indicates infinite.
 */
int tcmu_reopen_dev(struct tcmu_device *dev, int retries)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	pthread_mutex_lock(&rdev->rdev_lock);
	if (rdev->flags & TCMUR_DEV_FLAG_IN_RECOVERY) {
		pthread_mutex_unlock(&rdev->rdev_lock);
		return -EBUSY;
	}
	rdev->flags |= TCMUR_DEV_FLAG_IN_RECOVERY;
	pthread_mutex_unlock(&rdev->rdev_lock);

	return __tcmu_reopen_dev(dev, retries);
}

void tcmu_cancel_recovery(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	/*
	 * Only file and qcow can be canceled in their open/close calls, but
	 * they do not support recovery, so wait here for rbd/glfs type of
	 * handlers to fail/complete normally to avoid a segfault.
	 */
	tcmu_dev_dbg(dev, "Waiting on recovery thread\n");
	pthread_mutex_lock(&rdev->rdev_lock);
	while (rdev->flags & TCMUR_DEV_FLAG_IN_RECOVERY) {
		pthread_mutex_unlock(&rdev->rdev_lock);
		sleep(1);
		pthread_mutex_lock(&rdev->rdev_lock);
	}
	pthread_mutex_unlock(&rdev->rdev_lock);
	tcmu_dev_dbg(dev, "Recovery thread wait done\n");
}

static void __tcmu_report_event(void *data)
{
	struct tcmu_device *dev = data;
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
	int ret;

	/*
	 * For cmd timeouts and unbalanced systems we will get a burst so wait
	 * a second to batch up the updates.
	 */
	sleep(1);

	pthread_mutex_lock(&rdev->rdev_lock);
	if (rdev->flags & TCMUR_DEV_FLAG_IS_OPEN) {
		ret = rhandler->report_event(dev);
		if (ret)
			tcmu_dev_err(dev, "Could not report events. Error %d.\n", ret);
	}
	pthread_mutex_unlock(&rdev->rdev_lock);
}

static void tcmu_report_event(struct tcmu_device *dev)
{
	struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	int ret;

	if (!rhandler->report_event)
		return;

	ret = tcmur_run_work(rdev->event_work, dev, __tcmu_report_event);
	if (!ret)
		return;

	if (ret != -EBUSY)
		tcmu_dev_err(dev, "Could not execute event work. Error %d", ret);
}

static bool __tcmu_notify_conn_lost(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	/*
	 * Although there are 2 checks for STOPPING in __tcmu_reopen_dev
	 * which is called a little later by the recovery thread, STOPPING
	 * checking is still needed here.
	 *
	 * In device removal, tcmu_get_alua_grps will never get access to
	 * configfs dir resource which is holded by kernel in configfs_rmdir,
	 * thus tcmulib_cmd->done() will never get a chance to clear
	 * tracked_aio_ops. This will cause a deadlock in dev_removed
	 * which is polling tracked_aio_ops.
	 */
	if ((rdev->flags & TCMUR_DEV_FLAG_STOPPING) ||
		(rdev->flags & TCMUR_DEV_FLAG_IN_RECOVERY))
		return false;

	tcmu_dev_err(dev, "Handler connection lost (lock state %d)\n",
		     rdev->lock_state);

	if (!tcmu_add_dev_to_recovery_list(dev)) {
		rdev->conn_lost_cnt++;
		return true;
	}

	return false;
}

/**
 * tcmu_notify_conn_lost - notify runner the device instace has lost its
 *			   connection to its backend storage.
 * @dev: device that has lost its connection
 *
 * Handlers should call this function when they detect they cannot reach their
 * backend storage/medium/cache, so new commands will not be queued until
 * the device has been reopened.
 */
void tcmu_notify_conn_lost(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	bool report;

	pthread_mutex_lock(&rdev->rdev_lock);
	report =__tcmu_notify_conn_lost(dev);
	pthread_mutex_unlock(&rdev->rdev_lock);

	if (report)
		tcmu_report_event(dev);
}

static void __tcmu_notify_lock_lost(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	rdev->lock_lost = true;
	rdev->lock_state = TCMUR_DEV_LOCK_UNLOCKED;
	rdev->lock_lost_cnt++;

	tcmu_report_event(dev);
}

/**
 * tcmu_notify_lock_lost - notify runner the device instance has lost the lock
 * @dev: device that has lost the lock
 *
 * Handlers should call this function when they detect they have lost
 * the lock, so runner can re-acquire. It must be called before completing
 * a command that had caused the failure.
 */
void tcmu_notify_lock_lost(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	pthread_mutex_lock(&rdev->rdev_lock);
	tcmu_dev_warn(dev, "Async lock drop. Old state %d\n", rdev->lock_state);
	/*
	 * We could be getting stale IO completions. If we are trying to
	 * reaquire the lock do not change state.
	 */
	if (rdev->lock_state != TCMUR_DEV_LOCK_WRITE_LOCKING) {
		__tcmu_notify_lock_lost(dev);
	}
	pthread_mutex_unlock(&rdev->rdev_lock);
}

void tcmu_release_dev_lock(struct tcmu_device *dev)
{
	struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	int ret;

	pthread_mutex_lock(&rdev->rdev_lock);
	if (rdev->lock_state != TCMUR_DEV_LOCK_WRITE_LOCKED) {
		pthread_mutex_unlock(&rdev->rdev_lock);
		return;
	}

	if (!(rdev->flags & TCMUR_DEV_FLAG_IS_OPEN)) {
		tcmu_dev_dbg(dev, "Device is closed so unlock is not needed\n");
		rdev->lock_state = TCMUR_DEV_LOCK_UNLOCKED;
		pthread_mutex_unlock(&rdev->rdev_lock);
		return;
	}

	pthread_mutex_unlock(&rdev->rdev_lock);

	ret = rhandler->unlock(dev);
	if (ret != TCMU_STS_OK)
		tcmu_dev_warn(dev, "Lock not cleanly released. Ret %d.\n",
			      ret);
	/*
	 * If we don't have a clean unlock we still report success and set
	 * to unlocked to prevent new IO from executing in case the lock
	 * is in a state where it cannot be fenced.
	 */
	pthread_mutex_lock(&rdev->rdev_lock);
	rdev->lock_state = TCMUR_DEV_LOCK_UNLOCKED;
	pthread_mutex_unlock(&rdev->rdev_lock);
}

int tcmu_get_lock_tag(struct tcmu_device *dev, uint16_t *tag)
{
	struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	int retry = 0, ret;

	if (rdev->failover_type != TCMUR_DEV_FAILOVER_EXPLICIT)
		return 0;

	pthread_mutex_lock(&rdev->rdev_lock);
	if (!(rdev->flags & TCMUR_DEV_FLAG_IS_OPEN)) {
		/*
		 * Return tmp error until the recovery thread is able to
		 * start up.
		 */
		pthread_mutex_unlock(&rdev->rdev_lock);
		return TCMU_STS_BUSY;
	}
	pthread_mutex_unlock(&rdev->rdev_lock);

retry:
	ret = rhandler->get_lock_tag(dev, tag);
	tcmu_dev_dbg(dev, "Got rc %d tag %hu\n", ret, *tag);

	switch (ret) {
	case TCMU_STS_OK:
		break;
	case TCMU_STS_NO_LOCK_HOLDERS:
		/* No lock holder yet */
		break;
	case TCMU_STS_FENCED:
		/*
		 * This is safe without blocking/flushing because it
		 * is called from the main IO thread and will wait for
		 * commands started before it via the aio wait call.
		 */
		tcmu_dev_dbg(dev, "Could not access dev. Try reopen.\n");
		ret = tcmu_reopen_dev(dev, 0);
		if (!ret && retry < 1) {
			retry++;
			goto retry;
		}
		/* fallthrough */
	case TCMU_STS_TIMEOUT:
	default:
		tcmu_dev_dbg(dev, "Could not reach device to get locker id\n");
		/*
		 * In spc4r37 and newer
		 * "5.15.2.7 Target port asymmetric access state reporting"
		 * states that the initiator should consider the info
		 * returned through our enabled port current for that
		 * enabled port. If a RTPG sent through another port
		 * returns different info, then the info for the enabled
		 * port returned through the enabled port should be
		 * considered current.
		 *
		 * ESX though assumes the all port info in a RTPG to be
		 * current so we drop the session here to prevent sending
		 * inconsistent info. We probably want to do this regardless
		 * of ESX, because that value is returned when the handler
		 * cannot connect to the cluster so all requests are
		 * going to fail.
		 */
		tcmu_notify_conn_lost(dev);
		/*
		 * To try and not return inconsistent info and not look
		 * like a hard device error, fail the command so it is
		 * retried and the retry will be handled like other commands
		 * during session level recovery.
		 */
		return TCMU_STS_BUSY;
	}

	return ret;
}

int tcmu_acquire_dev_lock(struct tcmu_device *dev, uint16_t tag)
{
	struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	int retries = 0, ret = TCMU_STS_HW_ERR;
	bool reopen;

	tcmu_dev_dbg(dev, "Waiting for outstanding commands to complete\n");
	if (aio_wait_for_empty_queue(rdev)) {
		tcmu_dev_err(dev, "Not able to flush queue before taking lock.\n");
		goto done;
	}

	/*
	 * Since we are here the lock state must be one of:
	 * for implicit:
	 *    TCMUR_DEV_LOCK_READ_LOCKING
	 *    TCMUR_DEV_LOCK_WRITE_LOCKING
	 *
	 * for explicit:
	 *    TCMUR_DEV_LOCK_UNLOCKED
	 *    TCMUR_DEV_LOCK_UNKNOWN
	 */

	reopen = false;
	pthread_mutex_lock(&rdev->rdev_lock);
	if (rdev->lock_lost || !(rdev->flags & TCMUR_DEV_FLAG_IS_OPEN))
		reopen = true;
	pthread_mutex_unlock(&rdev->rdev_lock);

retry:
	tcmu_dev_dbg(dev, "lock call state %d retries %d. tag %hu reopen %d\n",
		     rdev->lock_state, retries, tag, reopen);

	if (reopen) {
		tcmu_dev_dbg(dev, "Try to reopen device. %d\n", retries);
		ret = tcmu_reopen_dev(dev, 0);
		if (ret) {
			tcmu_dev_err(dev, "Could not reopen device while taking lock. Err %d.\n",
				     ret);
			/* We were fenced and were not able to clear it. */
			ret = TCMU_STS_FENCED;
			goto drop_conn;
		}
	}

	pthread_mutex_lock(&rdev->rdev_lock);
	if (rdev->lock_state == TCMUR_DEV_LOCK_READ_LOCKING) {
		pthread_mutex_unlock(&rdev->rdev_lock);
		ret = TCMU_STS_OK;
		goto done;
	}
	pthread_mutex_unlock(&rdev->rdev_lock);

	ret = rhandler->lock(dev, tag);
	if (ret == TCMU_STS_FENCED) {
		if (retries < 1) {
			reopen = true;
			retries++;
			goto retry;
		}
		/*
		 * If we can't unfence ourself the lock is probably bouncing
		 * due to path initialization. Allow the other node to hold the
		 * lock for now, and the initiator will drive retries.
		 */
	}

drop_conn:
	if (ret == TCMU_STS_TIMEOUT) {
		tcmu_dev_dbg(dev, "Fail handler device connection.\n");
		tcmu_notify_conn_lost(dev);
	}

done:
	/* Block and flush stale IO from the kernel device and ring. */
	tcmu_cfgfs_dev_exec_action(dev, "block_dev", 1);
	/*
	 * Handle race where cmd could be in tcmur_generic_handle_cmd before
	 * the aio handler. For explicit ALUA, we execute the lock call from
	 * the main io processing thread, so we only flush here for implicit.
	 */
	if (pthread_self() != rdev->cmdproc_thread)
		tcmu_dev_flush_ring(dev);

	/* TODO: set UA based on bgly's patches */
	pthread_mutex_lock(&rdev->rdev_lock);
	if (ret != TCMU_STS_OK) {
		rdev->lock_state = TCMUR_DEV_LOCK_UNLOCKED;
		tcmu_dev_info(dev, "Lock acquisition unsuccessful\n");
	} else {
		if (rdev->lock_state == TCMUR_DEV_LOCK_READ_LOCKING) {
			rdev->lock_state = TCMUR_DEV_LOCK_READ_LOCKED;
			tcmu_dev_info(dev, "Read lock acquisition successful\n");
		} else if (rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKING) {
			rdev->lock_state = TCMUR_DEV_LOCK_WRITE_LOCKED;
			tcmu_dev_info(dev, "Write lock acquisition successful\n");
		} else {
			/*
			 * For explicit transition it will always acquire the write lock.
			 */
			rdev->lock_state = TCMUR_DEV_LOCK_WRITE_LOCKED;
			tcmu_dev_info(dev, "Write lock acquisition successful\n");
		}
	}

	tcmu_cfgfs_dev_exec_action(dev, "block_dev", 0);

	pthread_mutex_unlock(&rdev->rdev_lock);

	return ret;
}

void tcmu_update_dev_lock_state(struct tcmu_device *dev)
{
	struct tcmur_handler *rhandler = tcmu_get_runner_handler(dev);
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);
	int state;

	if (!rhandler->get_lock_state)
		return;

	pthread_mutex_lock(&rdev->rdev_lock);
	if (!(rdev->flags & TCMUR_DEV_FLAG_IS_OPEN)) {
		tcmu_dev_dbg(dev, "device closed.\n");
		state = TCMUR_DEV_LOCK_UNKNOWN;
		goto check_state;
	}
	pthread_mutex_unlock(&rdev->rdev_lock);

	state = rhandler->get_lock_state(dev);
	pthread_mutex_lock(&rdev->rdev_lock);
check_state:
	if (rdev->lock_state == TCMUR_DEV_LOCK_WRITE_LOCKED &&
	    state != TCMUR_DEV_LOCK_WRITE_LOCKED) {
		tcmu_dev_dbg(dev, "Updated out of sync lock state.\n");
		__tcmu_notify_lock_lost(dev);
	}
	pthread_mutex_unlock(&rdev->rdev_lock);
}

void tcmur_dev_set_private(struct tcmu_device *dev, void *private)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	rdev->hm_private = private;
}

void *tcmur_dev_get_private(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	return rdev->hm_private;
}

void tcmu_notify_cmd_timed_out(struct tcmu_device *dev)
{
	struct tcmur_device *rdev = tcmu_dev_get_private(dev);

	pthread_mutex_lock(&rdev->rdev_lock);
	rdev->cmd_timed_out_cnt++;
	__tcmu_notify_conn_lost(dev);
	pthread_mutex_unlock(&rdev->rdev_lock);

	tcmu_report_event(dev);
}