Merge tag 'misc-habanalabs-next-2021-10-18' of https://git.kernel.org…

…/pub/scm/linux/kernel/git/ogabbay/linux into char-misc-next Oded writes: This tag contains habanalabs driver changes for v5.16: - Add a new uAPI (under the memory ioctl) to request from the driver to export a DMA-BUF object that represents a memory region on the device's DRAM. This is needed to enable peer-to-peer over PCIe between habana device and an RDMA adapter (e.g. mlnx5 or efa rdma adapter). - Add debugfs node to dynamically configure CS timeout. Up until now, it was only configurable through kernel module parameter. - Fetch more comprehensive power information from the firmware. - Always take timestamp when waiting for user interrupt, as the user needs that information to optimize the graph runtime compilation. - Modify user interrupt to look on 64-bit user value as fence, instead of 32-bit. - Bypass reset in case of repeated h/w error event after device reset. This is to prevent endless loop of resets to the device. - Fix several bugs in multi CS completion code. - Fix race condition in fd close/open. - Update to latest firmware headers - Add select CRC32 in kconfig - Small fixes, cosmetics * tag 'misc-habanalabs-next-2021-10-18' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux: (25 commits) habanalabs: refactor fence handling in hl_cs_poll_fences habanalabs: context cleanup cosmetics habanalabs: simplify wait for interrupt with timestamp flow habanalabs: initialize hpriv fields before adding new node habanalabs: Unify frequency set/get functionality habanalabs: select CRC32 habanalabs: add support for dma-buf exporter habanalabs: define uAPI to export FD for DMA-BUF habanalabs: fix NULL pointer dereference habanalabs: fix race condition in multi CS completion habanalabs: use only u32 habanalabs: update firmware files habanalabs: bypass reset for continuous h/w error event habanalabs: take timestamp on wait for interrupt habanalabs: prevent race between fd close/open habanalabs: refactor reset log message habanalabs: define soft-reset as inference op habanalabs: fix debugfs device memory MMU VA translation habanalabs: add support for a long interrupt target value habanalabs: remove redundant cs validity checks ...
plbossart · Oct 19, 2021 · be24dd4 · be24dd4
2 parents 2b74240 + b2faac3
commit be24dd4
Show file tree

Hide file tree

Showing 27 changed files with 1,309 additions and 305 deletions.
diff --git a/Documentation/ABI/testing/debugfs-driver-habanalabs b/Documentation/ABI/testing/debugfs-driver-habanalabs
@@ -226,6 +226,12 @@ Description: Gets the state dump occurring on a CS timeout or failure.
  Writing an integer X discards X state dumps, so that the
  next read would return X+1-st newest state dump.
 
+What: /sys/kernel/debug/habanalabs/hl<n>/timeout_locked
+Date: Sep 2021
+KernelVersion: 5.16
+Contact: obitton@habana.ai
+Description: Sets the command submission timeout value in seconds.
+
 What: /sys/kernel/debug/habanalabs/hl<n>/stop_on_err
 Date: Mar 2020
 KernelVersion: 5.6

diff --git a/drivers/misc/habanalabs/Kconfig b/drivers/misc/habanalabs/Kconfig
@@ -8,6 +8,8 @@ config HABANA_AI
  depends on PCI && HAS_IOMEM
  select GENERIC_ALLOCATOR
  select HWMON
+ select DMA_SHARED_BUFFER
+ select CRC32
  help
  Enables PCIe card driver for Habana's AI Processors (AIP) that are
  designed to accelerate Deep Learning inference and training workloads.

diff --git a/drivers/misc/habanalabs/common/Makefile b/drivers/misc/habanalabs/common/Makefile
@@ -11,4 +11,4 @@ HL_COMMON_FILES := common/habanalabs_drv.o common/device.o common/context.o \
  common/command_buffer.o common/hw_queue.o common/irq.o \
  common/sysfs.o common/hwmon.o common/memory.o \
  common/command_submission.o common/firmware_if.o \
- common/state_dump.o
+ common/state_dump.o common/hwmgr.o
diff --git a/drivers/misc/habanalabs/common/command_submission.c b/drivers/misc/habanalabs/common/command_submission.c
@@ -143,6 +143,7 @@ static void hl_fence_init(struct hl_fence *fence, u64 sequence)
  fence->cs_sequence = sequence;
  fence->error = 0;
  fence->timestamp = ktime_set(0, 0);
+ fence->mcs_handling_done = false;
  init_completion(&fence->completion);
 }
 
@@ -431,11 +432,10 @@ static void cs_handle_tdr(struct hl_device *hdev, struct hl_cs *cs)
  /* Don't cancel TDR in case this CS was timedout because we might be
  * running from the TDR context
  */
- if (cs && (cs->timedout ||
- hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT))
+ if (cs->timedout || hdev->timeout_jiffies == MAX_SCHEDULE_TIMEOUT)
  return;
 
- if (cs && cs->tdr_active)
+ if (cs->tdr_active)
  cancel_delayed_work_sync(&cs->work_tdr);
 
  spin_lock(&hdev->cs_mirror_lock);
@@ -536,10 +536,21 @@ static void complete_multi_cs(struct hl_device *hdev, struct hl_cs *cs)
  mcs_compl->timestamp =
  ktime_to_ns(fence->timestamp);
  complete_all(&mcs_compl->completion);
+
+ /*
+ * Setting mcs_handling_done inside the lock ensures
+ * at least one fence have mcs_handling_done set to
+ * true before wait for mcs finish. This ensures at
+ * least one CS will be set as completed when polling
+ * mcs fences.
+ */
+ fence->mcs_handling_done = true;
  }
 
  spin_unlock(&mcs_compl->lock);
  }
+ /* In case CS completed without mcs completion initialized */
+ fence->mcs_handling_done = true;
 }
 
 static inline void cs_release_sob_reset_handler(struct hl_device *hdev,
@@ -2371,32 +2382,48 @@ static int hl_cs_poll_fences(struct multi_cs_data *mcs_data)
  break;
  }
 
- mcs_data->stream_master_qid_map |= fence->stream_master_qid_map;
-
- if (status == CS_WAIT_STATUS_BUSY)
- continue;
-
- mcs_data->completion_bitmap |= BIT(i);
-
- /*
- * best effort to extract timestamp. few notes:
- * - if even single fence is gone we cannot extract timestamp
- * (as fence not exist anymore)
- * - for all completed CSs we take the earliest timestamp.
- * for this we have to validate that:
- * 1. given timestamp was indeed set
- * 2. the timestamp is earliest of all timestamps so far
- */
+ switch (status) {
+ case CS_WAIT_STATUS_BUSY:
+ /* CS did not finished, keep waiting on its QID*/
+ mcs_data->stream_master_qid_map |=
+ fence->stream_master_qid_map;
+ break;
+ case CS_WAIT_STATUS_COMPLETED:
+ /*
+ * Using mcs_handling_done to avoid possibility of mcs_data
+ * returns to user indicating CS completed before it finished
+ * all of its mcs handling, to avoid race the next time the
+ * user waits for mcs.
+ */
+ if (!fence->mcs_handling_done)
+ break;
 
- if (status == CS_WAIT_STATUS_GONE) {
+ mcs_data->completion_bitmap |= BIT(i);
+ /*
+ * For all completed CSs we take the earliest timestamp.
+ * For this we have to validate that the timestamp is
+ * earliest of all timestamps so far.
+ */
+ if (mcs_data->update_ts &&
+ (ktime_compare(fence->timestamp, first_cs_time) < 0))
+ first_cs_time = fence->timestamp;
+ break;
+ case CS_WAIT_STATUS_GONE:
  mcs_data->update_ts = false;
  mcs_data->gone_cs = true;
- } else if (mcs_data->update_ts &&
- (ktime_compare(fence->timestamp,
- ktime_set(0, 0)) > 0) &&
- (ktime_compare(fence->timestamp, first_cs_time) < 0)) {
- first_cs_time = fence->timestamp;
+ /*
+ * It is possible to get an old sequence numbers from user
+ * which related to already completed CSs and their fences
+ * already gone. In this case, CS set as completed but
+ * no need to consider its QID for mcs completion.
+ */
+ mcs_data->completion_bitmap |= BIT(i);
+ break;
+ default:
+ dev_err(hdev->dev, "Invalid fence status\n");
+ return -EINVAL;
  }
+
  }
 
  hl_fences_put(mcs_data->fence_arr, arr_len);
@@ -2740,13 +2767,14 @@ static int hl_cs_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
 static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  u32 timeout_us, u64 user_address,
- u32 target_value, u16 interrupt_offset,
- enum hl_cs_wait_status *status)
+ u64 target_value, u16 interrupt_offset,
+ enum hl_cs_wait_status *status,
+ u64 *timestamp)
 {
  struct hl_user_pending_interrupt *pend;
  struct hl_user_interrupt *interrupt;
  unsigned long timeout, flags;
- u32 completion_value;
+ u64 completion_value;
  long completion_rc;
  int rc = 0;
 
@@ -2780,15 +2808,17 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  /* We check for completion value as interrupt could have been received
  * before we added the node to the wait list
  */
- if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) {
+ if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
  dev_err(hdev->dev, "Failed to copy completion value from user\n");
  rc = -EFAULT;
  goto remove_pending_user_interrupt;
  }
 
- if (completion_value >= target_value)
+ if (completion_value >= target_value) {
  *status = CS_WAIT_STATUS_COMPLETED;
- else
+ /* There was no interrupt, we assume the completion is now. */
+ pend->fence.timestamp = ktime_get();
+ } else
  *status = CS_WAIT_STATUS_BUSY;
 
  if (!timeout_us || (*status == CS_WAIT_STATUS_COMPLETED))
@@ -2812,7 +2842,7 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  reinit_completion(&pend->fence.completion);
  spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
 
- if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 4)) {
+ if (copy_from_user(&completion_value, u64_to_user_ptr(user_address), 8)) {
  dev_err(hdev->dev, "Failed to copy completion value from user\n");
  rc = -EFAULT;
 
@@ -2839,6 +2869,8 @@ static int _hl_interrupt_wait_ioctl(struct hl_device *hdev, struct hl_ctx *ctx,
  list_del(&pend->wait_list_node);
  spin_unlock_irqrestore(&interrupt->wait_list_lock, flags);
 
+ *timestamp = ktime_to_ns(pend->fence.timestamp);
+
  kfree(pend);
  hl_ctx_put(ctx);
 
@@ -2852,6 +2884,7 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
  struct asic_fixed_properties *prop;
  union hl_wait_cs_args *args = data;
  enum hl_cs_wait_status status;
+ u64 timestamp;
  int rc;
 
  prop = &hdev->asic_prop;
@@ -2881,7 +2914,8 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
  rc = _hl_interrupt_wait_ioctl(hdev, hpriv->ctx,
  args->in.interrupt_timeout_us, args->in.addr,
- args->in.target, interrupt_offset, &status);
+ args->in.target, interrupt_offset, &status,
+ &timestamp);
 
  if (rc) {
  if (rc != -EINTR)
@@ -2893,6 +2927,11 @@ static int hl_interrupt_wait_ioctl(struct hl_fpriv *hpriv, void *data)
 
  memset(args, 0, sizeof(*args));
 
+ if (timestamp) {
+ args->out.timestamp_nsec = timestamp;
+ args->out.flags |= HL_WAIT_CS_STATUS_FLAG_TIMESTAMP_VLD;
+ }
+
  switch (status) {
  case CS_WAIT_STATUS_COMPLETED:
  args->out.status = HL_WAIT_CS_STATUS_COMPLETED;

diff --git a/drivers/misc/habanalabs/common/context.c b/drivers/misc/habanalabs/common/context.c
@@ -181,12 +181,6 @@ int hl_ctx_create(struct hl_device *hdev, struct hl_fpriv *hpriv)
  return rc;
 }
 
-void hl_ctx_free(struct hl_device *hdev, struct hl_ctx *ctx)
-{
- if (kref_put(&ctx->refcount, hl_ctx_do_release) == 1)
- return;
-}
-
 int hl_ctx_init(struct hl_device *hdev, struct hl_ctx *ctx, bool is_kernel_ctx)
 {
  int rc = 0;
@@ -392,7 +386,7 @@ void hl_ctx_mgr_fini(struct hl_device *hdev, struct hl_ctx_mgr *mgr)
  idp = &mgr->ctx_handles;
 
  idr_for_each_entry(idp, ctx, id)
- hl_ctx_free(hdev, ctx);
+ kref_put(&ctx->refcount, hl_ctx_do_release);
 
  idr_destroy(&mgr->ctx_handles);
  mutex_destroy(&mgr->ctx_lock);

diff --git a/drivers/misc/habanalabs/common/debugfs.c b/drivers/misc/habanalabs/common/debugfs.c
@@ -1167,6 +1167,45 @@ static ssize_t hl_state_dump_write(struct file *f, const char __user *buf,
  return count;
 }
 
+static ssize_t hl_timeout_locked_read(struct file *f, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+ struct hl_device *hdev = entry->hdev;
+ char tmp_buf[200];
+ ssize_t rc;
+
+ if (*ppos)
+ return 0;
+
+ sprintf(tmp_buf, "%d\n",
+ jiffies_to_msecs(hdev->timeout_jiffies) / 1000);
+ rc = simple_read_from_buffer(buf, strlen(tmp_buf) + 1, ppos, tmp_buf,
+ strlen(tmp_buf) + 1);
+
+ return rc;
+}
+
+static ssize_t hl_timeout_locked_write(struct file *f, const char __user *buf,
+ size_t count, loff_t *ppos)
+{
+ struct hl_dbg_device_entry *entry = file_inode(f)->i_private;
+ struct hl_device *hdev = entry->hdev;
+ u32 value;
+ ssize_t rc;
+
+ rc = kstrtouint_from_user(buf, count, 10, &value);
+ if (rc)
+ return rc;
+
+ if (value)
+ hdev->timeout_jiffies = msecs_to_jiffies(value * 1000);
+ else
+ hdev->timeout_jiffies = MAX_SCHEDULE_TIMEOUT;
+
+ return count;
+}
+
 static const struct file_operations hl_data32b_fops = {
  .owner = THIS_MODULE,
  .read = hl_data_read32,
@@ -1240,6 +1279,12 @@ static const struct file_operations hl_state_dump_fops = {
  .write = hl_state_dump_write
 };
 
+static const struct file_operations hl_timeout_locked_fops = {
+ .owner = THIS_MODULE,
+ .read = hl_timeout_locked_read,
+ .write = hl_timeout_locked_write
+};
+
 static const struct hl_info_list hl_debugfs_list[] = {
  {"command_buffers", command_buffers_show, NULL},
  {"command_submission", command_submission_show, NULL},
@@ -1421,6 +1466,12 @@ void hl_debugfs_add_device(struct hl_device *hdev)
  dev_entry,
  &hl_state_dump_fops);
 
+ debugfs_create_file("timeout_locked",
+ 0644,
+ dev_entry->root,
+ dev_entry,
+ &hl_timeout_locked_fops);
+
  for (i = 0, entry = dev_entry->entry_arr ; i < count ; i++, entry++) {
  debugfs_create_file(hl_debugfs_list[i].name,
  0444,