drm/i915: Coordinate i915_active with its own mutex

Forgo the struct_mutex serialisation for i915_active, and interpose its own mutex handling for active/retire. This is a multi-layered sleight-of-hand. First, we had to ensure that no active/retire callbacks accidentally inverted the mutex ordering rules, nor assumed that they were themselves serialised by struct_mutex. More challenging though, is the rule over updating elements of the active rbtree. Instead of the whole i915_active now being serialised by struct_mutex, allocations/rotations of the tree are serialised by the i915_active.mutex and individual nodes are serialised by the caller using the i915_timeline.mutex (we need to use nested spinlocks to interact with the dma_fence callback lists). The pain point here is that instead of a single mutex around execbuf, we now have to take a mutex for active tracker (one for each vma, context, etc) and a couple of spinlocks for each fence update. The improvement in fine grained locking allowing for multiple concurrent clients (eventually!) should be worth it in typical loads. v2: Add some comments that barely elucidate anything :( Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk> Reviewed-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com> Link: https://patchwork.freedesktop.org/patch/msgid/20191004134015.13204-6-chris@chris-wilson.co.uk
d3adme4t · Oct 4, 2019 · b1e3177 · b1e3177
1 parent 274cbf2
commit b1e3177
Show file tree

Hide file tree

Showing 24 changed files with 298 additions and 572 deletions.
diff --git a/drivers/gpu/drm/i915/display/intel_frontbuffer.c b/drivers/gpu/drm/i915/display/intel_frontbuffer.c
@@ -257,7 +257,7 @@ intel_frontbuffer_get(struct drm_i915_gem_object *obj)
  front->obj = obj;
  kref_init(&front->ref);
  atomic_set(&front->bits, 0);
- i915_active_init(i915, &front->write,
+ i915_active_init(&front->write,
  frontbuffer_active,
  i915_active_may_sleep(frontbuffer_retire));
 

diff --git a/drivers/gpu/drm/i915/display/intel_overlay.c b/drivers/gpu/drm/i915/display/intel_overlay.c
@@ -1360,8 +1360,7 @@ void intel_overlay_setup(struct drm_i915_private *dev_priv)
  overlay->contrast = 75;
  overlay->saturation = 146;
 
- i915_active_init(dev_priv,
- &overlay->last_flip,
+ i915_active_init(&overlay->last_flip,
  NULL, intel_overlay_last_flip_retire);
 
  ret = get_registers(overlay, OVERLAY_NEEDS_PHYSICAL(dev_priv));

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_context.c b/drivers/gpu/drm/i915/gem/i915_gem_context.c
@@ -868,20 +868,18 @@ static int context_barrier_task(struct i915_gem_context *ctx,
  void (*task)(void *data),
  void *data)
 {
- struct drm_i915_private *i915 = ctx->i915;
  struct context_barrier_task *cb;
  struct i915_gem_engines_iter it;
  struct intel_context *ce;
  int err = 0;
 
- lockdep_assert_held(&i915->drm.struct_mutex);
  GEM_BUG_ON(!task);
 
  cb = kmalloc(sizeof(*cb), GFP_KERNEL);
  if (!cb)
  return -ENOMEM;
 
- i915_active_init(i915, &cb->base, NULL, cb_retire);
+ i915_active_init(&cb->base, NULL, cb_retire);
  err = i915_active_acquire(&cb->base);
  if (err) {
  kfree(cb);

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_object_types.h b/drivers/gpu/drm/i915/gem/i915_gem_object_types.h
@@ -8,6 +8,7 @@
 #define __I915_GEM_OBJECT_TYPES_H__
 
 #include <drm/drm_gem.h>
+#include <uapi/drm/i915_drm.h>
 
 #include "i915_active.h"
 #include "i915_selftest.h"

diff --git a/drivers/gpu/drm/i915/gem/i915_gem_pm.c b/drivers/gpu/drm/i915/gem/i915_gem_pm.c
@@ -16,14 +16,11 @@ static void call_idle_barriers(struct intel_engine_cs *engine)
  struct llist_node *node, *next;
 
  llist_for_each_safe(node, next, llist_del_all(&engine->barrier_tasks)) {
- struct i915_active_request *active =
+ struct dma_fence_cb *cb =
  container_of((struct list_head *)node,
- typeof(*active), link);
+ typeof(*cb), node);
 
- INIT_LIST_HEAD(&active->link);
- RCU_INIT_POINTER(active->request, NULL);
-
- active->retire(active, NULL);
+ cb->func(NULL, cb);
  }
 }
 

diff --git a/drivers/gpu/drm/i915/gt/intel_context.c b/drivers/gpu/drm/i915/gt/intel_context.c
@@ -240,7 +240,7 @@ intel_context_init(struct intel_context *ce,
 
  mutex_init(&ce->pin_mutex);
 
- i915_active_init(ctx->i915, &ce->active,
+ i915_active_init(&ce->active,
  __intel_context_active, __intel_context_retire);
 }
 
@@ -307,7 +307,7 @@ int intel_context_prepare_remote_request(struct intel_context *ce,
  return err;
 
  /* Queue this switch after current activity by this context. */
- err = i915_active_request_set(&tl->last_request, rq);
+ err = i915_active_fence_set(&tl->last_request, rq);
  mutex_unlock(&tl->mutex);
  if (err)
  return err;

diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pool.c b/drivers/gpu/drm/i915/gt/intel_engine_pool.c
@@ -95,7 +95,7 @@ node_create(struct intel_engine_pool *pool, size_t sz)
  return ERR_PTR(-ENOMEM);
 
  node->pool = pool;
- i915_active_init(engine->i915, &node->active, pool_active, pool_retire);
+ i915_active_init(&node->active, pool_active, pool_retire);
 
  obj = i915_gem_object_create_internal(engine->i915, sz);
  if (IS_ERR(obj)) {

diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c
@@ -844,10 +844,10 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
  */
  spin_lock_irqsave(&timelines->lock, flags);
  list_for_each_entry(tl, &timelines->active_list, link) {
- struct i915_request *rq;
+ struct dma_fence *fence;
 
- rq = i915_active_request_get_unlocked(&tl->last_request);
- if (!rq)
+ fence = i915_active_fence_get(&tl->last_request);
+ if (!fence)
  continue;
 
  spin_unlock_irqrestore(&timelines->lock, flags);
@@ -859,8 +859,8 @@ static bool __intel_gt_unset_wedged(struct intel_gt *gt)
  * (I915_FENCE_TIMEOUT) so this wait should not be unbounded
  * in the worst case.
  */
- dma_fence_default_wait(&rq->fence, false, MAX_SCHEDULE_TIMEOUT);
- i915_request_put(rq);
+ dma_fence_default_wait(fence, false, MAX_SCHEDULE_TIMEOUT);
+ dma_fence_put(fence);
 
  /* Restart iteration after droping lock */
  spin_lock_irqsave(&timelines->lock, flags);

diff --git a/drivers/gpu/drm/i915/gt/intel_timeline.c b/drivers/gpu/drm/i915/gt/intel_timeline.c
@@ -178,8 +178,7 @@ cacheline_alloc(struct intel_timeline_hwsp *hwsp, unsigned int cacheline)
  cl->hwsp = hwsp;
  cl->vaddr = page_pack_bits(vaddr, cacheline);
 
- i915_active_init(hwsp->gt->i915, &cl->active,
- __cacheline_active, __cacheline_retire);
+ i915_active_init(&cl->active, __cacheline_active, __cacheline_retire);
 
  return cl;
 }
@@ -255,7 +254,7 @@ int intel_timeline_init(struct intel_timeline *timeline,
 
  mutex_init(&timeline->mutex);
 
- INIT_ACTIVE_REQUEST(&timeline->last_request, &timeline->mutex);
+ INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
  INIT_LIST_HEAD(&timeline->requests);
 
  i915_syncmap_init(&timeline->sync);
@@ -443,7 +442,7 @@ __intel_timeline_get_seqno(struct intel_timeline *tl,
  * free it after the current request is retired, which ensures that
  * all writes into the cacheline from previous requests are complete.
  */
- err = i915_active_ref(&tl->hwsp_cacheline->active, tl, rq);
+ err = i915_active_ref(&tl->hwsp_cacheline->active, tl, &rq->fence);
  if (err)
  goto err_cacheline;
 

diff --git a/drivers/gpu/drm/i915/gt/intel_timeline_types.h b/drivers/gpu/drm/i915/gt/intel_timeline_types.h
@@ -58,12 +58,13 @@ struct intel_timeline {
  */
  struct list_head requests;
 
- /* Contains an RCU guarded pointer to the last request. No reference is
+ /*
+ * Contains an RCU guarded pointer to the last request. No reference is
  * held to the request, users must carefully acquire a reference to
- * the request using i915_active_request_get_request_rcu(), or hold the
- * struct_mutex.
+ * the request using i915_active_fence_get(), or manage the RCU
+ * protection themselves (cf the i915_active_fence API).
  */
- struct i915_active_request last_request;
+ struct i915_active_fence last_request;
 
  /**
  * We track the most recent seqno that we wait on in every context so

diff --git a/drivers/gpu/drm/i915/gt/selftest_context.c b/drivers/gpu/drm/i915/gt/selftest_context.c
@@ -47,24 +47,20 @@ static int context_sync(struct intel_context *ce)
 
  mutex_lock(&tl->mutex);
  do {
- struct i915_request *rq;
+ struct dma_fence *fence;
  long timeout;
 
- rcu_read_lock();
- rq = rcu_dereference(tl->last_request.request);
- if (rq)
- rq = i915_request_get_rcu(rq);
- rcu_read_unlock();
- if (!rq)
+ fence = i915_active_fence_get(&tl->last_request);
+ if (!fence)
  break;
 
- timeout = i915_request_wait(rq, 0, HZ / 10);
+ timeout = dma_fence_wait_timeout(fence, false, HZ / 10);
  if (timeout < 0)
  err = timeout;
  else
- i915_request_retire_upto(rq);
+ i915_request_retire_upto(to_request(fence));
 
- i915_request_put(rq);
+ dma_fence_put(fence);
  } while (!err);
  mutex_unlock(&tl->mutex);
 

diff --git a/drivers/gpu/drm/i915/gt/selftest_lrc.c b/drivers/gpu/drm/i915/gt/selftest_lrc.c
@@ -1172,9 +1172,13 @@ static struct i915_request *dummy_request(struct intel_engine_cs *engine)
  if (!rq)
  return NULL;
 
- INIT_LIST_HEAD(&rq->active_list);
  rq->engine = engine;
 
+ spin_lock_init(&rq->lock);
+ INIT_LIST_HEAD(&rq->fence.cb_list);
+ rq->fence.lock = &rq->lock;
+ rq->fence.ops = &i915_fence_ops;
+
  i915_sched_node_init(&rq->sched);
 
  /* mark this request as permanently incomplete */
@@ -1267,8 +1271,8 @@ static int live_suppress_wait_preempt(void *arg)
  }
 
  /* Disable NEWCLIENT promotion */
- __i915_active_request_set(&i915_request_timeline(rq[i])->last_request,
-  dummy);
+ __i915_active_fence_set(&i915_request_timeline(rq[i])->last_request,
+ &dummy->fence);
  i915_request_add(rq[i]);
  }
 

diff --git a/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c b/drivers/gpu/drm/i915/gt/selftests/mock_timeline.c
@@ -15,7 +15,7 @@ void mock_timeline_init(struct intel_timeline *timeline, u64 context)
 
  mutex_init(&timeline->mutex);
 
- INIT_ACTIVE_REQUEST(&timeline->last_request, &timeline->mutex);
+ INIT_ACTIVE_FENCE(&timeline->last_request, &timeline->mutex);
  INIT_LIST_HEAD(&timeline->requests);
 
  i915_syncmap_init(&timeline->sync);

diff --git a/drivers/gpu/drm/i915/gvt/scheduler.c b/drivers/gpu/drm/i915/gvt/scheduler.c
@@ -385,11 +385,8 @@ intel_gvt_workload_req_alloc(struct intel_vgpu_workload *workload)
 {
  struct intel_vgpu *vgpu = workload->vgpu;
  struct intel_vgpu_submission *s = &vgpu->submission;
- struct drm_i915_private *dev_priv = vgpu->gvt->dev_priv;
  struct i915_request *rq;
 
- lockdep_assert_held(&dev_priv->drm.struct_mutex);
-
  if (workload->req)
  return 0;