intel_execlists_submission.c 115 KB
Newer Older
1
// SPDX-License-Identifier: MIT
2
3
4
5
/*
 * Copyright © 2014 Intel Corporation
 */

6
7
8
9
/**
 * DOC: Logical Rings, Logical Ring Contexts and Execlists
 *
 * Motivation:
10
11
12
13
 * GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
 * These expanded contexts enable a number of new abilities, especially
 * "Execlists" (also implemented in this file).
 *
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
 * One of the main differences with the legacy HW contexts is that logical
 * ring contexts incorporate many more things to the context's state, like
 * PDPs or ringbuffer control registers:
 *
 * The reason why PDPs are included in the context is straightforward: as
 * PPGTTs (per-process GTTs) are actually per-context, having the PDPs
 * contained there mean you don't need to do a ppgtt->switch_mm yourself,
 * instead, the GPU will do it for you on the context switch.
 *
 * But, what about the ringbuffer control registers (head, tail, etc..)?
 * shouldn't we just need a set of those per engine command streamer? This is
 * where the name "Logical Rings" starts to make sense: by virtualizing the
 * rings, the engine cs shifts to a new "ring buffer" with every context
 * switch. When you want to submit a workload to the GPU you: A) choose your
 * context, B) find its appropriate virtualized ring, C) write commands to it
 * and then, finally, D) tell the GPU to switch to that context.
 *
 * Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
 * to a contexts is via a context execution list, ergo "Execlists".
 *
 * LRC implementation:
 * Regarding the creation of contexts, we have:
 *
 * - One global default context.
 * - One local default context for each opened fd.
 * - One local extra context for each context create ioctl call.
 *
 * Now that ringbuffers belong per-context (and not per-engine, like before)
 * and that contexts are uniquely tied to a given engine (and not reusable,
 * like before) we need:
 *
 * - One ringbuffer per-engine inside each context.
 * - One backing object per-engine inside each context.
 *
 * The global default context starts its life with these new objects fully
 * allocated and populated. The local default context for each opened fd is
 * more complex, because we don't know at creation time which engine is going
 * to use them. To handle this, we have implemented a deferred creation of LR
 * contexts:
 *
 * The local context starts its life as a hollow or blank holder, that only
 * gets populated for a given engine once we receive an execbuffer. If later
 * on we receive another execbuffer ioctl for the same context but a different
 * engine, we allocate/populate a new ringbuffer and context backing object and
 * so on.
 *
 * Finally, regarding local contexts created using the ioctl call: as they are
 * only allowed with the render ring, we can allocate & populate them right
 * away (no need to defer anything, at least for now).
 *
 * Execlists implementation:
65
66
 * Execlists are the new method by which, on gen8+ hardware, workloads are
 * submitted for execution (as opposed to the legacy, ringbuffer-based, method).
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
 * This method works as follows:
 *
 * When a request is committed, its commands (the BB start and any leading or
 * trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
 * for the appropriate context. The tail pointer in the hardware context is not
 * updated at this time, but instead, kept by the driver in the ringbuffer
 * structure. A structure representing this request is added to a request queue
 * for the appropriate engine: this structure contains a copy of the context's
 * tail after the request was written to the ring buffer and a pointer to the
 * context itself.
 *
 * If the engine's request queue was empty before the request was added, the
 * queue is processed immediately. Otherwise the queue will be processed during
 * a context switch interrupt. In any case, elements on the queue will get sent
 * (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
 * globally unique 20-bits submission ID.
 *
 * When execution of a request completes, the GPU updates the context status
 * buffer with a context complete event and generates a context switch interrupt.
 * During the interrupt handling, the driver examines the events in the buffer:
 * for each context complete event, if the announced ID matches that on the head
 * of the request queue, then that request is retired and removed from the queue.
 *
 * After processing, if any requests were retired and the queue is not empty
 * then a new execution list can be submitted. The two requests at the front of
 * the queue are next to be submitted but since a context may not occur twice in
 * an execution list, if subsequent requests have the same ID as the first then
 * the two requests must be combined. This is done simply by discarding requests
 * at the head of the queue until either only one requests is left (in which case
 * we use a NULL second context) or the first two requests have unique IDs.
 *
 * By always executing the first two requests in the queue the driver ensures
 * that the GPU is kept as busy as possible. In the case where a single context
 * completes but a second context is still executing, the request for this second
 * context will be at the head of the queue when we remove the first one. This
 * request will then be resubmitted along with a new request for a different context,
 * which will cause the hardware to continue executing the second request and queue
 * the new request (the GPU detects the condition of a context getting preempted
 * with the same context and optimizes the context switch flow by not doing
 * preemption, but just sampling the new tail pointer).
 *
108
 */
109
#include <linux/interrupt.h>
110
111

#include "i915_drv.h"
112
#include "i915_trace.h"
113
#include "i915_vgpu.h"
114
#include "gen8_engine_cs.h"
115
#include "intel_breadcrumbs.h"
116
#include "intel_context.h"
117
#include "intel_engine_pm.h"
118
#include "intel_engine_stats.h"
119
#include "intel_execlists_submission.h"
120
#include "intel_gt.h"
121
#include "intel_gt_irq.h"
122
#include "intel_gt_pm.h"
123
#include "intel_gt_requests.h"
124
#include "intel_lrc.h"
125
#include "intel_lrc_reg.h"
126
#include "intel_mocs.h"
127
#include "intel_reset.h"
128
#include "intel_ring.h"
129
#include "intel_workarounds.h"
130
#include "shmem_utils.h"
131

132
133
134
135
136
137
138
139
140
141
142
143
144
#define RING_EXECLIST_QFULL		(1 << 0x2)
#define RING_EXECLIST1_VALID		(1 << 0x3)
#define RING_EXECLIST0_VALID		(1 << 0x4)
#define RING_EXECLIST_ACTIVE_STATUS	(3 << 0xE)
#define RING_EXECLIST1_ACTIVE		(1 << 0x11)
#define RING_EXECLIST0_ACTIVE		(1 << 0x12)

#define GEN8_CTX_STATUS_IDLE_ACTIVE	(1 << 0)
#define GEN8_CTX_STATUS_PREEMPTED	(1 << 1)
#define GEN8_CTX_STATUS_ELEMENT_SWITCH	(1 << 2)
#define GEN8_CTX_STATUS_ACTIVE_IDLE	(1 << 3)
#define GEN8_CTX_STATUS_COMPLETE	(1 << 4)
#define GEN8_CTX_STATUS_LITE_RESTORE	(1 << 15)
145

146
#define GEN8_CTX_STATUS_COMPLETED_MASK \
147
	 (GEN8_CTX_STATUS_COMPLETE | GEN8_CTX_STATUS_PREEMPTED)
148

149
150
151
152
153
154
155
#define GEN12_CTX_STATUS_SWITCHED_TO_NEW_QUEUE	(0x1) /* lower csb dword */
#define GEN12_CTX_SWITCH_DETAIL(csb_dw)	((csb_dw) & 0xF) /* upper csb dword */
#define GEN12_CSB_SW_CTX_ID_MASK		GENMASK(25, 15)
#define GEN12_IDLE_CTX_ID		0x7FF
#define GEN12_CSB_CTX_VALID(csb_dw) \
	(FIELD_GET(GEN12_CSB_SW_CTX_ID_MASK, csb_dw) != GEN12_IDLE_CTX_ID)

156
157
/* Typical size of the average request (2 pipecontrols and a MI_BB) */
#define EXECLISTS_REQUEST_SIZE 64 /* bytes */
158

159
160
161
struct virtual_engine {
	struct intel_engine_cs base;
	struct intel_context context;
162
	struct rcu_work rcu;
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184

	/*
	 * We allow only a single request through the virtual engine at a time
	 * (each request in the timeline waits for the completion fence of
	 * the previous before being submitted). By restricting ourselves to
	 * only submitting a single request, each request is placed on to a
	 * physical to maximise load spreading (by virtue of the late greedy
	 * scheduling -- each real engine takes the next available request
	 * upon idling).
	 */
	struct i915_request *request;

	/*
	 * We keep a rbtree of available virtual engines inside each physical
	 * engine, sorted by priority. Here we preallocate the nodes we need
	 * for the virtual engine, indexed by physical_engine->id.
	 */
	struct ve_node {
		struct rb_node rb;
		int prio;
	} nodes[I915_NUM_ENGINES];

185
186
187
188
189
190
191
192
193
194
195
196
	/*
	 * Keep track of bonded pairs -- restrictions upon on our selection
	 * of physical engines any particular request may be submitted to.
	 * If we receive a submit-fence from a master engine, we will only
	 * use one of sibling_mask physical engines.
	 */
	struct ve_bond {
		const struct intel_engine_cs *master;
		intel_engine_mask_t sibling_mask;
	} *bonds;
	unsigned int num_bonds;

197
198
	/* And finally, which physical engines this virtual engine maps onto. */
	unsigned int num_siblings;
199
	struct intel_engine_cs *siblings[];
200
201
202
203
204
205
206
207
};

static struct virtual_engine *to_virtual_engine(struct intel_engine_cs *engine)
{
	GEM_BUG_ON(!intel_engine_is_virtual(engine));
	return container_of(engine, struct virtual_engine, base);
}

208
static struct i915_request *
209
210
211
__active_request(const struct intel_timeline * const tl,
		 struct i915_request *rq,
		 int error)
212
{
213
	struct i915_request *active = rq;
214

215
	list_for_each_entry_from_reverse(rq, &tl->requests, link) {
216
		if (__i915_request_is_complete(rq))
217
218
			break;

219
220
221
222
		if (error) {
			i915_request_set_error_once(rq, error);
			__i915_request_skip(rq);
		}
223
224
225
226
227
228
		active = rq;
	}

	return active;
}

229
230
231
232
233
234
static struct i915_request *
active_request(const struct intel_timeline * const tl, struct i915_request *rq)
{
	return __active_request(tl, rq, 0);
}

235
static void ring_set_paused(const struct intel_engine_cs *engine, int state)
236
237
238
239
240
241
242
243
{
	/*
	 * We inspect HWS_PREEMPT with a semaphore inside
	 * engine->emit_fini_breadcrumb. If the dword is true,
	 * the ring is paused as the semaphore will busywait
	 * until the dword is false.
	 */
	engine->status_page.addr[I915_GEM_HWS_PREEMPT] = state;
244
245
	if (state)
		wmb();
246
247
}

248
static struct i915_priolist *to_priolist(struct rb_node *rb)
249
250
251
252
{
	return rb_entry(rb, struct i915_priolist, node);
}

253
static int rq_prio(const struct i915_request *rq)
254
{
255
	return READ_ONCE(rq->sched.attr.priority);
256
257
}

258
259
static int effective_prio(const struct i915_request *rq)
{
260
261
	int prio = rq_prio(rq);

262
263
264
265
266
267
268
269
270
271
272
	/*
	 * If this request is special and must not be interrupted at any
	 * cost, so be it. Note we are only checking the most recent request
	 * in the context and so may be masking an earlier vip request. It
	 * is hoped that under the conditions where nopreempt is used, this
	 * will not matter (i.e. all requests to that context will be
	 * nopreempt for as long as desired).
	 */
	if (i915_request_has_nopreempt(rq))
		prio = I915_PRIORITY_UNPREEMPTABLE;

273
	return prio;
274
275
}

276
277
278
279
280
281
282
283
static int queue_prio(const struct intel_engine_execlists *execlists)
{
	struct rb_node *rb;

	rb = rb_first_cached(&execlists->queue);
	if (!rb)
		return INT_MIN;

284
	return to_priolist(rb)->priority;
285
286
}

287
288
289
290
291
292
293
static int virtual_prio(const struct intel_engine_execlists *el)
{
	struct rb_node *rb = rb_first_cached(&el->virtual);

	return rb ? rb_entry(rb, struct ve_node, rb)->prio : INT_MIN;
}

294
295
static bool need_preempt(const struct intel_engine_cs *engine,
			 const struct i915_request *rq)
296
{
297
	int last_prio;
298

299
300
301
	if (!intel_engine_has_semaphores(engine))
		return false;

302
303
304
305
306
307
308
309
310
311
312
	/*
	 * Check if the current priority hint merits a preemption attempt.
	 *
	 * We record the highest value priority we saw during rescheduling
	 * prior to this dequeue, therefore we know that if it is strictly
	 * less than the current tail of ESLP[0], we do not need to force
	 * a preempt-to-idle cycle.
	 *
	 * However, the priority hint is a mere hint that we may need to
	 * preempt. If that hint is stale or we may be trying to preempt
	 * ourselves, ignore the request.
313
314
315
316
317
318
	 *
	 * More naturally we would write
	 *      prio >= max(0, last);
	 * except that we wish to prevent triggering preemption at the same
	 * priority level: the task that is running should remain running
	 * to preserve FIFO ordering of dependencies.
319
	 */
320
321
	last_prio = max(effective_prio(rq), I915_PRIORITY_NORMAL - 1);
	if (engine->execlists.queue_priority_hint <= last_prio)
322
323
324
325
326
327
		return false;

	/*
	 * Check against the first request in ELSP[1], it will, thanks to the
	 * power of PI, be the highest priority of that context.
	 */
328
329
	if (!list_is_last(&rq->sched.link, &engine->active.requests) &&
	    rq_prio(list_next_entry(rq, sched.link)) > last_prio)
330
331
332
333
334
335
336
337
338
339
340
341
		return true;

	/*
	 * If the inflight context did not trigger the preemption, then maybe
	 * it was the set of queued requests? Pick the highest priority in
	 * the queue (the first active priolist) and see if it deserves to be
	 * running instead of ELSP[0].
	 *
	 * The highest priority request in the queue can not be either
	 * ELSP[0] or ELSP[1] as, thanks again to PI, if it was the same
	 * context, it's priority would not exceed ELSP[0] aka last_prio.
	 */
342
343
	return max(virtual_prio(&engine->execlists),
		   queue_prio(&engine->execlists)) > last_prio;
344
345
}

346
__maybe_unused static bool
347
assert_priority_queue(const struct i915_request *prev,
348
		      const struct i915_request *next)
349
{
350
351
352
353
354
355
356
	/*
	 * Without preemption, the prev may refer to the still active element
	 * which we refuse to let go.
	 *
	 * Even with preemption, there are times when we think it is better not
	 * to preempt and leave an ostensibly lower priority request in flight.
	 */
357
	if (i915_request_is_active(prev))
358
359
360
		return true;

	return rq_prio(prev) >= rq_prio(next);
361
362
}

363
static struct i915_request *
364
__unwind_incomplete_requests(struct intel_engine_cs *engine)
365
{
366
	struct i915_request *rq, *rn, *active = NULL;
367
	struct list_head *pl;
368
	int prio = I915_PRIORITY_INVALID;
369

370
	lockdep_assert_held(&engine->active.lock);
371
372

	list_for_each_entry_safe_reverse(rq, rn,
373
374
					 &engine->active.requests,
					 sched.link) {
375
		if (__i915_request_is_complete(rq)) {
376
377
378
			list_del_init(&rq->sched.link);
			continue;
		}
379

380
		__i915_request_unsubmit(rq);
381

382
383
384
385
386
387
		GEM_BUG_ON(rq_prio(rq) == I915_PRIORITY_INVALID);
		if (rq_prio(rq) != prio) {
			prio = rq_prio(rq);
			pl = i915_sched_lookup_priolist(engine, prio);
		}
		GEM_BUG_ON(RB_EMPTY_ROOT(&engine->execlists.queue.rb_root));
388

389
390
		list_move(&rq->sched.link, pl);
		set_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
391

392
393
394
395
396
		/* Check in case we rollback so far we wrap [size/2] */
		if (intel_ring_direction(rq->ring,
					 rq->tail,
					 rq->ring->tail + 8) > 0)
			rq->context->lrc.desc |= CTX_DESC_FORCE_RESTORE;
397

398
		active = rq;
399
400
	}

401
	return active;
402
403
}

404
struct i915_request *
405
406
407
408
409
execlists_unwind_incomplete_requests(struct intel_engine_execlists *execlists)
{
	struct intel_engine_cs *engine =
		container_of(execlists, typeof(*engine), execlists);

410
	return __unwind_incomplete_requests(engine);
411
412
}

413
static void
414
execlists_context_status_change(struct i915_request *rq, unsigned long status)
415
{
416
417
418
419
420
421
	/*
	 * Only used when GVT-g is enabled now. When GVT-g is disabled,
	 * The compiler should eliminate this function as dead-code.
	 */
	if (!IS_ENABLED(CONFIG_DRM_I915_GVT))
		return;
422

423
424
	atomic_notifier_call_chain(&rq->engine->context_status_notifier,
				   status, rq);
425
426
}

427
428
429
static void reset_active(struct i915_request *rq,
			 struct intel_engine_cs *engine)
{
430
	struct intel_context * const ce = rq->context;
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
	u32 head;

	/*
	 * The executing context has been cancelled. We want to prevent
	 * further execution along this context and propagate the error on
	 * to anything depending on its results.
	 *
	 * In __i915_request_submit(), we apply the -EIO and remove the
	 * requests' payloads for any banned requests. But first, we must
	 * rewind the context back to the start of the incomplete request so
	 * that we do not jump back into the middle of the batch.
	 *
	 * We preserve the breadcrumbs and semaphores of the incomplete
	 * requests so that inter-timeline dependencies (i.e other timelines)
	 * remain correctly ordered. And we defer to __i915_request_submit()
	 * so that all asynchronous waits are correctly handled.
	 */
448
	ENGINE_TRACE(engine, "{ reset rq=%llx:%lld }\n",
449
		     rq->fence.context, rq->fence.seqno);
450
451

	/* On resubmission of the active request, payload will be scrubbed */
452
	if (__i915_request_is_complete(rq))
453
454
		head = rq->tail;
	else
455
		head = __active_request(ce->timeline, rq, -EIO)->head;
456
	head = intel_ring_wrap(ce->ring, head);
457
458

	/* Scrub the context image to prevent replaying the previous batch */
459
	lrc_init_regs(ce, engine, true);
460
461

	/* We've switched away, so this should be a no-op, but intent matters */
462
	ce->lrc.lrca = lrc_update_regs(ce, engine, head);
463
464
}

465
466
467
468
469
static bool bad_request(const struct i915_request *rq)
{
	return rq->fence.error && i915_request_started(rq);
}

470
static struct intel_engine_cs *
471
472
473
__execlists_schedule_in(struct i915_request *rq)
{
	struct intel_engine_cs * const engine = rq->engine;
474
	struct intel_context * const ce = rq->context;
475
476
477

	intel_context_get(ce);

478
479
480
481
	if (unlikely(intel_context_is_closed(ce) &&
		     !intel_engine_has_heartbeat(engine)))
		intel_context_set_banned(ce);

482
	if (unlikely(intel_context_is_banned(ce) || bad_request(rq)))
483
484
		reset_active(rq, engine);

485
	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
486
		lrc_check_regs(ce, engine, "before");
487

Chris Wilson's avatar
Chris Wilson committed
488
489
	if (ce->tag) {
		/* Use a fixed tag for OA and friends */
490
		GEM_BUG_ON(ce->tag <= BITS_PER_LONG);
491
		ce->lrc.ccid = ce->tag;
Chris Wilson's avatar
Chris Wilson committed
492
493
	} else {
		/* We don't need a strict matching tag, just different values */
494
		unsigned int tag = __ffs(engine->context_tag);
495

496
497
498
		GEM_BUG_ON(tag >= BITS_PER_LONG);
		__clear_bit(tag, &engine->context_tag);
		ce->lrc.ccid = (1 + tag) << (GEN11_SW_CTX_ID_SHIFT - 32);
499
500

		BUILD_BUG_ON(BITS_PER_LONG > GEN12_MAX_CONTEXT_HW_ID);
Chris Wilson's avatar
Chris Wilson committed
501
502
	}

503
504
	ce->lrc.ccid |= engine->execlists.ccid;

505
	__intel_gt_pm_get(engine->gt);
506
	if (engine->fw_domain && !engine->fw_active++)
507
		intel_uncore_forcewake_get(engine->uncore, engine->fw_domain);
508
509
510
	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_IN);
	intel_engine_context_in(engine);

511
512
	CE_TRACE(ce, "schedule-in, ccid:%x\n", ce->lrc.ccid);

513
514
515
	return engine;
}

516
static void execlists_schedule_in(struct i915_request *rq, int idx)
517
{
518
	struct intel_context * const ce = rq->context;
519
	struct intel_engine_cs *old;
520

521
	GEM_BUG_ON(!intel_engine_pm_is_awake(rq->engine));
522
	trace_i915_request_in(rq, idx);
523

524
525
526
527
	old = ce->inflight;
	if (!old)
		old = __execlists_schedule_in(rq);
	WRITE_ONCE(ce->inflight, ptr_inc(old));
528
529

	GEM_BUG_ON(intel_context_inflight(ce) != rq->engine);
530
531
}

532
533
static void
resubmit_virtual_request(struct i915_request *rq, struct virtual_engine *ve)
534
{
535
536
	struct intel_engine_cs *engine = rq->engine;

537
538
539
540
541
542
543
544
545
546
547
548
549
550
	spin_lock_irq(&engine->active.lock);

	clear_bit(I915_FENCE_FLAG_PQUEUE, &rq->fence.flags);
	WRITE_ONCE(rq->engine, &ve->base);
	ve->base.submit_request(rq);

	spin_unlock_irq(&engine->active.lock);
}

static void kick_siblings(struct i915_request *rq, struct intel_context *ce)
{
	struct virtual_engine *ve = container_of(ce, typeof(*ve), context);
	struct intel_engine_cs *engine = rq->engine;

551
552
553
554
555
556
557
558
559
560
	/*
	 * After this point, the rq may be transferred to a new sibling, so
	 * before we clear ce->inflight make sure that the context has been
	 * removed from the b->signalers and furthermore we need to make sure
	 * that the concurrent iterator in signal_irq_work is no longer
	 * following ce->signal_link.
	 */
	if (!list_empty(&ce->signals))
		intel_context_remove_breadcrumbs(ce, engine->breadcrumbs);

561
562
563
564
565
566
567
568
569
570
	/*
	 * This engine is now too busy to run this virtual request, so
	 * see if we can find an alternative engine for it to execute on.
	 * Once a request has become bonded to this engine, we treat it the
	 * same as other native request.
	 */
	if (i915_request_in_priority_queue(rq) &&
	    rq->execution_mask != engine->mask)
		resubmit_virtual_request(rq, ve);

571
	if (READ_ONCE(ve->request))
572
		tasklet_hi_schedule(&ve->base.execlists.tasklet);
573
574
}

575
576
static void __execlists_schedule_out(struct i915_request * const rq,
				     struct intel_context * const ce)
577
{
578
579
	struct intel_engine_cs * const engine = rq->engine;
	unsigned int ccid;
580

581
582
583
584
585
586
	/*
	 * NB process_csb() is not under the engine->active.lock and hence
	 * schedule_out can race with schedule_in meaning that we should
	 * refrain from doing non-trivial work here.
	 */

587
	CE_TRACE(ce, "schedule-out, ccid:%x\n", ce->lrc.ccid);
588
	GEM_BUG_ON(ce->inflight != engine);
589

590
	if (IS_ENABLED(CONFIG_DRM_I915_DEBUG_GEM))
591
		lrc_check_regs(ce, engine, "after");
592

593
594
595
596
	/*
	 * If we have just completed this context, the engine may now be
	 * idle and we want to re-enter powersaving.
	 */
597
	if (intel_timeline_is_last(ce->timeline, rq) &&
598
	    __i915_request_is_complete(rq))
599
600
		intel_engine_add_retire(engine, ce->timeline);

601
	ccid = ce->lrc.ccid;
602
603
604
605
606
	ccid >>= GEN11_SW_CTX_ID_SHIFT - 32;
	ccid &= GEN12_MAX_CONTEXT_HW_ID;
	if (ccid < BITS_PER_LONG) {
		GEM_BUG_ON(ccid == 0);
		GEM_BUG_ON(test_bit(ccid - 1, &engine->context_tag));
607
		__set_bit(ccid - 1, &engine->context_tag);
608
609
	}

610
	lrc_update_runtime(ce);
611
612
	intel_engine_context_out(engine);
	execlists_context_status_change(rq, INTEL_CONTEXT_SCHEDULE_OUT);
613
	if (engine->fw_domain && !--engine->fw_active)
614
		intel_uncore_forcewake_put(engine->uncore, engine->fw_domain);
615
	intel_gt_pm_put_async(engine->gt);
616

617
618
619
620
621
622
623
624
625
626
627
	/*
	 * If this is part of a virtual engine, its next request may
	 * have been blocked waiting for access to the active context.
	 * We have to kick all the siblings again in case we need to
	 * switch (e.g. the next request is not runnable on this
	 * engine). Hopefully, we will already have submitted the next
	 * request before the tasklet runs and do not need to rebuild
	 * each virtual tree and kick everyone again.
	 */
	if (ce->engine != engine)
		kick_siblings(rq, ce);
628
629
630

	WRITE_ONCE(ce->inflight, NULL);
	intel_context_put(ce);
631
}
632

633
static inline void execlists_schedule_out(struct i915_request *rq)
634
{
635
	struct intel_context * const ce = rq->context;
636

637
638
	trace_i915_request_out(rq);

639
640
	GEM_BUG_ON(!ce->inflight);
	ce->inflight = ptr_dec(ce->inflight);
641
642
	if (!__intel_context_inflight_count(ce->inflight))
		__execlists_schedule_out(rq, ce);
643
644

	i915_request_put(rq);
645
646
}

647
static u64 execlists_update_context(struct i915_request *rq)
648
{
649
	struct intel_context *ce = rq->context;
650
	u64 desc = ce->lrc.desc;
651
	u32 tail, prev;
652

653
654
655
656
657
658
659
660
661
662
663
	/*
	 * WaIdleLiteRestore:bdw,skl
	 *
	 * We should never submit the context with the same RING_TAIL twice
	 * just in case we submit an empty ring, which confuses the HW.
	 *
	 * We append a couple of NOOPs (gen8_emit_wa_tail) after the end of
	 * the normal request to be able to always advance the RING_TAIL on
	 * subsequent resubmissions (for lite restore). Should that fail us,
	 * and we try and submit the same tail again, force the context
	 * reload.
664
665
666
667
668
	 *
	 * If we need to return to a preempted context, we need to skip the
	 * lite-restore and force it to reload the RING_TAIL. Otherwise, the
	 * HW has a tendency to ignore us rewinding the TAIL to the end of
	 * an earlier request.
669
	 */
670
671
	GEM_BUG_ON(ce->lrc_reg_state[CTX_RING_TAIL] != rq->ring->tail);
	prev = rq->ring->tail;
672
	tail = intel_ring_set_tail(rq->ring, rq->tail);
673
	if (unlikely(intel_ring_direction(rq->ring, tail, prev) <= 0))
674
675
676
		desc |= CTX_DESC_FORCE_RESTORE;
	ce->lrc_reg_state[CTX_RING_TAIL] = tail;
	rq->tail = rq->wa_tail;
677

678
679
680
681
682
683
684
685
686
687
	/*
	 * Make sure the context image is complete before we submit it to HW.
	 *
	 * Ostensibly, writes (including the WCB) should be flushed prior to
	 * an uncached write such as our mmio register access, the empirical
	 * evidence (esp. on Braswell) suggests that the WC write into memory
	 * may not be visible to the HW prior to the completion of the UC
	 * register write and that we may begin execution from the context
	 * before its image is complete leading to invalid PD chasing.
	 */
688
	wmb();
689

690
	ce->lrc.desc &= ~CTX_DESC_FORCE_RESTORE;
691
	return desc;
692
693
}

694
static void write_desc(struct intel_engine_execlists *execlists, u64 desc, u32 port)
Chris Wilson's avatar
Chris Wilson committed
695
{
696
697
698
699
700
701
702
	if (execlists->ctrl_reg) {
		writel(lower_32_bits(desc), execlists->submit_reg + port * 2);
		writel(upper_32_bits(desc), execlists->submit_reg + port * 2 + 1);
	} else {
		writel(upper_32_bits(desc), execlists->submit_reg);
		writel(lower_32_bits(desc), execlists->submit_reg);
	}
Chris Wilson's avatar
Chris Wilson committed
703
704
}

705
706
707
708
709
710
static __maybe_unused char *
dump_port(char *buf, int buflen, const char *prefix, struct i915_request *rq)
{
	if (!rq)
		return "";

711
	snprintf(buf, buflen, "%sccid:%x %llx:%lld%s prio %d",
712
		 prefix,
713
		 rq->context->lrc.ccid,
714
		 rq->fence.context, rq->fence.seqno,
715
716
		 __i915_request_is_complete(rq) ? "!" :
		 __i915_request_has_started(rq) ? "*" :
717
718
719
720
721
722
		 "",
		 rq_prio(rq));

	return buf;
}

723
static __maybe_unused noinline void
724
725
726
727
728
729
trace_ports(const struct intel_engine_execlists *execlists,
	    const char *msg,
	    struct i915_request * const *ports)
{
	const struct intel_engine_cs *engine =
		container_of(execlists, typeof(*engine), execlists);
730
	char __maybe_unused p0[40], p1[40];
731

732
733
734
	if (!ports[0])
		return;

735
736
737
	ENGINE_TRACE(engine, "%s { %s%s }\n", msg,
		     dump_port(p0, sizeof(p0), "", ports[0]),
		     dump_port(p1, sizeof(p1), ", ", ports[1]));
738
739
}

740
static bool
741
742
743
744
745
reset_in_progress(const struct intel_engine_execlists *execlists)
{
	return unlikely(!__tasklet_is_enabled(&execlists->tasklet));
}

746
static __maybe_unused noinline bool
747
748
749
assert_pending_valid(const struct intel_engine_execlists *execlists,
		     const char *msg)
{
750
751
	struct intel_engine_cs *engine =
		container_of(execlists, typeof(*engine), execlists);
752
	struct i915_request * const *port, *rq, *prev = NULL;
753
	struct intel_context *ce = NULL;
754
	u32 ccid = -1;
755
756
757

	trace_ports(execlists, msg, execlists->pending);

758
759
760
761
	/* We may be messing around with the lists during reset, lalala */
	if (reset_in_progress(execlists))
		return true;

762
	if (!execlists->pending[0]) {
763
764
		GEM_TRACE_ERR("%s: Nothing pending for promotion!\n",
			      engine->name);
765
		return false;
766
	}
767

768
	if (execlists->pending[execlists_num_ports(execlists)]) {
769
770
		GEM_TRACE_ERR("%s: Excess pending[%d] for promotion!\n",
			      engine->name, execlists_num_ports(execlists));
771
		return false;
772
	}
773
774

	for (port = execlists->pending; (rq = *port); port++) {
775
776
777
		unsigned long flags;
		bool ok = true;

778
779
780
		GEM_BUG_ON(!kref_read(&rq->fence.refcount));
		GEM_BUG_ON(!i915_request_is_active(rq));

781
		if (ce == rq->context) {
782
783
			GEM_TRACE_ERR("%s: Dup context:%llx in pending[%zd]\n",
				      engine->name,
784
				      ce->timeline->fence_context,
785
				      port - execlists->pending);
786
			return false;
787
		}
788
		ce = rq->context;
789

790
791
792
793
794
795
796
797
798
		if (ccid == ce->lrc.ccid) {
			GEM_TRACE_ERR("%s: Dup ccid:%x context:%llx in pending[%zd]\n",
				      engine->name,
				      ccid, ce->timeline->fence_context,
				      port - execlists->pending);
			return false;
		}
		ccid = ce->lrc.ccid;

799
		/*
800
801
802
		 * Sentinels are supposed to be the last request so they flush
		 * the current execution off the HW. Check that they are the only
		 * request in the pending submission.
803
804
805
806
		 *
		 * NB: Due to the async nature of preempt-to-busy and request
		 * cancellation we need to handle the case where request
		 * becomes a sentinel in parallel to CSB processing.
807
		 */
808
809
		if (prev && i915_request_has_sentinel(prev) &&
		    !READ_ONCE(prev->fence.error)) {
810
811
			GEM_TRACE_ERR("%s: context:%llx after sentinel in pending[%zd]\n",
				      engine->name,
812
813
814
815
				      ce->timeline->fence_context,
				      port - execlists->pending);
			return false;
		}
816
		prev = rq;
817

818
819
820
821
822
823
824
825
826
827
828
829
830
831
		/*
		 * We want virtual requests to only be in the first slot so
		 * that they are never stuck behind a hog and can be immediately
		 * transferred onto the next idle engine.
		 */
		if (rq->execution_mask != engine->mask &&
		    port != execlists->pending) {
			GEM_TRACE_ERR("%s: virtual engine:%llx not in prime position[%zd]\n",
				      engine->name,
				      ce->timeline->fence_context,
				      port - execlists->pending);
			return false;
		}

832
		/* Hold tightly onto the lock to prevent concurrent retires! */
833
834
		if (!spin_trylock_irqsave(&rq->lock, flags))
			continue;
835

836
		if (__i915_request_is_complete(rq))
837
			goto unlock;
838

839
840
		if (i915_active_is_idle(&ce->active) &&
		    !intel_context_is_barrier(ce)) {
841
842
			GEM_TRACE_ERR("%s: Inactive context:%llx in pending[%zd]\n",
				      engine->name,
843
				      ce->timeline->fence_context,
844
				      port - execlists->pending);
845
846
			ok = false;
			goto unlock;
847
848
849
		}

		if (!i915_vma_is_pinned(ce->state)) {
850
851
			GEM_TRACE_ERR("%s: Unpinned context:%llx in pending[%zd]\n",
				      engine->name,
852
				      ce->timeline->fence_context,
853
				      port - execlists->pending);
854
855
			ok = false;
			goto unlock;
856
		}
857

858
		if (!i915_vma_is_pinned(ce->ring->vma)) {
859
860
			GEM_TRACE_ERR("%s: Unpinned ring:%llx in pending[%zd]\n",
				      engine->name,
861
				      ce->timeline->fence_context,
862
				      port - execlists->pending);
863
864
			ok = false;
			goto unlock;
865
		}
866
867
868
869
870

unlock:
		spin_unlock_irqrestore(&rq->lock, flags);
		if (!ok)
			return false;
871
872
873
874
875
	}

	return ce;
}

876
static void execlists_submit_ports(struct intel_engine_cs *engine)
877
{
878
	struct intel_engine_execlists *execlists = &engine->execlists;
879
	unsigned int n;
880

881
882
	GEM_BUG_ON(!assert_pending_valid(execlists, "submit"));

883
884
885
886
887
888
889
890
	/*
	 * We can skip acquiring intel_runtime_pm_get() here as it was taken
	 * on our behalf by the request (see i915_gem_mark_busy()) and it will
	 * not be relinquished until the device is idle (see
	 * i915_gem_idle_work_handler()). As a precaution, we make sure
	 * that all ELSP are drained i.e. we have processed the CSB,
	 * before allowing ourselves to idle and calling intel_runtime_pm_put().
	 */
891
	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
892

893
894
895
896
897
898
899
	/*
	 * ELSQ note: the submit queue is not cleared after being submitted
	 * to the HW so we need to make sure we always clean it up. This is
	 * currently ensured by the fact that we always write the same number
	 * of elsq entries, keep this in mind before changing the loop below.
	 */
	for (n = execlists_num_ports(execlists); n--; ) {
900
		struct i915_request *rq = execlists->pending[n];
901

902
903
904
		write_desc(execlists,
			   rq ? execlists_update_context(rq) : 0,
			   n);
905
	}
906
907
908
909

	/* we need to manually load the submit queue */
	if (execlists->ctrl_reg)
		writel(EL_CTRL_LOAD, execlists->ctrl_reg);
910
911
}

912
static bool ctx_single_port_submission(const struct intel_context *ce)
913
{
914
	return (IS_ENABLED(CONFIG_DRM_I915_GVT) &&
915
		intel_context_force_single_submission(ce));
916
}
917

918
919
static bool can_merge_ctx(const struct intel_context *prev,
			  const struct intel_context *next)
920
921
922
{
	if (prev != next)
		return false;
923

924
925
	if (ctx_single_port_submission(prev))
		return false;
926

927
	return true;
928
929
}

930
931
932
933
934
static unsigned long i915_request_flags(const struct i915_request *rq)
{
	return READ_ONCE(rq->fence.flags);
}

935
936
937
static bool can_merge_rq(const struct i915_request *prev,
			 const struct i915_request *next)
{
938
	GEM_BUG_ON(prev == next);
939
940
	GEM_BUG_ON(!assert_priority_queue(prev, next));

941
942
943
944
945
946
947
948
	/*
	 * We do not submit known completed requests. Therefore if the next
	 * request is already completed, we can pretend to merge it in
	 * with the previous context (and we will skip updating the ELSP
	 * and tracking). Thus hopefully keeping the ELSP full with active
	 * contexts, despite the best efforts of preempt-to-busy to confuse
	 * us.
	 */
949
	if (__i915_request_is_complete(next))
950
951
		return true;

952
	if (unlikely((i915_request_flags(prev) | i915_request_flags(next)) &
953
954
		     (BIT(I915_FENCE_FLAG_NOPREEMPT) |
		      BIT(I915_FENCE_FLAG_SENTINEL))))
955
956
		return false;

957
	if (!can_merge_ctx(prev->context, next->context))
958
959
		return false;

960
	GEM_BUG_ON(i915_seqno_passed(prev->fence.seqno, next->fence.seqno));
961
962
963
	return true;
}

964
965
966
967
static bool virtual_matches(const struct virtual_engine *ve,
			    const struct i915_request *rq,
			    const struct intel_engine_cs *engine)
{
968
	const struct intel_engine_cs *inflight;
969

970
971
972
	if (!rq)
		return false;

973
974
975
	if (!(rq->execution_mask & engine->mask)) /* We peeked too soon! */
		return false;

976
977
978
979
980
981
982
983
984
	/*
	 * We track when the HW has completed saving the context image
	 * (i.e. when we have seen the final CS event switching out of
	 * the context) and must not overwrite the context image before
	 * then. This restricts us to only using the active engine
	 * while the previous virtualized request is inflight (so
	 * we reuse the register offsets). This is a very small
	 * hystersis on the greedy seelction algorithm.
	 */
985
	inflight = intel_context_inflight(&ve->context);
986
	if (inflight && inflight != engine)
987
988
989
990
991
		return false;

	return true;
}

992
993
994
995
996
997
998
999
1000
1001
1002
1003
static struct virtual_engine *
first_virtual_engine(struct intel_engine_cs *engine)
{
	struct intel_engine_execlists *el = &engine->execlists;
	struct rb_node *rb = rb_first_cached(&el->virtual);

	while (rb) {
		struct virtual_engine *ve =
			rb_entry(rb, typeof(*ve), nodes[engine->id].rb);
		struct i915_request *rq = READ_ONCE(ve->request);

		/* lazily cleanup after another engine handled rq */
1004
		if (!rq || !virtual_matches(ve, rq, engine)) {
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
			rb_erase_cached(rb, &el->virtual);
			RB_CLEAR_NODE(rb);
			rb = rb_first_cached(&el->virtual);
			continue;
		}

		return ve;
	}

	return NULL;
}

1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
static void virtual_xfer_context(struct virtual_engine *ve,
				 struct intel_engine_cs *engine)
{
	unsigned int n;

	if (likely(engine == ve->siblings[0]))
		return;

	GEM_BUG_ON(READ_ONCE(ve->context.inflight));
	if (!intel_engine_has_relative_mmio(engine))
1027
		lrc_update_offsets(&ve->context, engine);
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042

	/*
	 * Move the bound engine to the top of the list for
	 * future execution. We then kick this tasklet first
	 * before checking others, so that we preferentially
	 * reuse this set of bound registers.
	 */
	for (n = 1; n < ve->num_siblings; n++) {
		if (ve->siblings[n] == engine) {
			swap(ve->siblings[n], ve->siblings[0]);
			break;
		}
	}
}

1043
static void defer_request(struct i915_request *rq, struct list_head * const pl)
1044
{
1045
	LIST_HEAD(list);
1046
1047
1048
1049
1050
1051
1052
1053

	/*
	 * We want to move the interrupted request to the back of
	 * the round-robin list (i.e. its priority level), but
	 * in doing so, we must then move all requests that were in
	 * flight and were waiting for the interrupted request to
	 * be run after it again.
	 */
1054
1055
	do {
		struct i915_dependency *p;
1056

1057
1058
		GEM_BUG_ON(i915_request_is_active(rq));
		list_move_tail(&rq->sched.link, pl);
1059

1060
		for_each_waiter(p, rq) {
1061
1062
			struct i915_request *w =
				container_of(p->waiter, typeof(*w), sched);
1063

1064
1065
1066
			if (p->flags & I915_DEPENDENCY_WEAK)
				continue;

1067
1068
1069
			/* Leave semaphores spinning on the other engines */
			if (w->engine != rq->engine)
				continue;
1070

1071
			/* No waiter should start before its signaler */
1072
			GEM_BUG_ON(i915_request_has_initial_breadcrumb(w) &&
1073
1074
				   __i915_request_has_started(w) &&
				   !__i915_request_is_complete(rq));
1075