idle.c 12.4 KB
Newer Older
1
// SPDX-License-Identifier: GPL-2.0-only
2
/*
3
4
5
6
7
 * Generic entry points for the idle threads and
 * implementation of the idle task scheduling class.
 *
 * (NOTE: these are not related to SCHED_IDLE batch scheduled
 *        tasks which are handled in sched/fair.c )
8
 */
9
#include "sched.h"
10
11
12

#include <trace/events/power.h>

13
14
15
/* Linker adds these: start and end of __cpuidle functions */
extern char __cpuidle_text_start[], __cpuidle_text_end[];

16
17
18
19
20
21
22
23
24
/**
 * sched_idle_set_state - Record idle state for the current CPU.
 * @idle_state: State to record.
 */
void sched_idle_set_state(struct cpuidle_state *idle_state)
{
	idle_set_state(this_rq(), idle_state);
}

25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
static int __read_mostly cpu_idle_force_poll;

void cpu_idle_poll_ctrl(bool enable)
{
	if (enable) {
		cpu_idle_force_poll++;
	} else {
		cpu_idle_force_poll--;
		WARN_ON_ONCE(cpu_idle_force_poll < 0);
	}
}

#ifdef CONFIG_GENERIC_IDLE_POLL_SETUP
static int __init cpu_idle_poll_setup(char *__unused)
{
	cpu_idle_force_poll = 1;
41

42
43
44
45
46
47
48
	return 1;
}
__setup("nohlt", cpu_idle_poll_setup);

static int __init cpu_idle_nopoll_setup(char *__unused)
{
	cpu_idle_force_poll = 0;
49

50
51
52
53
54
	return 1;
}
__setup("hlt", cpu_idle_nopoll_setup);
#endif

55
static noinline int __cpuidle cpu_idle_poll(void)
56
{
57
58
	trace_cpu_idle(0, smp_processor_id());
	stop_critical_timings();
59
60
	rcu_idle_enter();
	local_irq_enable();
61

62
	while (!tif_need_resched() &&
63
	       (cpu_idle_force_poll || tick_check_broadcast_expired()))
64
		cpu_relax();
65

66
	rcu_idle_exit();
67
68
	start_critical_timings();
	trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
69

70
71
72
73
74
75
76
77
78
79
80
	return 1;
}

/* Weak implementations for optional arch specific functions */
void __weak arch_cpu_idle_prepare(void) { }
void __weak arch_cpu_idle_enter(void) { }
void __weak arch_cpu_idle_exit(void) { }
void __weak arch_cpu_idle_dead(void) { }
void __weak arch_cpu_idle(void)
{
	cpu_idle_force_poll = 1;
81
	raw_local_irq_enable();
82
83
}

84
85
86
87
88
/**
 * default_idle_call - Default CPU idle routine.
 *
 * To use when the cpuidle framework cannot be used.
 */
89
void __cpuidle default_idle_call(void)
90
{
91
	if (current_clr_polling_and_test()) {
92
		local_irq_enable();
93
	} else {
94
95

		trace_cpu_idle(1, smp_processor_id());
96
		stop_critical_timings();
97
98
99
100
101
102
103
104
105
106
107
108

		/*
		 * arch_cpu_idle() is supposed to enable IRQs, however
		 * we can't do that because of RCU and tracing.
		 *
		 * Trace IRQs enable here, then switch off RCU, and have
		 * arch_cpu_idle() use raw_local_irq_enable(). Note that
		 * rcu_idle_enter() relies on lockdep IRQ state, so switch that
		 * last -- this is very similar to the entry code.
		 */
		trace_hardirqs_on_prepare();
		lockdep_hardirqs_on_prepare(_THIS_IP_);
109
		rcu_idle_enter();
110
111
		lockdep_hardirqs_on(_THIS_IP_);

112
		arch_cpu_idle();
113
114
115
116
117
118
119
120
121

		/*
		 * OK, so IRQs are enabled here, but RCU needs them disabled to
		 * turn itself back on.. funny thing is that disabling IRQs
		 * will cause tracing, which needs RCU. Jump through hoops to
		 * make it 'work'.
		 */
		raw_local_irq_disable();
		lockdep_hardirqs_off(_THIS_IP_);
122
		rcu_idle_exit();
123
124
125
		lockdep_hardirqs_on(_THIS_IP_);
		raw_local_irq_enable();

126
		start_critical_timings();
127
		trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
128
	}
129
130
}

131
132
133
134
135
136
137
138
139
static int call_cpuidle_s2idle(struct cpuidle_driver *drv,
			       struct cpuidle_device *dev)
{
	if (current_clr_polling_and_test())
		return -EBUSY;

	return cpuidle_enter_s2idle(drv, dev);
}

140
141
142
143
144
145
146
147
static int call_cpuidle(struct cpuidle_driver *drv, struct cpuidle_device *dev,
		      int next_state)
{
	/*
	 * The idle task must be scheduled, it is pointless to go to idle, just
	 * update no idle residency and return.
	 */
	if (current_clr_polling_and_test()) {
148
		dev->last_residency_ns = 0;
149
150
151
152
153
154
155
156
157
		local_irq_enable();
		return -EBUSY;
	}

	/*
	 * Enter the idle state previously returned by the governor decision.
	 * This function will block until an interrupt occurs and will take
	 * care of re-enabling the local interrupts
	 */
158
	return cpuidle_enter(drv, dev, next_state);
159
160
}

161
162
163
164
/**
 * cpuidle_idle_call - the main idle function
 *
 * NOTE: no locks or semaphores should be used here
165
 *
Ingo Molnar's avatar
Ingo Molnar committed
166
 * On architectures that support TIF_POLLING_NRFLAG, is called with polling
167
168
 * set, and it returns with polling set.  If it ever stops polling, it
 * must clear the polling bit.
169
 */
170
static void cpuidle_idle_call(void)
171
{
172
	struct cpuidle_device *dev = cpuidle_get_device();
173
	struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
174
	int next_state, entered_state;
175

176
177
	/*
	 * Check if the idle task must be rescheduled. If it is the
178
	 * case, exit the function after re-enabling the local irq.
179
	 */
180
	if (need_resched()) {
181
		local_irq_enable();
182
		return;
183
184
	}

185
	/*
186
187
	 * The RCU framework needs to be told that we are entering an idle
	 * section, so no more rcu read side critical sections and one more
188
189
	 * step to the grace period
	 */
190

191
	if (cpuidle_not_available(drv, dev)) {
192
193
		tick_nohz_idle_stop_tick();

194
195
196
		default_idle_call();
		goto exit_idle;
	}
197

198
	/*
199
	 * Suspend-to-idle ("s2idle") is a system state in which all user space
200
	 * has been frozen, all I/O devices have been suspended and the only
201
	 * activity happens here and in interrupts (if any). In that case bypass
Ingo Molnar's avatar
Ingo Molnar committed
202
	 * the cpuidle governor and go straight for the deepest idle state
203
204
205
206
	 * available.  Possibly also suspend the local tick and the entire
	 * timekeeping to prevent timer interrupts from kicking us out of idle
	 * until a proper wakeup interrupt happens.
	 */
207

208
	if (idle_should_enter_s2idle() || dev->forced_idle_latency_limit_ns) {
209
210
		u64 max_latency_ns;

211
		if (idle_should_enter_s2idle()) {
212

213
214
			entered_state = call_cpuidle_s2idle(drv, dev);
			if (entered_state > 0)
215
				goto exit_idle;
216

217
218
219
			max_latency_ns = U64_MAX;
		} else {
			max_latency_ns = dev->forced_idle_latency_limit_ns;
220
221
		}

222
223
		tick_nohz_idle_stop_tick();

224
		next_state = cpuidle_find_deepest_state(drv, dev, max_latency_ns);
225
		call_cpuidle(drv, dev, next_state);
226
	} else {
227
228
		bool stop_tick = true;

229
230
231
		/*
		 * Ask the cpuidle framework to choose a convenient idle state.
		 */
232
		next_state = cpuidle_select(drv, dev, &stop_tick);
233

234
		if (stop_tick || tick_nohz_tick_stopped())
235
236
237
238
			tick_nohz_idle_stop_tick();
		else
			tick_nohz_idle_retain_tick();

239
240
241
242
		entered_state = call_cpuidle(drv, dev, next_state);
		/*
		 * Give the governor an opportunity to reflect on the outcome
		 */
243
		cpuidle_reflect(dev, entered_state);
244
	}
245
246

exit_idle:
247
	__current_set_polling();
248

249
	/*
250
	 * It is up to the idle functions to reenable local interrupts
251
	 */
252
253
	if (WARN_ON_ONCE(irqs_disabled()))
		local_irq_enable();
254
255
}

256
257
/*
 * Generic idle loop implementation
258
259
 *
 * Called with polling cleared.
260
 */
261
static void do_idle(void)
262
{
263
	int cpu = smp_processor_id();
264
265
266
267
268
269

	/*
	 * Check if we need to update blocked load
	 */
	nohz_run_idle_balance(cpu);

270
271
272
273
274
275
276
277
	/*
	 * If the arch has a polling bit, we maintain an invariant:
	 *
	 * Our polling bit is clear if we're not scheduled (i.e. if rq->curr !=
	 * rq->idle). This means that, if rq->idle has the polling bit set,
	 * then setting need_resched is guaranteed to cause the CPU to
	 * reschedule.
	 */
278

279
280
	__current_set_polling();
	tick_nohz_idle_enter();
281

282
283
	while (!need_resched()) {
		rmb();
284

285
286
		local_irq_disable();

287
		if (cpu_is_offline(cpu)) {
288
			tick_nohz_idle_stop_tick();
289
290
			cpuhp_report_idle_dead();
			arch_cpu_idle_dead();
291
		}
292

293
		arch_cpu_idle_enter();
294
		rcu_nocb_flush_deferred_wakeup();
295
296

		/*
297
298
299
300
		 * In poll mode we reenable interrupts and spin. Also if we
		 * detected in the wakeup from idle path that the tick
		 * broadcast device expired for us, we don't want to go deep
		 * idle as we know that the IPI is going to arrive right away.
301
		 */
302
303
		if (cpu_idle_force_poll || tick_check_broadcast_expired()) {
			tick_nohz_idle_restart_tick();
304
			cpu_idle_poll();
305
		} else {
306
			cpuidle_idle_call();
307
		}
308
		arch_cpu_idle_exit();
309
	}
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328

	/*
	 * Since we fell out of the loop above, we know TIF_NEED_RESCHED must
	 * be set, propagate it into PREEMPT_NEED_RESCHED.
	 *
	 * This is required because for polling idle loops we will not have had
	 * an IPI to fold the state for us.
	 */
	preempt_set_need_resched();
	tick_nohz_idle_exit();
	__current_clr_polling();

	/*
	 * We promise to call sched_ttwu_pending() and reschedule if
	 * need_resched() is set while polling is set. That means that clearing
	 * polling needs to be visible before doing these things.
	 */
	smp_mb__after_atomic();

329
330
331
332
333
	/*
	 * RCU relies on this call to be done outside of an RCU read-side
	 * critical section.
	 */
	flush_smp_call_function_from_idle();
334
	schedule_idle();
335
336
337

	if (unlikely(klp_patch_pending(current)))
		klp_update_patch_state(current);
338
339
}

340
341
342
343
344
345
bool cpu_in_idle(unsigned long pc)
{
	return pc >= (unsigned long)__cpuidle_text_start &&
		pc < (unsigned long)__cpuidle_text_end;
}

346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
struct idle_timer {
	struct hrtimer timer;
	int done;
};

static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *timer)
{
	struct idle_timer *it = container_of(timer, struct idle_timer, timer);

	WRITE_ONCE(it->done, 1);
	set_tsk_need_resched(current);

	return HRTIMER_NORESTART;
}

361
void play_idle_precise(u64 duration_ns, u64 latency_ns)
362
363
364
365
366
367
368
369
370
371
372
{
	struct idle_timer it;

	/*
	 * Only FIFO tasks can disable the tick since they don't need the forced
	 * preemption.
	 */
	WARN_ON_ONCE(current->policy != SCHED_FIFO);
	WARN_ON_ONCE(current->nr_cpus_allowed != 1);
	WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
	WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
373
	WARN_ON_ONCE(!duration_ns);
374
	WARN_ON_ONCE(current->mm);
375
376
377
378

	rcu_sleep_check();
	preempt_disable();
	current->flags |= PF_IDLE;
379
	cpuidle_use_deepest_state(latency_ns);
380
381

	it.done = 0;
382
	hrtimer_init_on_stack(&it.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL_HARD);
383
	it.timer.function = idle_inject_timer_fn;
384
	hrtimer_start(&it.timer, ns_to_ktime(duration_ns),
385
		      HRTIMER_MODE_REL_PINNED_HARD);
386
387
388
389

	while (!READ_ONCE(it.done))
		do_idle();

390
	cpuidle_use_deepest_state(0);
391
392
393
394
395
	current->flags &= ~PF_IDLE;

	preempt_fold_need_resched();
	preempt_enable();
}
396
EXPORT_SYMBOL_GPL(play_idle_precise);
397

398
399
400
void cpu_startup_entry(enum cpuhp_state state)
{
	arch_cpu_idle_prepare();
401
	cpuhp_online_idle(state);
402
403
	while (1)
		do_idle();
404
}
405
406
407
408
409
410
411

/*
 * idle-task scheduling class.
 */

#ifdef CONFIG_SMP
static int
412
select_task_rq_idle(struct task_struct *p, int cpu, int flags)
413
414
415
{
	return task_cpu(p); /* IDLE tasks as never migrated */
}
416
417
418
419
420
421

static int
balance_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
	return WARN_ON_ONCE(1);
}
422
423
424
425
426
427
428
429
430
431
#endif

/*
 * Idle tasks are unconditionally rescheduled:
 */
static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int flags)
{
	resched_curr(rq);
}

432
static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
433
434
435
{
}

436
static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first)
437
438
439
{
	update_idle_core(rq);
	schedstat_inc(rq->sched_goidle);
440
	queue_core_balance(rq);
441
442
}

443
444
445
446
447
448
449
#ifdef CONFIG_SMP
static struct task_struct *pick_task_idle(struct rq *rq)
{
	return rq->idle;
}
#endif

450
struct task_struct *pick_next_task_idle(struct rq *rq)
451
{
452
453
	struct task_struct *next = rq->idle;

454
	set_next_task_idle(rq, next, true);
455

456
	return next;
457
458
459
460
461
462
463
464
465
}

/*
 * It is not legal to sleep in the idle task - print a warning
 * message if some code attempts to do it:
 */
static void
dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags)
{
Peter Zijlstra's avatar
Peter Zijlstra committed
466
	raw_spin_rq_unlock_irq(rq);
467
468
	printk(KERN_ERR "bad: scheduling from the idle thread!\n");
	dump_stack();
Peter Zijlstra's avatar
Peter Zijlstra committed
469
	raw_spin_rq_lock_irq(rq);
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
}

/*
 * scheduler tick hitting a task of our scheduling class.
 *
 * NOTE: This function can be called remotely by the tick offload that
 * goes along full dynticks. Therefore no local assumption can be made
 * and everything must be accessed through the @rq and @curr passed in
 * parameters.
 */
static void task_tick_idle(struct rq *rq, struct task_struct *curr, int queued)
{
}

static void switched_to_idle(struct rq *rq, struct task_struct *p)
{
	BUG();
}

static void
prio_changed_idle(struct rq *rq, struct task_struct *p, int oldprio)
{
	BUG();
}

static void update_curr_idle(struct rq *rq)
{
}

/*
 * Simple, special scheduling class for the per-CPU idle tasks:
 */
502
503
DEFINE_SCHED_CLASS(idle) = {

504
505
506
507
508
509
510
511
512
	/* no enqueue/yield_task for idle tasks */

	/* dequeue is not valid, we print a debug message there: */
	.dequeue_task		= dequeue_task_idle,

	.check_preempt_curr	= check_preempt_curr_idle,

	.pick_next_task		= pick_next_task_idle,
	.put_prev_task		= put_prev_task_idle,
513
	.set_next_task          = set_next_task_idle,
514
515

#ifdef CONFIG_SMP
516
	.balance		= balance_idle,
517
	.pick_task		= pick_task_idle,
518
519
520
521
522
523
524
525
526
527
	.select_task_rq		= select_task_rq_idle,
	.set_cpus_allowed	= set_cpus_allowed_common,
#endif

	.task_tick		= task_tick_idle,

	.prio_changed		= prio_changed_idle,
	.switched_to		= switched_to_idle,
	.update_curr		= update_curr_idle,
};