core.c 217 KB
Newer Older
Linus Torvalds's avatar
Linus Torvalds committed
1
/*
2
 *  kernel/sched/core.c
Linus Torvalds's avatar
Linus Torvalds committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
 *
 *  Kernel scheduler and related syscalls
 *
 *  Copyright (C) 1991-2002  Linus Torvalds
 *
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 *		make semaphores SMP safe
 *  1998-11-19	Implemented schedule_timeout() and related stuff
 *		by Andrea Arcangeli
 *  2002-01-04	New ultra-scalable O(1) scheduler by Ingo Molnar:
 *		hybrid priority-list and round-robin design with
 *		an array-switch method of distributing timeslices
 *		and per-CPU runqueues.  Cleanups and useful suggestions
 *		by Davide Libenzi, preemptible kernel bits by Robert Love.
 *  2003-09-03	Interactivity tuning by Con Kolivas.
 *  2004-04-02	Scheduler domains code by Nick Piggin
Ingo Molnar's avatar
Ingo Molnar committed
19
20
21
22
23
24
 *  2007-04-15  Work begun on replacing all interactivity tuning with a
 *              fair scheduling design by Con Kolivas.
 *  2007-05-05  Load balancing (smp-nice) and other improvements
 *              by Peter Williams
 *  2007-05-06  Interactivity improvements to CFS by Mike Galbraith
 *  2007-07-01  Group scheduling enhancements by Srivatsa Vaddagiri
25
26
 *  2007-11-29  RT balancing improvements by Steven Rostedt, Gregory Haskins,
 *              Thomas Gleixner, Mike Kravetz
Linus Torvalds's avatar
Linus Torvalds committed
27
28
 */

29
#include <linux/kasan.h>
Linus Torvalds's avatar
Linus Torvalds committed
30
31
32
33
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/nmi.h>
#include <linux/init.h>
34
#include <linux/uaccess.h>
Linus Torvalds's avatar
Linus Torvalds committed
35
#include <linux/highmem.h>
36
#include <linux/mmu_context.h>
Linus Torvalds's avatar
Linus Torvalds committed
37
#include <linux/interrupt.h>
38
#include <linux/capability.h>
Linus Torvalds's avatar
Linus Torvalds committed
39
40
#include <linux/completion.h>
#include <linux/kernel_stat.h>
41
#include <linux/debug_locks.h>
42
#include <linux/perf_event.h>
Linus Torvalds's avatar
Linus Torvalds committed
43
44
45
#include <linux/security.h>
#include <linux/notifier.h>
#include <linux/profile.h>
46
#include <linux/freezer.h>
47
#include <linux/vmalloc.h>
Linus Torvalds's avatar
Linus Torvalds committed
48
49
#include <linux/blkdev.h>
#include <linux/delay.h>
50
#include <linux/pid_namespace.h>
Linus Torvalds's avatar
Linus Torvalds committed
51
52
53
54
55
56
57
#include <linux/smp.h>
#include <linux/threads.h>
#include <linux/timer.h>
#include <linux/rcupdate.h>
#include <linux/cpu.h>
#include <linux/cpuset.h>
#include <linux/percpu.h>
58
#include <linux/proc_fs.h>
Linus Torvalds's avatar
Linus Torvalds committed
59
#include <linux/seq_file.h>
60
#include <linux/sysctl.h>
Linus Torvalds's avatar
Linus Torvalds committed
61
62
#include <linux/syscalls.h>
#include <linux/times.h>
63
#include <linux/tsacct_kern.h>
64
#include <linux/kprobes.h>
65
#include <linux/delayacct.h>
66
#include <linux/unistd.h>
Jens Axboe's avatar
Jens Axboe committed
67
#include <linux/pagemap.h>
68
#include <linux/hrtimer.h>
Reynes Philippe's avatar
Reynes Philippe committed
69
#include <linux/tick.h>
Peter Zijlstra's avatar
Peter Zijlstra committed
70
#include <linux/ctype.h>
71
#include <linux/ftrace.h>
72
#include <linux/slab.h>
73
#include <linux/init_task.h>
74
#include <linux/context_tracking.h>
75
#include <linux/compiler.h>
76
#include <linux/frame.h>
77
#include <linux/prefetch.h>
Linus Torvalds's avatar
Linus Torvalds committed
78

79
#include <asm/switch_to.h>
80
#include <asm/tlb.h>
81
#include <asm/irq_regs.h>
82
#include <asm/mutex.h>
83
84
85
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#endif
Linus Torvalds's avatar
Linus Torvalds committed
86

87
#include "sched.h"
88
#include "../workqueue_internal.h"
89
#include "../smpboot.h"
90

91
#define CREATE_TRACE_POINTS
92
#include <trace/events/sched.h>
93

94
95
DEFINE_MUTEX(sched_domains_mutex);
DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
96

97
static void update_rq_clock_task(struct rq *rq, s64 delta);
98

99
void update_rq_clock(struct rq *rq)
100
{
101
	s64 delta;
102

103
104
105
	lockdep_assert_held(&rq->lock);

	if (rq->clock_skip_update & RQCF_ACT_SKIP)
106
		return;
107

108
	delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
109
110
	if (delta < 0)
		return;
111
112
	rq->clock += delta;
	update_rq_clock_task(rq, delta);
113
114
}

115
116
117
/*
 * Debugging: various feature bits
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
118
119
120
121

#define SCHED_FEAT(name, enabled)	\
	(1UL << __SCHED_FEAT_##name) * enabled |

122
const_debug unsigned int sysctl_sched_features =
123
#include "features.h"
Peter Zijlstra's avatar
Peter Zijlstra committed
124
125
126
127
	0;

#undef SCHED_FEAT

128
129
130
131
132
133
/*
 * Number of tasks to iterate in a single balance run.
 * Limited because this is done with IRQs disabled.
 */
const_debug unsigned int sysctl_sched_nr_migrate = 32;

134
135
136
137
138
139
140
141
/*
 * period over which we average the RT time consumption, measured
 * in ms.
 *
 * default: 1s
 */
const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;

Peter Zijlstra's avatar
Peter Zijlstra committed
142
/*
Peter Zijlstra's avatar
Peter Zijlstra committed
143
 * period over which we measure -rt task cpu usage in us.
Peter Zijlstra's avatar
Peter Zijlstra committed
144
145
 * default: 1s
 */
Peter Zijlstra's avatar
Peter Zijlstra committed
146
unsigned int sysctl_sched_rt_period = 1000000;
Peter Zijlstra's avatar
Peter Zijlstra committed
147

148
__read_mostly int scheduler_running;
149

Peter Zijlstra's avatar
Peter Zijlstra committed
150
151
152
153
154
/*
 * part of the period that we allow rt tasks to run in us.
 * default: 0.95s
 */
int sysctl_sched_rt_runtime = 950000;
Peter Zijlstra's avatar
Peter Zijlstra committed
155

156
157
158
/* cpus with isolated domains */
cpumask_var_t cpu_isolated_map;

Linus Torvalds's avatar
Linus Torvalds committed
159
/*
160
 * this_rq_lock - lock this runqueue and disable interrupts.
Linus Torvalds's avatar
Linus Torvalds committed
161
 */
Alexey Dobriyan's avatar
Alexey Dobriyan committed
162
static struct rq *this_rq_lock(void)
Linus Torvalds's avatar
Linus Torvalds committed
163
164
	__acquires(rq->lock)
{
165
	struct rq *rq;
Linus Torvalds's avatar
Linus Torvalds committed
166
167
168

	local_irq_disable();
	rq = this_rq();
169
	raw_spin_lock(&rq->lock);
Linus Torvalds's avatar
Linus Torvalds committed
170
171
172
173

	return rq;
}

174
175
176
/*
 * __task_rq_lock - lock the rq @p resides on.
 */
177
struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
178
179
180
181
182
183
184
185
186
187
	__acquires(rq->lock)
{
	struct rq *rq;

	lockdep_assert_held(&p->pi_lock);

	for (;;) {
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
188
			rf->cookie = lockdep_pin_lock(&rq->lock);
189
190
191
192
193
194
195
196
197
198
199
200
			return rq;
		}
		raw_spin_unlock(&rq->lock);

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

/*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 */
201
struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
202
203
204
205
206
207
	__acquires(p->pi_lock)
	__acquires(rq->lock)
{
	struct rq *rq;

	for (;;) {
208
		raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
		rq = task_rq(p);
		raw_spin_lock(&rq->lock);
		/*
		 *	move_queued_task()		task_rq_lock()
		 *
		 *	ACQUIRE (rq->lock)
		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
		 *	[S] ->cpu = new_cpu		[L] task_rq()
		 *					[L] ->on_rq
		 *	RELEASE (rq->lock)
		 *
		 * If we observe the old cpu in task_rq_lock, the acquire of
		 * the old rq->lock will fully serialize against the stores.
		 *
		 * If we observe the new cpu in task_rq_lock, the acquire will
		 * pair with the WMB to ensure we must then also see migrating.
		 */
		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
228
			rf->cookie = lockdep_pin_lock(&rq->lock);
229
230
231
			return rq;
		}
		raw_spin_unlock(&rq->lock);
232
		raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
233
234
235
236
237
238

		while (unlikely(task_on_rq_migrating(p)))
			cpu_relax();
	}
}

239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#ifdef CONFIG_SCHED_HRTICK
/*
 * Use HR-timers to deliver accurate preemption points.
 */

static void hrtick_clear(struct rq *rq)
{
	if (hrtimer_active(&rq->hrtick_timer))
		hrtimer_cancel(&rq->hrtick_timer);
}

/*
 * High-resolution timer tick.
 * Runs from hardirq context with interrupts disabled.
 */
static enum hrtimer_restart hrtick(struct hrtimer *timer)
{
	struct rq *rq = container_of(timer, struct rq, hrtick_timer);

	WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());

260
	raw_spin_lock(&rq->lock);
261
	update_rq_clock(rq);
262
	rq->curr->sched_class->task_tick(rq, rq->curr, 1);
263
	raw_spin_unlock(&rq->lock);
264
265
266
267

	return HRTIMER_NORESTART;
}

268
#ifdef CONFIG_SMP
Peter Zijlstra's avatar
Peter Zijlstra committed
269

270
static void __hrtick_restart(struct rq *rq)
Peter Zijlstra's avatar
Peter Zijlstra committed
271
272
273
{
	struct hrtimer *timer = &rq->hrtick_timer;

274
	hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
Peter Zijlstra's avatar
Peter Zijlstra committed
275
276
}

277
278
279
280
/*
 * called from hardirq (IPI) context
 */
static void __hrtick_start(void *arg)
281
{
282
	struct rq *rq = arg;
283

284
	raw_spin_lock(&rq->lock);
Peter Zijlstra's avatar
Peter Zijlstra committed
285
	__hrtick_restart(rq);
286
	rq->hrtick_csd_pending = 0;
287
	raw_spin_unlock(&rq->lock);
288
289
}

290
291
292
293
294
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
295
void hrtick_start(struct rq *rq, u64 delay)
296
{
297
	struct hrtimer *timer = &rq->hrtick_timer;
298
299
300
301
302
303
304
305
306
	ktime_t time;
	s64 delta;

	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense and can cause timer DoS.
	 */
	delta = max_t(s64, delay, 10000LL);
	time = ktime_add_ns(timer->base->get_time(), delta);
307

308
	hrtimer_set_expires(timer, time);
309
310

	if (rq == this_rq()) {
Peter Zijlstra's avatar
Peter Zijlstra committed
311
		__hrtick_restart(rq);
312
	} else if (!rq->hrtick_csd_pending) {
313
		smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
314
315
		rq->hrtick_csd_pending = 1;
	}
316
317
}

318
319
320
321
322
323
#else
/*
 * Called to set the hrtick timer state.
 *
 * called with rq->lock held and irqs disabled
 */
324
void hrtick_start(struct rq *rq, u64 delay)
325
{
Wanpeng Li's avatar
Wanpeng Li committed
326
327
328
329
330
	/*
	 * Don't schedule slices shorter than 10000ns, that just
	 * doesn't make sense. Rely on vruntime for fairness.
	 */
	delay = max_t(u64, delay, 10000LL);
331
332
	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
		      HRTIMER_MODE_REL_PINNED);
333
334
}
#endif /* CONFIG_SMP */
335

336
static void init_rq_hrtick(struct rq *rq)
337
{
338
339
#ifdef CONFIG_SMP
	rq->hrtick_csd_pending = 0;
340

341
342
343
344
	rq->hrtick_csd.flags = 0;
	rq->hrtick_csd.func = __hrtick_start;
	rq->hrtick_csd.info = rq;
#endif
345

346
347
	hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
	rq->hrtick_timer.function = hrtick;
348
}
Andrew Morton's avatar
Andrew Morton committed
349
#else	/* CONFIG_SCHED_HRTICK */
350
351
352
353
354
355
356
static inline void hrtick_clear(struct rq *rq)
{
}

static inline void init_rq_hrtick(struct rq *rq)
{
}
Andrew Morton's avatar
Andrew Morton committed
357
#endif	/* CONFIG_SCHED_HRTICK */
358

359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
/*
 * cmpxchg based fetch_or, macro so it works for different integer types
 */
#define fetch_or(ptr, mask)						\
	({								\
		typeof(ptr) _ptr = (ptr);				\
		typeof(mask) _mask = (mask);				\
		typeof(*_ptr) _old, _val = *_ptr;			\
									\
		for (;;) {						\
			_old = cmpxchg(_ptr, _val, _val | _mask);	\
			if (_old == _val)				\
				break;					\
			_val = _old;					\
		}							\
	_old;								\
})

377
#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
378
379
380
381
382
383
384
385
386
387
/*
 * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 * this avoids any races wrt polling state changes and thereby avoids
 * spurious IPIs.
 */
static bool set_nr_and_not_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
	return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
}
388
389
390
391
392
393
394
395
396
397

/*
 * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 *
 * If this returns true, then the idle task promises to call
 * sched_ttwu_pending() and reschedule soon.
 */
static bool set_nr_if_polling(struct task_struct *p)
{
	struct thread_info *ti = task_thread_info(p);
398
	typeof(ti->flags) old, val = READ_ONCE(ti->flags);
399
400
401
402
403
404
405
406
407
408
409
410
411
412

	for (;;) {
		if (!(val & _TIF_POLLING_NRFLAG))
			return false;
		if (val & _TIF_NEED_RESCHED)
			return true;
		old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
		if (old == val)
			break;
		val = old;
	}
	return true;
}

413
414
415
416
417
418
#else
static bool set_nr_and_not_polling(struct task_struct *p)
{
	set_tsk_need_resched(p);
	return true;
}
419
420
421
422
423
424
425

#ifdef CONFIG_SMP
static bool set_nr_if_polling(struct task_struct *p)
{
	return false;
}
#endif
426
427
#endif

428
429
430
431
432
433
434
435
436
437
void wake_q_add(struct wake_q_head *head, struct task_struct *task)
{
	struct wake_q_node *node = &task->wake_q;

	/*
	 * Atomically grab the task, if ->wake_q is !nil already it means
	 * its already queued (either by us or someone else) and will get the
	 * wakeup due to that.
	 *
	 * This cmpxchg() implies a full barrier, which pairs with the write
438
	 * barrier implied by the wakeup in wake_up_q().
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
	 */
	if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
		return;

	get_task_struct(task);

	/*
	 * The head is context local, there can be no concurrency.
	 */
	*head->lastp = node;
	head->lastp = &node->next;
}

void wake_up_q(struct wake_q_head *head)
{
	struct wake_q_node *node = head->first;

	while (node != WAKE_Q_TAIL) {
		struct task_struct *task;

		task = container_of(node, struct task_struct, wake_q);
		BUG_ON(!task);
		/* task can safely be re-inserted now */
		node = node->next;
		task->wake_q.next = NULL;

		/*
		 * wake_up_process() implies a wmb() to pair with the queueing
		 * in wake_q_add() so as not to miss wakeups.
		 */
		wake_up_process(task);
		put_task_struct(task);
	}
}

474
/*
475
 * resched_curr - mark rq's current task 'to be rescheduled now'.
476
477
478
479
480
 *
 * On UP this means the setting of the need_resched flag, on SMP it
 * might also involve a cross-CPU call to trigger the scheduler on
 * the target CPU.
 */
481
void resched_curr(struct rq *rq)
482
{
483
	struct task_struct *curr = rq->curr;
484
485
	int cpu;

486
	lockdep_assert_held(&rq->lock);
487

488
	if (test_tsk_need_resched(curr))
489
490
		return;

491
	cpu = cpu_of(rq);
492

493
	if (cpu == smp_processor_id()) {
494
		set_tsk_need_resched(curr);
495
		set_preempt_need_resched();
496
		return;
497
	}
498

499
	if (set_nr_and_not_polling(curr))
500
		smp_send_reschedule(cpu);
501
502
	else
		trace_sched_wake_idle_without_ipi(cpu);
503
504
}

505
void resched_cpu(int cpu)
506
507
508
509
{
	struct rq *rq = cpu_rq(cpu);
	unsigned long flags;

510
	if (!raw_spin_trylock_irqsave(&rq->lock, flags))
511
		return;
512
	resched_curr(rq);
513
	raw_spin_unlock_irqrestore(&rq->lock, flags);
514
}
515

516
#ifdef CONFIG_SMP
517
#ifdef CONFIG_NO_HZ_COMMON
518
519
520
521
522
523
524
525
/*
 * In the semi idle case, use the nearest busy cpu for migrating timers
 * from an idle cpu.  This is good for power-savings.
 *
 * We don't do similar optimization for completely idle system, as
 * selecting an idle cpu will add more delays to the timers than intended
 * (as that cpu's timer base may not be uptodate wrt jiffies etc).
 */
526
int get_nohz_timer_target(void)
527
{
528
	int i, cpu = smp_processor_id();
529
530
	struct sched_domain *sd;

531
	if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
532
533
		return cpu;

534
	rcu_read_lock();
535
	for_each_domain(cpu, sd) {
536
		for_each_cpu(i, sched_domain_span(sd)) {
537
538
539
540
			if (cpu == i)
				continue;

			if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
541
542
543
544
				cpu = i;
				goto unlock;
			}
		}
545
	}
546
547
548

	if (!is_housekeeping_cpu(cpu))
		cpu = housekeeping_any_cpu();
549
550
unlock:
	rcu_read_unlock();
551
552
	return cpu;
}
553
554
555
556
557
558
559
560
561
562
/*
 * When add_timer_on() enqueues a timer into the timer wheel of an
 * idle CPU then this timer might expire before the next timer event
 * which is scheduled to wake up that CPU. In case of a completely
 * idle system the next event might even be infinite time into the
 * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 * leaves the inner idle loop so the newly added timer is taken into
 * account when the CPU goes back to idle and evaluates the timer
 * wheel for the next timer event.
 */
563
static void wake_up_idle_cpu(int cpu)
564
565
566
567
568
569
{
	struct rq *rq = cpu_rq(cpu);

	if (cpu == smp_processor_id())
		return;

570
	if (set_nr_and_not_polling(rq->idle))
571
		smp_send_reschedule(cpu);
572
573
	else
		trace_sched_wake_idle_without_ipi(cpu);
574
575
}

576
static bool wake_up_full_nohz_cpu(int cpu)
577
{
578
579
580
581
582
583
	/*
	 * We just need the target to call irq_exit() and re-evaluate
	 * the next tick. The nohz full kick at least implies that.
	 * If needed we can still optimize that later with an
	 * empty IRQ.
	 */
584
585
	if (cpu_is_offline(cpu))
		return true;  /* Don't try to wake offline CPUs. */
586
	if (tick_nohz_full_cpu(cpu)) {
587
588
		if (cpu != smp_processor_id() ||
		    tick_nohz_tick_stopped())
589
			tick_nohz_full_kick_cpu(cpu);
590
591
592
593
594
595
		return true;
	}

	return false;
}

596
597
598
599
600
/*
 * Wake up the specified CPU.  If the CPU is going offline, it is the
 * caller's responsibility to deal with the lost wakeup, for example,
 * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
 */
601
602
void wake_up_nohz_cpu(int cpu)
{
603
	if (!wake_up_full_nohz_cpu(cpu))
604
605
606
		wake_up_idle_cpu(cpu);
}

607
static inline bool got_nohz_idle_kick(void)
608
{
609
	int cpu = smp_processor_id();
610
611
612
613
614
615
616
617
618
619
620
621
622

	if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
		return false;

	if (idle_cpu(cpu) && !need_resched())
		return true;

	/*
	 * We can't run Idle Load Balance on this CPU for this time so we
	 * cancel it and clear NOHZ_BALANCE_KICK
	 */
	clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
	return false;
623
624
}

625
#else /* CONFIG_NO_HZ_COMMON */
626

627
static inline bool got_nohz_idle_kick(void)
Peter Zijlstra's avatar
Peter Zijlstra committed
628
{
629
	return false;
Peter Zijlstra's avatar
Peter Zijlstra committed
630
631
}

632
#endif /* CONFIG_NO_HZ_COMMON */
633

634
#ifdef CONFIG_NO_HZ_FULL
635
bool sched_can_stop_tick(struct rq *rq)
636
{
637
638
639
640
641
642
	int fifo_nr_running;

	/* Deadline tasks, even if single, need the tick */
	if (rq->dl.dl_nr_running)
		return false;

643
	/*
644
645
	 * If there are more than one RR tasks, we need the tick to effect the
	 * actual RR behaviour.
646
	 */
647
648
649
650
651
	if (rq->rt.rr_nr_running) {
		if (rq->rt.rr_nr_running == 1)
			return true;
		else
			return false;
652
653
	}

654
655
656
657
658
659
660
661
662
663
664
665
666
667
	/*
	 * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
	 * forced preemption between FIFO tasks.
	 */
	fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
	if (fifo_nr_running)
		return true;

	/*
	 * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
	 * if there's more than one we need the tick for involuntary
	 * preemption.
	 */
	if (rq->nr_running > 1)
668
		return false;
669

670
	return true;
671
672
}
#endif /* CONFIG_NO_HZ_FULL */
673

674
void sched_avg_update(struct rq *rq)
675
{
676
677
	s64 period = sched_avg_period();

678
	while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
679
680
681
682
683
684
		/*
		 * Inline assembly required to prevent the compiler
		 * optimising this loop into a divmod call.
		 * See __iter_div_u64_rem() for another example of this.
		 */
		asm("" : "+rm" (rq->age_stamp));
685
686
687
		rq->age_stamp += period;
		rq->rt_avg /= 2;
	}
688
689
}

690
#endif /* CONFIG_SMP */
691

692
693
#if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
			(defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
694
/*
695
696
697
698
 * Iterate task_group tree rooted at *from, calling @down when first entering a
 * node and @up when leaving it for the final time.
 *
 * Caller must hold rcu_lock or sufficient equivalent.
699
 */
700
int walk_tg_tree_from(struct task_group *from,
701
			     tg_visitor down, tg_visitor up, void *data)
702
703
{
	struct task_group *parent, *child;
Peter Zijlstra's avatar
Peter Zijlstra committed
704
	int ret;
705

706
707
	parent = from;

708
down:
Peter Zijlstra's avatar
Peter Zijlstra committed
709
710
	ret = (*down)(parent, data);
	if (ret)
711
		goto out;
712
713
714
715
716
717
718
	list_for_each_entry_rcu(child, &parent->children, siblings) {
		parent = child;
		goto down;

up:
		continue;
	}
Peter Zijlstra's avatar
Peter Zijlstra committed
719
	ret = (*up)(parent, data);
720
721
	if (ret || parent == from)
		goto out;
722
723
724
725
726

	child = parent;
	parent = parent->parent;
	if (parent)
		goto up;
727
out:
Peter Zijlstra's avatar
Peter Zijlstra committed
728
	return ret;
729
730
}

731
int tg_nop(struct task_group *tg, void *data)
Peter Zijlstra's avatar
Peter Zijlstra committed
732
{
733
	return 0;
Peter Zijlstra's avatar
Peter Zijlstra committed
734
}
735
736
#endif

737
738
static void set_load_weight(struct task_struct *p)
{
Nikhil Rao's avatar
Nikhil Rao committed
739
740
741
	int prio = p->static_prio - MAX_RT_PRIO;
	struct load_weight *load = &p->se.load;

Ingo Molnar's avatar
Ingo Molnar committed
742
743
744
	/*
	 * SCHED_IDLE tasks get minimal weight:
	 */
745
	if (idle_policy(p->policy)) {
746
		load->weight = scale_load(WEIGHT_IDLEPRIO);
Nikhil Rao's avatar
Nikhil Rao committed
747
		load->inv_weight = WMULT_IDLEPRIO;
Ingo Molnar's avatar
Ingo Molnar committed
748
749
		return;
	}
750

751
752
	load->weight = scale_load(sched_prio_to_weight[prio]);
	load->inv_weight = sched_prio_to_wmult[prio];
753
754
}

755
static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
756
{
757
	update_rq_clock(rq);
758
759
	if (!(flags & ENQUEUE_RESTORE))
		sched_info_queued(rq, p);
760
	p->sched_class->enqueue_task(rq, p, flags);
761
762
}

763
static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
764
{
765
	update_rq_clock(rq);
766
767
	if (!(flags & DEQUEUE_SAVE))
		sched_info_dequeued(rq, p);
768
	p->sched_class->dequeue_task(rq, p, flags);
769
770
}

771
void activate_task(struct rq *rq, struct task_struct *p, int flags)
772
773
774
775
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible--;

776
	enqueue_task(rq, p, flags);
777
778
}

779
void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
780
781
782
783
{
	if (task_contributes_to_load(p))
		rq->nr_uninterruptible++;

784
	dequeue_task(rq, p, flags);
785
786
}

787
static void update_rq_clock_task(struct rq *rq, s64 delta)
788
{
789
790
791
792
793
794
795
796
/*
 * In theory, the compile should just see 0 here, and optimize out the call
 * to sched_rt_avg_update. But I don't trust it...
 */
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
	s64 steal = 0, irq_delta = 0;
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
797
	irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818

	/*
	 * Since irq_time is only updated on {soft,}irq_exit, we might run into
	 * this case when a previous update_rq_clock() happened inside a
	 * {soft,}irq region.
	 *
	 * When this happens, we stop ->clock_task and only update the
	 * prev_irq_time stamp to account for the part that fit, so that a next
	 * update will consume the rest. This ensures ->clock_task is
	 * monotonic.
	 *
	 * It does however cause some slight miss-attribution of {soft,}irq
	 * time, a more accurate solution would be to update the irq_time using
	 * the current rq->clock timestamp, except that would require using
	 * atomic ops.
	 */
	if (irq_delta > delta)
		irq_delta = delta;

	rq->prev_irq_time += irq_delta;
	delta -= irq_delta;
819
820
#endif
#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
821
	if (static_key_false((&paravirt_steal_rq_enabled))) {
822
823
824
825
826
827
828
829
830
831
832
		steal = paravirt_steal_clock(cpu_of(rq));
		steal -= rq->prev_steal_time_rq;

		if (unlikely(steal > delta))
			steal = delta;

		rq->prev_steal_time_rq += steal;
		delta -= steal;
	}
#endif

833
834
	rq->clock_task += delta;

835
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
836
	if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
837
838
		sched_rt_avg_update(rq, irq_delta + steal);
#endif
839
840
}

841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
void sched_set_stop_task(int cpu, struct task_struct *stop)
{
	struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
	struct task_struct *old_stop = cpu_rq(cpu)->stop;

	if (stop) {
		/*
		 * Make it appear like a SCHED_FIFO task, its something
		 * userspace knows about and won't get confused about.
		 *
		 * Also, it will make PI more or less work without too
		 * much confusion -- but then, stop work should not
		 * rely on PI working anyway.
		 */
		sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);

		stop->sched_class = &stop_sched_class;
	}

	cpu_rq(cpu)->stop = stop;

	if (old_stop) {
		/*
		 * Reset it back to a normal scheduling class so that
		 * it can die in pieces.
		 */
		old_stop->sched_class = &rt_sched_class;
	}
}

871
/*
Ingo Molnar's avatar
Ingo Molnar committed
872
 * __normal_prio - return the priority that is based on the static prio
873
874
875
 */
static inline int __normal_prio(struct task_struct *p)
{
Ingo Molnar's avatar
Ingo Molnar committed
876
	return p->static_prio;
877
878
}

879
880
881
882
883
884
885
/*
 * Calculate the expected normal priority: i.e. priority
 * without taking RT-inheritance into account. Might be
 * boosted by interactivity modifiers. Changes upon fork,
 * setprio syscalls, and whenever the interactivity
 * estimator recalculates.
 */
886
static inline int normal_prio(struct task_struct *p)
887
888
889
{
	int prio;

890
891
892
	if (task_has_dl_policy(p))
		prio = MAX_DL_PRIO-1;
	else if (task_has_rt_policy(p))
893
894
895
896
897
898
899
900
901
902
903
904
905
		prio = MAX_RT_PRIO-1 - p->rt_priority;
	else
		prio = __normal_prio(p);
	return prio;
}

/*
 * Calculate the current priority, i.e. the priority
 * taken into account by the scheduler. This value might
 * be boosted by RT tasks, or might be boosted by
 * interactivity modifiers. Will be RT if the task got
 * RT-boosted. If not then it returns p->normal_prio.
 */
906
static int effective_prio(struct task_struct *p)
907
908
909
910
911
912
913
914
915
916
917
918
{
	p->normal_prio = normal_prio(p);
	/*
	 * If we are RT tasks or we were boosted to RT priority,
	 * keep the priority unchanged. Otherwise, update priority
	 * to the normal priority:
	 */
	if (!rt_prio(p->prio))
		return p->normal_prio;
	return p->prio;
}

Linus Torvalds's avatar
Linus Torvalds committed
919
920
921
/**
 * task_curr - is this task currently executing on a CPU?
 * @p: the task in question.
922
923
 *
 * Return: 1 if the task is currently executing. 0 otherwise.
Linus Torvalds's avatar
Linus Torvalds committed
924
 */
925
inline int task_curr(const struct task_struct *p)
Linus Torvalds's avatar
Linus Torvalds committed
926
927
928
929
{
	return cpu_curr(task_cpu(p)) == p;
}

930
/*
931
932
933
934
935
 * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
 * use the balance_callback list if you want balancing.
 *
 * this means any call to check_class_changed() must be followed by a call to
 * balance_callback().
936
 */
937
938
static inline void check_class_changed(struct rq *rq, struct task_struct *p,
				       const struct sched_class *prev_class,
Peter Zijlstra's avatar
Peter Zijlstra committed
939
				       int oldprio)
940
941
942
{
	if (prev_class != p->sched_class) {
		if (prev_class->switched_from)
Peter Zijlstra's avatar
Peter Zijlstra committed
943
			prev_class->switched_from(rq, p);
944

Peter Zijlstra's avatar
Peter Zijlstra committed
945
		p->sched_class->switched_to(rq, p);
946
	} else if (oldprio != p->prio || dl_task(p))
Peter Zijlstra's avatar
Peter Zijlstra committed
947
		p->sched_class->prio_changed(rq, p, oldprio);
948
949
}

950
void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
951
952
953
954
955
956
957
958
959
960
{
	const struct sched_class *class;

	if (p->sched_class == rq->curr->sched_class) {
		rq->curr->sched_class->check_preempt_curr(rq, p, flags);
	} else {
		for_each_class(class) {
			if (class == rq->curr->sched_class)
				break;
			if (class == p->sched_class) {
961
				resched_curr(rq);
962
963
964
965
966
967
968
969
970
				break;
			}
		}
	}

	/*
	 * A queue event has occurred, and we're going to schedule.  In
	 * this case, we can save a useless back to back clock update.
	 */
971
	if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
972
		rq_clock_skip_update(rq, true);
973
974
}

Linus Torvalds's avatar
Linus Torvalds committed
975
#ifdef CONFIG_SMP
Peter Zijlstra's avatar
Peter Zijlstra committed
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
/*
 * This is how migration works:
 *
 * 1) we invoke migration_cpu_stop() on the target CPU using
 *    stop_one_cpu().
 * 2) stopper starts to run (implicitly forcing the migrated thread
 *    off the CPU)
 * 3) it checks whether the migrated task is still in the wrong runqueue.
 * 4) if it's in the wrong runqueue then the migration thread removes
 *    it and puts it into the right queue.
 * 5) stopper completes and stop_one_cpu() returns and the migration
 *    is done.
 */

/*
 * move_queued_task - move a queued task to new rq.
 *
 * Returns (locked) new rq. Old rq's lock is released.
 */
995
static struct rq *move_queued_task(struct rq *rq, struct task_struct *p, int new_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
996
997
998
999
{
	lockdep_assert_held(&rq->lock);

	p->on_rq = TASK_ON_RQ_MIGRATING;
1000
	dequeue_task(rq, p, 0);
Peter Zijlstra's avatar
Peter Zijlstra committed
1001
1002
1003
1004
1005
1006
1007
1008
	set_task_cpu(p, new_cpu);
	raw_spin_unlock(&rq->lock);

	rq = cpu_rq(new_cpu);

	raw_spin_lock(&rq->lock);
	BUG_ON(task_cpu(p) != new_cpu);
	enqueue_task(rq, p, 0);
1009
	p->on_rq = TASK_ON_RQ_QUEUED;
Peter Zijlstra's avatar
Peter Zijlstra committed
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
	check_preempt_curr(rq, p, 0);

	return rq;
}

struct migration_arg {
	struct task_struct *task;
	int dest_cpu;
};

/*
 * Move (not current) task off this cpu, onto dest cpu. We're doing
 * this because either it can't run here any more (set_cpus_allowed()
 * away from this CPU, or CPU going down), or because we're
 * attempting to rebalance this task on exec (sched_exec).
 *
 * So we race with normal scheduler movements, but that's OK, as long
 * as the task is no longer on this CPU.
 */
1029
static struct rq *__migrate_task(struct rq *rq, struct task_struct *p, int dest_cpu)
Peter Zijlstra's avatar
Peter Zijlstra committed
1030
1031
{
	if (unlikely(!cpu_active(dest_cpu)))
1032
		return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
1033
1034
1035

	/* Affinity changed (again). */
	if (!cpumask_test_cpu(dest_cpu, tsk_cpus_allowed(p)))
1036
		return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
1037

1038
1039
1040
	rq = move_queued_task(rq, p, dest_cpu);

	return rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
}

/*
 * migration_cpu_stop - this will be executed by a highprio stopper thread
 * and performs thread migration by bumping thread off CPU then
 * 'pushing' onto another runqueue.
 */
static int migration_cpu_stop(void *data)
{
	struct migration_arg *arg = data;
1051
1052
	struct task_struct *p = arg->task;
	struct rq *rq = this_rq();
Peter Zijlstra's avatar
Peter Zijlstra committed
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064

	/*
	 * The original target cpu might have gone down and we might
	 * be on another cpu but it doesn't matter.
	 */
	local_irq_disable();
	/*
	 * We need to explicitly wake pending tasks before running
	 * __migrate_task() such that we will not miss enforcing cpus_allowed
	 * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
	 */
	sched_ttwu_pending();
1065
1066
1067
1068
1069
1070
1071
1072

	raw_spin_lock(&p->pi_lock);
	raw_spin_lock(&rq->lock);
	/*
	 * If task_rq(p) != rq, it cannot be migrated here, because we're
	 * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
	 * we're holding p->pi_lock.
	 */
1073
1074
1075
1076
1077
1078
	if (task_rq(p) == rq) {
		if (task_on_rq_queued(p))
			rq = __migrate_task(rq, p, arg->dest_cpu);
		else
			p->wake_cpu = arg->dest_cpu;
	}
1079
1080
1081
	raw_spin_unlock(&rq->lock);
	raw_spin_unlock(&p->pi_lock);

Peter Zijlstra's avatar
Peter Zijlstra committed
1082
1083
1084
1085
	local_irq_enable();
	return 0;
}

1086
1087
1088
1089
1090
/*
 * sched_class::set_cpus_allowed must do the below, but is not required to
 * actually call this function.
 */
void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
Peter Zijlstra's avatar
Peter Zijlstra committed
1091
1092
1093
1094
1095
{
	cpumask_copy(&p->cpus_allowed, new_mask);
	p->nr_cpus_allowed = cpumask_weight(new_mask);
}

1096
1097
void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
{
1098
1099
1100
	struct rq *rq = task_rq(p);
	bool queued, running;

1101
	lockdep_assert_held(&p->pi_lock);
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111

	queued = task_on_rq_queued(p);
	running = task_current(rq, p);

	if (queued) {
		/*
		 * Because __kthread_bind() calls this on blocked tasks without
		 * holding rq->lock.
		 */
		lockdep_assert_held(&rq->lock);
1112
		dequeue_task(rq, p, DEQUEUE_SAVE);
1113
1114
1115
1116
	}
	if (running)
		put_prev_task(rq, p);

1117
	p->sched_class->set_cpus_allowed(p, new_mask);
1118
1119

	if (queued)
1120
		enqueue_task(rq, p, ENQUEUE_RESTORE);
1121
	if (running)
1122
		set_curr_task(rq, p);
1123
1124
}

Peter Zijlstra's avatar
Peter Zijlstra committed
1125
1126
1127
1128
1129
1130
1131
1132
1133
/*
 * Change a given task's CPU affinity. Migrate the thread to a
 * proper CPU and schedule it away if the CPU it's executing on
 * is removed from the allowed bitmask.
 *
 * NOTE: the caller must have a valid reference to the task, the
 * task must not exit() & deallocate itself prematurely. The
 * call is not atomic; no spinlocks may be held.
 */
1134
1135
static int __set_cpus_allowed_ptr(struct task_struct *p,
				  const struct cpumask *new_mask, bool check)
Peter Zijlstra's avatar
Peter Zijlstra committed
1136
{
1137
	const struct cpumask *cpu_valid_mask = cpu_active_mask;
Peter Zijlstra's avatar
Peter Zijlstra committed
1138
	unsigned int dest_cpu;
1139
1140
	struct rq_flags rf;
	struct rq *rq;
Peter Zijlstra's avatar
Peter Zijlstra committed
1141
1142
	int ret = 0;

1143
	rq = task_rq_lock(p, &rf);
Peter Zijlstra's avatar
Peter Zijlstra committed
1144