rt.c 54 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316
  1. /*
  2. * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
  3. * policies)
  4. */
  5. #include "sched.h"
  6. #include <linux/slab.h>
  7. #include <linux/irq_work.h>
  8. int sched_rr_timeslice = RR_TIMESLICE;
  9. static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  10. struct rt_bandwidth def_rt_bandwidth;
  11. static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  12. {
  13. struct rt_bandwidth *rt_b =
  14. container_of(timer, struct rt_bandwidth, rt_period_timer);
  15. int idle = 0;
  16. int overrun;
  17. raw_spin_lock(&rt_b->rt_runtime_lock);
  18. for (;;) {
  19. overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  20. if (!overrun)
  21. break;
  22. raw_spin_unlock(&rt_b->rt_runtime_lock);
  23. idle = do_sched_rt_period_timer(rt_b, overrun);
  24. raw_spin_lock(&rt_b->rt_runtime_lock);
  25. }
  26. if (idle)
  27. rt_b->rt_period_active = 0;
  28. raw_spin_unlock(&rt_b->rt_runtime_lock);
  29. return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  30. }
  31. void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  32. {
  33. rt_b->rt_period = ns_to_ktime(period);
  34. rt_b->rt_runtime = runtime;
  35. raw_spin_lock_init(&rt_b->rt_runtime_lock);
  36. hrtimer_init(&rt_b->rt_period_timer,
  37. CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  38. rt_b->rt_period_timer.function = sched_rt_period_timer;
  39. }
  40. static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  41. {
  42. if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  43. return;
  44. raw_spin_lock(&rt_b->rt_runtime_lock);
  45. if (!rt_b->rt_period_active) {
  46. rt_b->rt_period_active = 1;
  47. hrtimer_forward_now(&rt_b->rt_period_timer, rt_b->rt_period);
  48. hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  49. }
  50. raw_spin_unlock(&rt_b->rt_runtime_lock);
  51. }
  52. void init_rt_rq(struct rt_rq *rt_rq)
  53. {
  54. struct rt_prio_array *array;
  55. int i;
  56. array = &rt_rq->active;
  57. for (i = 0; i < MAX_RT_PRIO; i++) {
  58. INIT_LIST_HEAD(array->queue + i);
  59. __clear_bit(i, array->bitmap);
  60. }
  61. /* delimiter for bitsearch: */
  62. __set_bit(MAX_RT_PRIO, array->bitmap);
  63. #if defined CONFIG_SMP
  64. rt_rq->highest_prio.curr = MAX_RT_PRIO;
  65. rt_rq->highest_prio.next = MAX_RT_PRIO;
  66. rt_rq->rt_nr_migratory = 0;
  67. rt_rq->overloaded = 0;
  68. plist_head_init(&rt_rq->pushable_tasks);
  69. #endif /* CONFIG_SMP */
  70. /* We start is dequeued state, because no RT tasks are queued */
  71. rt_rq->rt_queued = 0;
  72. rt_rq->rt_time = 0;
  73. rt_rq->rt_throttled = 0;
  74. rt_rq->rt_runtime = 0;
  75. raw_spin_lock_init(&rt_rq->rt_runtime_lock);
  76. }
  77. #ifdef CONFIG_RT_GROUP_SCHED
  78. static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
  79. {
  80. hrtimer_cancel(&rt_b->rt_period_timer);
  81. }
  82. #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
  83. static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  84. {
  85. #ifdef CONFIG_SCHED_DEBUG
  86. WARN_ON_ONCE(!rt_entity_is_task(rt_se));
  87. #endif
  88. return container_of(rt_se, struct task_struct, rt);
  89. }
  90. static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  91. {
  92. return rt_rq->rq;
  93. }
  94. static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  95. {
  96. return rt_se->rt_rq;
  97. }
  98. static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
  99. {
  100. struct rt_rq *rt_rq = rt_se->rt_rq;
  101. return rt_rq->rq;
  102. }
  103. void free_rt_sched_group(struct task_group *tg)
  104. {
  105. int i;
  106. if (tg->rt_se)
  107. destroy_rt_bandwidth(&tg->rt_bandwidth);
  108. for_each_possible_cpu(i) {
  109. if (tg->rt_rq)
  110. kfree(tg->rt_rq[i]);
  111. if (tg->rt_se)
  112. kfree(tg->rt_se[i]);
  113. }
  114. kfree(tg->rt_rq);
  115. kfree(tg->rt_se);
  116. }
  117. void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
  118. struct sched_rt_entity *rt_se, int cpu,
  119. struct sched_rt_entity *parent)
  120. {
  121. struct rq *rq = cpu_rq(cpu);
  122. rt_rq->highest_prio.curr = MAX_RT_PRIO;
  123. rt_rq->rt_nr_boosted = 0;
  124. rt_rq->rq = rq;
  125. rt_rq->tg = tg;
  126. tg->rt_rq[cpu] = rt_rq;
  127. tg->rt_se[cpu] = rt_se;
  128. if (!rt_se)
  129. return;
  130. if (!parent)
  131. rt_se->rt_rq = &rq->rt;
  132. else
  133. rt_se->rt_rq = parent->my_q;
  134. rt_se->my_q = rt_rq;
  135. rt_se->parent = parent;
  136. INIT_LIST_HEAD(&rt_se->run_list);
  137. }
  138. int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  139. {
  140. struct rt_rq *rt_rq;
  141. struct sched_rt_entity *rt_se;
  142. int i;
  143. tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
  144. if (!tg->rt_rq)
  145. goto err;
  146. tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
  147. if (!tg->rt_se)
  148. goto err;
  149. init_rt_bandwidth(&tg->rt_bandwidth,
  150. ktime_to_ns(def_rt_bandwidth.rt_period), 0);
  151. for_each_possible_cpu(i) {
  152. rt_rq = kzalloc_node(sizeof(struct rt_rq),
  153. GFP_KERNEL, cpu_to_node(i));
  154. if (!rt_rq)
  155. goto err;
  156. rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
  157. GFP_KERNEL, cpu_to_node(i));
  158. if (!rt_se)
  159. goto err_free_rq;
  160. init_rt_rq(rt_rq);
  161. rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
  162. init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
  163. }
  164. return 1;
  165. err_free_rq:
  166. kfree(rt_rq);
  167. err:
  168. return 0;
  169. }
  170. #else /* CONFIG_RT_GROUP_SCHED */
  171. #define rt_entity_is_task(rt_se) (1)
  172. static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
  173. {
  174. return container_of(rt_se, struct task_struct, rt);
  175. }
  176. static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
  177. {
  178. return container_of(rt_rq, struct rq, rt);
  179. }
  180. static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
  181. {
  182. struct task_struct *p = rt_task_of(rt_se);
  183. return task_rq(p);
  184. }
  185. static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
  186. {
  187. struct rq *rq = rq_of_rt_se(rt_se);
  188. return &rq->rt;
  189. }
  190. void free_rt_sched_group(struct task_group *tg) { }
  191. int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
  192. {
  193. return 1;
  194. }
  195. #endif /* CONFIG_RT_GROUP_SCHED */
  196. #ifdef CONFIG_SMP
  197. static void pull_rt_task(struct rq *this_rq);
  198. static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
  199. {
  200. /* Try to pull RT tasks here if we lower this rq's prio */
  201. return rq->rt.highest_prio.curr > prev->prio;
  202. }
  203. static inline int rt_overloaded(struct rq *rq)
  204. {
  205. return atomic_read(&rq->rd->rto_count);
  206. }
  207. static inline void rt_set_overload(struct rq *rq)
  208. {
  209. if (!rq->online)
  210. return;
  211. cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
  212. /*
  213. * Make sure the mask is visible before we set
  214. * the overload count. That is checked to determine
  215. * if we should look at the mask. It would be a shame
  216. * if we looked at the mask, but the mask was not
  217. * updated yet.
  218. *
  219. * Matched by the barrier in pull_rt_task().
  220. */
  221. smp_wmb();
  222. atomic_inc(&rq->rd->rto_count);
  223. }
  224. static inline void rt_clear_overload(struct rq *rq)
  225. {
  226. if (!rq->online)
  227. return;
  228. /* the order here really doesn't matter */
  229. atomic_dec(&rq->rd->rto_count);
  230. cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
  231. }
  232. static void update_rt_migration(struct rt_rq *rt_rq)
  233. {
  234. if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
  235. if (!rt_rq->overloaded) {
  236. rt_set_overload(rq_of_rt_rq(rt_rq));
  237. rt_rq->overloaded = 1;
  238. }
  239. } else if (rt_rq->overloaded) {
  240. rt_clear_overload(rq_of_rt_rq(rt_rq));
  241. rt_rq->overloaded = 0;
  242. }
  243. }
  244. static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  245. {
  246. struct task_struct *p;
  247. if (!rt_entity_is_task(rt_se))
  248. return;
  249. p = rt_task_of(rt_se);
  250. rt_rq = &rq_of_rt_rq(rt_rq)->rt;
  251. rt_rq->rt_nr_total++;
  252. if (p->nr_cpus_allowed > 1)
  253. rt_rq->rt_nr_migratory++;
  254. update_rt_migration(rt_rq);
  255. }
  256. static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  257. {
  258. struct task_struct *p;
  259. if (!rt_entity_is_task(rt_se))
  260. return;
  261. p = rt_task_of(rt_se);
  262. rt_rq = &rq_of_rt_rq(rt_rq)->rt;
  263. rt_rq->rt_nr_total--;
  264. if (p->nr_cpus_allowed > 1)
  265. rt_rq->rt_nr_migratory--;
  266. update_rt_migration(rt_rq);
  267. }
  268. static inline int has_pushable_tasks(struct rq *rq)
  269. {
  270. return !plist_head_empty(&rq->rt.pushable_tasks);
  271. }
  272. static DEFINE_PER_CPU(struct callback_head, rt_push_head);
  273. static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
  274. static void push_rt_tasks(struct rq *);
  275. static void pull_rt_task(struct rq *);
  276. static inline void queue_push_tasks(struct rq *rq)
  277. {
  278. if (!has_pushable_tasks(rq))
  279. return;
  280. queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
  281. }
  282. static inline void queue_pull_task(struct rq *rq)
  283. {
  284. queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
  285. }
  286. static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
  287. {
  288. plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
  289. plist_node_init(&p->pushable_tasks, p->prio);
  290. plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
  291. /* Update the highest prio pushable task */
  292. if (p->prio < rq->rt.highest_prio.next)
  293. rq->rt.highest_prio.next = p->prio;
  294. }
  295. static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
  296. {
  297. plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
  298. /* Update the new highest prio pushable task */
  299. if (has_pushable_tasks(rq)) {
  300. p = plist_first_entry(&rq->rt.pushable_tasks,
  301. struct task_struct, pushable_tasks);
  302. rq->rt.highest_prio.next = p->prio;
  303. } else
  304. rq->rt.highest_prio.next = MAX_RT_PRIO;
  305. }
  306. #else
  307. static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
  308. {
  309. }
  310. static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
  311. {
  312. }
  313. static inline
  314. void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  315. {
  316. }
  317. static inline
  318. void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  319. {
  320. }
  321. static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
  322. {
  323. return false;
  324. }
  325. static inline void pull_rt_task(struct rq *this_rq)
  326. {
  327. }
  328. static inline void queue_push_tasks(struct rq *rq)
  329. {
  330. }
  331. #endif /* CONFIG_SMP */
  332. static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
  333. static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
  334. static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  335. {
  336. return !list_empty(&rt_se->run_list);
  337. }
  338. #ifdef CONFIG_RT_GROUP_SCHED
  339. static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  340. {
  341. if (!rt_rq->tg)
  342. return RUNTIME_INF;
  343. return rt_rq->rt_runtime;
  344. }
  345. static inline u64 sched_rt_period(struct rt_rq *rt_rq)
  346. {
  347. return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
  348. }
  349. typedef struct task_group *rt_rq_iter_t;
  350. static inline struct task_group *next_task_group(struct task_group *tg)
  351. {
  352. do {
  353. tg = list_entry_rcu(tg->list.next,
  354. typeof(struct task_group), list);
  355. } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
  356. if (&tg->list == &task_groups)
  357. tg = NULL;
  358. return tg;
  359. }
  360. #define for_each_rt_rq(rt_rq, iter, rq) \
  361. for (iter = container_of(&task_groups, typeof(*iter), list); \
  362. (iter = next_task_group(iter)) && \
  363. (rt_rq = iter->rt_rq[cpu_of(rq)]);)
  364. #define for_each_sched_rt_entity(rt_se) \
  365. for (; rt_se; rt_se = rt_se->parent)
  366. static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  367. {
  368. return rt_se->my_q;
  369. }
  370. static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
  371. static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
  372. static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  373. {
  374. struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
  375. struct rq *rq = rq_of_rt_rq(rt_rq);
  376. struct sched_rt_entity *rt_se;
  377. int cpu = cpu_of(rq);
  378. rt_se = rt_rq->tg->rt_se[cpu];
  379. if (rt_rq->rt_nr_running) {
  380. if (!rt_se)
  381. enqueue_top_rt_rq(rt_rq);
  382. else if (!on_rt_rq(rt_se))
  383. enqueue_rt_entity(rt_se, false);
  384. if (rt_rq->highest_prio.curr < curr->prio)
  385. resched_curr(rq);
  386. }
  387. }
  388. static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
  389. {
  390. struct sched_rt_entity *rt_se;
  391. int cpu = cpu_of(rq_of_rt_rq(rt_rq));
  392. rt_se = rt_rq->tg->rt_se[cpu];
  393. if (!rt_se)
  394. dequeue_top_rt_rq(rt_rq);
  395. else if (on_rt_rq(rt_se))
  396. dequeue_rt_entity(rt_se);
  397. }
  398. static inline int rt_rq_throttled(struct rt_rq *rt_rq)
  399. {
  400. return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
  401. }
  402. static int rt_se_boosted(struct sched_rt_entity *rt_se)
  403. {
  404. struct rt_rq *rt_rq = group_rt_rq(rt_se);
  405. struct task_struct *p;
  406. if (rt_rq)
  407. return !!rt_rq->rt_nr_boosted;
  408. p = rt_task_of(rt_se);
  409. return p->prio != p->normal_prio;
  410. }
  411. #ifdef CONFIG_SMP
  412. static inline const struct cpumask *sched_rt_period_mask(void)
  413. {
  414. return this_rq()->rd->span;
  415. }
  416. #else
  417. static inline const struct cpumask *sched_rt_period_mask(void)
  418. {
  419. return cpu_online_mask;
  420. }
  421. #endif
  422. static inline
  423. struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
  424. {
  425. return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
  426. }
  427. static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
  428. {
  429. return &rt_rq->tg->rt_bandwidth;
  430. }
  431. #else /* !CONFIG_RT_GROUP_SCHED */
  432. static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  433. {
  434. return rt_rq->rt_runtime;
  435. }
  436. static inline u64 sched_rt_period(struct rt_rq *rt_rq)
  437. {
  438. return ktime_to_ns(def_rt_bandwidth.rt_period);
  439. }
  440. typedef struct rt_rq *rt_rq_iter_t;
  441. #define for_each_rt_rq(rt_rq, iter, rq) \
  442. for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
  443. #define for_each_sched_rt_entity(rt_se) \
  444. for (; rt_se; rt_se = NULL)
  445. static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  446. {
  447. return NULL;
  448. }
  449. static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  450. {
  451. struct rq *rq = rq_of_rt_rq(rt_rq);
  452. if (!rt_rq->rt_nr_running)
  453. return;
  454. enqueue_top_rt_rq(rt_rq);
  455. resched_curr(rq);
  456. }
  457. static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
  458. {
  459. dequeue_top_rt_rq(rt_rq);
  460. }
  461. static inline int rt_rq_throttled(struct rt_rq *rt_rq)
  462. {
  463. return rt_rq->rt_throttled;
  464. }
  465. static inline const struct cpumask *sched_rt_period_mask(void)
  466. {
  467. return cpu_online_mask;
  468. }
  469. static inline
  470. struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
  471. {
  472. return &cpu_rq(cpu)->rt;
  473. }
  474. static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
  475. {
  476. return &def_rt_bandwidth;
  477. }
  478. #endif /* CONFIG_RT_GROUP_SCHED */
  479. bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
  480. {
  481. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  482. return (hrtimer_active(&rt_b->rt_period_timer) ||
  483. rt_rq->rt_time < rt_b->rt_runtime);
  484. }
  485. #ifdef CONFIG_SMP
  486. /*
  487. * We ran out of runtime, see if we can borrow some from our neighbours.
  488. */
  489. static void do_balance_runtime(struct rt_rq *rt_rq)
  490. {
  491. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  492. struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
  493. int i, weight;
  494. u64 rt_period;
  495. weight = cpumask_weight(rd->span);
  496. raw_spin_lock(&rt_b->rt_runtime_lock);
  497. rt_period = ktime_to_ns(rt_b->rt_period);
  498. for_each_cpu(i, rd->span) {
  499. struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  500. s64 diff;
  501. if (iter == rt_rq)
  502. continue;
  503. raw_spin_lock(&iter->rt_runtime_lock);
  504. /*
  505. * Either all rqs have inf runtime and there's nothing to steal
  506. * or __disable_runtime() below sets a specific rq to inf to
  507. * indicate its been disabled and disalow stealing.
  508. */
  509. if (iter->rt_runtime == RUNTIME_INF)
  510. goto next;
  511. /*
  512. * From runqueues with spare time, take 1/n part of their
  513. * spare time, but no more than our period.
  514. */
  515. diff = iter->rt_runtime - iter->rt_time;
  516. if (diff > 0) {
  517. diff = div_u64((u64)diff, weight);
  518. if (rt_rq->rt_runtime + diff > rt_period)
  519. diff = rt_period - rt_rq->rt_runtime;
  520. iter->rt_runtime -= diff;
  521. rt_rq->rt_runtime += diff;
  522. if (rt_rq->rt_runtime == rt_period) {
  523. raw_spin_unlock(&iter->rt_runtime_lock);
  524. break;
  525. }
  526. }
  527. next:
  528. raw_spin_unlock(&iter->rt_runtime_lock);
  529. }
  530. raw_spin_unlock(&rt_b->rt_runtime_lock);
  531. }
  532. /*
  533. * Ensure this RQ takes back all the runtime it lend to its neighbours.
  534. */
  535. static void __disable_runtime(struct rq *rq)
  536. {
  537. struct root_domain *rd = rq->rd;
  538. rt_rq_iter_t iter;
  539. struct rt_rq *rt_rq;
  540. if (unlikely(!scheduler_running))
  541. return;
  542. for_each_rt_rq(rt_rq, iter, rq) {
  543. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  544. s64 want;
  545. int i;
  546. raw_spin_lock(&rt_b->rt_runtime_lock);
  547. raw_spin_lock(&rt_rq->rt_runtime_lock);
  548. /*
  549. * Either we're all inf and nobody needs to borrow, or we're
  550. * already disabled and thus have nothing to do, or we have
  551. * exactly the right amount of runtime to take out.
  552. */
  553. if (rt_rq->rt_runtime == RUNTIME_INF ||
  554. rt_rq->rt_runtime == rt_b->rt_runtime)
  555. goto balanced;
  556. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  557. /*
  558. * Calculate the difference between what we started out with
  559. * and what we current have, that's the amount of runtime
  560. * we lend and now have to reclaim.
  561. */
  562. want = rt_b->rt_runtime - rt_rq->rt_runtime;
  563. /*
  564. * Greedy reclaim, take back as much as we can.
  565. */
  566. for_each_cpu(i, rd->span) {
  567. struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
  568. s64 diff;
  569. /*
  570. * Can't reclaim from ourselves or disabled runqueues.
  571. */
  572. if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
  573. continue;
  574. raw_spin_lock(&iter->rt_runtime_lock);
  575. if (want > 0) {
  576. diff = min_t(s64, iter->rt_runtime, want);
  577. iter->rt_runtime -= diff;
  578. want -= diff;
  579. } else {
  580. iter->rt_runtime -= want;
  581. want -= want;
  582. }
  583. raw_spin_unlock(&iter->rt_runtime_lock);
  584. if (!want)
  585. break;
  586. }
  587. raw_spin_lock(&rt_rq->rt_runtime_lock);
  588. /*
  589. * We cannot be left wanting - that would mean some runtime
  590. * leaked out of the system.
  591. */
  592. BUG_ON(want);
  593. balanced:
  594. /*
  595. * Disable all the borrow logic by pretending we have inf
  596. * runtime - in which case borrowing doesn't make sense.
  597. */
  598. rt_rq->rt_runtime = RUNTIME_INF;
  599. rt_rq->rt_throttled = 0;
  600. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  601. raw_spin_unlock(&rt_b->rt_runtime_lock);
  602. /* Make rt_rq available for pick_next_task() */
  603. sched_rt_rq_enqueue(rt_rq);
  604. }
  605. }
  606. static void __enable_runtime(struct rq *rq)
  607. {
  608. rt_rq_iter_t iter;
  609. struct rt_rq *rt_rq;
  610. if (unlikely(!scheduler_running))
  611. return;
  612. /*
  613. * Reset each runqueue's bandwidth settings
  614. */
  615. for_each_rt_rq(rt_rq, iter, rq) {
  616. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  617. raw_spin_lock(&rt_b->rt_runtime_lock);
  618. raw_spin_lock(&rt_rq->rt_runtime_lock);
  619. rt_rq->rt_runtime = rt_b->rt_runtime;
  620. rt_rq->rt_time = 0;
  621. rt_rq->rt_throttled = 0;
  622. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  623. raw_spin_unlock(&rt_b->rt_runtime_lock);
  624. }
  625. }
  626. static void balance_runtime(struct rt_rq *rt_rq)
  627. {
  628. if (!sched_feat(RT_RUNTIME_SHARE))
  629. return;
  630. if (rt_rq->rt_time > rt_rq->rt_runtime) {
  631. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  632. do_balance_runtime(rt_rq);
  633. raw_spin_lock(&rt_rq->rt_runtime_lock);
  634. }
  635. }
  636. #else /* !CONFIG_SMP */
  637. static inline void balance_runtime(struct rt_rq *rt_rq) {}
  638. #endif /* CONFIG_SMP */
  639. static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
  640. {
  641. int i, idle = 1, throttled = 0;
  642. const struct cpumask *span;
  643. span = sched_rt_period_mask();
  644. #ifdef CONFIG_RT_GROUP_SCHED
  645. /*
  646. * FIXME: isolated CPUs should really leave the root task group,
  647. * whether they are isolcpus or were isolated via cpusets, lest
  648. * the timer run on a CPU which does not service all runqueues,
  649. * potentially leaving other CPUs indefinitely throttled. If
  650. * isolation is really required, the user will turn the throttle
  651. * off to kill the perturbations it causes anyway. Meanwhile,
  652. * this maintains functionality for boot and/or troubleshooting.
  653. */
  654. if (rt_b == &root_task_group.rt_bandwidth)
  655. span = cpu_online_mask;
  656. #endif
  657. for_each_cpu(i, span) {
  658. int enqueue = 0;
  659. struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
  660. struct rq *rq = rq_of_rt_rq(rt_rq);
  661. raw_spin_lock(&rq->lock);
  662. update_rq_clock(rq);
  663. if (rt_rq->rt_time) {
  664. u64 runtime;
  665. raw_spin_lock(&rt_rq->rt_runtime_lock);
  666. if (rt_rq->rt_throttled)
  667. balance_runtime(rt_rq);
  668. runtime = rt_rq->rt_runtime;
  669. rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
  670. if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
  671. rt_rq->rt_throttled = 0;
  672. enqueue = 1;
  673. /*
  674. * When we're idle and a woken (rt) task is
  675. * throttled check_preempt_curr() will set
  676. * skip_update and the time between the wakeup
  677. * and this unthrottle will get accounted as
  678. * 'runtime'.
  679. */
  680. if (rt_rq->rt_nr_running && rq->curr == rq->idle)
  681. rq_clock_skip_update(rq, false);
  682. }
  683. if (rt_rq->rt_time || rt_rq->rt_nr_running)
  684. idle = 0;
  685. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  686. } else if (rt_rq->rt_nr_running) {
  687. idle = 0;
  688. if (!rt_rq_throttled(rt_rq))
  689. enqueue = 1;
  690. }
  691. if (rt_rq->rt_throttled)
  692. throttled = 1;
  693. if (enqueue)
  694. sched_rt_rq_enqueue(rt_rq);
  695. raw_spin_unlock(&rq->lock);
  696. }
  697. if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
  698. return 1;
  699. return idle;
  700. }
  701. static inline int rt_se_prio(struct sched_rt_entity *rt_se)
  702. {
  703. #ifdef CONFIG_RT_GROUP_SCHED
  704. struct rt_rq *rt_rq = group_rt_rq(rt_se);
  705. if (rt_rq)
  706. return rt_rq->highest_prio.curr;
  707. #endif
  708. return rt_task_of(rt_se)->prio;
  709. }
  710. static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
  711. {
  712. u64 runtime = sched_rt_runtime(rt_rq);
  713. if (rt_rq->rt_throttled)
  714. return rt_rq_throttled(rt_rq);
  715. if (runtime >= sched_rt_period(rt_rq))
  716. return 0;
  717. balance_runtime(rt_rq);
  718. runtime = sched_rt_runtime(rt_rq);
  719. if (runtime == RUNTIME_INF)
  720. return 0;
  721. if (rt_rq->rt_time > runtime) {
  722. struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
  723. /*
  724. * Don't actually throttle groups that have no runtime assigned
  725. * but accrue some time due to boosting.
  726. */
  727. if (likely(rt_b->rt_runtime)) {
  728. rt_rq->rt_throttled = 1;
  729. printk_deferred_once("sched: RT throttling activated\n");
  730. } else {
  731. /*
  732. * In case we did anyway, make it go away,
  733. * replenishment is a joke, since it will replenish us
  734. * with exactly 0 ns.
  735. */
  736. rt_rq->rt_time = 0;
  737. }
  738. if (rt_rq_throttled(rt_rq)) {
  739. sched_rt_rq_dequeue(rt_rq);
  740. return 1;
  741. }
  742. }
  743. return 0;
  744. }
  745. /*
  746. * Update the current task's runtime statistics. Skip current tasks that
  747. * are not in our scheduling class.
  748. */
  749. static void update_curr_rt(struct rq *rq)
  750. {
  751. struct task_struct *curr = rq->curr;
  752. struct sched_rt_entity *rt_se = &curr->rt;
  753. u64 delta_exec;
  754. if (curr->sched_class != &rt_sched_class)
  755. return;
  756. delta_exec = rq_clock_task(rq) - curr->se.exec_start;
  757. if (unlikely((s64)delta_exec <= 0))
  758. return;
  759. schedstat_set(curr->se.statistics.exec_max,
  760. max(curr->se.statistics.exec_max, delta_exec));
  761. curr->se.sum_exec_runtime += delta_exec;
  762. account_group_exec_runtime(curr, delta_exec);
  763. curr->se.exec_start = rq_clock_task(rq);
  764. cpuacct_charge(curr, delta_exec);
  765. sched_rt_avg_update(rq, delta_exec);
  766. if (!rt_bandwidth_enabled())
  767. return;
  768. for_each_sched_rt_entity(rt_se) {
  769. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  770. if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
  771. raw_spin_lock(&rt_rq->rt_runtime_lock);
  772. rt_rq->rt_time += delta_exec;
  773. if (sched_rt_runtime_exceeded(rt_rq))
  774. resched_curr(rq);
  775. raw_spin_unlock(&rt_rq->rt_runtime_lock);
  776. }
  777. }
  778. }
  779. static void
  780. dequeue_top_rt_rq(struct rt_rq *rt_rq)
  781. {
  782. struct rq *rq = rq_of_rt_rq(rt_rq);
  783. BUG_ON(&rq->rt != rt_rq);
  784. if (!rt_rq->rt_queued)
  785. return;
  786. BUG_ON(!rq->nr_running);
  787. sub_nr_running(rq, rt_rq->rt_nr_running);
  788. rt_rq->rt_queued = 0;
  789. }
  790. static void
  791. enqueue_top_rt_rq(struct rt_rq *rt_rq)
  792. {
  793. struct rq *rq = rq_of_rt_rq(rt_rq);
  794. BUG_ON(&rq->rt != rt_rq);
  795. if (rt_rq->rt_queued)
  796. return;
  797. if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
  798. return;
  799. add_nr_running(rq, rt_rq->rt_nr_running);
  800. rt_rq->rt_queued = 1;
  801. }
  802. #if defined CONFIG_SMP
  803. static void
  804. inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
  805. {
  806. struct rq *rq = rq_of_rt_rq(rt_rq);
  807. #ifdef CONFIG_RT_GROUP_SCHED
  808. /*
  809. * Change rq's cpupri only if rt_rq is the top queue.
  810. */
  811. if (&rq->rt != rt_rq)
  812. return;
  813. #endif
  814. if (rq->online && prio < prev_prio)
  815. cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
  816. }
  817. static void
  818. dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
  819. {
  820. struct rq *rq = rq_of_rt_rq(rt_rq);
  821. #ifdef CONFIG_RT_GROUP_SCHED
  822. /*
  823. * Change rq's cpupri only if rt_rq is the top queue.
  824. */
  825. if (&rq->rt != rt_rq)
  826. return;
  827. #endif
  828. if (rq->online && rt_rq->highest_prio.curr != prev_prio)
  829. cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
  830. }
  831. #else /* CONFIG_SMP */
  832. static inline
  833. void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
  834. static inline
  835. void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
  836. #endif /* CONFIG_SMP */
  837. #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
  838. static void
  839. inc_rt_prio(struct rt_rq *rt_rq, int prio)
  840. {
  841. int prev_prio = rt_rq->highest_prio.curr;
  842. if (prio < prev_prio)
  843. rt_rq->highest_prio.curr = prio;
  844. inc_rt_prio_smp(rt_rq, prio, prev_prio);
  845. }
  846. static void
  847. dec_rt_prio(struct rt_rq *rt_rq, int prio)
  848. {
  849. int prev_prio = rt_rq->highest_prio.curr;
  850. if (rt_rq->rt_nr_running) {
  851. WARN_ON(prio < prev_prio);
  852. /*
  853. * This may have been our highest task, and therefore
  854. * we may have some recomputation to do
  855. */
  856. if (prio == prev_prio) {
  857. struct rt_prio_array *array = &rt_rq->active;
  858. rt_rq->highest_prio.curr =
  859. sched_find_first_bit(array->bitmap);
  860. }
  861. } else
  862. rt_rq->highest_prio.curr = MAX_RT_PRIO;
  863. dec_rt_prio_smp(rt_rq, prio, prev_prio);
  864. }
  865. #else
  866. static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
  867. static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
  868. #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
  869. #ifdef CONFIG_RT_GROUP_SCHED
  870. static void
  871. inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  872. {
  873. if (rt_se_boosted(rt_se))
  874. rt_rq->rt_nr_boosted++;
  875. if (rt_rq->tg)
  876. start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
  877. }
  878. static void
  879. dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  880. {
  881. if (rt_se_boosted(rt_se))
  882. rt_rq->rt_nr_boosted--;
  883. WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
  884. }
  885. #else /* CONFIG_RT_GROUP_SCHED */
  886. static void
  887. inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  888. {
  889. start_rt_bandwidth(&def_rt_bandwidth);
  890. }
  891. static inline
  892. void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
  893. #endif /* CONFIG_RT_GROUP_SCHED */
  894. static inline
  895. unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
  896. {
  897. struct rt_rq *group_rq = group_rt_rq(rt_se);
  898. if (group_rq)
  899. return group_rq->rt_nr_running;
  900. else
  901. return 1;
  902. }
  903. static inline
  904. void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  905. {
  906. int prio = rt_se_prio(rt_se);
  907. WARN_ON(!rt_prio(prio));
  908. rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
  909. inc_rt_prio(rt_rq, prio);
  910. inc_rt_migration(rt_se, rt_rq);
  911. inc_rt_group(rt_se, rt_rq);
  912. }
  913. static inline
  914. void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
  915. {
  916. WARN_ON(!rt_prio(rt_se_prio(rt_se)));
  917. WARN_ON(!rt_rq->rt_nr_running);
  918. rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
  919. dec_rt_prio(rt_rq, rt_se_prio(rt_se));
  920. dec_rt_migration(rt_se, rt_rq);
  921. dec_rt_group(rt_se, rt_rq);
  922. }
  923. static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
  924. {
  925. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  926. struct rt_prio_array *array = &rt_rq->active;
  927. struct rt_rq *group_rq = group_rt_rq(rt_se);
  928. struct list_head *queue = array->queue + rt_se_prio(rt_se);
  929. /*
  930. * Don't enqueue the group if its throttled, or when empty.
  931. * The latter is a consequence of the former when a child group
  932. * get throttled and the current group doesn't have any other
  933. * active members.
  934. */
  935. if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
  936. return;
  937. if (head)
  938. list_add(&rt_se->run_list, queue);
  939. else
  940. list_add_tail(&rt_se->run_list, queue);
  941. __set_bit(rt_se_prio(rt_se), array->bitmap);
  942. inc_rt_tasks(rt_se, rt_rq);
  943. }
  944. static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  945. {
  946. struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
  947. struct rt_prio_array *array = &rt_rq->active;
  948. list_del_init(&rt_se->run_list);
  949. if (list_empty(array->queue + rt_se_prio(rt_se)))
  950. __clear_bit(rt_se_prio(rt_se), array->bitmap);
  951. dec_rt_tasks(rt_se, rt_rq);
  952. }
  953. /*
  954. * Because the prio of an upper entry depends on the lower
  955. * entries, we must remove entries top - down.
  956. */
  957. static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
  958. {
  959. struct sched_rt_entity *back = NULL;
  960. for_each_sched_rt_entity(rt_se) {
  961. rt_se->back = back;
  962. back = rt_se;
  963. }
  964. dequeue_top_rt_rq(rt_rq_of_se(back));
  965. for (rt_se = back; rt_se; rt_se = rt_se->back) {
  966. if (on_rt_rq(rt_se))
  967. __dequeue_rt_entity(rt_se);
  968. }
  969. }
  970. static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
  971. {
  972. struct rq *rq = rq_of_rt_se(rt_se);
  973. dequeue_rt_stack(rt_se);
  974. for_each_sched_rt_entity(rt_se)
  975. __enqueue_rt_entity(rt_se, head);
  976. enqueue_top_rt_rq(&rq->rt);
  977. }
  978. static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
  979. {
  980. struct rq *rq = rq_of_rt_se(rt_se);
  981. dequeue_rt_stack(rt_se);
  982. for_each_sched_rt_entity(rt_se) {
  983. struct rt_rq *rt_rq = group_rt_rq(rt_se);
  984. if (rt_rq && rt_rq->rt_nr_running)
  985. __enqueue_rt_entity(rt_se, false);
  986. }
  987. enqueue_top_rt_rq(&rq->rt);
  988. }
  989. /*
  990. * Adding/removing a task to/from a priority array:
  991. */
  992. static void
  993. enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
  994. {
  995. struct sched_rt_entity *rt_se = &p->rt;
  996. if (flags & ENQUEUE_WAKEUP)
  997. rt_se->timeout = 0;
  998. enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
  999. if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
  1000. enqueue_pushable_task(rq, p);
  1001. }
  1002. static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
  1003. {
  1004. struct sched_rt_entity *rt_se = &p->rt;
  1005. update_curr_rt(rq);
  1006. dequeue_rt_entity(rt_se);
  1007. dequeue_pushable_task(rq, p);
  1008. }
  1009. /*
  1010. * Put task to the head or the end of the run list without the overhead of
  1011. * dequeue followed by enqueue.
  1012. */
  1013. static void
  1014. requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
  1015. {
  1016. if (on_rt_rq(rt_se)) {
  1017. struct rt_prio_array *array = &rt_rq->active;
  1018. struct list_head *queue = array->queue + rt_se_prio(rt_se);
  1019. if (head)
  1020. list_move(&rt_se->run_list, queue);
  1021. else
  1022. list_move_tail(&rt_se->run_list, queue);
  1023. }
  1024. }
  1025. static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
  1026. {
  1027. struct sched_rt_entity *rt_se = &p->rt;
  1028. struct rt_rq *rt_rq;
  1029. for_each_sched_rt_entity(rt_se) {
  1030. rt_rq = rt_rq_of_se(rt_se);
  1031. requeue_rt_entity(rt_rq, rt_se, head);
  1032. }
  1033. }
  1034. static void yield_task_rt(struct rq *rq)
  1035. {
  1036. requeue_task_rt(rq, rq->curr, 0);
  1037. }
  1038. #ifdef CONFIG_SMP
  1039. static int find_lowest_rq(struct task_struct *task);
  1040. static int
  1041. select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
  1042. {
  1043. struct task_struct *curr;
  1044. struct rq *rq;
  1045. /* For anything but wake ups, just return the task_cpu */
  1046. if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
  1047. goto out;
  1048. rq = cpu_rq(cpu);
  1049. rcu_read_lock();
  1050. curr = READ_ONCE(rq->curr); /* unlocked access */
  1051. /*
  1052. * If the current task on @p's runqueue is an RT task, then
  1053. * try to see if we can wake this RT task up on another
  1054. * runqueue. Otherwise simply start this RT task
  1055. * on its current runqueue.
  1056. *
  1057. * We want to avoid overloading runqueues. If the woken
  1058. * task is a higher priority, then it will stay on this CPU
  1059. * and the lower prio task should be moved to another CPU.
  1060. * Even though this will probably make the lower prio task
  1061. * lose its cache, we do not want to bounce a higher task
  1062. * around just because it gave up its CPU, perhaps for a
  1063. * lock?
  1064. *
  1065. * For equal prio tasks, we just let the scheduler sort it out.
  1066. *
  1067. * Otherwise, just let it ride on the affined RQ and the
  1068. * post-schedule router will push the preempted task away
  1069. *
  1070. * This test is optimistic, if we get it wrong the load-balancer
  1071. * will have to sort it out.
  1072. */
  1073. if (curr && unlikely(rt_task(curr)) &&
  1074. (curr->nr_cpus_allowed < 2 ||
  1075. curr->prio <= p->prio)) {
  1076. int target = find_lowest_rq(p);
  1077. /*
  1078. * Don't bother moving it if the destination CPU is
  1079. * not running a lower priority task.
  1080. */
  1081. if (target != -1 &&
  1082. p->prio < cpu_rq(target)->rt.highest_prio.curr)
  1083. cpu = target;
  1084. }
  1085. rcu_read_unlock();
  1086. out:
  1087. return cpu;
  1088. }
  1089. static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
  1090. {
  1091. /*
  1092. * Current can't be migrated, useless to reschedule,
  1093. * let's hope p can move out.
  1094. */
  1095. if (rq->curr->nr_cpus_allowed == 1 ||
  1096. !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
  1097. return;
  1098. /*
  1099. * p is migratable, so let's not schedule it and
  1100. * see if it is pushed or pulled somewhere else.
  1101. */
  1102. if (p->nr_cpus_allowed != 1
  1103. && cpupri_find(&rq->rd->cpupri, p, NULL))
  1104. return;
  1105. /*
  1106. * There appears to be other cpus that can accept
  1107. * current and none to run 'p', so lets reschedule
  1108. * to try and push current away:
  1109. */
  1110. requeue_task_rt(rq, p, 1);
  1111. resched_curr(rq);
  1112. }
  1113. #endif /* CONFIG_SMP */
  1114. /*
  1115. * Preempt the current task with a newly woken task if needed:
  1116. */
  1117. static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
  1118. {
  1119. if (p->prio < rq->curr->prio) {
  1120. resched_curr(rq);
  1121. return;
  1122. }
  1123. #ifdef CONFIG_SMP
  1124. /*
  1125. * If:
  1126. *
  1127. * - the newly woken task is of equal priority to the current task
  1128. * - the newly woken task is non-migratable while current is migratable
  1129. * - current will be preempted on the next reschedule
  1130. *
  1131. * we should check to see if current can readily move to a different
  1132. * cpu. If so, we will reschedule to allow the push logic to try
  1133. * to move current somewhere else, making room for our non-migratable
  1134. * task.
  1135. */
  1136. if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
  1137. check_preempt_equal_prio(rq, p);
  1138. #endif
  1139. }
  1140. static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
  1141. struct rt_rq *rt_rq)
  1142. {
  1143. struct rt_prio_array *array = &rt_rq->active;
  1144. struct sched_rt_entity *next = NULL;
  1145. struct list_head *queue;
  1146. int idx;
  1147. idx = sched_find_first_bit(array->bitmap);
  1148. BUG_ON(idx >= MAX_RT_PRIO);
  1149. queue = array->queue + idx;
  1150. next = list_entry(queue->next, struct sched_rt_entity, run_list);
  1151. return next;
  1152. }
  1153. static struct task_struct *_pick_next_task_rt(struct rq *rq)
  1154. {
  1155. struct sched_rt_entity *rt_se;
  1156. struct task_struct *p;
  1157. struct rt_rq *rt_rq = &rq->rt;
  1158. do {
  1159. rt_se = pick_next_rt_entity(rq, rt_rq);
  1160. BUG_ON(!rt_se);
  1161. rt_rq = group_rt_rq(rt_se);
  1162. } while (rt_rq);
  1163. p = rt_task_of(rt_se);
  1164. p->se.exec_start = rq_clock_task(rq);
  1165. return p;
  1166. }
  1167. static struct task_struct *
  1168. pick_next_task_rt(struct rq *rq, struct task_struct *prev)
  1169. {
  1170. struct task_struct *p;
  1171. struct rt_rq *rt_rq = &rq->rt;
  1172. if (need_pull_rt_task(rq, prev)) {
  1173. /*
  1174. * This is OK, because current is on_cpu, which avoids it being
  1175. * picked for load-balance and preemption/IRQs are still
  1176. * disabled avoiding further scheduler activity on it and we're
  1177. * being very careful to re-start the picking loop.
  1178. */
  1179. lockdep_unpin_lock(&rq->lock);
  1180. pull_rt_task(rq);
  1181. lockdep_pin_lock(&rq->lock);
  1182. /*
  1183. * pull_rt_task() can drop (and re-acquire) rq->lock; this
  1184. * means a dl or stop task can slip in, in which case we need
  1185. * to re-start task selection.
  1186. */
  1187. if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
  1188. rq->dl.dl_nr_running))
  1189. return RETRY_TASK;
  1190. }
  1191. /*
  1192. * We may dequeue prev's rt_rq in put_prev_task().
  1193. * So, we update time before rt_nr_running check.
  1194. */
  1195. if (prev->sched_class == &rt_sched_class)
  1196. update_curr_rt(rq);
  1197. if (!rt_rq->rt_queued)
  1198. return NULL;
  1199. put_prev_task(rq, prev);
  1200. p = _pick_next_task_rt(rq);
  1201. /* The running task is never eligible for pushing */
  1202. dequeue_pushable_task(rq, p);
  1203. queue_push_tasks(rq);
  1204. return p;
  1205. }
  1206. static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
  1207. {
  1208. update_curr_rt(rq);
  1209. /*
  1210. * The previous task needs to be made eligible for pushing
  1211. * if it is still active
  1212. */
  1213. if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
  1214. enqueue_pushable_task(rq, p);
  1215. }
  1216. #ifdef CONFIG_SMP
  1217. /* Only try algorithms three times */
  1218. #define RT_MAX_TRIES 3
  1219. static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
  1220. {
  1221. if (!task_running(rq, p) &&
  1222. cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
  1223. return 1;
  1224. return 0;
  1225. }
  1226. /*
  1227. * Return the highest pushable rq's task, which is suitable to be executed
  1228. * on the cpu, NULL otherwise
  1229. */
  1230. static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
  1231. {
  1232. struct plist_head *head = &rq->rt.pushable_tasks;
  1233. struct task_struct *p;
  1234. if (!has_pushable_tasks(rq))
  1235. return NULL;
  1236. plist_for_each_entry(p, head, pushable_tasks) {
  1237. if (pick_rt_task(rq, p, cpu))
  1238. return p;
  1239. }
  1240. return NULL;
  1241. }
  1242. static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
  1243. static int find_lowest_rq(struct task_struct *task)
  1244. {
  1245. struct sched_domain *sd;
  1246. struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
  1247. int this_cpu = smp_processor_id();
  1248. int cpu = task_cpu(task);
  1249. /* Make sure the mask is initialized first */
  1250. if (unlikely(!lowest_mask))
  1251. return -1;
  1252. if (task->nr_cpus_allowed == 1)
  1253. return -1; /* No other targets possible */
  1254. if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
  1255. return -1; /* No targets found */
  1256. /*
  1257. * At this point we have built a mask of cpus representing the
  1258. * lowest priority tasks in the system. Now we want to elect
  1259. * the best one based on our affinity and topology.
  1260. *
  1261. * We prioritize the last cpu that the task executed on since
  1262. * it is most likely cache-hot in that location.
  1263. */
  1264. if (cpumask_test_cpu(cpu, lowest_mask))
  1265. return cpu;
  1266. /*
  1267. * Otherwise, we consult the sched_domains span maps to figure
  1268. * out which cpu is logically closest to our hot cache data.
  1269. */
  1270. if (!cpumask_test_cpu(this_cpu, lowest_mask))
  1271. this_cpu = -1; /* Skip this_cpu opt if not among lowest */
  1272. rcu_read_lock();
  1273. for_each_domain(cpu, sd) {
  1274. if (sd->flags & SD_WAKE_AFFINE) {
  1275. int best_cpu;
  1276. /*
  1277. * "this_cpu" is cheaper to preempt than a
  1278. * remote processor.
  1279. */
  1280. if (this_cpu != -1 &&
  1281. cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
  1282. rcu_read_unlock();
  1283. return this_cpu;
  1284. }
  1285. best_cpu = cpumask_first_and(lowest_mask,
  1286. sched_domain_span(sd));
  1287. if (best_cpu < nr_cpu_ids) {
  1288. rcu_read_unlock();
  1289. return best_cpu;
  1290. }
  1291. }
  1292. }
  1293. rcu_read_unlock();
  1294. /*
  1295. * And finally, if there were no matches within the domains
  1296. * just give the caller *something* to work with from the compatible
  1297. * locations.
  1298. */
  1299. if (this_cpu != -1)
  1300. return this_cpu;
  1301. cpu = cpumask_any(lowest_mask);
  1302. if (cpu < nr_cpu_ids)
  1303. return cpu;
  1304. return -1;
  1305. }
  1306. /* Will lock the rq it finds */
  1307. static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
  1308. {
  1309. struct rq *lowest_rq = NULL;
  1310. int tries;
  1311. int cpu;
  1312. for (tries = 0; tries < RT_MAX_TRIES; tries++) {
  1313. cpu = find_lowest_rq(task);
  1314. if ((cpu == -1) || (cpu == rq->cpu))
  1315. break;
  1316. lowest_rq = cpu_rq(cpu);
  1317. if (lowest_rq->rt.highest_prio.curr <= task->prio) {
  1318. /*
  1319. * Target rq has tasks of equal or higher priority,
  1320. * retrying does not release any lock and is unlikely
  1321. * to yield a different result.
  1322. */
  1323. lowest_rq = NULL;
  1324. break;
  1325. }
  1326. /* if the prio of this runqueue changed, try again */
  1327. if (double_lock_balance(rq, lowest_rq)) {
  1328. /*
  1329. * We had to unlock the run queue. In
  1330. * the mean time, task could have
  1331. * migrated already or had its affinity changed.
  1332. * Also make sure that it wasn't scheduled on its rq.
  1333. */
  1334. if (unlikely(task_rq(task) != rq ||
  1335. !cpumask_test_cpu(lowest_rq->cpu,
  1336. tsk_cpus_allowed(task)) ||
  1337. task_running(rq, task) ||
  1338. !task_on_rq_queued(task))) {
  1339. double_unlock_balance(rq, lowest_rq);
  1340. lowest_rq = NULL;
  1341. break;
  1342. }
  1343. }
  1344. /* If this rq is still suitable use it. */
  1345. if (lowest_rq->rt.highest_prio.curr > task->prio)
  1346. break;
  1347. /* try again */
  1348. double_unlock_balance(rq, lowest_rq);
  1349. lowest_rq = NULL;
  1350. }
  1351. return lowest_rq;
  1352. }
  1353. static struct task_struct *pick_next_pushable_task(struct rq *rq)
  1354. {
  1355. struct task_struct *p;
  1356. if (!has_pushable_tasks(rq))
  1357. return NULL;
  1358. p = plist_first_entry(&rq->rt.pushable_tasks,
  1359. struct task_struct, pushable_tasks);
  1360. BUG_ON(rq->cpu != task_cpu(p));
  1361. BUG_ON(task_current(rq, p));
  1362. BUG_ON(p->nr_cpus_allowed <= 1);
  1363. BUG_ON(!task_on_rq_queued(p));
  1364. BUG_ON(!rt_task(p));
  1365. return p;
  1366. }
  1367. /*
  1368. * If the current CPU has more than one RT task, see if the non
  1369. * running task can migrate over to a CPU that is running a task
  1370. * of lesser priority.
  1371. */
  1372. static int push_rt_task(struct rq *rq)
  1373. {
  1374. struct task_struct *next_task;
  1375. struct rq *lowest_rq;
  1376. int ret = 0;
  1377. if (!rq->rt.overloaded)
  1378. return 0;
  1379. next_task = pick_next_pushable_task(rq);
  1380. if (!next_task)
  1381. return 0;
  1382. retry:
  1383. if (unlikely(next_task == rq->curr)) {
  1384. WARN_ON(1);
  1385. return 0;
  1386. }
  1387. /*
  1388. * It's possible that the next_task slipped in of
  1389. * higher priority than current. If that's the case
  1390. * just reschedule current.
  1391. */
  1392. if (unlikely(next_task->prio < rq->curr->prio)) {
  1393. resched_curr(rq);
  1394. return 0;
  1395. }
  1396. /* We might release rq lock */
  1397. get_task_struct(next_task);
  1398. /* find_lock_lowest_rq locks the rq if found */
  1399. lowest_rq = find_lock_lowest_rq(next_task, rq);
  1400. if (!lowest_rq) {
  1401. struct task_struct *task;
  1402. /*
  1403. * find_lock_lowest_rq releases rq->lock
  1404. * so it is possible that next_task has migrated.
  1405. *
  1406. * We need to make sure that the task is still on the same
  1407. * run-queue and is also still the next task eligible for
  1408. * pushing.
  1409. */
  1410. task = pick_next_pushable_task(rq);
  1411. if (task_cpu(next_task) == rq->cpu && task == next_task) {
  1412. /*
  1413. * The task hasn't migrated, and is still the next
  1414. * eligible task, but we failed to find a run-queue
  1415. * to push it to. Do not retry in this case, since
  1416. * other cpus will pull from us when ready.
  1417. */
  1418. goto out;
  1419. }
  1420. if (!task)
  1421. /* No more tasks, just exit */
  1422. goto out;
  1423. /*
  1424. * Something has shifted, try again.
  1425. */
  1426. put_task_struct(next_task);
  1427. next_task = task;
  1428. goto retry;
  1429. }
  1430. deactivate_task(rq, next_task, 0);
  1431. set_task_cpu(next_task, lowest_rq->cpu);
  1432. activate_task(lowest_rq, next_task, 0);
  1433. ret = 1;
  1434. resched_curr(lowest_rq);
  1435. double_unlock_balance(rq, lowest_rq);
  1436. out:
  1437. put_task_struct(next_task);
  1438. return ret;
  1439. }
  1440. static void push_rt_tasks(struct rq *rq)
  1441. {
  1442. /* push_rt_task will return true if it moved an RT */
  1443. while (push_rt_task(rq))
  1444. ;
  1445. }
  1446. #ifdef HAVE_RT_PUSH_IPI
  1447. /*
  1448. * When a high priority task schedules out from a CPU and a lower priority
  1449. * task is scheduled in, a check is made to see if there's any RT tasks
  1450. * on other CPUs that are waiting to run because a higher priority RT task
  1451. * is currently running on its CPU. In this case, the CPU with multiple RT
  1452. * tasks queued on it (overloaded) needs to be notified that a CPU has opened
  1453. * up that may be able to run one of its non-running queued RT tasks.
  1454. *
  1455. * All CPUs with overloaded RT tasks need to be notified as there is currently
  1456. * no way to know which of these CPUs have the highest priority task waiting
  1457. * to run. Instead of trying to take a spinlock on each of these CPUs,
  1458. * which has shown to cause large latency when done on machines with many
  1459. * CPUs, sending an IPI to the CPUs to have them push off the overloaded
  1460. * RT tasks waiting to run.
  1461. *
  1462. * Just sending an IPI to each of the CPUs is also an issue, as on large
  1463. * count CPU machines, this can cause an IPI storm on a CPU, especially
  1464. * if its the only CPU with multiple RT tasks queued, and a large number
  1465. * of CPUs scheduling a lower priority task at the same time.
  1466. *
  1467. * Each root domain has its own irq work function that can iterate over
  1468. * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
  1469. * tassk must be checked if there's one or many CPUs that are lowering
  1470. * their priority, there's a single irq work iterator that will try to
  1471. * push off RT tasks that are waiting to run.
  1472. *
  1473. * When a CPU schedules a lower priority task, it will kick off the
  1474. * irq work iterator that will jump to each CPU with overloaded RT tasks.
  1475. * As it only takes the first CPU that schedules a lower priority task
  1476. * to start the process, the rto_start variable is incremented and if
  1477. * the atomic result is one, then that CPU will try to take the rto_lock.
  1478. * This prevents high contention on the lock as the process handles all
  1479. * CPUs scheduling lower priority tasks.
  1480. *
  1481. * All CPUs that are scheduling a lower priority task will increment the
  1482. * rt_loop_next variable. This will make sure that the irq work iterator
  1483. * checks all RT overloaded CPUs whenever a CPU schedules a new lower
  1484. * priority task, even if the iterator is in the middle of a scan. Incrementing
  1485. * the rt_loop_next will cause the iterator to perform another scan.
  1486. *
  1487. */
  1488. static int rto_next_cpu(struct root_domain *rd)
  1489. {
  1490. int next;
  1491. int cpu;
  1492. /*
  1493. * When starting the IPI RT pushing, the rto_cpu is set to -1,
  1494. * rt_next_cpu() will simply return the first CPU found in
  1495. * the rto_mask.
  1496. *
  1497. * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
  1498. * will return the next CPU found in the rto_mask.
  1499. *
  1500. * If there are no more CPUs left in the rto_mask, then a check is made
  1501. * against rto_loop and rto_loop_next. rto_loop is only updated with
  1502. * the rto_lock held, but any CPU may increment the rto_loop_next
  1503. * without any locking.
  1504. */
  1505. for (;;) {
  1506. /* When rto_cpu is -1 this acts like cpumask_first() */
  1507. cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
  1508. rd->rto_cpu = cpu;
  1509. if (cpu < nr_cpu_ids)
  1510. return cpu;
  1511. rd->rto_cpu = -1;
  1512. /*
  1513. * ACQUIRE ensures we see the @rto_mask changes
  1514. * made prior to the @next value observed.
  1515. *
  1516. * Matches WMB in rt_set_overload().
  1517. */
  1518. next = atomic_read_acquire(&rd->rto_loop_next);
  1519. if (rd->rto_loop == next)
  1520. break;
  1521. rd->rto_loop = next;
  1522. }
  1523. return -1;
  1524. }
  1525. static inline bool rto_start_trylock(atomic_t *v)
  1526. {
  1527. return !atomic_cmpxchg_acquire(v, 0, 1);
  1528. }
  1529. static inline void rto_start_unlock(atomic_t *v)
  1530. {
  1531. atomic_set_release(v, 0);
  1532. }
  1533. static void tell_cpu_to_push(struct rq *rq)
  1534. {
  1535. int cpu = -1;
  1536. /* Keep the loop going if the IPI is currently active */
  1537. atomic_inc(&rq->rd->rto_loop_next);
  1538. /* Only one CPU can initiate a loop at a time */
  1539. if (!rto_start_trylock(&rq->rd->rto_loop_start))
  1540. return;
  1541. raw_spin_lock(&rq->rd->rto_lock);
  1542. /*
  1543. * The rto_cpu is updated under the lock, if it has a valid cpu
  1544. * then the IPI is still running and will continue due to the
  1545. * update to loop_next, and nothing needs to be done here.
  1546. * Otherwise it is finishing up and an ipi needs to be sent.
  1547. */
  1548. if (rq->rd->rto_cpu < 0)
  1549. cpu = rto_next_cpu(rq->rd);
  1550. raw_spin_unlock(&rq->rd->rto_lock);
  1551. rto_start_unlock(&rq->rd->rto_loop_start);
  1552. if (cpu >= 0) {
  1553. /* Make sure the rd does not get freed while pushing */
  1554. sched_get_rd(rq->rd);
  1555. irq_work_queue_on(&rq->rd->rto_push_work, cpu);
  1556. }
  1557. }
  1558. /* Called from hardirq context */
  1559. void rto_push_irq_work_func(struct irq_work *work)
  1560. {
  1561. struct root_domain *rd =
  1562. container_of(work, struct root_domain, rto_push_work);
  1563. struct rq *rq;
  1564. int cpu;
  1565. rq = this_rq();
  1566. /*
  1567. * We do not need to grab the lock to check for has_pushable_tasks.
  1568. * When it gets updated, a check is made if a push is possible.
  1569. */
  1570. if (has_pushable_tasks(rq)) {
  1571. raw_spin_lock(&rq->lock);
  1572. push_rt_tasks(rq);
  1573. raw_spin_unlock(&rq->lock);
  1574. }
  1575. raw_spin_lock(&rd->rto_lock);
  1576. /* Pass the IPI to the next rt overloaded queue */
  1577. cpu = rto_next_cpu(rd);
  1578. raw_spin_unlock(&rd->rto_lock);
  1579. if (cpu < 0) {
  1580. sched_put_rd(rd);
  1581. return;
  1582. }
  1583. /* Try the next RT overloaded CPU */
  1584. irq_work_queue_on(&rd->rto_push_work, cpu);
  1585. }
  1586. #endif /* HAVE_RT_PUSH_IPI */
  1587. static void pull_rt_task(struct rq *this_rq)
  1588. {
  1589. int this_cpu = this_rq->cpu, cpu;
  1590. bool resched = false;
  1591. struct task_struct *p;
  1592. struct rq *src_rq;
  1593. int rt_overload_count = rt_overloaded(this_rq);
  1594. if (likely(!rt_overload_count))
  1595. return;
  1596. /*
  1597. * Match the barrier from rt_set_overloaded; this guarantees that if we
  1598. * see overloaded we must also see the rto_mask bit.
  1599. */
  1600. smp_rmb();
  1601. /* If we are the only overloaded CPU do nothing */
  1602. if (rt_overload_count == 1 &&
  1603. cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
  1604. return;
  1605. #ifdef HAVE_RT_PUSH_IPI
  1606. if (sched_feat(RT_PUSH_IPI)) {
  1607. tell_cpu_to_push(this_rq);
  1608. return;
  1609. }
  1610. #endif
  1611. for_each_cpu(cpu, this_rq->rd->rto_mask) {
  1612. if (this_cpu == cpu)
  1613. continue;
  1614. src_rq = cpu_rq(cpu);
  1615. /*
  1616. * Don't bother taking the src_rq->lock if the next highest
  1617. * task is known to be lower-priority than our current task.
  1618. * This may look racy, but if this value is about to go
  1619. * logically higher, the src_rq will push this task away.
  1620. * And if its going logically lower, we do not care
  1621. */
  1622. if (src_rq->rt.highest_prio.next >=
  1623. this_rq->rt.highest_prio.curr)
  1624. continue;
  1625. /*
  1626. * We can potentially drop this_rq's lock in
  1627. * double_lock_balance, and another CPU could
  1628. * alter this_rq
  1629. */
  1630. double_lock_balance(this_rq, src_rq);
  1631. /*
  1632. * We can pull only a task, which is pushable
  1633. * on its rq, and no others.
  1634. */
  1635. p = pick_highest_pushable_task(src_rq, this_cpu);
  1636. /*
  1637. * Do we have an RT task that preempts
  1638. * the to-be-scheduled task?
  1639. */
  1640. if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
  1641. WARN_ON(p == src_rq->curr);
  1642. WARN_ON(!task_on_rq_queued(p));
  1643. /*
  1644. * There's a chance that p is higher in priority
  1645. * than what's currently running on its cpu.
  1646. * This is just that p is wakeing up and hasn't
  1647. * had a chance to schedule. We only pull
  1648. * p if it is lower in priority than the
  1649. * current task on the run queue
  1650. */
  1651. if (p->prio < src_rq->curr->prio)
  1652. goto skip;
  1653. resched = true;
  1654. deactivate_task(src_rq, p, 0);
  1655. set_task_cpu(p, this_cpu);
  1656. activate_task(this_rq, p, 0);
  1657. /*
  1658. * We continue with the search, just in
  1659. * case there's an even higher prio task
  1660. * in another runqueue. (low likelihood
  1661. * but possible)
  1662. */
  1663. }
  1664. skip:
  1665. double_unlock_balance(this_rq, src_rq);
  1666. }
  1667. if (resched)
  1668. resched_curr(this_rq);
  1669. }
  1670. /*
  1671. * If we are not running and we are not going to reschedule soon, we should
  1672. * try to push tasks away now
  1673. */
  1674. static void task_woken_rt(struct rq *rq, struct task_struct *p)
  1675. {
  1676. if (!task_running(rq, p) &&
  1677. !test_tsk_need_resched(rq->curr) &&
  1678. p->nr_cpus_allowed > 1 &&
  1679. (dl_task(rq->curr) || rt_task(rq->curr)) &&
  1680. (rq->curr->nr_cpus_allowed < 2 ||
  1681. rq->curr->prio <= p->prio))
  1682. push_rt_tasks(rq);
  1683. }
  1684. /* Assumes rq->lock is held */
  1685. static void rq_online_rt(struct rq *rq)
  1686. {
  1687. if (rq->rt.overloaded)
  1688. rt_set_overload(rq);
  1689. __enable_runtime(rq);
  1690. cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
  1691. }
  1692. /* Assumes rq->lock is held */
  1693. static void rq_offline_rt(struct rq *rq)
  1694. {
  1695. if (rq->rt.overloaded)
  1696. rt_clear_overload(rq);
  1697. __disable_runtime(rq);
  1698. cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
  1699. }
  1700. /*
  1701. * When switch from the rt queue, we bring ourselves to a position
  1702. * that we might want to pull RT tasks from other runqueues.
  1703. */
  1704. static void switched_from_rt(struct rq *rq, struct task_struct *p)
  1705. {
  1706. /*
  1707. * If there are other RT tasks then we will reschedule
  1708. * and the scheduling of the other RT tasks will handle
  1709. * the balancing. But if we are the last RT task
  1710. * we may need to handle the pulling of RT tasks
  1711. * now.
  1712. */
  1713. if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
  1714. return;
  1715. queue_pull_task(rq);
  1716. }
  1717. void __init init_sched_rt_class(void)
  1718. {
  1719. unsigned int i;
  1720. for_each_possible_cpu(i) {
  1721. zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
  1722. GFP_KERNEL, cpu_to_node(i));
  1723. }
  1724. }
  1725. #endif /* CONFIG_SMP */
  1726. /*
  1727. * When switching a task to RT, we may overload the runqueue
  1728. * with RT tasks. In this case we try to push them off to
  1729. * other runqueues.
  1730. */
  1731. static void switched_to_rt(struct rq *rq, struct task_struct *p)
  1732. {
  1733. /*
  1734. * If we are already running, then there's nothing
  1735. * that needs to be done. But if we are not running
  1736. * we may need to preempt the current running task.
  1737. * If that current running task is also an RT task
  1738. * then see if we can move to another run queue.
  1739. */
  1740. if (task_on_rq_queued(p) && rq->curr != p) {
  1741. #ifdef CONFIG_SMP
  1742. if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
  1743. queue_push_tasks(rq);
  1744. #endif /* CONFIG_SMP */
  1745. if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
  1746. resched_curr(rq);
  1747. }
  1748. }
  1749. /*
  1750. * Priority of the task has changed. This may cause
  1751. * us to initiate a push or pull.
  1752. */
  1753. static void
  1754. prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
  1755. {
  1756. if (!task_on_rq_queued(p))
  1757. return;
  1758. if (rq->curr == p) {
  1759. #ifdef CONFIG_SMP
  1760. /*
  1761. * If our priority decreases while running, we
  1762. * may need to pull tasks to this runqueue.
  1763. */
  1764. if (oldprio < p->prio)
  1765. queue_pull_task(rq);
  1766. /*
  1767. * If there's a higher priority task waiting to run
  1768. * then reschedule.
  1769. */
  1770. if (p->prio > rq->rt.highest_prio.curr)
  1771. resched_curr(rq);
  1772. #else
  1773. /* For UP simply resched on drop of prio */
  1774. if (oldprio < p->prio)
  1775. resched_curr(rq);
  1776. #endif /* CONFIG_SMP */
  1777. } else {
  1778. /*
  1779. * This task is not running, but if it is
  1780. * greater than the current running task
  1781. * then reschedule.
  1782. */
  1783. if (p->prio < rq->curr->prio)
  1784. resched_curr(rq);
  1785. }
  1786. }
  1787. static void watchdog(struct rq *rq, struct task_struct *p)
  1788. {
  1789. unsigned long soft, hard;
  1790. /* max may change after cur was read, this will be fixed next tick */
  1791. soft = task_rlimit(p, RLIMIT_RTTIME);
  1792. hard = task_rlimit_max(p, RLIMIT_RTTIME);
  1793. if (soft != RLIM_INFINITY) {
  1794. unsigned long next;
  1795. if (p->rt.watchdog_stamp != jiffies) {
  1796. p->rt.timeout++;
  1797. p->rt.watchdog_stamp = jiffies;
  1798. }
  1799. next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
  1800. if (p->rt.timeout > next)
  1801. p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
  1802. }
  1803. }
  1804. static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
  1805. {
  1806. struct sched_rt_entity *rt_se = &p->rt;
  1807. update_curr_rt(rq);
  1808. watchdog(rq, p);
  1809. /*
  1810. * RR tasks need a special form of timeslice management.
  1811. * FIFO tasks have no timeslices.
  1812. */
  1813. if (p->policy != SCHED_RR)
  1814. return;
  1815. if (--p->rt.time_slice)
  1816. return;
  1817. p->rt.time_slice = sched_rr_timeslice;
  1818. /*
  1819. * Requeue to the end of queue if we (and all of our ancestors) are not
  1820. * the only element on the queue
  1821. */
  1822. for_each_sched_rt_entity(rt_se) {
  1823. if (rt_se->run_list.prev != rt_se->run_list.next) {
  1824. requeue_task_rt(rq, p, 0);
  1825. resched_curr(rq);
  1826. return;
  1827. }
  1828. }
  1829. }
  1830. static void set_curr_task_rt(struct rq *rq)
  1831. {
  1832. struct task_struct *p = rq->curr;
  1833. p->se.exec_start = rq_clock_task(rq);
  1834. /* The running task is never eligible for pushing */
  1835. dequeue_pushable_task(rq, p);
  1836. }
  1837. static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
  1838. {
  1839. /*
  1840. * Time slice is 0 for SCHED_FIFO tasks
  1841. */
  1842. if (task->policy == SCHED_RR)
  1843. return sched_rr_timeslice;
  1844. else
  1845. return 0;
  1846. }
  1847. const struct sched_class rt_sched_class = {
  1848. .next = &fair_sched_class,
  1849. .enqueue_task = enqueue_task_rt,
  1850. .dequeue_task = dequeue_task_rt,
  1851. .yield_task = yield_task_rt,
  1852. .check_preempt_curr = check_preempt_curr_rt,
  1853. .pick_next_task = pick_next_task_rt,
  1854. .put_prev_task = put_prev_task_rt,
  1855. #ifdef CONFIG_SMP
  1856. .select_task_rq = select_task_rq_rt,
  1857. .set_cpus_allowed = set_cpus_allowed_common,
  1858. .rq_online = rq_online_rt,
  1859. .rq_offline = rq_offline_rt,
  1860. .task_woken = task_woken_rt,
  1861. .switched_from = switched_from_rt,
  1862. #endif
  1863. .set_curr_task = set_curr_task_rt,
  1864. .task_tick = task_tick_rt,
  1865. .get_rr_interval = get_rr_interval_rt,
  1866. .prio_changed = prio_changed_rt,
  1867. .switched_to = switched_to_rt,
  1868. .update_curr = update_curr_rt,
  1869. };
  1870. #ifdef CONFIG_SCHED_DEBUG
  1871. extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
  1872. void print_rt_stats(struct seq_file *m, int cpu)
  1873. {
  1874. rt_rq_iter_t iter;
  1875. struct rt_rq *rt_rq;
  1876. rcu_read_lock();
  1877. for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
  1878. print_rt_rq(m, cpu, rt_rq);
  1879. rcu_read_unlock();
  1880. }
  1881. #endif /* CONFIG_SCHED_DEBUG */