CFS进程调度.docx

资源描述

CFS进程调度.docx

《CFS进程调度.docx》由会员分享，可在线阅读，更多相关《CFS进程调度.docx（40页珍藏版）》请在冰豆网上搜索。

CFS进程调度.docx

CFS进程调度

分类：

arm-linux2011-12-0210:

14 1253人阅读评论（0）收藏举报

structstatisticsclassthreadupoptimization

[cpp] viewplaincopy

1.一、概述

2.linux 2.6.23中采用了一个全新的调度策略CFS（Completely Fair Scheduler）来处理非实时进程。

4.二、主要数据结构

5.1.为了和原先的实时策略更好的融合，linux在实现CFS之余，还将内核的调度策略模块化，添加了新的结构体sched_class用于管理不同的调度器。

7.2.CFS没有用传统的调度器中时间片的概念，而是使用了新的结构体sched_entity来跟踪一个进程的运行时间。

8. se也可表示组调度，在此不做分析，所以这里se代表一个进程。

9.struct sched_entity {

10. struct load_weight load; /* for load-balancing */ //se的权重

11. struct rb_node run_node; //在红黑树上的节点

12. unsigned int on_rq; //该se是否在rq上

13.

14. u64 exec_start; //当前cfs_rq的时间，用于计算时间差

15. u64 sum_exec_runtime; //进程总共运行的时间，real-run time

16. u64 vruntime; //进程的virtual-run time

17. u64 prev_sum_exec_runtime; //进程在醒来的时间

18.

19. u64 nr_migrations;

20.};

21.

22.3.rq

23.

24.三、tick中断处理

25.每一次进入tick中断后，会进入scheduler_tick函数来进行scheduler相关的处理：

26.void scheduler_tick（void）

27.{

28. int cpu = smp_processor_id（）;

29. struct rq *rq = cpu_rq（cpu）;

30. struct task_struct *curr = rq->curr;

31.

32. raw_spin_lock（&rq->lock）;

33. /*

34. 更新运行队列的时间，rq->clock和rq->clock_task，可以看出在真正计算

35. 时间的时候用的是clock_task

36. */

37. update_rq_clock（rq）;

38.

39. curr->sched_class->task_tick（rq, curr, 0）;

40. raw_spin_unlock（&rq->lock）;

41.

42. perf_event_task_tick（）;

43.}

44.

45.由于添加了模块化的架构，如果采用的是CFS，将会跳转到函数task_tick_fair进而到函数entity_tick。

46.这个函数的作用是更新进程的时间数据，在判断是否被其他进程抢占。

47.static void

48.entity_tick（struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued）

49.{

50. /*

51. * Update run-time statistics of the 'current'.

52. */

53. update_curr（cfs_rq）;

54.

55. if （cfs_rq->nr_running > 1）

56. check_preempt_tick（cfs_rq, curr）;

57.}

58.

59.正如函数注释所说，update_curr用来更新run-time的统计数据，系统会根据这些数据最终决定调度哪个进程。

60.static void update_curr（struct cfs_rq *cfs_rq）

61.{

62. struct sched_entity *curr = cfs_rq->curr;

63. u64 now = rq_of（cfs_rq）->clock_task;

64. unsigned long delta_exec;

65.

66. if （unlikely（!

curr））

67. return;

68.

69. /*

70. * Get the amount of time the current task was running

71. * since the last time we changed load （this cannot

72. * overflow on 32 bits）:

73. */

74. /*

75. delta_exec为自从上次变动以来的时间。

76. */

77. delta_exec = （unsigned long）（now - curr->exec_start）;

78. if （!

delta_exec）

79. return;

80.

81. __update_curr（cfs_rq, curr, delta_exec）;

82. curr->exec_start = now;

83.}

84.

85./*

86. * Update the current task's runtime statistics. Skip current tasks that

87. * are not in our scheduling class.

88. */

89.static inline void

90.__update_curr（struct cfs_rq *cfs_rq, struct sched_entity *curr,

91. unsigned long delta_exec）

92.{

93. unsigned long delta_exec_weighted;

94.

95. //更新进程的真实运行时间

96. curr->sum_exec_runtime += delta_exec;

97.

98. /* calc_delta_fair用来将真实时间转化为虚拟时间。

99. 进程的优先级不同，它在系统中的地位（也就是权重）也不同

100. 进程的优先级越高，它的虚拟时间走的越慢。

101. */

102. delta_exec_weighted = calc_delta_fair（delta_exec, curr）;

103.

104. //更新进程的虚拟运行时间

105. curr->vruntime += delta_exec_weighted;

106. update_min_vruntime（cfs_rq）;

107.}

108.

109.每个进程在其产生（fork）的时候，都会根据其父亲的优先级产生它的优先级和权重（sched_fork函数）。

110./*

111. * delta /= w

112. */

113.static inline unsigned long

114.calc_delta_fair（unsigned long delta, struct sched_entity *se）

115.{

116. //如果该进程拥有nice为0的权重，这是他的虚拟时钟和真实时钟是一样速度的。

117. if （unlikely（se->load.weight !

= NICE_0_LOAD））

118. delta = calc_delta_mine（delta, NICE_0_LOAD, &se->load）;

119.

120. return delta;

121.}

122.

123.从注释来看calc_delta_mine的计算公式为delta *= weight / lw，也就是说进程的权重越大，时钟走的越慢，而且是线性的。

124.

125.min_vruntime是cfs的rq中的一个成员，是cfs时间的基准，在cfs中起这至关重要的作用。

126.自cfs产生以来，这部分的代码改动也是很频繁的。

127.static void update_min_vruntime（struct cfs_rq *cfs_rq）

128.{

129. u64 vruntime = cfs_rq->min_vruntime;

130.

131. /*

132. 由于当前运行的进程是不在红黑树上的，所以关于cfs_rq->min_vruntime的更新

133. 必须要考虑当前的进程，以免产生不公平，这是以前的调度器所疏忽的。

134. 如果有当前进程，就以当前进程作为基准计算

135. */

136. if （cfs_rq->curr）

137. vruntime = cfs_rq->curr->vruntime;

138.

139. if （cfs_rq->rb_leftmost） {

140. struct sched_entity *se = rb_entry（cfs_rq->rb_leftmost,

141. struct sched_entity,

142. run_node）;

143.

144. if （!

cfs_rq->curr）

145. /*

146. 如果没有当前进程，这个在什么时候出现？

147. 其他策略的进程在运行时？

148. 就不用考虑它了，就是最小的运行时间

149. */

150. vruntime = se->vruntime;

151. else

152. /*

153. 如果有当前进程，还要考虑这个最小的运行时间

154. */

155. vruntime = min_vruntime（vruntime, se->vruntime）;

156. }

157.

158. //最后，更新cfs_rq->min_vruntime，这个值是单调增加的。

159. cfs_rq->min_vruntime = max_vruntime（cfs_rq->min_vruntime, vruntime）;

160.}

161.

162.分析完update_curr函数，我们回到entity_tick函数，接着往下看。

下面两行的作用是判断当前的进程是否需要被抢占。

能够发生抢占首要条件是nr_running大于1。

163./*

164. * Preempt the current task with a newly woken task if needed:

165. */

166.static void

167.check_preempt_tick（struct cfs_rq *cfs_rq, struct sched_entity *curr）

168.{

169. unsigned long ideal_runtime, delta_exec;

170. struct sched_entity *se;

171. s64 delta;

172.

173. //计算curr进程的理想运行时间

174. ideal_runtime = sched_slice（cfs_rq, curr）;

175. //计算该进程实际运行时间

176. delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;

177. //如果实际运行的时间超出了它应该运行的时间，则set该进程的TIF_NEED_RESCHED标志位

178. if （delta_exec > ideal_runtime） {

179. resched_task（rq_of（cfs_rq）->curr）;

180. //如注释，如果当前进程运行了足够时间，就取消它在buddy中的优先权

181. /*

182. * The current task ran long enough, ensure it doesn't get

183. * re-elected due to buddy favours.

184. */

185. clear_buddies（cfs_rq, curr）;

186. return;

187. }

188.

189. /*

190. 这里是第二个抢占条件：

最小虚拟运行时间的进程和当前进程的虚拟运行时间进行比较，

191. 如果后者比前者大了 ideal_runtime，就需要进行调度。

192. 为什么要用虚拟时间个sched_slice产生的真实时间进行比较呢？

193. 大了ideal_runtime又能代表什么呢？

194. */

195. /*

196. * Ensure that a task that missed wakeup preemption by a

197. * narrow margin doesn't have to wait for a full slice.

198. * This also mitigates buddy induced latencies under load.

199. */

200. if （delta_exec < sysctl_sched_min_granularity）

201. return;

202.

203. se = __pick_first_entity（cfs_rq）;

204. delta = curr->vruntime - se->vruntime;

205.

206. if （delta < 0）

207. return;

208.

209. if （delta > ideal_runtime）

210. resched_task（rq_of（cfs_rq）->curr）;

211.}

212.

213.sched_slice函数用来计算一个进程时间基准（wall time），一个进程的理论运行时间是和整个cfs_rq中的进程数量和权重有关系的。

214.不考虑调度组的话，函数等价于：

215./*

216. * We calculate the wall-time slice from the period by taking a part

217. * proportional to the weight.

218. *

219. * s = p*P[w/rw]

220. */

221.static u64 sched_slice（struct cfs_rq *cfs_rq, struct sched_entity *se）

222.{

223. /*

224. __sched_period这个函数得到的是每一个进程运行一次的时间总和，公式为：

225. p = （nr <= nl） ?

l :

mg * nr

226. l ：

系统常数，为调度延时，就是系统所有进程运行一次的时间总和

227. nl ：

系统常数，系统活动进程的上限

228. nr ：

当前的进程数

229. mg ：

系统常数，最小的调度时间间隔

230. */

231. u64 slice = __sched_period（cfs_rq->nr_running + !

se->on_rq）;

232. struct load_weight *load;

233. struct load_weight lw;

234.

235. load = &cfs_rq->load;

236.

237. //这里和上面都考虑了情况，当进程刚刚创建时，当前进程不在运行队列中，为了计算合理，就临时加上去。

238. if （unlikely（!

se->on_rq）） {

239. lw = cfs_rq->load;

240.

241. update_load_add（&lw, se->load.weight）;

242. load = &lw;

243. }

244. //根据权重得到属于自己的那份时间

245. slice = calc_delta_mine（slice, se->load.weight, load）;

246.

247. return slice;

248.}

249.

250.四、新进程中的调度

251.1.sched_fork出现在copy_process中。

252./*

253. * fork（）/clone（）-time setup:

254. */

255.void sched_fork（struct task_struct *p）

256.{

257. unsigned long flags;

258. int cpu = get_cpu（）;

259.

260. //初始化task_struct中调度器相关的成员。

261. __sched_fork（p）;

262. /*

263. * We mark the process as running here. This guarantees that

264. * nobody will actually run it, and a signal or other external

265. * event cannot wake it up and insert it on the runqueue either.

266. */

267. p->state = TASK_RUNNING;

268.

269. //确保临时的优先级的提升不会继承到新的进程中

270. /*

271. * Make sure we do not leak PI boosting priority to the child.

272. */

273. p->prio = current->normal_prio;

274.

275. //如果设置了sched_reset_on_fork，会恢复默认调度策略

276. /*

277. * Revert to default priority/policy on fork if requested.

278. */

279. if （unlikely（p->sched_reset_on_fork）） {

280. if （task_has_rt_policy（p）） {

281. p->policy = SCHED_NORMAL;

282. p->static_prio = NICE_TO_PRIO（0）;

283. p->rt_priority = 0;

284. } else if （PRIO_TO_NICE（p->static_prio） < 0）

285. p->static_prio = NICE_T

展开阅读全文