Linux系统内核抢占补丁的原理解说(附代码)

CPU在内核中运行时并不是处处不可抢占的,内核中存在一些空隙,在这时进行抢占是安全的,内核抢占补丁的基本原理就是将SMP可并行的代码段看成是可以进行内核抢占的区域。

Linux 2.4内核正好细化了多CPU下的内核线程同步机构,对不可并行的指令块用spinlock和rwlock作了细致的表示,该补丁的实现可谓水到渠成。具体的方法就是在进程的任务结构上增加一个preempt_count变量作为内核抢占锁,它随着spinlock和rwlock一起加锁和解锁。当preempt_count为0时表示可以进行内核调度。内核调度器的入口为preempt_schedule(),它将当前进程标记为TASK_PREEMPTED状态再调用schedule(),在TASK_PREEMPTED状态,schedule()不会将进程从运行队列中删除。

下面是内核抢占补丁的主要代码示意:

arch/i386/kernel/entry.S: preempt_count = 4 # 将task_struct中的flags用作preempt_count,flags被移到了别 的位置 ret_from_exception: # 从异常返回 #ifdef CONFIG_SMP GET_CURRENT(%ebx) movl processor(%ebx),%eax shll $CONFIG_X86_L1_CACHE_SHIFT,%eax movl SYMBOL_NAME(irq_stat)(,%eax),%ecx # softirq_active testl SYMBOL_NAME(irq_stat)+4(,%eax),%ecx # softirq_mask #else movl SYMBOL_NAME(irq_stat),%ecx # softirq_active testl SYMBOL_NAME(irq_stat)+4,%ecx # softirq_mask #endif jne handle_softirq #ifdef CONFIG_PREEMPT cli incl preempt_count(%ebx) # 异常的入口没有禁止内核调度的指令,与ret_from_intr 匹配一下 #endif ENTRY(ret_from_intr) # 硬件中断的返回 GET_CURRENT(%ebx) #ifdef CONFIG_PREEMPT cli decl preempt_count(%ebx) # 恢复内核抢占标志 #endif movl EFLAGS(%esp),%eax # mix EFLAGS and CS movb CS(%esp),%al testl $(VM_MASK | 3),%eax # return to VM86 mode or non-supervisor? jne ret_with_reschedule #ifdef CONFIG_PREEMPT cmpl $0,preempt_count(%ebx) jnz restore_all # 如果preempt_count非零则表示禁止内核抢占 cmpl $0,need_resched(%ebx) jz restore_all # movl SYMBOL_NAME(irq_stat)+irq_stat_local_bh_count CPU_INDX,%ecx addl SYMBOL_NAME(irq_stat)+irq_stat_local_irq_count CPU_INDX,%ecx jnz restore_all incl preempt_count(%ebx) sti call SYMBOL_NAME(preempt_schedule) jmp ret_from_intr # 新进程返回,返回ret_from_intr恢复抢占标志后再返回 #else jmp restore_all #endif ALIGN handle_softirq: #ifdef CONFIG_PREEMPT cli GET_CURRENT(%ebx) incl preempt_count(%ebx) sti #endif call SYMBOL_NAME(do_softirq) jmp ret_from_intr ALIGN reschedule: call SYMBOL_NAME(schedule) # test jmp ret_from_sys_call include/asm/hw_irq.h: ... #ifdef CONFIG_PREEMPT #define BUMP_CONTEX_SWITCH_LOCK \ GET_CURRENT \ "incl 4(%ebx)\n\t" #else #define BUMP_CONTEX_SWITCH_LOCK #endif #define SAVE_ALL \ 硬件中断保护入口现场 "cld\n\t" \ "pushl %es\n\t" \ "pushl %ds\n\t" \ "pushl %eax\n\t" \ "pushl %ebp\n\t" \ "pushl %edi\n\t" \ "pushl %esi\n\t" \ "pushl %edx\n\t" \ "pushl %ecx\n\t" \ "pushl %ebx\n\t" \ "movl $" STR(__KERNEL_DS) ",%edx\n\t" \ "movl %edx,%ds\n\t" \ "movl %edx,%es\n\t" \ BUMP_CONTEX_SWITCH_LOCK # 硬件中断的入口禁止内核抢占 include/linux/spinlock.h: #ifdef CONFIG_PREEMPT #define switch_lock_count() current->preempt_count #define in_ctx_sw_off() (switch_lock_count().counter) 判断当前进程的抢占计数 是否非零 #define atomic_ptr_in_ctx_sw_off() (&switch_lock_count()) #define ctx_sw_off() \ 禁止内核抢占 do { \ atomic_inc(atomic_ptr_in_ctx_sw_off()); \ 当前进程的内核抢占计数增1 } while (0) #define ctx_sw_on_no_preempt() \ 允许内核抢占 do { \ atomic_dec(atomic_ptr_in_ctx_sw_off()); \ 当前进程的内核抢占计数减1 } while (0) #define ctx_sw_on() \ 允许并完成内核抢占 do { \ if (atomic_dec_and_test(atomic_ptr_in_ctx_sw_off()) && \ current->need_resched) \ preempt_schedule(); \ } while (0) #define spin_lock(lock) \ do { \ ctx_sw_off(); \ 进入自旋锁时禁止抢占 _raw_spin_lock(lock); \ } while(0) #define spin_trylock(lock) ({ctx_sw_off(); _raw_spin_trylock(lock) ? \锁定并 测试原来是否上锁 1 : ({ctx_sw_on(); 0;});}) #define spin_unlock(lock) \ do { \ _raw_spin_unlock(lock); \ ctx_sw_on(); \ 离开自旋锁时允许并完成内核抢占 } while (0) #define read_lock(lock) ({ctx_sw_off(); _raw_read_lock(lock);}) #define read_unlock(lock) ({_raw_read_unlock(lock); ctx_sw_on();}) #define write_lock(lock) ({ctx_sw_off(); _raw_write_lock(lock);}) #define write_unlock(lock) ({_raw_write_unlock(lock); ctx_sw_on();}) #define write_trylock(lock) ({ctx_sw_off(); _raw_write_trylock(lock) ? \ 1 : ({ctx_sw_on(); 0;});}) ... include/asm/softirq.h: #define cpu_bh_disable(cpu) do { ctx_sw_off(); local_bh_count(cpu)++; barrie r(); } while (0) #define cpu_bh_enable(cpu) do { barrier(); local_bh_count(cpu)--;ctx_sw_on() ; } while (0) kernel/schedule.c: #ifdef CONFIG_PREEMPT asmlinkage void preempt_schedule(void) { while (current->need_resched) { ctx_sw_off(); current->state |= TASK_PREEMPTED; schedule(); current->state &= ~TASK_PREEMPTED; ctx_sw_on_no_preempt(); } } #endif asmlinkage void schedule(void) { struct schedule_data * sched_data; struct task_struct *prev, *next, *p; struct list_head *tmp; int this_cpu, c; #ifdef CONFIG_PREEMPT ctx_sw_off(); #endif if (!current->active_mm) BUG(); need_resched_back: prev = current; this_cpu = prev->processor; if (in_interrupt()) goto scheduling_in_interrupt; release_kernel_lock(prev, this_cpu); /* Do "administrative" work here while we don't hold any locks */ if (softirq_active(this_cpu) & softirq_mask(this_cpu)) goto handle_softirq; handle_softirq_back: /* * 'sched_data' is protected by the fact that we can run * only one process per CPU. */ sched_data = & aligned_data[this_cpu].schedule_data; spin_lock_irq(&runqueue_lock); /* move an exhausted RR process to be last.. */ if (prev->policy == SCHED_RR) goto move_rr_last; move_rr_back: switch (prev->state) { case TASK_INTERRUPTIBLE: if (signal_pending(prev)) { prev->state = TASK_RUNNING; break; } default: #ifdef CONFIG_PREEMPT if (prev->state & TASK_PREEMPTED) break; 如果是内核抢占调度,则保留运行队列 #endif del_from_runqueue(prev); #ifdef CONFIG_PREEMPT case TASK_PREEMPTED: #endif case TASK_RUNNING: } prev->need_resched = 0; /* * this is the scheduler proper: */ repeat_schedule: /* * Default process to select.. */ next = idle_task(this_cpu); c = -1000; if (task_on_runqueue(prev)) goto still_running; still_running_back: list_for_each(tmp, &runqueue_head) { p = list_entry(tmp, struct task_struct, run_list); if (can_schedule(p, this_cpu)) { int weight = goodness(p, this_cpu, prev->active_mm); if (weight > c) c = weight, next = p; } } /* Do we need to re-calculate counters? */ if (!c) goto recalculate; /* * from this point on nothing can prevent us from * switching to the next task, save this fact in * sched_data. */ sched_data->curr = next; #ifdef CONFIG_SMP next->has_cpu = 1; next->processor = this_cpu; #endif spin_unlock_irq(&runqueue_lock); if (prev == next) goto same_process; #ifdef CONFIG_SMP /* * maintain the per-process 'last schedule' value. * (this has to be recalculated even if we reschedule to * the same process) Currently this is only used on SMP, * and it's approximate, so we do not have to maintain * it while holding the runqueue spinlock. */ sched_data->last_schedule = get_cycles(); /* * We drop the scheduler lock early (it's a global spinlock), * thus we have to lock the previous process from getting * rescheduled during switch_to(). */ #endif /* CONFIG_SMP */ kstat.context_swtch++; /* * there are 3 processes which are affected by a context switch: * * prev == .... ==> (last => next) * * It's the 'much more previous' 'prev' that is on next's stack, * but prev is set to (the just run) 'last' process by switch_to(). * This might sound slightly confusing but makes tons of sense. */ prepare_to_switch(); { struct mm_struct *mm = next->mm; struct mm_struct *oldmm = prev->active_mm; if (!mm) { if (next->active_mm) BUG(); next->active_mm = oldmm; atomic_inc(&oldmm->mm_count); enter_lazy_tlb(oldmm, next, this_cpu); } else { if (next->active_mm != mm) BUG(); switch_mm(oldmm, mm, next, this_cpu); } if (!prev->mm) { prev->active_mm = NULL; mmdrop(oldmm); } } /* * This just switches the register state and the * stack. */ switch_to(prev, next, prev); __schedule_tail(prev); same_process: reacquire_kernel_lock(current); if (current->need_resched) goto need_resched_back; #ifdef CONFIG_PREEMPT ctx_sw_on_no_preempt(); #endif return; recalculate: { struct task_struct *p; spin_unlock_irq(&runqueue_lock); read_lock(&tasklist_lock); for_each_task(p) p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); read_unlock(&tasklist_lock); spin_lock_irq(&runqueue_lock); } goto repeat_schedule; still_running: c = goodness(prev, this_cpu, prev->active_mm); next = prev; goto still_running_back; handle_softirq: do_softirq(); goto handle_softirq_back; move_rr_last: if (!prev->counter) { prev->counter = NICE_TO_TICKS(prev->nice); move_last_runqueue(prev); } goto move_rr_back; scheduling_in_interrupt: printk("Scheduling in interrupt\n"); BUG(); return; } void schedule_tail(struct task_struct *prev) { __schedule_tail(prev); #ifdef CONFIG_PREEMPT ctx_sw_on(); #endif }  

内容版权声明:除非注明,否则皆为本站原创文章。

转载注明出处:https://www.heiqu.com/wzgjzw.html