[PATCH] new tlb flush code
Manfred Spraul
manfreds en colorfullife.com
Vie Ene 28 23:02:51 CST 2000
Below is my new tlb flush code:
* it fixes the race between switch_mm() and switch_to()
* smp_flush_interrupt() optimization: flush_tlb_page() flushes only one
page instead of the complete tlb.
* the patch is against 2.3.40
The patch is i386 only, all other architectures must add an
enter_lazy_tlb() macro:
* It's usually empty on UP
* If the cpu doesn't support mm->context, then you could copy the
implementation from i386.
* If the cpu supports multiple active_mm's in the tlb, then you could
use the "negative version" of the i386 design:
- instead of storing "active_mm" in a special array, you store "mm" in
that array.
- flush_tlb_xy() resets "mm->cpu_vm_mask".
- the tlb flush interrupt adds itself to "mm->cpu_vm_mask" if that mm is
active.
- I don't know if flush_tlb_all() can clear "mm->cpu_vm_mask".
It's tested on my K6/200 and my Dual-PII, and it's stable.
--
Manfred
------------ próxima parte ------------
// $Header$
// Kernel Version:
// VERSION = 2
// PATCHLEVEL = 3
// SUBLEVEL = 40
// EXTRAVERSION =
--- 2.3/arch/i386/kernel/smp.c Fri Jan 21 12:59:23 2000
+++ build-2.3/arch/i386/kernel/smp.c Fri Jan 28 20:21:46 2000
@@ -103,8 +103,7 @@
/* The 'big kernel lock' */
spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
-volatile unsigned long smp_invalidate_needed; /* immediate flush required */
-unsigned int cpu_tlbbad[NR_CPUS]; /* flush before returning to user space */
+struct tlb_state cpu_tlbstate[NR_CPUS];
/*
* the following functions deal with sending IPIs between CPUs.
@@ -282,74 +281,140 @@
}
/*
- * This is fraught with deadlocks. Probably the situation is not that
- * bad as in the early days of SMP, so we might ease some of the
- * paranoia here.
+ * Smarter SMP flushing macros.
+ * c/o Linus Torvalds.
+ *
+ * These mean you can really definitely utterly forget about
+ * writing to user space from interrupts. (Its not allowed anyway).
+ *
+ * Optimizations Manfred Spraul <manfreds en colorfullife.com>
*/
-static void flush_tlb_others(unsigned int cpumask)
+#define TLB_PARANOIA 1
+
+static volatile unsigned long flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+#define FLUSH_ALL 0xFFFFffff
+
+static void inline leave_mm(unsigned long cpu)
{
- int cpu = smp_processor_id();
- int stuck;
- unsigned long flags;
+#ifdef TLB_PARANOIA
+ if(cpu_tlbstate[cpu].state == TLBSTATE_OK)
+ BUG();
+#endif
+ clear_bit(cpu, &cpu_tlbstate[cpu].active_mm->cpu_vm_mask);
+ cpu_tlbstate[cpu].state = TLBSTATE_OLD;
+}
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * 1) set_bit(cpu, &new_mm->cpu_vm_mask);
+ * 2) update cpu_tlbstate
+ * [now the cpu can accept tlb flush request for the new mm]
+ * 3) change cr3 (if required, or flush local tlb,...)
+ * 4) clear_bit(cpu, &old_mm->cpu_vm_mask);
+ * 5) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ * runs in kernel space, the cpu could load tlb entries for user space
+ * pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ * We cannot call mmdrop() because we are in interrupt context,
+ * instead update cpu_tlbstate.
+ */
+
+asmlinkage void smp_invalidate_interrupt(void)
+{
+ unsigned long cpu = smp_processor_id();
+
+ if (flush_mm == cpu_tlbstate[cpu].active_mm) {
+ if (cpu_tlbstate[cpu].state == TLBSTATE_OK) {
+ if(flush_va == FLUSH_ALL)
+ local_flush_tlb();
+ else
+ __flush_tlb_one(flush_va);
+ } else {
+ leave_mm(cpu);
+ }
+ }
+ ack_APIC_irq();
+ clear_bit(cpu, &flush_cpumask);
+}
+
+static void flush_tlb_others(unsigned long cpumask, struct mm_struct *mm, unsigned long va)
+{
+#ifdef TLB_PARANOIA
+ if(in_interrupt()) {
+ printk(KERN_EMERG "tlb flush from interrupt: %d,%d",
+ local_bh_count[smp_processor_id()],
+ local_irq_count[smp_processor_id()]);
+ }
+ if(cpumask & (1<<smp_processor_id())) {
+ printk(KERN_EMERG "flush_tlb_others: bad cpumask!");
+ cpumask &= ~(1<<smp_processor_id());
+ local_flush_tlb();
+ }
+ {
+ int flags;
+
+ save_flags(flags);
+ if(flags != 1) {
+static int limit=10;
+ if(limit > 0) {
+ limit--;
+ printk(KERN_EMERG "flush_tlb_others: possible lock-up, broken!(%d)",
+ flags);
+/* show_stack(NULL);*/
+ }
+ sti();
+ }
+ }
+#endif
+ cpumask &= cpu_online_map;
/*
* it's important that we do not generate any APIC traffic
* until the AP CPUs have booted up!
*/
- cpumask &= cpu_online_map;
if (cpumask) {
- atomic_set_mask(cpumask, &smp_invalidate_needed);
-
- /*
- * Processors spinning on some lock with IRQs disabled
- * will see this IRQ late. The smp_invalidate_needed
- * map will ensure they don't do a spurious flush tlb
- * or miss one.
- */
-
- __save_flags(flags);
- __cli();
-
+static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+ spin_lock(&lock);
+
+ flush_mm = mm;
+ flush_va = va;
+ atomic_set_mask(cpumask, &flush_cpumask);
send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
- /*
- * Spin waiting for completion
- */
-
- stuck = 50000000;
- while (smp_invalidate_needed) {
- /*
- * Take care of "crossing" invalidates
+ while (flush_cpumask) {
+ /* FIXME: lockup-detection, print backtrace on
+ * lock-up
*/
- if (test_bit(cpu, &smp_invalidate_needed))
- do_flush_tlb_local();
-
- --stuck;
- if (!stuck) {
- printk("stuck on TLB IPI wait (CPU#%d)\n",cpu);
- break;
- }
}
- __restore_flags(flags);
+ flush_mm = flush_va = 0;
+ spin_unlock(&lock);
}
}
-
-/*
- * Smarter SMP flushing macros.
- * c/o Linus Torvalds.
- *
- * These mean you can really definitely utterly forget about
- * writing to user space from interrupts. (Its not allowed anyway).
- */
+
void flush_tlb_current_task(void)
{
unsigned long vm_mask = 1 << smp_processor_id();
struct mm_struct *mm = current->mm;
unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask;
- mm->cpu_vm_mask = vm_mask;
- flush_tlb_others(cpu_mask);
local_flush_tlb();
+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
}
void flush_tlb_mm(struct mm_struct * mm)
@@ -357,12 +422,14 @@
unsigned long vm_mask = 1 << smp_processor_id();
unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask;
- mm->cpu_vm_mask = 0;
if (current->active_mm == mm) {
- mm->cpu_vm_mask = vm_mask;
- local_flush_tlb();
+ if(current->mm)
+ local_flush_tlb();
+ else
+ leave_mm(smp_processor_id());
}
- flush_tlb_others(cpu_mask);
+
+ flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
}
void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
@@ -371,23 +438,22 @@
struct mm_struct *mm = vma->vm_mm;
unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask;
- mm->cpu_vm_mask = 0;
if (current->active_mm == mm) {
- __flush_tlb_one(va);
- mm->cpu_vm_mask = vm_mask;
+ if(current->mm)
+ __flush_tlb_one(va);
+ else
+ leave_mm(smp_processor_id());
}
- flush_tlb_others(cpu_mask);
+
+ flush_tlb_others(cpu_mask, mm, va);
}
static inline void do_flush_tlb_all_local(void)
{
+ unsigned long cpu = smp_processor_id();
__flush_tlb_all();
- if (!current->mm && current->active_mm) {
- unsigned long cpu = smp_processor_id();
-
- clear_bit(cpu, ¤t->active_mm->cpu_vm_mask);
- cpu_tlbbad[cpu] = 1;
- }
+ if (cpu_tlbstate[cpu].state == TLBSTATE_LAZY)
+ leave_mm(cpu);
}
static void flush_tlb_all_ipi(void* info)
@@ -512,23 +578,6 @@
asmlinkage void smp_reschedule_interrupt(void)
{
ack_APIC_irq();
-}
-
-/*
- * Invalidate call-back.
- *
- * Mark the CPU as a VM user if there is a active
- * thread holding on to an mm at this time. This
- * allows us to optimize CPU cross-calls even in the
- * presense of lazy TLB handling.
- */
-asmlinkage void smp_invalidate_interrupt(void)
-{
- if (test_bit(smp_processor_id(), &smp_invalidate_needed))
- do_flush_tlb_local();
-
- ack_APIC_irq();
-
}
asmlinkage void smp_call_function_interrupt(void)
--- 2.3/arch/i386/kernel/setup.c Fri Jan 21 12:59:23 2000
+++ build-2.3/arch/i386/kernel/setup.c Fri Jan 28 20:53:14 2000
@@ -75,7 +75,7 @@
#include <asm/e820.h>
#include <asm/dma.h>
#include <asm/mpspec.h>
-
+#include <asm/mmu_context.h>
/*
* Machine setup..
*/
@@ -1543,6 +1543,10 @@
*/
atomic_inc(&init_mm.mm_count);
current->active_mm = &init_mm;
+ if(current->mm)
+ BUG();
+ enter_lazy_tlb(&init_mm, current, nr);
+
t->esp0 = current->thread.esp0;
set_tss_desc(nr,t);
gdt_table[__TSS(nr)].b &= 0xfffffdff;
--- 2.3/arch/i386/kernel/irq.c Fri Jan 21 12:59:23 2000
+++ build-2.3/arch/i386/kernel/irq.c Fri Jan 28 20:21:46 2000
@@ -192,20 +192,6 @@
atomic_t global_bh_count;
atomic_t global_bh_lock;
-/*
- * "global_cli()" is a special case, in that it can hold the
- * interrupts disabled for a longish time, and also because
- * we may be doing TLB invalidates when holding the global
- * IRQ lock for historical reasons. Thus we may need to check
- * SMP invalidate events specially by hand here (but not in
- * any normal spinlocks)
- */
-static inline void check_smp_invalidate(int cpu)
-{
- if (test_bit(cpu, &smp_invalidate_needed))
- do_flush_tlb_local();
-}
-
static void show(char * str)
{
int i;
@@ -294,7 +280,6 @@
__sti();
SYNC_OTHER_CORES(cpu);
__cli();
- check_smp_invalidate(cpu);
if (atomic_read(&global_irq_count))
continue;
if (global_irq_lock)
@@ -346,7 +331,6 @@
/* Uhhuh.. Somebody else got it. Wait.. */
do {
do {
- check_smp_invalidate(cpu);
} while (test_bit(0,&global_irq_lock));
} while (test_and_set_bit(0,&global_irq_lock));
}
--- 2.3/kernel/sched.c Fri Jan 21 12:59:26 2000
+++ build-2.3/kernel/sched.c Fri Jan 28 20:52:50 2000
@@ -581,6 +581,7 @@
if (next->active_mm) BUG();
next->active_mm = oldmm;
atomic_inc(&oldmm->mm_count);
+ enter_lazy_tlb(oldmm, next, this_cpu);
} else {
if (next->active_mm != mm) BUG();
switch_mm(oldmm, mm, next, this_cpu);
@@ -1184,5 +1185,6 @@
* The boot idle thread does lazy MMU switching as well:
*/
atomic_inc(&init_mm.mm_count);
+ enter_lazy_tlb(&init_mm, current, cpu);
}
--- 2.3/kernel/exit.c Tue Dec 7 10:43:36 1999
+++ build-2.3/kernel/exit.c Fri Jan 28 20:21:47 2000
@@ -247,6 +247,7 @@
current->mm = NULL;
/* active_mm is still 'mm' */
atomic_inc(&mm->mm_count);
+ enter_lazy_tlb(mm, current, smp_processor_id());
return mm;
}
@@ -275,6 +276,7 @@
mm_release();
if (mm != tsk->active_mm) BUG();
tsk->mm = NULL;
+ enter_lazy_tlb(mm, current, smp_processor_id());
mmput(mm);
}
}
--- 2.3/include/asm-i386/pgalloc.h Fri Jan 21 12:59:26 2000
+++ build-2.3/include/asm-i386/pgalloc.h Fri Jan 28 20:21:47 2000
@@ -220,11 +220,6 @@
#else
-/*
- * We aren't very clever about this yet - SMP could certainly
- * avoid some global flushes..
- */
-
#include <asm/smp.h>
#define local_flush_tlb() \
@@ -242,22 +237,17 @@
flush_tlb_mm(mm);
}
-extern volatile unsigned long smp_invalidate_needed;
-extern unsigned int cpu_tlbbad[NR_CPUS];
+#define TLBSTATE_OK 1
+#define TLBSTATE_LAZY 2
+#define TLBSTATE_OLD 3
-static inline void do_flush_tlb_local(void)
+struct tlb_state
{
- unsigned long cpu = smp_processor_id();
- struct mm_struct *mm = current->mm;
+ struct mm_struct *active_mm;
+ int state;
+};
+extern struct tlb_state cpu_tlbstate[NR_CPUS];
- clear_bit(cpu, &smp_invalidate_needed);
- if (mm) {
- set_bit(cpu, &mm->cpu_vm_mask);
- local_flush_tlb();
- } else {
- cpu_tlbbad[cpu] = 1;
- }
-}
#endif
--- 2.3/include/asm-i386/mmu_context.h Tue Dec 7 10:49:04 1999
+++ build-2.3/include/asm-i386/mmu_context.h Fri Jan 28 20:51:12 2000
@@ -12,30 +12,46 @@
#define init_new_context(tsk,mm) do { } while (0)
#ifdef __SMP__
-extern unsigned int cpu_tlbbad[NR_CPUS];
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
+{
+ if(cpu_tlbstate[cpu].state == TLBSTATE_OK)
+ cpu_tlbstate[cpu].state = TLBSTATE_LAZY;
+}
+#else
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
+{
+}
#endif
static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
{
+ set_bit(cpu, &next->cpu_vm_mask);
if (prev != next) {
/*
* Re-load LDT if necessary
*/
if (prev->segments != next->segments)
load_LDT(next);
-
+#ifdef CONFIG_SMP
+ cpu_tlbstate[cpu].state = TLBSTATE_OK;
+ cpu_tlbstate[cpu].active_mm = next;
+#endif
/* Re-load page tables */
asm volatile("movl %0,%%cr3": :"r" (__pa(next->pgd)));
clear_bit(cpu, &prev->cpu_vm_mask);
}
#ifdef __SMP__
else {
- if(cpu_tlbbad[cpu])
+ int old_state = cpu_tlbstate[cpu].state;
+ cpu_tlbstate[cpu].state = TLBSTATE_OK;
+ if(cpu_tlbstate[cpu].active_mm != next)
+ BUG();
+ if(old_state == TLBSTATE_OLD)
local_flush_tlb();
}
- cpu_tlbbad[cpu] = 0;
+
#endif
- set_bit(cpu, &next->cpu_vm_mask);
}
#define activate_mm(prev, next) \
Más información sobre la lista de distribución Ayuda