[PATCH] new tlb flush code

Vie Ene 28 23:02:51 CST 2000

Below is my new tlb flush code:

* it fixes the race between switch_mm() and switch_to()
* smp_flush_interrupt() optimization: flush_tlb_page() flushes only one
page instead of the complete tlb.
* the patch is against 2.3.40

The patch is i386 only, all other architectures must add an
enter_lazy_tlb() macro:

* It's usually empty on UP
* If the cpu doesn't support mm->context, then you could copy the
implementation from i386.
* If the cpu supports multiple active_mm's in the tlb, then you could
use the "negative version" of the i386 design:
- instead of storing "active_mm" in a special array, you store "mm" in
that array.
- flush_tlb_xy() resets "mm->cpu_vm_mask".
- the tlb flush interrupt adds itself to "mm->cpu_vm_mask" if that mm is
active.
- I don't know if flush_tlb_all() can clear "mm->cpu_vm_mask".

It's tested on my K6/200 and my Dual-PII, and it's stable.

--
	Manfred
------------ próxima parte ------------
// $Header$
// Kernel Version:
//  VERSION = 2
//  PATCHLEVEL = 3
//  SUBLEVEL = 40
//  EXTRAVERSION =

--- 2.3/arch/i386/kernel/smp.c	Fri Jan 21 12:59:23 2000
+++ build-2.3/arch/i386/kernel/smp.c	Fri Jan 28 20:21:46 2000
@@ -103,8 +103,7 @@
 /* The 'big kernel lock' */
 spinlock_t kernel_flag = SPIN_LOCK_UNLOCKED;
 
-volatile unsigned long smp_invalidate_needed; /* immediate flush required */
-unsigned int cpu_tlbbad[NR_CPUS]; /* flush before returning to user space */
+struct tlb_state cpu_tlbstate[NR_CPUS];
 
 /*
  * the following functions deal with sending IPIs between CPUs.
@@ -282,74 +281,140 @@
 }
 
 /*
- * This is fraught with deadlocks. Probably the situation is not that
- * bad as in the early days of SMP, so we might ease some of the
- * paranoia here.
+ *	Smarter SMP flushing macros. 
+ *		c/o Linus Torvalds.
+ *
+ *	These mean you can really definitely utterly forget about
+ *	writing to user space from interrupts. (Its not allowed anyway).
+ *
+ *	Optimizations Manfred Spraul <manfreds en colorfullife.com>
  */
-static void flush_tlb_others(unsigned int cpumask)
+#define TLB_PARANOIA 1
+
+static volatile unsigned long flush_cpumask;
+static struct mm_struct * flush_mm;
+static unsigned long flush_va;
+#define FLUSH_ALL	0xFFFFffff
+
+static void inline leave_mm(unsigned long cpu)
 {
-	int cpu = smp_processor_id();
-	int stuck;
-	unsigned long flags;
+#ifdef TLB_PARANOIA
+	if(cpu_tlbstate[cpu].state == TLBSTATE_OK)
+		BUG();
+#endif
+	clear_bit(cpu, &cpu_tlbstate[cpu].active_mm->cpu_vm_mask);
+	cpu_tlbstate[cpu].state = TLBSTATE_OLD;
+}
+
+/*
+ *
+ * The flush IPI assumes that a thread switch happens in this order:
+ * 1) set_bit(cpu, &new_mm->cpu_vm_mask);
+ * 2) update cpu_tlbstate
+ * [now the cpu can accept tlb flush request for the new mm]
+ * 3) change cr3 (if required, or flush local tlb,...)
+ * 4) clear_bit(cpu, &old_mm->cpu_vm_mask);
+ * 5) switch %%esp, ie current
+ *
+ * The interrupt must handle 2 special cases:
+ * - cr3 is changed before %%esp, ie. it cannot use current->{active_,}mm.
+ * - the cpu performs speculative tlb reads, i.e. even if the cpu only
+ *   runs in kernel space, the cpu could load tlb entries for user space
+ *   pages.
+ *
+ * The good news is that cpu_tlbstate is local to each cpu, no
+ * write/read ordering problems.
+ */
+
+/*
+ * TLB flush IPI:
+ *
+ * 1) Flush the tlb entries if the cpu uses the mm that's being flushed.
+ * 2) Leave the mm if we are in the lazy tlb mode.
+ * We cannot call mmdrop() because we are in interrupt context, 
+ * instead update cpu_tlbstate.
+ */
+
+asmlinkage void smp_invalidate_interrupt(void)
+{
+	unsigned long cpu = smp_processor_id();
+
+	if (flush_mm == cpu_tlbstate[cpu].active_mm) {
+		if (cpu_tlbstate[cpu].state == TLBSTATE_OK) {
+			if(flush_va == FLUSH_ALL)
+				local_flush_tlb();
+			 else
+			 	__flush_tlb_one(flush_va);
+		} else {
+			leave_mm(cpu);
+		}
+	}
+	ack_APIC_irq();
+	clear_bit(cpu, &flush_cpumask);
+}
+
+static void flush_tlb_others(unsigned long cpumask, struct mm_struct *mm, unsigned long va)
+{
+#ifdef TLB_PARANOIA
+	if(in_interrupt()) {
+		printk(KERN_EMERG "tlb flush from interrupt: %d,%d",
+			local_bh_count[smp_processor_id()],
+			local_irq_count[smp_processor_id()]);
+	}
+	if(cpumask & (1<<smp_processor_id())) {
+		printk(KERN_EMERG "flush_tlb_others: bad cpumask!");
+		cpumask &= ~(1<<smp_processor_id());	
+		local_flush_tlb();
+	}
+	{
+		int flags;
+
+		save_flags(flags);
+		if(flags != 1) {
+static int limit=10;
+			if(limit > 0) {
+				limit--;
+				printk(KERN_EMERG "flush_tlb_others: possible lock-up, broken!(%d)",
+						flags);
+/*				show_stack(NULL);*/
+			}
+			sti();
+		}
+	}
+#endif
+	cpumask &= cpu_online_map;
 
 	/*
 	 * it's important that we do not generate any APIC traffic
 	 * until the AP CPUs have booted up!
 	 */
-	cpumask &= cpu_online_map;
 	if (cpumask) {
-		atomic_set_mask(cpumask, &smp_invalidate_needed);
-
-		/*
-		 * Processors spinning on some lock with IRQs disabled
-		 * will see this IRQ late. The smp_invalidate_needed
-		 * map will ensure they don't do a spurious flush tlb
-		 * or miss one.
-		 */
-	
-		__save_flags(flags);
-		__cli();
-
+static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+		spin_lock(&lock);
+		
+		flush_mm = mm;
+		flush_va = va;
+		atomic_set_mask(cpumask, &flush_cpumask);
 		send_IPI_allbutself(INVALIDATE_TLB_VECTOR);
 
-		/*
-		 * Spin waiting for completion
-		 */
-
-		stuck = 50000000;
-		while (smp_invalidate_needed) {
-			/*
-			 * Take care of "crossing" invalidates
+		while (flush_cpumask) {
+			/* FIXME: lockup-detection, print backtrace on
+			 * lock-up
 			 */
-			if (test_bit(cpu, &smp_invalidate_needed))
-				do_flush_tlb_local();
-
-			--stuck;
-			if (!stuck) {
-				printk("stuck on TLB IPI wait (CPU#%d)\n",cpu);
-				break;
-			}
 		}
-		__restore_flags(flags);
+		flush_mm = flush_va = 0;
+		spin_unlock(&lock);
 	}
 }
-
-/*
- *	Smarter SMP flushing macros. 
- *		c/o Linus Torvalds.
- *
- *	These mean you can really definitely utterly forget about
- *	writing to user space from interrupts. (Its not allowed anyway).
- */	
+	
 void flush_tlb_current_task(void)
 {
 	unsigned long vm_mask = 1 << smp_processor_id();
 	struct mm_struct *mm = current->mm;
 	unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask;
 
-	mm->cpu_vm_mask = vm_mask;
-	flush_tlb_others(cpu_mask);
 	local_flush_tlb();
+	flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
 }
 
 void flush_tlb_mm(struct mm_struct * mm)
@@ -357,12 +422,14 @@
 	unsigned long vm_mask = 1 << smp_processor_id();
 	unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask;
 
-	mm->cpu_vm_mask = 0;
 	if (current->active_mm == mm) {
-		mm->cpu_vm_mask = vm_mask;
-		local_flush_tlb();
+		if(current->mm)
+			local_flush_tlb();
+		 else
+			leave_mm(smp_processor_id());
 	}
-	flush_tlb_others(cpu_mask);
+		
+	flush_tlb_others(cpu_mask, mm, FLUSH_ALL);
 }
 
 void flush_tlb_page(struct vm_area_struct * vma, unsigned long va)
@@ -371,23 +438,22 @@
 	struct mm_struct *mm = vma->vm_mm;
 	unsigned long cpu_mask = mm->cpu_vm_mask & ~vm_mask;
 
-	mm->cpu_vm_mask = 0;
 	if (current->active_mm == mm) {
-		__flush_tlb_one(va);
-		mm->cpu_vm_mask = vm_mask;
+		if(current->mm)
+			__flush_tlb_one(va);
+		 else
+		 	leave_mm(smp_processor_id());
 	}
-	flush_tlb_others(cpu_mask);
+
+	flush_tlb_others(cpu_mask, mm, va);
 }
 
 static inline void do_flush_tlb_all_local(void)
 {
+	unsigned long cpu = smp_processor_id();
 	__flush_tlb_all();
-	if (!current->mm && current->active_mm) {
-		unsigned long cpu = smp_processor_id();
-
-		clear_bit(cpu, &current->active_mm->cpu_vm_mask);
-		cpu_tlbbad[cpu] = 1;
-	}
+	if (cpu_tlbstate[cpu].state == TLBSTATE_LAZY)
+		leave_mm(cpu);
 }
 
 static void flush_tlb_all_ipi(void* info)
@@ -512,23 +578,6 @@
 asmlinkage void smp_reschedule_interrupt(void)
 {
 	ack_APIC_irq();
-}
-
-/*
- * Invalidate call-back.
- *
- * Mark the CPU as a VM user if there is a active
- * thread holding on to an mm at this time. This
- * allows us to optimize CPU cross-calls even in the
- * presense of lazy TLB handling.
- */
-asmlinkage void smp_invalidate_interrupt(void)
-{
-	if (test_bit(smp_processor_id(), &smp_invalidate_needed))
-		do_flush_tlb_local();
-
-	ack_APIC_irq();
-
 }
 
 asmlinkage void smp_call_function_interrupt(void)
--- 2.3/arch/i386/kernel/setup.c	Fri Jan 21 12:59:23 2000
+++ build-2.3/arch/i386/kernel/setup.c	Fri Jan 28 20:53:14 2000
@@ -75,7 +75,7 @@
 #include <asm/e820.h>
 #include <asm/dma.h>
 #include <asm/mpspec.h>
-
+#include <asm/mmu_context.h>
 /*
  * Machine setup..
  */
@@ -1543,6 +1543,10 @@
 	 */
 	atomic_inc(&init_mm.mm_count);
 	current->active_mm = &init_mm;
+	if(current->mm)
+		BUG();
+	enter_lazy_tlb(&init_mm, current, nr);
+
 	t->esp0 = current->thread.esp0;
 	set_tss_desc(nr,t);
 	gdt_table[__TSS(nr)].b &= 0xfffffdff;
--- 2.3/arch/i386/kernel/irq.c	Fri Jan 21 12:59:23 2000
+++ build-2.3/arch/i386/kernel/irq.c	Fri Jan 28 20:21:46 2000
@@ -192,20 +192,6 @@
 atomic_t global_bh_count;
 atomic_t global_bh_lock;
 
-/*
- * "global_cli()" is a special case, in that it can hold the
- * interrupts disabled for a longish time, and also because
- * we may be doing TLB invalidates when holding the global
- * IRQ lock for historical reasons. Thus we may need to check
- * SMP invalidate events specially by hand here (but not in
- * any normal spinlocks)
- */
-static inline void check_smp_invalidate(int cpu)
-{
-	if (test_bit(cpu, &smp_invalidate_needed))
-		do_flush_tlb_local();
-}
-
 static void show(char * str)
 {
 	int i;
@@ -294,7 +280,6 @@
 			__sti();
 			SYNC_OTHER_CORES(cpu);
 			__cli();
-			check_smp_invalidate(cpu);
 			if (atomic_read(&global_irq_count))
 				continue;
 			if (global_irq_lock)
@@ -346,7 +331,6 @@
 		/* Uhhuh.. Somebody else got it. Wait.. */
 		do {
 			do {
-				check_smp_invalidate(cpu);
 			} while (test_bit(0,&global_irq_lock));
 		} while (test_and_set_bit(0,&global_irq_lock));		
 	}
--- 2.3/kernel/sched.c	Fri Jan 21 12:59:26 2000
+++ build-2.3/kernel/sched.c	Fri Jan 28 20:52:50 2000
@@ -581,6 +581,7 @@
 			if (next->active_mm) BUG();
 			next->active_mm = oldmm;
 			atomic_inc(&oldmm->mm_count);
+			enter_lazy_tlb(oldmm, next, this_cpu);
 		} else {
 			if (next->active_mm != mm) BUG();
 			switch_mm(oldmm, mm, next, this_cpu);
@@ -1184,5 +1185,6 @@
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
+	enter_lazy_tlb(&init_mm, current, cpu);
 }
 
--- 2.3/kernel/exit.c	Tue Dec  7 10:43:36 1999
+++ build-2.3/kernel/exit.c	Fri Jan 28 20:21:47 2000
@@ -247,6 +247,7 @@
 	current->mm = NULL;
 	/* active_mm is still 'mm' */
 	atomic_inc(&mm->mm_count);
+	enter_lazy_tlb(mm, current, smp_processor_id());
 	return mm;
 }
 
@@ -275,6 +276,7 @@
 		mm_release();
 		if (mm != tsk->active_mm) BUG();
 		tsk->mm = NULL;
+		enter_lazy_tlb(mm, current, smp_processor_id());
 		mmput(mm);
 	}
 }
--- 2.3/include/asm-i386/pgalloc.h	Fri Jan 21 12:59:26 2000
+++ build-2.3/include/asm-i386/pgalloc.h	Fri Jan 28 20:21:47 2000
@@ -220,11 +220,6 @@
 
 #else
 
-/*
- * We aren't very clever about this yet -  SMP could certainly
- * avoid some global flushes..
- */
-
 #include <asm/smp.h>
 
 #define local_flush_tlb() \
@@ -242,22 +237,17 @@
 	flush_tlb_mm(mm);
 }
 
-extern volatile unsigned long smp_invalidate_needed;
-extern unsigned int cpu_tlbbad[NR_CPUS];
+#define TLBSTATE_OK	1
+#define TLBSTATE_LAZY	2
+#define TLBSTATE_OLD	3
 
-static inline void do_flush_tlb_local(void)
+struct tlb_state
 {
-	unsigned long cpu = smp_processor_id();
-	struct mm_struct *mm = current->mm;
+	struct mm_struct *active_mm;
+	int state;
+};
+extern struct tlb_state cpu_tlbstate[NR_CPUS];
 
-	clear_bit(cpu, &smp_invalidate_needed);
-	if (mm) {
-		set_bit(cpu, &mm->cpu_vm_mask);
-		local_flush_tlb();
-	} else {
-		cpu_tlbbad[cpu] = 1;
-	}
-}
 
 #endif
 
--- 2.3/include/asm-i386/mmu_context.h	Tue Dec  7 10:49:04 1999
+++ build-2.3/include/asm-i386/mmu_context.h	Fri Jan 28 20:51:12 2000
@@ -12,30 +12,46 @@
 #define init_new_context(tsk,mm)	do { } while (0)
 
 #ifdef __SMP__
-extern unsigned int cpu_tlbbad[NR_CPUS];
+
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
+{
+	if(cpu_tlbstate[cpu].state == TLBSTATE_OK)
+		cpu_tlbstate[cpu].state = TLBSTATE_LAZY;	
+}
+#else
+static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk, unsigned cpu)
+{
+}
 #endif
 
 static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk, unsigned cpu)
 {
+	set_bit(cpu, &next->cpu_vm_mask);
 	if (prev != next) {
 		/*
 		 * Re-load LDT if necessary
 		 */
 		if (prev->segments != next->segments)
 			load_LDT(next);
-
+#ifdef CONFIG_SMP
+		cpu_tlbstate[cpu].state = TLBSTATE_OK;
+		cpu_tlbstate[cpu].active_mm = next;
+#endif
 		/* Re-load page tables */
 		asm volatile("movl %0,%%cr3": :"r" (__pa(next->pgd)));
 		clear_bit(cpu, &prev->cpu_vm_mask);
 	}
 #ifdef __SMP__
 	else {
-		if(cpu_tlbbad[cpu])
+		int old_state = cpu_tlbstate[cpu].state;
+		cpu_tlbstate[cpu].state = TLBSTATE_OK;
+		if(cpu_tlbstate[cpu].active_mm != next)
+			BUG();
+		if(old_state == TLBSTATE_OLD)
 			local_flush_tlb();
 	}
-	cpu_tlbbad[cpu] = 0;
+
 #endif
-	set_bit(cpu, &next->cpu_vm_mask);
 }
 
 #define activate_mm(prev, next) \