Index: linux/Documentation/DocBook/Makefile
===================================================================
--- linux.orig/Documentation/DocBook/Makefile
+++ linux/Documentation/DocBook/Makefile
@@ -10,7 +10,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mc
 	    kernel-hacking.xml kernel-locking.xml deviceiobook.xml \
 	    procfs-guide.xml writing_usb_driver.xml \
 	    sis900.xml kernel-api.xml journal-api.xml lsm.xml usb.xml \
-	    gadget.xml libata.xml mtdnand.xml librs.xml
+	    gadget.xml libata.xml mtdnand.xml librs.xml genericirq.xml
 
 ###
 # The build process is as follows (targets):
Index: linux/Documentation/DocBook/genericirq.tmpl
===================================================================
--- /dev/null
+++ linux/Documentation/DocBook/genericirq.tmpl
@@ -0,0 +1,560 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+	"http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Generic-IRQ-Guide">
+ <bookinfo>
+  <title>Linux generic IRQ handling</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Thomas</firstname>
+    <surname>Gleixner</surname>
+    <affiliation>
+     <address>
+      <email>tglx@linutronix.de</email>
+     </address>
+    </affiliation>
+   </author>
+   <author>
+    <firstname>Ingo</firstname>
+    <surname>Molnar</surname>
+    <affiliation>
+     <address>
+      <email>mingo@elte.hu</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <copyright>
+   <year>2005</year>
+   <holder>Thomas Gleixner</holder>
+  </copyright>
+  <copyright>
+   <year>2005</year>
+   <holder>Ingo Molnar</holder>
+  </copyright>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License version 2 as published by the Free Software Foundation.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+<toc></toc>
+
+  <chapter id="intro">
+    <title>Introduction</title>
+    <para>
+	The generic interrupt handling layer is designed to provide a
+	complete abstraction of interrupt handling for device drivers
+	and is able to handle all different types of interrupt controller
+	hardware. Device drivers use generic API function to request, enable,
+	disable and free interrupts. The drivers do not have to know anything
+	about interrupt hardware, so they can be used on different hardware
+	platforms without code changes.
+    </para>
+    <para>
+  	This documentation is provided for developers who want to implement
+	architecture interrupt support based on the Generic IRQ handling layer.
+    </para>
+  </chapter>
+
+  <chapter id="rationale">
+    <title>Rationale</title>
+	<para>
+	The original implementation of interrupt handling in Linux is using
+	the __do_IRQ() super-handler, which must be able to deal with every
+	type of interrupt logic. This is achieved by an 'interrupt type'
+	structure and runtime flags to handle special cases.
+	Furthermore the superhandler assumed a certain type of interrupt
+	handling hardware and turned out to be not capable of handling all
+	kind of interrupt controller hardware which can be found through
+	the architectures. The all in one approach also adds unnecessary
+	complexity for every user.
+	</para>
+	<para>
+	Originally, Russell King identified different types of handlers to
+	build a quite universal set for the ARM interrupt handler
+	implementation in Linux 2.5/2.6. He distiguished between:
+	<itemizedlist>
+	  <listitem><para>Level type</para></listitem>
+	  <listitem><para>Edge type</para></listitem>
+	  <listitem><para>Simple type</para></listitem>
+	</itemizedlist>
+	In the SMP world of the __do_IRQ() super-handler another type
+	was identified:
+	<itemizedlist>
+	  <listitem><para>Per CPU type</para></listitem>
+	</itemizedlist>
+	</para>
+	<para>
+	This split implementation of handlers allows to optimize the flow
+	of the interrupt handling for each specific interrupt type.
+	This reduces complexitiy in that particular code path and allows
+	the optimized handling of a given type.
+	</para>
+	<para>
+	The original general implementation uses interrupt_type structures
+	to differentiate the flow control in the super-handler. This
+	leads to a mix of flow logic and code related to hardware details.
+	Russell Kings ARM implementation which replaced the type by a chip
+	abstraction did the mix the other way around.
+	</para>
+	<para>
+	The natural conclusion was a clean seperation of the 'type flow'
+	and the	'chip'. Analysing a couple of architecture implementations
+	reveals that many of them can use a generic set of 'type flow'
+	implementations and only need to add the chip level specific code.
+	The seperation is also valuable for the (sub)architectures,
+	which need specific quirks in the type flow itself, because it
+	provides a more transparent design.
+	</para>
+	<para>
+	Each interrupt type implementation has assigned its own flow
+	handler, which should be normally one of the generic
+	implementations. The flow handler implementation makes it
+	simple to provide demultiplexing handlers which can be found in
+	embedded platforms on various architectures.
+	</para>
+	<para>
+	The seperation makes the generic interrupt handling more flexible
+	and extensible. An (sub)architecture can use a generic type flow
+	implementation for e.g. 'level type' interrupts and add a
+	(sub)architecture specific 'edge type' implementation.
+	</para>
+	<para>
+	To make the transition to the new model easier and prevent the
+	breakage of existing implementations the __do_IRQ() super-handler
+	is still available. This leads to a kind of duality for the time
+	being. Over time the new model should achieve a homogeneous
+	implementation scheme over all architectures with enhanced
+	maintainability and cleanliness.
+	</para>
+  </chapter>
+  <chapter id="bugs">
+    <title>Known Bugs And Assumptions</title>
+    <para>
+	None (hopefully).
+    </para>
+  </chapter>
+
+  <chapter id="Abstraction">
+    <title>Abstraction layers</title>
+    <para>
+	There are three main levels of abstraction in the interrupt code:
+	<orderedlist>
+	  <listitem><para>Highlevel driver API</para></listitem>
+	  <listitem><para>Abstract interrupt type</para></listitem>
+	  <listitem><para>Chiplevel hardware encapsulation</para></listitem>
+	</orderedlist>
+    </para>
+    <para>
+	The seperation of interrupt type and chip level functionality
+	provides the most flexible design. This implementation can handle
+	all kinds of interrupt hardware and the necessary workarounds for
+	the interrupt types without the need of redundant implementations.
+	The seperation handles also edge and level type interrupts
+	on the same hardware chip.
+    </para>
+    <sect1>
+	<title>Interrupt control flow</title>
+	<para>
+	Each interrupt is described by an interrupt description structure
+	irq_desc. The interrupt is referenced by an 'unsigned int' numeric
+	value which selects the corresponding interrupt decription structure
+	in the description structures array.
+	The description structure contains status information and pointers
+	to the interrupt type structure and the interrupt chip structure
+	which are assigned to this interrupt.
+	</para>
+	<para>
+	Whenever an interrupt triggers, the lowlevel arch code calls into
+	the generic interrupt code by calling desc->handler->handle_irq().
+	This highlevel IRQ handling function only uses other
+	desc->handler primitives which describe the control flow operation
+	necessary for the interrupt type. These operations are calling
+	the chip primitives referenced by the assigned chip description
+	structure.
+	</para>
+    </sect1>
+    <sect1>
+	<title>Highlevel Driver API</title>
+	<para>
+	  The highlevel Driver API consists of following functions:
+	  <itemizedlist>
+	  <listitem><para>request_irq()</para></listitem>
+	  <listitem><para>free_irq()</para></listitem>
+	  <listitem><para>disable_irq()</para></listitem>
+	  <listitem><para>enable_irq()</para></listitem>
+	  <listitem><para>disable_irq_nosync() (SMP only)</para></listitem>
+	  <listitem><para>synchronize_irq() (SMP only)</para></listitem>
+	  <listitem><para>set_irq_type()</para></listitem>
+	  <listitem><para>set_irq_wake()</para></listitem>
+	  <listitem><para>set_irq_data()</para></listitem>
+	  <listitem><para>set_irq_chip()</para></listitem>
+	  <listitem><para>set_irq_chip_data()</para></listitem>
+          </itemizedlist>
+	  See the autogenerated function documentation for details.
+	</para>
+    </sect1>
+    <sect1>
+	<title>Abstract interrupt type</title>
+	<para>
+	  The 'interrupt type' (struct irq_type) abstraction mainly consists of
+	  methods which implement the 'interrupt handling flow'. The generic
+	  layer provides a set of pre-defined types:
+	  <itemizedlist>
+	  <listitem><para>default_level_type</para></listitem>
+	  <listitem><para>default_edge_type</para></listitem>
+	  <listitem><para>default_simple_type</para></listitem>
+	  <listitem><para>default_percpu_type</para></listitem>
+	  </itemizedlist>
+	  The default type implementations use the generic type handlers.
+	  <itemizedlist>
+	  <listitem><para>handle_level_type</para></listitem>
+	  <listitem><para>handle_edge_type</para></listitem>
+	  <listitem><para>handle_simple_type</para></listitem>
+	  <listitem><para>handle_percpu_type</para></listitem>
+	  </itemizedlist>
+	  The interrupt types (either predefined or architecture specific) are
+	  assigned to specific interrupts by the architecture either during
+	  bootup or during device initialization.
+	</para>
+	<sect2>
+	<title>Default type implementations</title>
+	    <sect3>
+	 	<title>Helper functions</title>
+		<para>
+		The helper functions call the chip primitives and
+		are used by the default type implementations.
+		Following helper functions are implemented (simplified excerpt):
+		<programlisting>
+default_enable(irq)
+{
+	desc->chip->unmask(irq);
+}
+
+default_disable(irq)
+{
+	desc->chip->mask(irq);
+}
+
+default_ack(irq)
+{
+	chip->ack(irq);
+}
+
+default_mask_ack(irq)
+{
+	if (chip->mask_ack) {
+		chip->mask_ack(irq);
+	} else {
+		chip->mask(irq);
+		chip->ack(irq);
+	}
+}
+
+noop(irq)
+{
+}
+
+default_set_type(irq, type)
+{
+	if (desc->chip->set_type) {
+		if (desc->chip->set_type(irq, type))
+			return NULL;
+	}
+
+	return default_handler for type;
+}
+		</programlisting>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default Level IRQ type</title>
+		<para>
+		The default Level IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_mask_ack</member>
+		<member>end</member><member>default_enable</member>
+		<member>handle_irq</member><member>handle_level_irq</member>
+		<member>set_type</member><member>default_set_type</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default Edge IRQ type</title>
+		<para>
+		The default Edge IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_ack</member>
+		<member>hold</member><member>default_mask_ack</member>
+		<member>end</member><member>noop</member>
+		<member>handle_irq</member><member>handle_edge_irq</member>
+		<member>set_type</member><member>default_set_type</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default simple IRQ type</title>
+		<para>
+		The default simple IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>noop</member>
+		<member>disable</member><member>noop</member>
+		<member>handle_irq</member><member>handle_simple_irq</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	    <sect3>
+	 	<title>Default per CPU IRQ type</title>
+		<para>
+		The default per CPU IRQ type implements the functions
+		<simplelist type="horiz" columns="2">
+		<member>enable</member><member>default_enable</member>
+		<member>disable</member><member>default_disable</member>
+		<member>start</member><member>default_ack</member>
+		<member>end</member><member>default_enable</member>
+		<member>handle_irq</member><member>handle_percpu_irq</member>
+		</simplelist>
+	        </para>
+	    </sect3>
+	</sect2>
+	<sect2>
+	<title>Default type handler implementations</title>
+	    <sect3>
+	 	<title>Default Level IRQ type handler</title>
+		<para>
+		handle_level_type provides a generic implementation
+		for level type interrupts.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+desc->handler->start();
+handle_IRQ_event(desc->action);
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default Edge IRQ type handler</title>
+		<para>
+		handle_edge_type provides a generic implementation
+		for edge type interrupts.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+if (desc->status &amp; running) {
+	desc->handler->hold();
+	desc->status |= pending | masked;
+	return;
+}
+desc->handler->start();
+desc->status |= running;
+do {
+	if (desc->status &amp; masked)
+		desc->handler->enable();
+	desc-status &amp;= ~pending;
+	handle_IRQ_event(desc->action);
+} while (status &amp; pending);
+desc-status &amp;= ~running;
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default simple IRQ type handler</title>
+		<para>
+		handle_simple_type provides a generic implementation
+		for simple type interrupts.
+		</para>
+		<para>
+		Note: The simple type handler does not call any
+		handler/chip primitives.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+handle_IRQ_event(desc->action);
+		</programlisting>
+		</para>
+   	    </sect3>
+	    <sect3>
+	 	<title>Default per CPU type handler</title>
+		<para>
+		handle_percpu_type provides a generic implementation
+		for per CPU type interrupts.
+		</para>
+		<para>
+		Per CPU interrupts are only available on SMP and
+		the handler provides a simplified version without
+		locking.
+		</para>
+		<para>
+		Following control flow is implemented (simplified excerpt):
+		<programlisting>
+desc->handler->start();
+handle_IRQ_event(desc->action);
+desc->handler->end();
+		</programlisting>
+		</para>
+   	    </sect3>
+	</sect2>
+	<sect2>
+	<title>Architecture specific type implementation</title>
+	<para>
+	  If an architecture needs to implement its own type structures, then
+	  the following primitives have to be implemented:
+	  <itemizedlist>
+	  <listitem><para>handle_irq() - The handle_irq function pointer should preferably point to
+	  one of the generic type handler functions</para></listitem>
+	  <listitem><para>startup() - Optional</para></listitem>
+	  <listitem><para>shutdown() - Optional</para></listitem>
+	  <listitem><para>enable()</para></listitem>
+	  <listitem><para>disable()</para></listitem>
+	  <listitem><para>start()</para></listitem>
+	  <listitem><para>hold() - For edge type interupts only</para></listitem>
+	  <listitem><para>end()</para></listitem>
+	  <listitem><para>set_type - Optional</para></listitem>
+	  <listitem><para>set_affinity - SMP only</para></listitem>
+	  </itemizedlist>
+	</para>
+	</sect2>
+	<sect2>
+	<title>Quirks and optimizations</title>
+	<para>
+	The generic functions are intended for 'clean' architectures and chips,
+	which have no platform-specific IRQ handling quirks. If an architecture
+	needs to implement quirks on the 'flow' level then it can do so by
+	overriding the irqtype. This is also done for compatibility reasons, as
+	most architectures use irqtypes only at the moment.
+	</para>
+	<para>
+	An architecture could implement all of its IRQ logic via pushing
+	chip handling details into the irqtype's ->start()/->end()/->hold()
+	functions. This is only recommended when the underlying primitives
+	are pure chip primitives without additional quirks. The direct pointer
+	to the chip functions reduces the indirection level by one.
+	</para>
+	</sect2>
+    </sect1>
+    <sect1>
+	<title>Chiplevel hardware encapsulation</title>
+	<para>
+	The chip level hardware description structure irq_chip
+	contains all the direct chip relevant functions, which
+	can be utilized by the irq_type implementations.
+	  <itemizedlist>
+	  <listitem><para>ack()</para></listitem>
+	  <listitem><para>mask_ack() - Optional, recommended for performance</para></listitem>
+	  <listitem><para>mask()</para></listitem>
+	  <listitem><para>unmask()</para></listitem>
+	  <listitem><para>retrigger() - Optional</para></listitem>
+	  <listitem><para>set_type() - Optional</para></listitem>
+	  <listitem><para>set_wake() - Optional</para></listitem>
+	  </itemizedlist>
+	These primitives are strictly intended to mean what they say: ack means
+	ACK, masking means masking of an IRQ line, etc. It is up to the flow
+	handler(s) to use these basic units of lowlevel functionality.
+	</para>
+    </sect1>
+  </chapter>
+
+  <chapter id="doirq">
+     <title>__do_IRQ entry point</title>
+     <para>
+ 	The original implementation __do_IRQ() is an alternative entry
+	point for all types of interrupts.
+     </para>
+     <para>
+	This handler turned out to be not suitable for all
+	interrupt hardware and was therefor reimplemented with split
+	functionality for egde/level/simple/percpu interrupts. This is not
+	only a functional optimization. It also shortenes code pathes for
+	interrupts.
+      </para>
+      <para>
+	To make use of the split implementation, replace the call to
+	__do_IRQ by a call to desc->handler->handle_irq() and associate
+        the appropriate handler function to desc->handler->handle_irq().
+	In most cases the generic type and handler implementations should
+	be sufficient.
+     </para>
+  </chapter>
+
+  <chapter id="locking">
+     <title>Locking on SMP</title>
+     <para>
+	The locking of chip registers is up to the architecture that
+	defines the chip primitives. There is a chip->lock field that can be used
+	for serialization, but the generic layer does not touch it. The per-irq
+	structure is protected via desc->lock, by the generic layer.
+     </para>
+  </chapter>
+  <chapter id="structs">
+     <title>Structures</title>
+     <para>
+     This chapter contains the autogenerated documentation of the structures which are
+     used in the generic IRQ layer.
+     </para>
+!Iinclude/linux/irq.h
+  </chapter>
+
+  <chapter id="pubfunctions">
+     <title>Public Functions Provided</title>
+     <para>
+     This chapter contains the autogenerated documentation of the kernel API functions
+      which are exported.
+     </para>
+!Ekernel/irq/manage.c
+  </chapter>
+
+  <chapter id="intfunctions">
+     <title>Internal Functions Provided</title>
+     <para>
+     This chapter contains the autogenerated documentation of the internal functions.
+     </para>
+!Ikernel/irq/handle.c
+  </chapter>
+
+  <chapter id="credits">
+     <title>Credits</title>
+	<para>
+		The following people have contributed to this document:
+		<orderedlist>
+			<listitem><para>Thomas Gleixner<email>tglx@linutronix.de</email></para></listitem>
+			<listitem><para>Ingo Molnar<email>mingo@elte.hu</email></para></listitem>
+		</orderedlist>
+	</para>
+  </chapter>
+</book>
Index: linux/Documentation/DocBook/kernel-api.tmpl
===================================================================
--- linux.orig/Documentation/DocBook/kernel-api.tmpl
+++ linux/Documentation/DocBook/kernel-api.tmpl
@@ -54,6 +54,11 @@
 !Ekernel/sched.c
 !Ekernel/timer.c
      </sect1>
+     <sect1><title>High-precision timers</title>
+!Iinclude/linux/ktime.h
+!Iinclude/linux/ktimer.h
+!Ekernel/ktimers.c
+     </sect1>
      <sect1><title>Internal Functions</title>
 !Ikernel/exit.c
 !Ikernel/signal.c
Index: linux/Documentation/RCU/proc.txt
===================================================================
--- /dev/null
+++ linux/Documentation/RCU/proc.txt
@@ -0,0 +1,119 @@
+/proc Filesystem Entries for RCU
+
+
+CONFIG_RCU_STATS
+
+The CONFIG_RCU_STATS config option is available only in conjunction with
+CONFIG_PREEMPT_RCU.  It makes four /proc entries available, namely: rcuctrs,
+rcuptrs, rcugp, and rcustats.
+
+/proc/rcuctrs
+
+	CPU last cur
+	  0    1   1
+	  1    1   1
+	  2    1   1
+	  3    0   2
+	ggp = 230725
+
+This displays the number of processes that started RCU read-side critical
+sections on each CPU.  In absence of preemption, the "last" and "cur"
+counts for a given CPU will always sum to one.  Therefore, in the example
+output above, each CPU has started one RCU read-side critical section
+that was later preempted.  The "last" column counts RCU read-side critical
+sections that started prior to the last counter flip, while the "cur"
+column counts critical sections that started after the last counter flip.
+
+The "ggp" count is a count of the number of counter flips since boot.
+Since this is shown as an odd number, the "cur" counts are stored in
+the zero-th element of each of the per-CPU arrays, and the "last" counts
+are stored in the first element of each of the per-CPU arrays.
+
+
+/proc/rcuptrs
+
+	nl=c04c7160/c04c7960 nt=c04c72d0
+	 wl=c04c7168/c04c794c wt=c04c72bc dl=c04c7170/00000000 dt=c04c7170
+
+This displays the head and tail of each of CONFIG_PREEMPT_RCU's three
+callback lists.  This will soon change to display this on a per-CPU
+basis, since each CPU will soon have its own set of callback lists.
+In the example above, the "next" list header is located at hex address
+0xc04c7160, the first element on the list at hex address 0xc04c7960,
+and the last element on the list at hex address 0xc04c72d0.  The "wl="
+and "wt=" output is similar for the "wait" list, and the "dl=" and "dt="
+output for the "done" list.  The "done" list is normally emptied very
+quickly after being filled, so will usually be empty as shown above.
+Note that the tail pointer points into the list header in this case.
+
+Callbacks are placed in the "next" list by call_rcu(), moved to the
+"wait" list after the next counter flip, and moved to the "done" list
+on the counter flip after that.  Once on the "done" list, the callbacks
+are invoked.
+
+
+/proc/rcugp
+
+	oldggp=241419  newggp=241421
+
+This entry invokes synchronize_rcu() and prints out the number of counter
+flips since boot before and after the synchronize_rcu().  These two
+numbers will always differ by at least two.  Unless RCU is broken.  ;-)
+
+
+/proc/rcustats
+
+	ggp=242416 lgp=242416 sr=0 rcc=396233
+	na=2090938 nl=9 wa=2090929 wl=9 dl=0 dr=2090920 di=2090920
+	rtf1=22230730 rtf2=20139162 rtf3=242416 rtfe1=2085911 rtfe2=5657 rtfe3=19896746
+
+The quantities printed are as follows:
+
+o	"ggp=": The number of flips since boot.
+
+o	"lgp=": The number of flips sensed by the local structure since
+	boot.  This will soon be per-CPU.
+
+o	"sr=": The number of explicit call to synchronize_rcu().
+	Except that this is currently broken, so always reads as zero.
+	It is likely to be removed...
+
+o	"rcc=": The number of calls to rcu_check_callbacks().
+
+o	"na=": The number of callbacks that call_rcu() has registered
+	since boot.
+
+o	"nl=": The number of callbacks currently on the "next" list.
+
+o	"wa=": The number of callbacks that have moved to the "wait"
+	list since boot.
+
+o	"wl=": The number of callbacks currently on the "wait" list.
+
+o	"da=": The number of callbacks that have been moved to the
+	"done" list since boot.
+
+o	"dl=": The number of callbacks currently on the "done" list.
+
+o	"dr=": The number of callbacks that have been removed from the
+	"done" list since boot.
+
+o	"di=": The number of callbacks that have been invoked after being
+	removed from the "done" list.
+
+o	"rtf1=": The number of attempts to flip the counters.
+
+o	"rtf2=": The number of attempts to flip the counters that successfully
+	acquired the fliplock.
+
+o	"rtf3=": The number of successful counter flips.
+
+o	"rtfe1=": The number of attempts to flip the counters that failed
+	due to the lock being held by someone else.
+
+o	"rtfe2=": The number of attempts to flip the counters that were
+	abandoned due to someone else doing the job for us.
+
+o	"rtfe3=": The number of attempts to flip the counters that failed
+	due to some task still being in an RCU read-side critical section
+	starting from before the last successful counter flip.
Index: linux/Documentation/RCU/torture.txt
===================================================================
--- /dev/null
+++ linux/Documentation/RCU/torture.txt
@@ -0,0 +1,127 @@
+RCU Torture Test Operation
+
+
+CONFIG_RCU_TORTURE_TEST
+
+The CONFIG_RCU_TORTURE_TEST config option is available for all RCU
+implementations.  It creates an rcutorture kernel module that can
+be loaded to run a torture test.  The test periodically outputs
+status messages via printk(), which can be examined via the dmesg
+command (perhaps grepping for "rcutorture").  The test is started
+when the module is loaded, and stops when the module is unloaded.
+
+However, actually setting this config option to "y" results in the system
+running the test immediately upon boot, and ending only when the system
+is taken down.  Normally, one will instead want to build the system
+with CONFIG_RCU_TORTURE_TEST=m and to use modprobe and rmmod to control
+the test, perhaps using a script similar to the one shown at the end of
+this document.  Note that you will need CONFIG_MODULE_UNLOAD in order
+to be able to end the test.
+
+
+MODULE PARAMETERS
+
+This module has the following parameters:
+
+nreaders	This is the number of RCU reading threads supported.
+		The default is twice the number of CPUs.  Why twice?
+		To properly exercise RCU implementations with preemptible
+		read-side critical sections.
+
+stat_interval	The number of seconds between output of torture
+		statistics (via printk()).  Regardless of the interval,
+		statistics are printed when the module is unloaded.
+		Setting the interval to zero causes the statistics to
+		be printed -only- when the module is unloaded, and this
+		is the default.
+
+verbose		Enable debug printk()s.  Default is disabled.
+
+
+OUTPUT
+
+The statistics output is as follows:
+
+	rcutorture: --- Start of test: nreaders=16 stat_interval=0 verbose=0
+	rcutorture: rtc: 0000000000000000 ver: 1916 tfle: 0 rta: 1916 rtaf: 0 rtf: 1915 rtbme: 0
+	rcutorture: Reader Pipe:  1466408 9747 0 0 0 0 0 0 0 0 0
+	rcutorture: Reader Batch:  1464477 11678 0 0 0 0 0 0 0 0
+	rcutorture: Free-Block Circulation:  1915 1915 1915 1915 1915 1915 1915 1915 1915 1915 0
+	rcutorture: --- End of test: SUCCESS
+
+The command "dmesg | grep rcutorture:" will extract this information on
+most systems.  On more esoteric configurations, it may be necessary to
+use other commands to access the output of the printk()s used by
+the RCU torture test.  The printk()s use KERN_ALERT, so they should
+be evident.  ;-)
+
+The entries are as follows:
+
+o	"ggp": The number of counter flips (or batches) since boot.
+
+o	"rtc": The hexadecimal address of the structure currently visible
+	to readers.
+
+o	"ver": The number of times since boot that the rcutw writer task
+	has changed the structure visible to readers.
+
+o	"tfle": If non-zero, indicates that the "torture freelist"
+	containing structure to be placed into the "rtc" area is empty.
+	This condition is important, since it can fool you into thinking
+	that RCU is working when it is not.  :-/
+
+o	"rta": Number of structures allocated from the torture freelist.
+
+o	"rtaf": Number of allocations from the torture freelist that have
+	failed due to the list being empty.
+
+o	"rtf": Number of frees into the torture freelist.
+
+o	"rtmbe": Number of memory-barrier failures detected (which would
+	indicate problems with either the test itself or the underlying
+	memory-barrier primitives for the CPU architecture on which the
+	failure occurred.
+
+o	"Reader Pipe": Histogram of "ages" of structures seen by readers.
+	If any entries past the first two are non-zero, RCU is broken.
+	And rcutorture prints the error flag string "!!!" to make sure
+	you notice.  The age of a newly allocated structure is zero,
+	it becomes one when removed from reader visibility, and is
+	incremented once per grace period subsequently -- and is freed
+	after passing through (RCU_TORTURE_PIPE_LEN-2) grace periods.
+
+	The output displayed above was taken from a correctly working
+	RCU.  If you want to see what it looks like when broken, break
+	it yourself.  ;-)
+
+o	"Reader Batch": Another histogram of "ages" of structures seen
+	by readers, but in terms of counter flips (or batches) rather
+	than in terms of grace periods.  The legal number of non-zero
+	entries is again two.  The reason for this separate view is
+	that it is easier to get the third entry to show up in the
+	"Reader Batch" list than in the "Reader Pipe" list.
+
+o	"Free-Block Circulation": Shows the number of torture structures
+	that have reached a given point in the pipeline.  The first element
+	should closely correspond to the number of structures allocated,
+	the second to the number that have been removed from reader view,
+	and all but the last remaining to the corresponding number of
+	passes through a grace period.  The last entry should be zero,
+	as it is only incremented if a torture structure's counter
+	somehow gets incremented farther than it should.
+
+
+USAGE
+
+The following script may be used to torture RCU:
+
+	#!/bin/sh
+
+	modprobe rcutorture
+	sleep 100
+	rmmod rcutorture
+	dmesg | grep rcutorture:
+
+The output can be manually inspected for the error flag of "!!!".
+One could of course create a more elaborate script that automatically
+checked for such errors.
Index: linux/Documentation/kernel-parameters.txt
===================================================================
--- linux.orig/Documentation/kernel-parameters.txt
+++ linux/Documentation/kernel-parameters.txt
@@ -52,6 +52,7 @@ restrictions referred to are that the re
 	MTD	MTD support is enabled.
 	NET	Appropriate network support is enabled.
 	NUMA	NUMA support is enabled.
+	GENERIC_TIME The generic timeofday code is enabled.
 	NFS	Appropriate NFS support is enabled.
 	OSS	OSS sound support is enabled.
 	PARIDE	The ParIDE subsystem is enabled.
@@ -329,10 +330,11 @@ running once the system is up.
 			Value can be changed at runtime via
 				/selinux/checkreqprot.
 
- 	clock=		[BUGS=IA-32,HW] gettimeofday timesource override.
-			Forces specified timesource (if avaliable) to be used
-			when calculating gettimeofday(). If specicified
-			timesource is not avalible, it defaults to PIT.
+	clock=		[BUGS=IA-32, HW] gettimeofday clocksource override.
+			[Deprecated]
+			Forces specified clocksource (if avaliable) to be used
+			when calculating gettimeofday(). If specified
+			clocksource is not avalible, it defaults to PIT.
 			Format: { pit | tsc | cyclone | pmtmr }
 
 	hpet=		[IA-32,HPET] option to disable HPET and use PIT.
@@ -1479,6 +1481,10 @@ running once the system is up.
 
 	time		Show timing data prefixed to each printk message line
 
+	clocksource=	[GENERIC_TIME] Override the default clocksource
+			Override the default clocksource and use the clocksource
+			with the name specified.
+
 	tipar.timeout=	[HW,PPT]
 			Set communications timeout in tenths of a second
 			(default 15).
Index: linux/Documentation/ktimers.txt
===================================================================
--- /dev/null
+++ linux/Documentation/ktimers.txt
@@ -0,0 +1,239 @@
+
+ktimers - subsystem for high-precision kernel timers
+----------------------------------------------------
+
+This patch introduces a new subsystem for high-precision kernel timers.
+
+Why two timer subsystems? After a lot of back and forth trying to
+integrate high-precision and high-resolution features into the existing
+timer framework, and after testing various such high-resolution timer
+implementations in practice, we came to the conclusion that the timer
+wheel code is fundamentally not suitable for such an approach. We
+initially didnt believe this ('there must be a way to solve this'), and
+we spent a considerable effort trying to integrate things into the timer
+wheel, but we failed. There are several reasons why such integration is
+impossible:
+
+- the forced handling of low-resolution and high-resolution timers in
+  the same way leads to a lot of compromises, macro magic and #ifdef
+  mess. The timers.c code is very "tightly coded" around jiffies and
+  32-bitness assumptions, and has been honed and micro-optimized for a
+  narrow use case for many years - and thus even small extensions to it
+  frequently break the wheel concept, leading to even worse
+  compromises.
+
+- the unpredictable [O(N)] overhead of cascading leads to delays which
+  necessiate a more complex handling of high resolution timers, which
+  decreases robustness. Such a design still led to rather large timing
+  inaccuracies. Cascading is a fundamental property of the timer wheel
+  concept, it cannot be 'designed out' without unevitabling degrading
+  other portions of the timers.c code in an unacceptable way.
+
+- the implementation of the current posix-timer subsystem on top of
+  the timer wheel has already introduced a quite complex handling of
+  the required readjusting of absolute CLOCK_REALTIME timers at
+  settimeofday or NTP time - showing the rigidity of the timer wheel
+  data structure.
+
+- the timer wheel code is most optimal for use cases which can be
+  identified as "timeouts". Such timeouts are usually set up to cover
+  error conditions in various I/O paths, such as networking and block
+  I/O. The vast majority of those timers never expire and are rarely
+  recascaded because the expected correct event arrives in time so they
+  can be removed from the timer wheel before any further processing of
+  them becomes necessary. Thus the users of these timeouts can accept
+  the granularity and precision tradeoffs of the timer wheel, and
+  largely expect the timer subsystem to have near-zero overhead. Timing
+  for them is not a core purpose, it's most a necessary evil to
+  guarantee the processing of requests, which should be as cheap and
+  unintrusive as possible.
+
+The primary users of precision timers are user-space applications that
+utilize nanosleep, posix-timers and itimer interfaces. Also, in-kernel
+users like drivers and subsystems with a requirement for precise timed
+events can benefit from the availability of a seperate high-precision
+timer subsystem as well.
+
+The ktimer subsystem is easily extended with high-resolution
+capabilities, and patches for that exist and are maturing quickly. The
+increasing demand for realtime and multimedia applications along with
+other potential users for precise timers gives another reason to
+separate the "timeout" and "precise timer" subsystems.
+
+Another potential benefit is that such seperation allows for future
+optimizations of the existing timer wheel implementation for the low
+resolution and low precision use cases - once the precision-sensitive
+APIs are separated from the timer wheel and are migrated over to
+ktimers. E.g. we could decrease the frequency of the timeout subsystem
+from 250 Hz to 100 HZ (or even smaller).
+
+ktimer subsystem implementation details
+---------------------------------------
+
+the basic design considerations were:
+
+- simplicity
+- robust, extensible abstractions
+- data structure not bound to jiffies or any other granularity
+- simplification of existing, timing related kernel code
+
+From our previous experience with various approaches of high-resolution
+timers another basic requirement was the immediate enqueueing and
+ordering of timers at activation time. After looking at several possible
+solutions such as radix trees and hashes, the red black tree was choosen
+as the basic data structure. Rbtrees are available as a library in the
+kernel and are used in various performance-critical areas of e.g. memory
+management and file systems. The rbtree is solely used for the time
+sorted ordering, while a seperate list is used to give the expiry code
+fast access to the queued timers, without having to walk the rbtree.
+(This seperate list is also useful for high-resolution timers where we
+need seperate pending and expired queues while keeping the time-order
+intact.)
+
+The time-ordered enqueueing is not purely for the purposes of the
+high-resolution timers extension though, it also simplifies the handling
+of absolute timers based on CLOCK_REALTIME. The existing implementation
+needed to keep an extra list of all armed absolute CLOCK_REALTIME timers
+along with complex locking. In case of settimeofday and NTP, all the
+timers (!) had to be dequeued, the time-changing code had to fix them up
+one by one, and all of them had to be enqueued again. The time-ordered
+enqueueing and the storage of the expiry time in absolute time units
+removes all this complex and poorly scaling code from the posix-timer
+implementation - the clock can simply be set without having to touch the
+rbtree. This also makes the handling of posix-timers simpler in general.
+
+The locking and per-CPU behavior of ktimers was mostly taken from the
+existing timer wheel code, as it is mature and well suited. Sharing code
+was not really a win, due to the different data structures. Also, the
+ktimer functions now have clearer behavior and clearer names - such as
+ktimer_try_to_cancel() and ktimer_cancel() [which are roughly equivalent
+to del_timer() and del_timer_sync()] - and there's no direct 1:1 mapping
+between them on the algorithmical level.
+
+The internal representation of time values (ktime_t) is implemented via
+macros and inline functions, and can be switched between a "hybrid
+union" type and a plain "scalar" 64bit nanoseconds representation (at
+compile time). The hybrid union type exists to optimize time conversions
+on 32bit CPUs. This build-time-selectable ktime_t storage format was
+implemented to avoid the performance impact of 64-bit multiplications
+and divisions on 32bit CPUs. Such operations are frequently necessary to
+convert between the storage formats provided by kernel and userspace
+interfaces and the internal time format. (See include/linux/ktime.h for
+further details.)
+
+ktimers - rounding of timer values
+----------------------------------
+
+Why do we need rounding at all ?
+
+Firstly, the POSIX specification requires rounding to the resolution -
+whatever that means. The POSIX specification is quite imprecise on the
+details of rounding though, so a practical interpretation had to be
+found.
+
+The first question is which resolution value should be returned to the
+user by the clock_getres() interface.
+
+The simplest case is when the hardware is capable of 1 nsec resolution:
+in that case we can fulfill all wishes and there is no rounding :-)
+
+Another simple case is when the clock hardware has a limited resolution
+that the kernel wants to fully offer to user-space: in this case that
+limited resolution is returned to userspace.
+
+The hairy case is when the underlying hardware is capable of finer
+grained resolution, but the kernel is not willing to offer that
+resolution. Why would the kernel want to do that? Because e.g. the
+system could easily be DoS-ed with high-frequency timer interrupts. Or
+the kernel might want to cluster high-res timer interrupts into groups
+for performance reasons, so that extremely high interrupt rates are
+avoided. So the kernel needs some leeway in deciding the 'effective'
+resolution that it is willing to expose to userspace.
+
+In this case, the clock_getres() decision is easy: we want to return the
+'effective' resolution, not the 'theoretical' resolution. Thus an
+application programmer gets correct information about what granularity
+and accuracy to expect from the system.
+
+What is much less obvious in both the 'hardware is low-res' and 'kernel
+wants to offer low-res' cases is the actual behavior of timers, and
+where and how to round time values to the 'effective' resolution of the
+clock.
+
+For this we first need to see what types of expiries there exist for
+ktimers, and how rounding affects them. Ktimers have the following
+variants:
+
+- relative one-shot timers
+- absolute one-shot timers
+- relative interval timers
+- absolute interval timers
+
+Interval timers can be led back to one-shot timers: they are a series of
+one-shot timers with the same interval. Relative one-shot timers can be
+handled identically to absolute one-shot timers after adding the
+relative expiry time to the current time of the respective clock.
+
+We picked to handle two cases of rounding:
+
+- the rounding of the absolute value of the first expiry time
+- the rounding of the timer interval
+
+An alternative implementation would be to not round the interval and to
+implicitly round at every timer event, but it's not clear what the
+advantages would be from doing that. There are a couple of
+disadvantages:
+
+- the technique seems to contradict the standard's requirement that
+  'time values ... be rounded' (which the interval clearly is).
+
+- other OSs implement the rounding in the way we implemented it.
+
+- also, there is an application surprise factor, the 'do not round
+  intervals' technique can lead to the following sample sequence of
+  events:
+
+    Interval:   1.7ms
+    Resolution: 1ms
+
+    Event timeline:
+
+     2ms - 4ms - 6ms - 7ms - 9ms - 11ms - 12ms - 14ms - 16ms - 17ms ...
+
+  this 2,2,1,2,2,1...msec 'unpredictable and uneven' relative distance
+  of events could surprise applications.
+
+(as a sidenote, current POSIX APIs could be extended with a method of
+periodic timers to have an 'average' frequency, where there is no
+rounding of the interval. No such API exists at the moment.)
+
+ktimers - testing and verification
+----------------------------------
+
+We used the high-resolution timer subsystem ontop of ktimers to verify
+the ktimer implementation details in praxis, and we also ran the posix
+timer tests in order to ensure specification compliance.
+
+The ktimer patch converts the following kernel functionality to use
+ktimers:
+
+ - nanosleep
+ - itimers
+ - posix-timers
+
+The conversion of nanosleep and posix-timers enabled the unification of
+nanosleep and clock_nanosleep.
+
+The code was successfully compiled for the following platforms:
+
+ i386, x86_64, ARM, PPC, PPC64, IA64
+
+The code was run-tested on the following platforms:
+
+ i386(UP/SMP), x86_64(UP/SMP), ARM, PPC
+
+ktimers were also integrated into the -rt tree, along with a
+ktimers-based high-resolution timer implementation, so the ktimers code
+got a healthy amount of testing and use in practice.
+
+	Thomas Gleixner, Ingo Molnar
Index: linux/Documentation/timekeeping.txt
===================================================================
--- /dev/null
+++ linux/Documentation/timekeeping.txt
@@ -0,0 +1,246 @@
+How timekeeping works with CONFIG_GENERIC_TIME
+========================================================================
+
+The generic timekeeping code maintains and allows access to the systems understanding of how much time has passed from a certain point. However, in order to measure the passing of time, the generic timekeeping code relies on the clocksource abstraction. A clocksource abstracts a free running counter who's value increases at a known frequency.
+
+In the generic timekeeping code, we use a pointer to a selected clocksource to measure the passing of time.
+
+struct clocksource *clock
+
+The clocksource has some limitations however. Since its likely of fixed width, it will not increment forever and will overflow. In order to still properly keep time, we must occasionally accumulate an interval of time. In the generic timekeeping code, we accumulate the amount of time system the system booted into the value system_time, which keeps nanosecond resolution in a ktime_t storage.
+
+ktime_t system_time
+
+Since its likely your system has not been running continually since midnight on the 1st of January in 1970, we must provide an offset from that time in accordance with conventions. This only occasionally changed (via settimeofday()) offset is the wall_time_offset value, which is also stored as a ktime_t.
+
+ktime_t wall_time_offset
+
+
+Since we accumulate time in intervals, we need a base cycle value that we can use to generate an offset from the time value kept in system_time. We store this value in cycle_last.
+
+cycle_t cycle_last;
+
+
+Further since all clocks drift somewhat from each other, we use the adjustment values provided via adjtimex() to correct our clocksource frequency for each interval. This frequency adjustment value is stored in ntp_adj.
+
+long ntp_adj;
+
+Now that we've covered the core global variables for timekeeping, lets look at how we maintain these values.
+
+As stated above, we want to avoid the clocksource from overflowing on us, so we accumulate a time interval periodically. This periodic accumulation function is called timeofday_periodic_hook().  In simplified pseudo code, it logically is presented as:
+
+timeofday_periodic_hook():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	system_time += nsec
+	cycle_last = cycle_now
+
+	/* do other stuff */
+
+You can see we read the cycle value from the clocksource, calculate a cycle delta for the interval since we last called timeofday_periodic_hook(), convert that cycle delta to a nanosecond interval (for now ignore ntp_adj), add it to the system time and finally set our cycle_last value to cycle_now for the next interval. Using this simple algorithm we can correctly measure and record the passing of time.
+
+But just storing this info isn't very useful, we also want to make it available to be used elsewhere. So how do we provide a notion of how much time has passed inbetween calls to timeofday_periodic_hook()?
+
+First, lets create a function that calculates the time since the last call to timeofday_peridoic_hook().
+
+get_nsec_offset():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	return nsec
+
+Here you can see, we read the clocksource, calculate a cycle interval, and convert that to a nanosecond interval. Just like how it is done in timeofday_periodic_hook!
+
+Now lets use this function to provide the number of nanoseconds that the system has been running:
+
+do_monotonic_clock():
+	return system_time + get_nsec_offset()
+
+Here we trivially add the nanosecond offset since the last timeofday_periodic_hook() to the value of system_time which was stored at the last timeofday_periodic_hook().
+
+Note that since we use the same method to calculate time intervals, assuming each function is atomic and the clocksource functions as it should, time cannot go backward!
+
+Now to get the time of day using the standard convention:
+
+do_gettimeofday():
+	return do_monotonic_clock() + wall_time_offset
+
+We simply add the wall_time_offset, and we have the number of nanoseconds since 1970 began!
+
+
+Of course, in real life, things are not so static. We have to handle a number of dynamic values that may change and affect timekeeping. In order to do these safely, we must only change values in-between intervals. This means the periodic_hook call must handle these changes.
+
+Since clocksources can be changed while the system is running, we need to check for and possibly switch to using new clocksources in the periodic_hook call. Further, clocksources may change their frequency. Since this must be done only at a safe point, we use the update_callback function pointer (for more details, see "How to write a clocksource driver" below), this too must be done in-between intervals in the periodic_hook call. Finally, since the ntp adjustment made in the cyc2ns conversion is not static, we need to update the ntp state machine and get a calculate a new adjustment value.
+
+This adds some extra pseudo code to the timeofday_periodic_hook function:
+
+timeofday_periodic_hook():
+	cycle_now = read_clocksource(clock)
+	cycle_delta = (cycle_now - cycle_last) & clock->mask
+	nsec = cyc2ns(clock, cycle_delta, ntp_adj)
+	system_time += nsec
+	cycle_last = cycle_now
+
+	next = get_next_clocksource()
+	if (next != clock):
+		cycle_last = read_clocksource(next)
+		clock = next
+
+	if (clock->update_callback):
+		clock->update_callback()
+
+	ntp_advance(nsec)
+	ppm = ntp_get_ppm_adjustment()
+	ntp_adj = ppm_to_mult_adj(clock, ppm)
+
+
+Unfortunately, the actual timeofday_periodic_hook code is not as simple as this pseudo code. For performance concerns, much has been done to pre-calculate values and use them repeatedly. Thus be aware that the code in timeofday.c is more complex, however the functional logic is the same.
+
+
+How to port an architecture to GENERIC_TIME
+========================================================================
+Porting an architecture to the GENERIC_TIME timekeeping code consists of moving a little bit of code around then deleting a fair amount. It is my hope that this will reduce the arch specific maintenance work around timekeeping.
+
+Porting an arch usually requires the following steps.
+
+1. Define CONFIG_GENERIC_TIME in the arches Kconfig
+2. Implmenting the following functions
+	nsec_t read_persistent_clock(void)
+	void sync_persistent_clock(struct timespec ts)
+3. Removing all of the arch specific timekeeping code
+	do_gettimeofday()
+	do_settimeofday()
+	etc
+4. Implementing clocksource drivers
+	See "How to write a clocksource driver" for more details
+
+The exeptions to the above are:
+
+5.  If the arch is has no continuous clocksource
+	A) Implement 1-3 in the above list.
+	B) Define CONFIG_IS_TICK_BASED in arches Kconfig
+	C) Implement the "long arch_getoffset(void)" function
+
+6. If the arch supports vsyscall gettimeofday (see x86_64 for reference)
+	A) Implement 1-4 in the above list
+	B) Define GENERIC_TIME_VSYSCALL
+	C) Implement arch_update_vsyscall_gtod()
+	D) Implement vsyscall gettimeofday (similar to __get_realtime_clock_ts)
+	E) Implement vread functions for supported clocksources
+
+
+
+How to write a clocksource driver.
+========================================================================
+First, a quick summary of what a clocksource driver provides.
+
+Simply put, a clocksource is a abstraction of a free running increasing counter. The abstraction provides the minimal amount of info for that counter to be usable for timekeeping. Those required values are:
+	1. It's name
+	2. A rating value for selection priority
+	3. A read function pointer
+	4. A mask value for correct twos-complement subtraction
+	5. A mult and shift pair that aproximate the counter frequency
+		mult/(2^shift) ~= nanoseconds per cycle
+
+Additionally, there are other optionally set values that allow for advanced functinoality. Those values are:
+	6. The update_callback function.
+	7. The is_continuous flag.
+	8. The vread function pointer
+	9. The vdata pointer value
+
+
+Now lets go over these values in detail.
+
+1. Name.
+	The clocksource's name should be unique since it is used for both identification as well as for manually overriding the default clocksource selection. The name length must be shorter then 32 characters in order for it to be properly overrided.
+
+2. Rating value
+	This rating value is used as a priority value for clocksource selection. It has no direct connection to quality or physical properties of the clocksource, but is to be set and manipulated to guarantee that the best (by no specific metric) clocksource that will provide correct timekeeping is automatically selected. Rating suggestions can be found in include/linux/clocksource.h
+
+3. Read function pointer
+	This pointer should point to a function that returns an unsigned increasing cycle value from the clocksource. The value should have a coverage from zero to the maximum cycle value the clocksource can provide. This does not have to be direct hardware value and can also be a software counter. An example of a software counter is the jiffies clocksource.
+
+4. The mask value
+	This value should be the largest power of two that is smaller then the maximum cycle value. This allows twos complement subtraction to work on overflow boundary conditions if the max value is less then (cycle_t)-1. So for example, if we have a 16 bit counter (ie: one that loops to zero after 0x0000FFFF), the mask would be 0xFFFF. So then when finding the cycle difference around a overflow, where now = 0x0013 and then = 0xFFEE, we can compute the cycle delta properly using the equation:
+	delta = (now - then)&mask
+	delta = (0x0013 - 0xFFEE) & 0xFFFF
+	delta = 0xFFFF0025 & 0xFFFF  /* note the unmasked negative value */
+	delta = 0x25
+
+5. The mult and shift pair
+	These 32bit values approximate the nanosecond per cycle frequency of the clocksource using the equation: mult/(2^shift). If you have a khz or hz frequency value, the mult value for a given shift value can be easily calculated using the  clocksource_hz2mult() and clocksource_khz2mult() helper functions. When selecting a shift value, it is important to be careful. Larger shift values give a finer precision in the cycle to nanosecond conversion and allows for more exact NTP adjustments.	However if you select too large a shift value, the resulting mult value might overflow a cycle_t * mult computation.
+
+
+So if you have a simple hardware counter that does not change frequency, filling in the above should be sufficient for a functional clocksource. But read on for details on implementing a more complex clocksource.
+
+6. The update_callback function pointer.
+	If this function pointer is non-NULL, it will be called every periodic hook when it is safe for the clocksource to change its state. This would be necessary in the case where the counter frequency changes, for example. One user of this  function pointer is the TSC clocksource. When the TSC frequency changes (which may occur if the cpu changes frequency) we need to notify the clocksource at a safe point where that state may change. Thus, if the TSC has changed frequency we set the new mult/shift values in the update_callback function.
+
+7. The is_continuous flag.
+	This flag variable (0 if false, 1 if true) denotes that the clocksource is continuous. This means that it is a purely hardware driven clocksource and is not dependent on any software code to run for it to increment properly. This denotation will be useful in the future when timer ticks may be disabled for long periods of time. Doing so using software clocksources, like the jiffies clocksource, would cause timekeeping problems.
+
+8. The vread function pointer.
+	This function pointer points to a user-space accessible function that reads the clocksource. This is used in userspace gettimeofday implementations to improve performance. See the x86-64 TSC clocksource implementation for an example.
+
+8. The vdata pointer.
+	This pointer is passed to the vread function pointer in a userspace gettimeofday implementation. Its usage is dependent on the vread implementation, but if the pointer points to data, that data must be readable from userspace.
+
+
+Now lets write a quick clocksource for an imaginary bit of hardware. Here are the specs:
+
+	A 32bit counter can be found at the MMIO address 0xFEEDF000. It runs at 100Mhz. To enable it, the the low bit of the address 0xFEEDF0F0 must be set to one.
+
+So lets start out an empty cool-counter.c file, and define the clocksource.
+
+#include <linux/clocksource.h>
+#include <linux/init.h>
+#include <asm/io.h>
+
+#define COOL_READ_PTR	0xFEEDF000
+#define COOL_START_PTR	0xFEEDF0F0
+
+static __iomem *cool_ptr = COOL_READ_PTR;
+
+struct clocksource clocksource_cool
+{
+	.name = "cool",
+	.rating = 200,		/* its a pretty decent clock */
+	.mask = 0xFFFFFFFF,	/* 32 bits */
+	.mult = 0,			/*to be computed */
+	.shift = 10,
+}
+
+
+Now let's write the read function:
+
+cycle_t cool_counter_read(void)
+{
+	cycle_t ret = readl(cool_ptr);
+	return ret;
+}
+
+Finally, lets write the init function:
+
+void cool_counter_init(void)
+{
+	__iomem *ptr = COOL_START_PTR;
+	u32 val;
+
+	/* start the counter */
+	val = readl(ptr);
+	val |= 0x1;
+	writel(val, ptr);
+
+	/* finish initializing the clocksource */
+	clocksource_cool.read = cool_counter_read;
+	clocksource_cool.mult = clocksource_khz2mult(100000,
+					clocksource_cool.shift);
+
+	/* register the clocksource */
+	register_clocksource(&clocksource_cool);
+}
+module_init(cool_counter_init);
+
+
+Now wasn't that easy!
Index: linux/Makefile
===================================================================
--- linux.orig/Makefile
+++ linux/Makefile
@@ -1,7 +1,7 @@
 VERSION = 2
 PATCHLEVEL = 6
 SUBLEVEL = 14
-EXTRAVERSION =
+EXTRAVERSION = -rt22
 NAME=Affluent Albatross
 
 # *DOCUMENTATION*
@@ -517,10 +517,14 @@ CFLAGS		+= $(call add-align,CONFIG_CC_AL
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_LOOPS,-loops)
 CFLAGS		+= $(call add-align,CONFIG_CC_ALIGN_JUMPS,-jumps)
 
-ifdef CONFIG_FRAME_POINTER
-CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+ifdef CONFIG_MCOUNT
+CFLAGS                += -pg -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
 else
-CFLAGS		+= -fomit-frame-pointer
+  ifdef CONFIG_FRAME_POINTER
+    CFLAGS		+= -fno-omit-frame-pointer $(call cc-option,-fno-optimize-sibling-calls,)
+  else
+    CFLAGS		+= -fomit-frame-pointer
+  endif
 endif
 
 ifdef CONFIG_DEBUG_INFO
Index: linux/arch/alpha/kernel/time.c
===================================================================
--- linux.orig/arch/alpha/kernel/time.c
+++ linux/arch/alpha/kernel/time.c
@@ -55,10 +55,6 @@
 #include "proto.h"
 #include "irq_impl.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;	/* kernel/timer.c */
 
 static int set_rtc_mmss(unsigned long);
Index: linux/arch/arm/Kconfig
===================================================================
--- linux.orig/arch/arm/Kconfig
+++ linux/arch/arm/Kconfig
@@ -50,6 +50,10 @@ config UID16
 	bool
 	default y
 
+config GENERIC_HARDIRQS
+	bool
+	default y
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
 	default y
@@ -339,18 +343,7 @@ config NR_CPUS
 	depends on SMP
 	default "4"
 
-config PREEMPT
-	bool "Preemptible Kernel (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
-	  Say Y here if you are building a kernel for a desktop, embedded
-	  or real-time system.  Say N if you are unsure.
+source kernel/Kconfig.preempt
 
 config NO_IDLE_HZ
 	bool "Dynamic tick timer"
Index: linux/arch/arm/boot/compressed/head.S
===================================================================
--- linux.orig/arch/arm/boot/compressed/head.S
+++ linux/arch/arm/boot/compressed/head.S
@@ -718,6 +718,19 @@ memdump:	mov	r12, r0
 		mov	pc, r10
 #endif
 
+#ifdef CONFIG_MCOUNT
+/* CONFIG_MCOUNT causes boot header to be built with -pg requiring this
+ * trampoline
+ */
+                .text
+                .align 0
+                .type mcount %function
+                .global mcount
+mcount:
+		mov pc, lr	@ just return
+#endif
+
+
 reloc_end:
 
 		.align
Index: linux/arch/arm/boot/compressed/misc.c
===================================================================
--- linux.orig/arch/arm/boot/compressed/misc.c
+++ linux/arch/arm/boot/compressed/misc.c
@@ -199,6 +199,7 @@ static ulg free_mem_ptr_end;
 
 #define HEAP_SIZE 0x2000
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 #ifndef STANDALONE_DEBUG
Index: linux/arch/arm/common/dmabounce.c
===================================================================
--- linux.orig/arch/arm/common/dmabounce.c
+++ linux/arch/arm/common/dmabounce.c
@@ -403,11 +403,11 @@ dma_map_single(struct device *dev, void 
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	dma_addr = map_single(dev, ptr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return dma_addr;
 }
@@ -430,11 +430,11 @@ dma_unmap_single(struct device *dev, dma
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	unmap_single(dev, dma_addr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 int
@@ -449,7 +449,7 @@ dma_map_sg(struct device *dev, struct sc
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		struct page *page = sg->page;
@@ -461,7 +461,7 @@ dma_map_sg(struct device *dev, struct sc
 			map_single(dev, ptr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return nents;
 }
@@ -478,7 +478,7 @@ dma_unmap_sg(struct device *dev, struct 
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		dma_addr_t dma_addr = sg->dma_address;
@@ -487,7 +487,7 @@ dma_unmap_sg(struct device *dev, struct 
 		unmap_single(dev, dma_addr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -499,11 +499,11 @@ dma_sync_single_for_cpu(struct device *d
 	dev_dbg(dev, "%s(ptr=%p,size=%d,dir=%x)\n",
 		__func__, (void *) dma_addr, size, dir);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	sync_single(dev, dma_addr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -515,11 +515,11 @@ dma_sync_single_for_device(struct device
 	dev_dbg(dev, "%s(ptr=%p,size=%d,dir=%x)\n",
 		__func__, (void *) dma_addr, size, dir);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	sync_single(dev, dma_addr, size, dir);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -534,7 +534,7 @@ dma_sync_sg_for_cpu(struct device *dev, 
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		dma_addr_t dma_addr = sg->dma_address;
@@ -543,7 +543,7 @@ dma_sync_sg_for_cpu(struct device *dev, 
 		sync_single(dev, dma_addr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void
@@ -558,7 +558,7 @@ dma_sync_sg_for_device(struct device *de
 
 	BUG_ON(dir == DMA_NONE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (i = 0; i < nents; i++, sg++) {
 		dma_addr_t dma_addr = sg->dma_address;
@@ -567,7 +567,7 @@ dma_sync_sg_for_device(struct device *de
 		sync_single(dev, dma_addr, length, dir);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 int
Index: linux/arch/arm/common/locomo.c
===================================================================
--- linux.orig/arch/arm/common/locomo.c
+++ linux/arch/arm/common/locomo.c
@@ -425,6 +425,12 @@ static struct irqchip locomo_spi_chip = 
 	.unmask	= locomo_spi_unmask_irq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(locomo_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_key_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_gpio_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_lt_handler);
+static DEFINE_IRQ_CHAINED_TYPE(locomo_spi_handler);
+
 static void locomo_setup_irq(struct locomo *lchip)
 {
 	int irq;
Index: linux/arch/arm/common/sa1111.c
===================================================================
--- linux.orig/arch/arm/common/sa1111.c
+++ linux/arch/arm/common/sa1111.c
@@ -159,11 +159,11 @@ sa1111_irq_handler(unsigned int irq, str
 
 	for (i = IRQ_SA1111_START; stat0; i++, stat0 >>= 1)
 		if (stat0 & 1)
-			do_edge_IRQ(i, irq_desc + i, regs);
+			handle_edge_irq(i, irq_desc + i, regs);
 
 	for (i = IRQ_SA1111_START + 32; stat1; i++, stat1 >>= 1)
 		if (stat1 & 1)
-			do_edge_IRQ(i, irq_desc + i, regs);
+			handle_edge_irq(i, irq_desc + i, regs);
 
 	/* For level-based interrupts */
 	desc->chip->unmask(irq);
@@ -368,6 +368,8 @@ static struct irqchip sa1111_high_chip =
 	.set_wake	= sa1111_wake_highirq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(sa1111_irq_handler);
+
 static void sa1111_setup_irq(struct sa1111 *sachip)
 {
 	void __iomem *irqbase = sachip->base + SA1111_INTC;
Index: linux/arch/arm/common/time-acorn.c
===================================================================
--- linux.orig/arch/arm/common/time-acorn.c
+++ linux/arch/arm/common/time-acorn.c
@@ -16,6 +16,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/hardware.h>
 #include <asm/io.h>
@@ -76,7 +77,7 @@ ioc_timer_interrupt(int irq, void *dev_i
 
 static struct irqaction ioc_timer_irq = {
 	.name		= "timer",
-	.flags		= SA_INTERRUPT,
+	.flags		= SA_INTERRUPT | SA_NODELAY,
 	.handler	= ioc_timer_interrupt
 };
 
Index: linux/arch/arm/kernel/calls.S
===================================================================
--- linux.orig/arch/arm/kernel/calls.S
+++ linux/arch/arm/kernel/calls.S
@@ -7,11 +7,8 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  *
- *  This file is included twice in entry-common.S
+ *  NR_syscalls now defined in include/asm-arm/unistd.h - tglx
  */
-#ifndef NR_syscalls
-#define NR_syscalls 328
-#else
 
 __syscall_start:
 /* 0 */		.long	sys_restart_syscall
@@ -341,4 +338,3 @@ __syscall_end:
 		.rept	NR_syscalls - (__syscall_end - __syscall_start) / 4
 			.long	sys_ni_syscall
 		.endr
-#endif
Index: linux/arch/arm/kernel/dma.c
===================================================================
--- linux.orig/arch/arm/kernel/dma.c
+++ linux/arch/arm/kernel/dma.c
@@ -22,7 +22,7 @@
 
 #include <asm/mach/dma.h>
 
-DEFINE_SPINLOCK(dma_spin_lock);
+DEFINE_RAW_SPINLOCK(dma_spin_lock);
 
 #if MAX_DMA_CHANNELS > 0
 
Index: linux/arch/arm/kernel/ecard.c
===================================================================
--- linux.orig/arch/arm/kernel/ecard.c
+++ linux/arch/arm/kernel/ecard.c
@@ -619,7 +619,7 @@ ecard_irqexp_handler(unsigned int irq, s
 		ecard_t *ec = slot_to_ecard(slot);
 
 		if (ec->claimed) {
-			struct irqdesc *d = irqdesc + ec->irq;
+			struct irqdesc *d = irq_desc + ec->irq;
 			/*
 			 * this ugly code is so that we can operate a
 			 * prioritorising system:
@@ -1052,6 +1052,9 @@ ecard_probe(int slot, card_type_t type)
 	return rc;
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(ecard_irqexp_handler);
+static DEFINE_IRQ_CHAINED_TYPE(ecard_irq_handler);
+
 /*
  * Initialise the expansion card system.
  * Locate all hardware - interrupt management and
@@ -1081,8 +1084,10 @@ static int __init ecard_init(void)
 
 	irqhw = ecard_probeirqhw();
 
-	set_irq_chained_handler(IRQ_EXPANSIONCARD,
-				irqhw ? ecard_irqexp_handler : ecard_irq_handler);
+	if (irqhw)
+		set_irq_chained_handler(IRQ_EXPANSIONCARD, ecard_irqexp_handler);
+	else
+		set_irq_chained_handler(IRQ_EXPANSIONCARD, ecard_irq_handler);
 
 	ecard_proc_init();
 
Index: linux/arch/arm/kernel/entry-armv.S
===================================================================
--- linux.orig/arch/arm/kernel/entry-armv.S
+++ linux/arch/arm/kernel/entry-armv.S
@@ -184,7 +184,7 @@ __irq_svc:
 	irq_handler
 #ifdef CONFIG_PREEMPT
 	ldr	r0, [tsk, #TI_FLAGS]		@ get flags
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	blne	svc_preempt
 preempt_return:
 	ldr	r0, [tsk, #TI_PREEMPT]		@ read preempt value
@@ -211,7 +211,7 @@ svc_preempt:
 	str	r7, [tsk, #TI_PREEMPT]		@ expects preempt_count == 0
 1:	bl	preempt_schedule_irq		@ irq en/disable is done inside
 	ldr	r0, [tsk, #TI_FLAGS]		@ get new tasks TI_FLAGS
-	tst	r0, #_TIF_NEED_RESCHED
+	tst	r0, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	beq	preempt_return			@ go again
 	b	1b
 #endif
Index: linux/arch/arm/kernel/entry-common.S
===================================================================
--- linux.orig/arch/arm/kernel/entry-common.S
+++ linux/arch/arm/kernel/entry-common.S
@@ -3,6 +3,8 @@
  *
  *  Copyright (C) 2000 Russell King
  *
+ * LATENCY_TRACE/mcount support (C) 2005 Timesys john.cooper@timesys.com
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
@@ -41,7 +43,7 @@ ret_fast_syscall:
 fast_work_pending:
 	str	r0, [sp, #S_R0+S_OFF]!		@ returned r0
 work_pending:
-	tst	r1, #_TIF_NEED_RESCHED
+	tst	r1, #_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED
 	bne	work_resched
 	tst	r1, #_TIF_NOTIFY_RESUME | _TIF_SIGPENDING
 	beq	no_work_pending
@@ -52,7 +54,8 @@ work_pending:
 	b	no_work_pending
 
 work_resched:
-	bl	schedule
+	bl	__schedule
+
 /*
  * "slow" syscall return path.  "why" tells us if this was a real syscall.
  */
@@ -88,8 +91,6 @@ ENTRY(ret_from_fork)
 	b	ret_slow_syscall
 	
 
-#include "calls.S"
-
 /*=============================================================================
  * SWI handler
  *-----------------------------------------------------------------------------
@@ -288,3 +289,110 @@ sys_mmap2:
 		str	r5, [sp, #4]
 		b	do_mmap2
 #endif
+
+#ifdef CONFIG_FRAME_POINTER
+
+#ifdef CONFIG_MCOUNT
+/*
+ * At the point where we are in mcount() we maintain the
+ * frame of the prologue code and keep the call to mcount()
+ * out of the stack frame list:
+
+        saved pc          <---\     caller of instrumented routine
+        saved lr              |
+        ip/prev_sp            |
+        fp        -----^      |
+         :                    |
+                              |
+     -> saved pc              |     instrumented routine
+    |   saved lr              |
+    |   ip/prev_sp            |
+    |   fp           ---------/
+    |     :
+    |
+    |                             mcount
+    |	saved pc
+    |	saved lr
+    |	ip/prev sp
+     --	fp
+        r3
+        r2
+        r1
+   sp-> r0
+         :
+ */
+
+	.text
+	.align 0
+	.type mcount %function
+	.global mcount
+
+/* gcc -pg generated FUNCTION_PROLOGUE references mcount()
+ * and has already created the stack frame invocation for
+ * the routine we have been called to instrument. We create
+ * a complete frame nevertheless, as we want to use the same
+ * call to mcount() from c code.
+ */
+mcount:
+
+	ldr	ip, =mcount_enabled	@ leave early, if disabled
+	ldr	ip, [ip]
+	cmp	ip, #0
+	moveq	pc,lr
+
+	mov	ip,  sp
+	stmdb   sp!, {r0 - r3, fp, ip, lr, pc}	@ create stack frame
+
+	ldr	r1, [fp, #-4]		@ get lr (the return address
+					@ of the caller of the
+					@ instrumented function)
+	mov	r0, lr			@ get lr - (the return address
+					@ of the instrumented function)
+
+	sub	fp, ip, #4		@ point fp at this frame
+
+	bl	__trace
+1:
+	ldmdb   fp, {r0 - r3, fp, sp, pc}	@ pop entry frame and return
+
+#endif
+
+/* ARM replacement for unsupported gcc __builtin_return_address(n)
+ * where 0 < n.  n == 0 is supported here as well.
+ *
+ * Walk up the stack frame until the desired frame is found or a NULL
+ * fp is encountered, return NULL in the latter case.
+ *
+ * Note: it is possible under code optimization for the stack invocation
+ * of an ancestor function (level N) to be removed before calling a
+ * descendant function (level N+1).  No easy means is available to deduce
+ * this scenario with the result being [for example] caller_addr(0) when
+ * called from level N+1 returning level N-1 rather than the expected
+ * level N.  This optimization issue appears isolated to the case of
+ * a call to a level N+1 routine made at the tail end of a level N
+ * routine -- the level N frame is deleted and a simple branch is made
+ * to the level N+1 routine.
+ */
+
+	.text
+	.align 0
+	.type arm_return_addr %function
+	.global arm_return_addr
+
+arm_return_addr:
+	mov	ip, r0
+	mov	r0, fp
+3:
+	cmp	r0, #0
+	beq	1f		@ frame list hit end, bail
+	cmp	ip, #0
+	beq	2f		@ reached desired frame
+	ldr	r0, [r0, #-12]  @ else continue, get next fp
+	sub	ip, ip, #1
+	b	 3b
+2:
+	ldr	r0, [r0, #-4]   @ get target return address
+1:
+	mov	pc, lr
+
+#endif
Index: linux/arch/arm/kernel/fiq.c
===================================================================
--- linux.orig/arch/arm/kernel/fiq.c
+++ linux/arch/arm/kernel/fiq.c
@@ -38,6 +38,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/seq_file.h>
 
 #include <asm/cacheflush.h>
@@ -88,7 +89,7 @@ void set_fiq_handler(void *start, unsign
  * disable irqs for the duration.  Note - these functions are almost
  * entirely coded in assembly.
  */
-void __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) set_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
@@ -106,7 +107,7 @@ void __attribute__((naked)) set_fiq_regs
 	: "r" (&regs->ARM_r8), "I" (PSR_I_BIT | PSR_F_BIT | FIQ_MODE));
 }
 
-void __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
+void notrace __attribute__((naked)) get_fiq_regs(struct pt_regs *regs)
 {
 	register unsigned long tmp;
 	asm volatile (
Index: linux/arch/arm/kernel/init_task.c
===================================================================
--- linux.orig/arch/arm/kernel/init_task.c
+++ linux/arch/arm/kernel/init_task.c
@@ -12,8 +12,8 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux/arch/arm/kernel/irq.c
===================================================================
--- linux.orig/arch/arm/kernel/irq.c
+++ linux/arch/arm/kernel/irq.c
@@ -27,6 +27,7 @@
 #include <linux/signal.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ptrace.h>
 #include <linux/slab.h>
 #include <linux/random.h>
@@ -38,193 +39,11 @@
 #include <linux/kallsyms.h>
 #include <linux/proc_fs.h>
 
-#include <asm/irq.h>
 #include <asm/system.h>
-#include <asm/mach/irq.h>
 #include <asm/mach/time.h>
 
-/*
- * Maximum IRQ count.  Currently, this is arbitary.  However, it should
- * not be set too low to prevent false triggering.  Conversely, if it
- * is set too high, then you could miss a stuck IRQ.
- *
- * Maybe we ought to set a timer and re-enable the IRQ at a later time?
- */
-#define MAX_IRQ_CNT	100000
-
-static int noirqdebug;
-static volatile unsigned long irq_err_count;
-static DEFINE_SPINLOCK(irq_controller_lock);
-static LIST_HEAD(irq_pending);
-
-struct irqdesc irq_desc[NR_IRQS];
 void (*init_arch_irq)(void) __initdata = NULL;
 
-/*
- * No architecture-specific irq_finish function defined in arm/arch/irqs.h.
- */
-#ifndef irq_finish
-#define irq_finish(irq) do { } while (0)
-#endif
-
-/*
- * Dummy mask/unmask handler
- */
-void dummy_mask_unmask_irq(unsigned int irq)
-{
-}
-
-irqreturn_t no_action(int irq, void *dev_id, struct pt_regs *regs)
-{
-	return IRQ_NONE;
-}
-
-void do_bad_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	irq_err_count += 1;
-	printk(KERN_ERR "IRQ: spurious interrupt %d\n", irq);
-}
-
-static struct irqchip bad_chip = {
-	.ack	= dummy_mask_unmask_irq,
-	.mask	= dummy_mask_unmask_irq,
-	.unmask = dummy_mask_unmask_irq,
-};
-
-static struct irqdesc bad_irq_desc = {
-	.chip		= &bad_chip,
-	.handle		= do_bad_IRQ,
-	.pend		= LIST_HEAD_INIT(bad_irq_desc.pend),
-	.disable_depth	= 1,
-};
-
-#ifdef CONFIG_SMP
-void synchronize_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-
-	while (desc->running)
-		barrier();
-}
-EXPORT_SYMBOL(synchronize_irq);
-
-#define smp_set_running(desc)	do { desc->running = 1; } while (0)
-#define smp_clear_running(desc)	do { desc->running = 0; } while (0)
-#else
-#define smp_set_running(desc)	do { } while (0)
-#define smp_clear_running(desc)	do { } while (0)
-#endif
-
-/**
- *	disable_irq_nosync - disable an irq without waiting
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Enables and disables
- *	are nested.  We do this lazily.
- *
- *	This function may be called from IRQ context.
- */
-void disable_irq_nosync(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->disable_depth++;
-	list_del_init(&desc->pend);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(disable_irq_nosync);
-
-/**
- *	disable_irq - disable an irq and wait for completion
- *	@irq: Interrupt to disable
- *
- *	Disable the selected interrupt line.  Enables and disables
- *	are nested.  This functions waits for any pending IRQ
- *	handlers for this interrupt to complete before returning.
- *	If you use this function while holding a resource the IRQ
- *	handler may need you will deadlock.
- *
- *	This function may be called - with care - from IRQ context.
- */
-void disable_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-
-	disable_irq_nosync(irq);
-	if (desc->action)
-		synchronize_irq(irq);
-}
-EXPORT_SYMBOL(disable_irq);
-
-/**
- *	enable_irq - enable interrupt handling on an irq
- *	@irq: Interrupt to enable
- *
- *	Re-enables the processing of interrupts on this IRQ line.
- *	Note that this may call the interrupt handler, so you may
- *	get unexpected results if you hold IRQs disabled.
- *
- *	This function may be called from IRQ context.
- */
-void enable_irq(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (unlikely(!desc->disable_depth)) {
-		printk("enable_irq(%u) unbalanced from %p\n", irq,
-			__builtin_return_address(0));
-	} else if (!--desc->disable_depth) {
-		desc->probing = 0;
-		desc->chip->unmask(irq);
-
-		/*
-		 * If the interrupt is waiting to be processed,
-		 * try to re-run it.  We can't directly run it
-		 * from here since the caller might be in an
-		 * interrupt-protected region.
-		 */
-		if (desc->pending && list_empty(&desc->pend)) {
-			desc->pending = 0;
-			if (!desc->chip->retrigger ||
-			    desc->chip->retrigger(irq))
-				list_add(&desc->pend, &irq_pending);
-		}
-	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(enable_irq);
-
-/*
- * Enable wake on selected irq
- */
-void enable_irq_wake(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (desc->chip->set_wake)
-		desc->chip->set_wake(irq, 1);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(enable_irq_wake);
-
-void disable_irq_wake(unsigned int irq)
-{
-	struct irqdesc *desc = irq_desc + irq;
-	unsigned long flags;
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (desc->chip->set_wake)
-		desc->chip->set_wake(irq, 0);
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-EXPORT_SYMBOL(disable_irq_wake);
-
 int show_interrupts(struct seq_file *p, void *v)
 {
 	int i = *(loff_t *) v, cpu;
@@ -243,7 +62,7 @@ int show_interrupts(struct seq_file *p, 
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_controller_lock, flags);
+		spin_lock_irqsave(&irq_desc[i].lock, flags);
 	    	action = irq_desc[i].action;
 		if (!action)
 			goto unlock;
@@ -257,7 +76,7 @@ int show_interrupts(struct seq_file *p, 
 
 		seq_putc(p, '\n');
 unlock:
-		spin_unlock_irqrestore(&irq_controller_lock, flags);
+		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
 	} else if (i == NR_IRQS) {
 #ifdef CONFIG_ARCH_ACORN
 		show_fiq_list(p, v);
@@ -265,374 +84,83 @@ unlock:
 #ifdef CONFIG_SMP
 		show_ipi_list(p);
 #endif
+#ifdef FIXME_TGLX
 		seq_printf(p, "Err: %10lu\n", irq_err_count);
-	}
-	return 0;
-}
-
-/*
- * IRQ lock detection.
- *
- * Hopefully, this should get us out of a few locked situations.
- * However, it may take a while for this to happen, since we need
- * a large number if IRQs to appear in the same jiffie with the
- * same instruction pointer (or within 2 instructions).
- */
-static int check_irq_lock(struct irqdesc *desc, int irq, struct pt_regs *regs)
-{
-	unsigned long instr_ptr = instruction_pointer(regs);
-
-	if (desc->lck_jif == jiffies &&
-	    desc->lck_pc >= instr_ptr && desc->lck_pc < instr_ptr + 8) {
-		desc->lck_cnt += 1;
-
-		if (desc->lck_cnt > MAX_IRQ_CNT) {
-			printk(KERN_ERR "IRQ LOCK: IRQ%d is locking the system, disabled\n", irq);
-			return 1;
-		}
-	} else {
-		desc->lck_cnt = 0;
-		desc->lck_pc  = instruction_pointer(regs);
-		desc->lck_jif = jiffies;
-	}
-	return 0;
-}
-
-static void
-report_bad_irq(unsigned int irq, struct pt_regs *regs, struct irqdesc *desc, int ret)
-{
-	static int count = 100;
-	struct irqaction *action;
-
-	if (!count || noirqdebug)
-		return;
-
-	count--;
-
-	if (ret != IRQ_HANDLED && ret != IRQ_NONE) {
-		printk("irq%u: bogus retval mask %x\n", irq, ret);
-	} else {
-		printk("irq%u: nobody cared\n", irq);
-	}
-	show_regs(regs);
-	dump_stack();
-	printk(KERN_ERR "handlers:");
-	action = desc->action;
-	do {
-		printk("\n" KERN_ERR "[<%p>]", action->handler);
-		print_symbol(" (%s)", (unsigned long)action->handler);
-		action = action->next;
-	} while (action);
-	printk("\n");
-}
-
-static int
-__do_irq(unsigned int irq, struct irqaction *action, struct pt_regs *regs)
-{
-	unsigned int status;
-	int ret, retval = 0;
-
-	spin_unlock(&irq_controller_lock);
-
-#ifdef CONFIG_NO_IDLE_HZ
-	if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) {
-		write_seqlock(&xtime_lock);
-		if (system_timer->dyn_tick->state & DYN_TICK_ENABLED)
-			system_timer->dyn_tick->handler(irq, 0, regs);
-		write_sequnlock(&xtime_lock);
-	}
 #endif
-
-	if (!(action->flags & SA_INTERRUPT))
-		local_irq_enable();
-
-	status = 0;
-	do {
-		ret = action->handler(irq, action->dev_id, regs);
-		if (ret == IRQ_HANDLED)
-			status |= action->flags;
-		retval |= ret;
-		action = action->next;
-	} while (action);
-
-	if (status & SA_SAMPLE_RANDOM)
-		add_interrupt_randomness(irq);
-
-	spin_lock_irq(&irq_controller_lock);
-
-	return retval;
-}
-
-/*
- * This is for software-decoded IRQs.  The caller is expected to
- * handle the ack, clear, mask and unmask issues.
- */
-void
-do_simple_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	struct irqaction *action;
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	kstat_cpu(cpu).irqs[irq]++;
-
-	smp_set_running(desc);
-
-	action = desc->action;
-	if (action) {
-		int ret = __do_irq(irq, action, regs);
-		if (ret != IRQ_HANDLED)
-			report_bad_irq(irq, regs, desc, ret);
-	}
-
-	smp_clear_running(desc);
-}
-
-/*
- * Most edge-triggered IRQ implementations seem to take a broken
- * approach to this.  Hence the complexity.
- */
-void
-do_edge_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	/*
-	 * If we're currently running this IRQ, or its disabled,
-	 * we shouldn't process the IRQ.  Instead, turn on the
-	 * hardware masks.
-	 */
-	if (unlikely(desc->running || desc->disable_depth))
-		goto running;
-
-	/*
-	 * Acknowledge and clear the IRQ, but don't mask it.
-	 */
-	desc->chip->ack(irq);
-
-	/*
-	 * Mark the IRQ currently in progress.
-	 */
-	desc->running = 1;
-
-	kstat_cpu(cpu).irqs[irq]++;
-
-	do {
-		struct irqaction *action;
-
-		action = desc->action;
-		if (!action)
-			break;
-
-		if (desc->pending && !desc->disable_depth) {
-			desc->pending = 0;
-			desc->chip->unmask(irq);
-		}
-
-		__do_irq(irq, action, regs);
-	} while (desc->pending && !desc->disable_depth);
-
-	desc->running = 0;
-
-	/*
-	 * If we were disabled or freed, shut down the handler.
-	 */
-	if (likely(desc->action && !check_irq_lock(desc, irq, regs)))
-		return;
-
- running:
-	/*
-	 * We got another IRQ while this one was masked or
-	 * currently running.  Delay it.
-	 */
-	desc->pending = 1;
-	desc->chip->mask(irq);
-	desc->chip->ack(irq);
-}
-
-/*
- * Level-based IRQ handler.  Nice and simple.
- */
-void
-do_level_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	struct irqaction *action;
-	const unsigned int cpu = smp_processor_id();
-
-	desc->triggered = 1;
-
-	/*
-	 * Acknowledge, clear _AND_ disable the interrupt.
-	 */
-	desc->chip->ack(irq);
-
-	if (likely(!desc->disable_depth)) {
-		kstat_cpu(cpu).irqs[irq]++;
-
-		smp_set_running(desc);
-
-		/*
-		 * Return with this interrupt masked if no action
-		 */
-		action = desc->action;
-		if (action) {
-			int ret = __do_irq(irq, desc->action, regs);
-
-			if (ret != IRQ_HANDLED)
-				report_bad_irq(irq, regs, desc, ret);
-
-			if (likely(!desc->disable_depth &&
-				   !check_irq_lock(desc, irq, regs)))
-				desc->chip->unmask(irq);
-		}
-
-		smp_clear_running(desc);
 	}
+	return 0;
 }
 
-static void do_pending_irqs(struct pt_regs *regs)
-{
-	struct list_head head, *l, *n;
-
-	do {
-		struct irqdesc *desc;
-
-		/*
-		 * First, take the pending interrupts off the list.
-		 * The act of calling the handlers may add some IRQs
-		 * back onto the list.
-		 */
-		head = irq_pending;
-		INIT_LIST_HEAD(&irq_pending);
-		head.next->prev = &head;
-		head.prev->next = &head;
-
-		/*
-		 * Now run each entry.  We must delete it from our
-		 * list before calling the handler.
-		 */
-		list_for_each_safe(l, n, &head) {
-			desc = list_entry(l, struct irqdesc, pend);
-			list_del_init(&desc->pend);
-			desc_handle_irq(desc - irq_desc, desc, regs);
-		}
-
-		/*
-		 * The list must be empty.
-		 */
-		BUG_ON(!list_empty(&head));
-	} while (!list_empty(&irq_pending));
-}
+/* Handle bad interrupts */
+static struct irq_desc bad_irq = {
+	.handler = &no_irq_type,
+	.lock = RAW_SPIN_LOCK_UNLOCKED
+};
 
 /*
- * do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
+ * asm_do_IRQ handles all hardware IRQ's.  Decoded IRQs should not
  * come via this function.  Instead, they should provide their
  * own 'handler'
  */
-asmlinkage void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
+asmlinkage notrace void asm_do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	struct irqdesc *desc = irq_desc + irq;
 
+	trace_special(instruction_pointer(regs), irq, 0);
+
 	/*
 	 * Some hardware gives randomly wrong interrupts.  Rather
 	 * than crashing, do something sensible.
 	 */
 	if (irq >= NR_IRQS)
-		desc = &bad_irq_desc;
+		desc = &bad_irq;
 
 	irq_enter();
-	spin_lock(&irq_controller_lock);
-	desc_handle_irq(irq, desc, regs);
-
-	/*
-	 * Now re-run any pending interrupts.
-	 */
-	if (!list_empty(&irq_pending))
-		do_pending_irqs(regs);
 
-	irq_finish(irq);
+	desc_handle_irq(irq, desc, regs);
 
-	spin_unlock(&irq_controller_lock);
 	irq_exit();
 }
 
-void __set_irq_handler(unsigned int irq, irq_handler_t handle, int is_chained)
+void __set_irq_handler(unsigned int irq, struct irq_type *type, int is_chained)
 {
 	struct irqdesc *desc;
 	unsigned long flags;
 
 	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install handler for IRQ%d\n", irq);
+		printk(KERN_ERR "Trying to install type control for IRQ%d\n", irq);
 		return;
 	}
 
-	if (handle == NULL)
-		handle = do_bad_IRQ;
-
 	desc = irq_desc + irq;
 
-	if (is_chained && desc->chip == &bad_chip)
-		printk(KERN_WARNING "Trying to install chained handler for IRQ%d\n", irq);
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	if (handle == do_bad_IRQ) {
-		desc->chip->mask(irq);
-		desc->chip->ack(irq);
-		desc->disable_depth = 1;
-	}
-	desc->handle = handle;
-	if (handle != do_bad_IRQ && is_chained) {
-		desc->valid = 0;
-		desc->probe_ok = 0;
-		desc->disable_depth = 0;
-		desc->chip->unmask(irq);
+	/* Uninstall ? */
+	if (type == NULL || type == &no_irq_type) {
+		spin_lock_irqsave(&desc->lock, flags);
+		if (desc->chip) {
+			desc->chip->mask(irq);
+			desc->chip->ack(irq);
+		}
+		desc->depth = 1;
+		spin_unlock_irqrestore(&desc->lock, flags);
 	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-
-void set_irq_chip(unsigned int irq, struct irqchip *chip)
-{
-	struct irqdesc *desc;
-	unsigned long flags;
 
-	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+	/* Install the irq_type */
+	if (generic_set_irq_type(irq, type))
 		return;
-	}
-
-	if (chip == NULL)
-		chip = &bad_chip;
-
-	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->chip = chip;
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
 
-int set_irq_type(unsigned int irq, unsigned int type)
-{
-	struct irqdesc *desc;
-	unsigned long flags;
-	int ret = -ENXIO;
+	spin_lock_irqsave(&desc->lock, flags);
+	if (is_chained && (desc->handler == &no_irq_type || !desc->chip))
+		printk(KERN_WARNING "Trying to install chained interrupt type for IRQ%d\n", irq);
 
-	if (irq >= NR_IRQS) {
-		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
-		return -ENODEV;
-	}
-
-	desc = irq_desc + irq;
-	if (desc->chip->set_type) {
-		spin_lock_irqsave(&irq_controller_lock, flags);
-		ret = desc->chip->set_type(irq, type);
-		spin_unlock_irqrestore(&irq_controller_lock, flags);
+	if (type != NULL && is_chained) {
+		desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+		desc->depth = 0;
+		if (desc->chip)
+			desc->chip->unmask(irq);
 	}
-
-	return ret;
+	spin_unlock_irqrestore(&desc->lock, flags);
 }
-EXPORT_SYMBOL(set_irq_type);
 
 void set_irq_flags(unsigned int irq, unsigned int iflags)
 {
@@ -645,408 +173,28 @@ void set_irq_flags(unsigned int irq, uns
 	}
 
 	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	desc->valid = (iflags & IRQF_VALID) != 0;
-	desc->probe_ok = (iflags & IRQF_PROBE) != 0;
-	desc->noautoenable = (iflags & IRQF_NOAUTOEN) != 0;
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-}
-
-int setup_irq(unsigned int irq, struct irqaction *new)
-{
-	int shared = 0;
-	struct irqaction *old, **p;
-	unsigned long flags;
-	struct irqdesc *desc;
-
-	/*
-	 * Some drivers like serial.c use request_irq() heavily,
-	 * so we have to be careful not to interfere with a
-	 * running system.
-	 */
-	if (new->flags & SA_SAMPLE_RANDOM) {
-		/*
-		 * This function might sleep, we want to call it first,
-		 * outside of the atomic block.
-		 * Yes, this might clear the entropy pool if the wrong
-		 * driver is attempted to be loaded, without actually
-		 * installing a new handler, but is this really a problem,
-		 * only the sysadmin is able to do this.
-		 */
-	        rand_initialize_irq(irq);
-	}
-
-	/*
-	 * The following block of code has to be executed atomically
-	 */
-	desc = irq_desc + irq;
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	p = &desc->action;
-	if ((old = *p) != NULL) {
-		/* Can't share interrupts unless both agree to */
-		if (!(old->flags & new->flags & SA_SHIRQ)) {
-			spin_unlock_irqrestore(&irq_controller_lock, flags);
-			return -EBUSY;
-		}
-
-		/* add new interrupt at end of irq queue */
-		do {
-			p = &old->next;
-			old = *p;
-		} while (old);
-		shared = 1;
-	}
-
-	*p = new;
-
-	if (!shared) {
- 		desc->probing = 0;
-		desc->running = 0;
-		desc->pending = 0;
-		desc->disable_depth = 1;
-		if (!desc->noautoenable) {
-			desc->disable_depth = 0;
-			desc->chip->unmask(irq);
-		}
-	}
-
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-	return 0;
-}
-
-/**
- *	request_irq - allocate an interrupt line
- *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
- *	@irqflags: Interrupt type flags
- *	@devname: An ascii name for the claiming device
- *	@dev_id: A cookie passed back to the handler function
- *
- *	This call allocates interrupt resources and enables the
- *	interrupt line and IRQ handling. From the point this
- *	call is made your handler function may be invoked. Since
- *	your handler function must clear any interrupt the board
- *	raises, you must take care both to initialise your hardware
- *	and to set up the interrupt handler in the right order.
- *
- *	Dev_id must be globally unique. Normally the address of the
- *	device data structure is used as the cookie. Since the handler
- *	receives this value it makes sense to use it.
- *
- *	If your interrupt is shared you must pass a non NULL dev_id
- *	as this is required when freeing the interrupt.
- *
- *	Flags:
- *
- *	SA_SHIRQ		Interrupt is shared
- *
- *	SA_INTERRUPT		Disable local interrupts while processing
- *
- *	SA_SAMPLE_RANDOM	The interrupt can be used for entropy
- *
- */
-int request_irq(unsigned int irq, irqreturn_t (*handler)(int, void *, struct pt_regs *),
-		 unsigned long irq_flags, const char * devname, void *dev_id)
-{
-	unsigned long retval;
-	struct irqaction *action;
-
-	if (irq >= NR_IRQS || !irq_desc[irq].valid || !handler ||
-	    (irq_flags & SA_SHIRQ && !dev_id))
-		return -EINVAL;
-
-	action = (struct irqaction *)kmalloc(sizeof(struct irqaction), GFP_KERNEL);
-	if (!action)
-		return -ENOMEM;
-
-	action->handler = handler;
-	action->flags = irq_flags;
-	cpus_clear(action->mask);
-	action->name = devname;
-	action->next = NULL;
-	action->dev_id = dev_id;
-
-	retval = setup_irq(irq, action);
-
-	if (retval)
-		kfree(action);
-	return retval;
-}
-
-EXPORT_SYMBOL(request_irq);
-
-/**
- *	free_irq - free an interrupt
- *	@irq: Interrupt line to free
- *	@dev_id: Device identity to free
- *
- *	Remove an interrupt handler. The handler is removed and if the
- *	interrupt line is no longer in use by any driver it is disabled.
- *	On a shared IRQ the caller must ensure the interrupt is disabled
- *	on the card it drives before calling this function.
- *
- *	This function must not be called from interrupt context.
- */
-void free_irq(unsigned int irq, void *dev_id)
-{
-	struct irqaction * action, **p;
-	unsigned long flags;
-
-	if (irq >= NR_IRQS || !irq_desc[irq].valid) {
-		printk(KERN_ERR "Trying to free IRQ%d\n",irq);
-		dump_stack();
-		return;
-	}
-
-	spin_lock_irqsave(&irq_controller_lock, flags);
-	for (p = &irq_desc[irq].action; (action = *p) != NULL; p = &action->next) {
-		if (action->dev_id != dev_id)
-			continue;
-
-	    	/* Found it - now free it */
-		*p = action->next;
-		break;
-	}
-	spin_unlock_irqrestore(&irq_controller_lock, flags);
-
-	if (!action) {
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		dump_stack();
-	} else {
-		synchronize_irq(irq);
-		kfree(action);
-	}
-}
-
-EXPORT_SYMBOL(free_irq);
-
-static DECLARE_MUTEX(probe_sem);
-
-/* Start the interrupt probing.  Unlike other architectures,
- * we don't return a mask of interrupts from probe_irq_on,
- * but return the number of interrupts enabled for the probe.
- * The interrupts which have been enabled for probing is
- * instead recorded in the irq_desc structure.
- */
-unsigned long probe_irq_on(void)
-{
-	unsigned int i, irqs = 0;
-	unsigned long delay;
-
-	down(&probe_sem);
-
-	/*
-	 * first snaffle up any unassigned but
-	 * probe-able interrupts
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (!irq_desc[i].probe_ok || irq_desc[i].action)
-			continue;
-
-		irq_desc[i].probing = 1;
-		irq_desc[i].triggered = 0;
-		if (irq_desc[i].chip->set_type)
-			irq_desc[i].chip->set_type(i, IRQT_PROBE);
-		irq_desc[i].chip->unmask(i);
-		irqs += 1;
-	}
-	spin_unlock_irq(&irq_controller_lock);
-
-	/*
-	 * wait for spurious interrupts to mask themselves out again
-	 */
-	for (delay = jiffies + HZ/10; time_before(jiffies, delay); )
-		/* min 100ms delay */;
-
-	/*
-	 * now filter out any obviously spurious interrupts
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (irq_desc[i].probing && irq_desc[i].triggered) {
-			irq_desc[i].probing = 0;
-			irqs -= 1;
-		}
-	}
-	spin_unlock_irq(&irq_controller_lock);
-
-	return irqs;
-}
-
-EXPORT_SYMBOL(probe_irq_on);
-
-unsigned int probe_irq_mask(unsigned long irqs)
-{
-	unsigned int mask = 0, i;
-
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < 16 && i < NR_IRQS; i++)
-		if (irq_desc[i].probing && irq_desc[i].triggered)
-			mask |= 1 << i;
-	spin_unlock_irq(&irq_controller_lock);
-
-	up(&probe_sem);
-
-	return mask;
-}
-EXPORT_SYMBOL(probe_irq_mask);
-
-/*
- * Possible return values:
- *  >= 0 - interrupt number
- *    -1 - no interrupt/many interrupts
- */
-int probe_irq_off(unsigned long irqs)
-{
-	unsigned int i;
-	int irq_found = NO_IRQ;
-
-	/*
-	 * look at the interrupts, and find exactly one
-	 * that we were probing has been triggered
-	 */
-	spin_lock_irq(&irq_controller_lock);
-	for (i = 0; i < NR_IRQS; i++) {
-		if (irq_desc[i].probing &&
-		    irq_desc[i].triggered) {
-			if (irq_found != NO_IRQ) {
-				irq_found = NO_IRQ;
-				goto out;
-			}
-			irq_found = i;
-		}
-	}
-
-	if (irq_found == -1)
-		irq_found = NO_IRQ;
-out:
-	spin_unlock_irq(&irq_controller_lock);
-
-	up(&probe_sem);
-
-	return irq_found;
-}
-
-EXPORT_SYMBOL(probe_irq_off);
-
-#ifdef CONFIG_SMP
-static void route_irq(struct irqdesc *desc, unsigned int irq, unsigned int cpu)
-{
-	pr_debug("IRQ%u: moving from cpu%u to cpu%u\n", irq, desc->cpu, cpu);
-
-	spin_lock_irq(&irq_controller_lock);
-	desc->cpu = cpu;
-	desc->chip->set_cpu(desc, irq, cpu);
-	spin_unlock_irq(&irq_controller_lock);
-}
-
-#ifdef CONFIG_PROC_FS
-static int
-irq_affinity_read_proc(char *page, char **start, off_t off, int count,
-		       int *eof, void *data)
-{
-	struct irqdesc *desc = irq_desc + ((int)data);
-	int len = cpumask_scnprintf(page, count, desc->affinity);
-
-	if (count - len < 2)
-		return -EINVAL;
-	page[len++] = '\n';
-	page[len] = '\0';
-
-	return len;
-}
-
-static int
-irq_affinity_write_proc(struct file *file, const char __user *buffer,
-			unsigned long count, void *data)
-{
-	unsigned int irq = (unsigned int)data;
-	struct irqdesc *desc = irq_desc + irq;
-	cpumask_t affinity, tmp;
-	int ret = -EIO;
-
-	if (!desc->chip->set_cpu)
-		goto out;
-
-	ret = cpumask_parse(buffer, count, affinity);
-	if (ret)
-		goto out;
-
-	cpus_and(tmp, affinity, cpu_online_map);
-	if (cpus_empty(tmp)) {
-		ret = -EINVAL;
-		goto out;
-	}
-
-	desc->affinity = affinity;
-	route_irq(desc, irq, first_cpu(tmp));
-	ret = count;
-
- out:
-	return ret;
-}
-#endif
-#endif
-
-void __init init_irq_proc(void)
-{
-#if defined(CONFIG_SMP) && defined(CONFIG_PROC_FS)
-	struct proc_dir_entry *dir;
-	int irq;
-
-	dir = proc_mkdir("irq", 0);
-	if (!dir)
-		return;
-
-	for (irq = 0; irq < NR_IRQS; irq++) {
-		struct proc_dir_entry *entry;
-		struct irqdesc *desc;
-		char name[16];
-
-		desc = irq_desc + irq;
-		memset(name, 0, sizeof(name));
-		snprintf(name, sizeof(name) - 1, "%u", irq);
-
-		desc->procdir = proc_mkdir(name, dir);
-		if (!desc->procdir)
-			continue;
-
-		entry = create_proc_entry("smp_affinity", 0600, desc->procdir);
-		if (entry) {
-			entry->nlink = 1;
-			entry->data = (void *)irq;
-			entry->read_proc = irq_affinity_read_proc;
-			entry->write_proc = irq_affinity_write_proc;
-		}
-	}
-#endif
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->status |= IRQ_NOREQUEST | IRQ_NOPROBE;
+	if (iflags & IRQF_VALID)
+		desc->status &= ~IRQ_NOREQUEST;
+	if (iflags & IRQF_PROBE)
+		desc->status &= ~IRQ_NOPROBE;
+	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
 void __init init_IRQ(void)
 {
-	struct irqdesc *desc;
 	extern void init_dma(void);
 	int irq;
 
+	for (irq = 0; irq < NR_IRQS; irq++)
+		irq_desc[irq].status |= IRQ_NOREQUEST;
+
 #ifdef CONFIG_SMP
 	bad_irq_desc.affinity = CPU_MASK_ALL;
 	bad_irq_desc.cpu = smp_processor_id();
 #endif
 
-	for (irq = 0, desc = irq_desc; irq < NR_IRQS; irq++, desc++) {
-		*desc = bad_irq_desc;
-		INIT_LIST_HEAD(&desc->pend);
-	}
-
 	init_arch_irq();
 	init_dma();
 }
-
-static int __init noirqdebug_setup(char *str)
-{
-	noirqdebug = 1;
-	return 1;
-}
-
-__setup("noirqdebug", noirqdebug_setup);
Index: linux/arch/arm/kernel/process.c
===================================================================
--- linux.orig/arch/arm/kernel/process.c
+++ linux/arch/arm/kernel/process.c
@@ -85,12 +85,12 @@ EXPORT_SYMBOL(pm_power_off);
  */
 void default_idle(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	if (!need_resched() && !hlt_counter) {
 		timer_dyn_reprogram();
 		arch_idle();
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
@@ -112,8 +112,8 @@ void cpu_idle(void)
 		while (!need_resched())
 			idle();
 		leds_event(led_idle_end);
-		preempt_enable();
-		schedule();
+		__preempt_enable_no_resched();
+		__schedule();
 	}
 }
 
Index: linux/arch/arm/kernel/semaphore.c
===================================================================
--- linux.orig/arch/arm/kernel/semaphore.c
+++ linux/arch/arm/kernel/semaphore.c
@@ -49,14 +49,14 @@
  *    we cannot lose wakeup events.
  */
 
-void __up(struct semaphore *sem)
+fastcall void __attribute_used__ __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
 static DEFINE_SPINLOCK(semaphore_lock);
 
-void __sched __down(struct semaphore * sem)
+fastcall void __attribute_used__ __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -89,7 +89,7 @@ void __sched __down(struct semaphore * s
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+fastcall int __attribute_used__ __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -148,7 +148,7 @@ int __sched __down_interruptible(struct 
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-int __down_trylock(struct semaphore * sem)
+fastcall int __attribute_used__ __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -168,6 +168,11 @@ int __down_trylock(struct semaphore * se
 	return 1;
 }
 
+fastcall int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
 /*
  * The semaphore operations have a special calling sequence that
  * allow us to do a simpler in-line version of them. These routines
@@ -184,7 +189,7 @@ asm("	.section .sched.text,\"ax\",%progb
 __down_failed:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down				\n\
+	bl	__compat_down			\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 						\n\
 	.align	5				\n\
@@ -192,7 +197,7 @@ __down_failed:					\n\
 __down_interruptible_failed:			\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_interruptible		\n\
+	bl	__compat_down_interruptible	\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 						\n\
@@ -201,7 +206,7 @@ __down_interruptible_failed:			\n\
 __down_trylock_failed:				\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__down_trylock			\n\
+	bl	__compat_down_trylock		\n\
 	mov	ip, r0				\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 						\n\
@@ -210,7 +215,7 @@ __down_trylock_failed:				\n\
 __up_wakeup:					\n\
 	stmfd	sp!, {r0 - r3, lr}		\n\
 	mov	r0, ip				\n\
-	bl	__up				\n\
+	bl	__compat_up			\n\
 	ldmfd	sp!, {r0 - r3, pc}		\n\
 	");
 
Index: linux/arch/arm/kernel/signal.c
===================================================================
--- linux.orig/arch/arm/kernel/signal.c
+++ linux/arch/arm/kernel/signal.c
@@ -689,6 +689,14 @@ static int do_signal(sigset_t *oldset, s
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux/arch/arm/kernel/smp.c
===================================================================
--- linux.orig/arch/arm/kernel/smp.c
+++ linux/arch/arm/kernel/smp.c
@@ -56,6 +56,7 @@ struct ipi_data {
 	unsigned long bits;
 };
 
+/* FIXME */
 static DEFINE_PER_CPU(struct ipi_data, ipi_data) = {
 	.lock	= SPIN_LOCK_UNLOCKED,
 };
@@ -246,7 +247,7 @@ static void send_ipi_message(cpumask_t c
 	unsigned long flags;
 	unsigned int cpu;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for_each_cpu_mask(cpu, callmap) {
 		struct ipi_data *ipi = &per_cpu(ipi_data, cpu);
@@ -261,7 +262,7 @@ static void send_ipi_message(cpumask_t c
 	 */
 	smp_cross_call(callmap);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -394,7 +395,7 @@ static void ipi_call_function(unsigned i
 		cpu_clear(cpu, data->unfinished);
 }
 
-static DEFINE_SPINLOCK(stop_lock);
+static DEFINE_RAW_SPINLOCK(stop_lock);
 
 /*
  * ipi_cpu_stop - handle IPI from smp_send_stop()
@@ -409,7 +410,7 @@ static void ipi_cpu_stop(unsigned int cp
 	cpu_clear(cpu, cpu_online_map);
 
 	local_fiq_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	while (1)
 		cpu_relax();
Index: linux/arch/arm/kernel/time.c
===================================================================
--- linux.orig/arch/arm/kernel/time.c
+++ linux/arch/arm/kernel/time.c
@@ -36,10 +36,6 @@
 #include <asm/thread_info.h>
 #include <asm/mach/time.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * Our system timer.
  */
Index: linux/arch/arm/kernel/traps.c
===================================================================
--- linux.orig/arch/arm/kernel/traps.c
+++ linux/arch/arm/kernel/traps.c
@@ -177,6 +177,8 @@ void dump_stack(void)
 {
 #ifdef CONFIG_DEBUG_ERRORS
 	__backtrace();
+	print_traces(current);
+	show_held_locks(current);
 #endif
 }
 
@@ -198,7 +200,7 @@ void show_stack(struct task_struct *tsk,
 	barrier();
 }
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 /*
  * This function is protected against re-entrancy.
@@ -244,7 +246,7 @@ void notify_die(const char *str, struct 
 }
 
 static LIST_HEAD(undef_hook);
-static DEFINE_SPINLOCK(undef_lock);
+static DEFINE_RAW_SPINLOCK(undef_lock);
 
 void register_undef_hook(struct undef_hook *hook)
 {
@@ -336,7 +338,7 @@ asmlinkage void bad_mode(struct pt_regs 
 		handler[reason], processor_modes[proc_mode]);
 
 	die("Oops - bad mode", regs, 0);
-	local_irq_disable();
+	raw_local_irq_disable();
 	panic("bad mode");
 }
 
Index: linux/arch/arm/mach-clps711x/p720t-leds.c
===================================================================
--- linux.orig/arch/arm/mach-clps711x/p720t-leds.c
+++ linux/arch/arm/mach-clps711x/p720t-leds.c
@@ -36,7 +36,7 @@ static void p720t_leds_event(led_event_t
 	unsigned long flags;
 	u32 pddr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	switch(ledevt) {
 	case led_idle_start:
 		break;
@@ -53,7 +53,7 @@ static void p720t_leds_event(led_event_t
 		break;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int __init leds_init(void)
Index: linux/arch/arm/mach-clps711x/time.c
===================================================================
--- linux.orig/arch/arm/mach-clps711x/time.c
+++ linux/arch/arm/mach-clps711x/time.c
@@ -19,6 +19,7 @@
 #include <linux/timex.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 
 #include <asm/hardware.h>
Index: linux/arch/arm/mach-clps7500/core.c
===================================================================
--- linux.orig/arch/arm/mach-clps7500/core.c
+++ linux/arch/arm/mach-clps7500/core.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/list.h>
 #include <linux/sched.h>
 #include <linux/init.h>
Index: linux/arch/arm/mach-ebsa110/core.c
===================================================================
--- linux.orig/arch/arm/mach-ebsa110/core.c
+++ linux/arch/arm/mach-ebsa110/core.c
@@ -56,14 +56,14 @@ static void __init ebsa110_init_irq(void
 	unsigned long flags;
 	unsigned int irq;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	__raw_writeb(0xff, IRQ_MCLR);
 	__raw_writeb(0x55, IRQ_MSET);
 	__raw_writeb(0x00, IRQ_MSET);
 	if (__raw_readb(IRQ_MASK) != 0x55)
 		while (1);
 	__raw_writeb(0xff, IRQ_MCLR);	/* clear all interrupt enables */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	for (irq = 0; irq < NR_IRQS; irq++) {
 		set_irq_chip(irq, &ebsa110_irq_chip);
Index: linux/arch/arm/mach-footbridge/dc21285-timer.c
===================================================================
--- linux.orig/arch/arm/mach-footbridge/dc21285-timer.c
+++ linux/arch/arm/mach-footbridge/dc21285-timer.c
@@ -6,6 +6,7 @@
  */
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/irq.h>
 
Index: linux/arch/arm/mach-footbridge/isa-irq.c
===================================================================
--- linux.orig/arch/arm/mach-footbridge/isa-irq.c
+++ linux/arch/arm/mach-footbridge/isa-irq.c
@@ -102,6 +102,17 @@ static struct irqaction irq_cascade = { 
 static struct resource pic1_resource = { "pic1", 0x20, 0x3f };
 static struct resource pic2_resource = { "pic2", 0xa0, 0xbf };
 
+static DEFINE_IRQ_CHAINED_TYPE(isa_irq_handler);
+
+static unsigned int startup_irq_disabled(unsigned int irq)
+{
+	return 0;
+}
+
+/* Interrupt type for irqs which must not be
+ * automatically enabled in reqeust_irq */
+static struct irq_type level_type_nostart;
+
 void __init isa_init_irq(unsigned int host_irq)
 {
 	unsigned int irq;
@@ -159,9 +170,11 @@ void __init isa_init_irq(unsigned int ho
 		 * There appears to be a missing pull-up
 		 * resistor on this line.
 		 */
-		if (machine_is_netwinder())
-			set_irq_flags(_ISA_IRQ(11), IRQF_VALID |
-				      IRQF_PROBE | IRQF_NOAUTOEN);
+		if (machine_is_netwinder()) {
+			level_type_nostart = default_level_type;
+			level_type_nostart.startup = startup_irq_disabled;
+			set_irq_handler(_ISA_IRQ(11), &level_type_nostart);
+		}
 	}
 }
 
Index: linux/arch/arm/mach-footbridge/isa-timer.c
===================================================================
--- linux.orig/arch/arm/mach-footbridge/isa-timer.c
+++ linux/arch/arm/mach-footbridge/isa-timer.c
@@ -6,6 +6,7 @@
  */
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
Index: linux/arch/arm/mach-footbridge/netwinder-hw.c
===================================================================
--- linux.orig/arch/arm/mach-footbridge/netwinder-hw.c
+++ linux/arch/arm/mach-footbridge/netwinder-hw.c
@@ -68,7 +68,7 @@ static inline void wb977_ww(int reg, int
 /*
  * This is a lock for accessing ports GP1_IO_BASE and GP2_IO_BASE
  */
-DEFINE_SPINLOCK(gpio_lock);
+DEFINE_RAW_SPINLOCK(gpio_lock);
 
 static unsigned int current_gpio_op;
 static unsigned int current_gpio_io;
Index: linux/arch/arm/mach-footbridge/netwinder-leds.c
===================================================================
--- linux.orig/arch/arm/mach-footbridge/netwinder-leds.c
+++ linux/arch/arm/mach-footbridge/netwinder-leds.c
@@ -33,7 +33,7 @@ static char led_state;
 static char hw_led_state;
 
 static DEFINE_SPINLOCK(leds_lock);
-extern spinlock_t gpio_lock;
+extern raw_spinlock_t gpio_lock;
 
 static void netwinder_leds_event(led_event_t evt)
 {
Index: linux/arch/arm/mach-h720x/common.c
===================================================================
--- linux.orig/arch/arm/mach-h720x/common.c
+++ linux/arch/arm/mach-h720x/common.c
@@ -163,6 +163,11 @@ h720x_gpiod_demux_handler(unsigned int i
 	h720x_gpio_handler(mask, irq, desc, regs);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioa_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpiob_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioc_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpiod_demux_handler);
+
 #ifdef CONFIG_CPU_H7202
 static void
 h720x_gpioe_demux_handler(unsigned int irq_unused, struct irqdesc *desc,
@@ -175,6 +180,7 @@ h720x_gpioe_demux_handler(unsigned int i
 	IRQDBG("%s mask: 0x%08x irq: %d\n",__FUNCTION__,mask,irq);
 	h720x_gpio_handler(mask, irq, desc, regs);
 }
+static DEFINE_IRQ_CHAINED_TYPE(h720x_gpioe_demux_handler);
 #endif
 
 static struct irqchip h720x_global_chip = {
Index: linux/arch/arm/mach-h720x/cpu-h7202.c
===================================================================
--- linux.orig/arch/arm/mach-h720x/cpu-h7202.c
+++ linux/arch/arm/mach-h720x/cpu-h7202.c
@@ -175,6 +175,8 @@ static struct irqaction h7202_timer_irq 
 	.handler	= h7202_timer_interrupt,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(h7202_timerx_demux_handler);
+
 /*
  * Setup TIMER0 as system timer
  */
Index: linux/arch/arm/mach-imx/dma.c
===================================================================
--- linux.orig/arch/arm/mach-imx/dma.c
+++ linux/arch/arm/mach-imx/dma.c
@@ -43,7 +43,7 @@ imx_request_dma(char *name, imx_dma_prio
 	if (!name || !irq_handler)
 		return -EINVAL;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* try grabbing a DMA channel with the requested priority */
 	for (i = prio; i < prio + (prio == DMA_PRIO_LOW) ? 8 : 4; i++) {
@@ -75,7 +75,7 @@ imx_request_dma(char *name, imx_dma_prio
 		i = -ENODEV;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return i;
 }
 
@@ -91,10 +91,10 @@ imx_free_dma(int dma_ch)
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	DIMR &= ~(1 << dma_ch);
 	dma_channels[dma_ch].name = NULL;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static irqreturn_t
Index: linux/arch/arm/mach-imx/irq.c
===================================================================
--- linux.orig/arch/arm/mach-imx/irq.c
+++ linux/arch/arm/mach-imx/irq.c
@@ -217,6 +217,11 @@ static struct irqchip imx_gpio_chip = {
 	.set_type = imx_gpio_irq_type,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpioa_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpiob_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpioc_demux_handler);
+static DEFINE_IRQ_CHAINED_TYPE(imx_gpiod_demux_handler);
+
 void __init
 imx_init_irq(void)
 {
Index: linux/arch/arm/mach-imx/leds-mx1ads.c
===================================================================
--- linux.orig/arch/arm/mach-imx/leds-mx1ads.c
+++ linux/arch/arm/mach-imx/leds-mx1ads.c
@@ -29,7 +29,7 @@ mx1ads_leds_event(led_event_t ledevt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (ledevt) {
 #ifdef CONFIG_LEDS_CPU
@@ -49,5 +49,5 @@ mx1ads_leds_event(led_event_t ledevt)
 	default:
 		break;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-imx/time.c
===================================================================
--- linux.orig/arch/arm/mach-imx/time.c
+++ linux/arch/arm/mach-imx/time.c
@@ -13,6 +13,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/time.h>
 
 #include <asm/hardware.h>
Index: linux/arch/arm/mach-integrator/core.c
===================================================================
--- linux.orig/arch/arm/mach-integrator/core.c
+++ linux/arch/arm/mach-integrator/core.c
@@ -13,6 +13,7 @@
 #include <linux/device.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 #include <linux/smp.h>
 
@@ -117,7 +118,7 @@ arch_initcall(integrator_init);
 
 #define CM_CTRL	IO_ADDRESS(INTEGRATOR_HDR_BASE) + INTEGRATOR_HDR_CTRL_OFFSET
 
-static DEFINE_SPINLOCK(cm_lock);
+static DEFINE_RAW_SPINLOCK(cm_lock);
 
 /**
  * cm_control - update the CM_CTRL register.
Index: linux/arch/arm/mach-integrator/leds.c
===================================================================
--- linux.orig/arch/arm/mach-integrator/leds.c
+++ linux/arch/arm/mach-integrator/leds.c
@@ -41,7 +41,7 @@ static void integrator_leds_event(led_ev
 	unsigned int update_alpha_leds;
 
 	// yup, change the LEDs
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	update_alpha_leds = 0;
 
 	switch(ledevt) {
@@ -76,7 +76,7 @@ static void integrator_leds_event(led_ev
 		while (__raw_readl(dbg_base + INTEGRATOR_DBG_ALPHA_OFFSET) & 1);
 		__raw_writel(saved_leds, dbg_base + INTEGRATOR_DBG_LEDS_OFFSET);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int __init leds_init(void)
Index: linux/arch/arm/mach-integrator/pci_v3.c
===================================================================
--- linux.orig/arch/arm/mach-integrator/pci_v3.c
+++ linux/arch/arm/mach-integrator/pci_v3.c
@@ -163,7 +163,7 @@
  *	 7:2	register number
  *  
  */
-static DEFINE_SPINLOCK(v3_lock);
+static DEFINE_RAW_SPINLOCK(v3_lock);
 
 #define PCI_BUS_NONMEM_START	0x00000000
 #define PCI_BUS_NONMEM_SIZE	SZ_256M
Index: linux/arch/arm/mach-integrator/platsmp.c
===================================================================
--- linux.orig/arch/arm/mach-integrator/platsmp.c
+++ linux/arch/arm/mach-integrator/platsmp.c
@@ -31,7 +31,7 @@ extern void integrator_secondary_startup
 volatile int __cpuinitdata pen_release = -1;
 unsigned long __cpuinitdata phys_pen_release = 0;
 
-static DEFINE_SPINLOCK(boot_lock);
+static DEFINE_RAW_SPINLOCK(boot_lock);
 
 void __cpuinit platform_secondary_init(unsigned int cpu)
 {
Index: linux/arch/arm/mach-ixp2000/core.c
===================================================================
--- linux.orig/arch/arm/mach-ixp2000/core.c
+++ linux/arch/arm/mach-ixp2000/core.c
@@ -20,6 +20,7 @@
 #include <linux/spinlock.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/serial.h>
 #include <linux/tty.h>
 #include <linux/bitops.h>
@@ -286,9 +287,9 @@ void gpio_line_config(int line, int dire
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (direction == GPIO_OUT) {
-		irq_desc[line + IRQ_IXP2000_GPIO0].valid = 0;
+ 		set_irq_flags(line + IRQ_IXP2000_GPIO0, 0);
 
 		/* if it's an output, it ain't an interrupt anymore */
 		GPIO_IRQ_falling_edge &= ~(1 << line);
@@ -301,7 +302,7 @@ void gpio_line_config(int line, int dire
 	} else if (direction == GPIO_IN) {
 		ixp2000_reg_write(IXP2000_GPIO_PDCR, 1 << line);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 
@@ -354,8 +355,7 @@ static int ixp2000_GPIO_irq_type(unsigne
 	/*
 	 * Finally, mark the corresponding IRQ as valid.
 	 */
-	irq_desc[irq].valid = 1;
-
+	set_irq_flags(irq, IRQF_VALID);
 	return 0;
 }
 
@@ -425,6 +425,8 @@ static struct irqchip ixp2000_irq_chip =
 	.unmask	= ixp2000_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixp2000_GPIO_irq_handler);
+
 void __init ixp2000_init_irq(void)
 {
 	int irq;
Index: linux/arch/arm/mach-ixp2000/ixdp2x00.c
===================================================================
--- linux.orig/arch/arm/mach-ixp2000/ixdp2x00.c
+++ linux/arch/arm/mach-ixp2000/ixdp2x00.c
@@ -146,6 +146,8 @@ static struct irqchip ixdp2x00_cpld_irq_
 	.unmask	= ixdp2x00_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixdp2x00_irq_handler);
+
 void ixdp2x00_init_irq(volatile unsigned long *stat_reg, volatile unsigned long *mask_reg, unsigned long nr_irqs)
 {
 	unsigned int irq;
@@ -168,7 +170,7 @@ void ixdp2x00_init_irq(volatile unsigned
 	}
 
 	/* Hook into PCI interrupt */
-	set_irq_chained_handler(IRQ_IXP2000_PCIB, &ixdp2x00_irq_handler);
+	set_irq_chained_handler(IRQ_IXP2000_PCIB, ixdp2x00_irq_handler);
 }
 
 /*************************************************************************
Index: linux/arch/arm/mach-ixp2000/ixdp2x01.c
===================================================================
--- linux.orig/arch/arm/mach-ixp2000/ixdp2x01.c
+++ linux/arch/arm/mach-ixp2000/ixdp2x01.c
@@ -95,6 +95,8 @@ static struct irqchip ixdp2x01_irq_chip 
 	.unmask	= ixdp2x01_irq_unmask
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(ixdp2x01_irq_handler);
+
 /*
  * We only do anything if we are the master NPU on the board.
  * The slave NPU only has the ethernet chip going directly to
@@ -127,7 +129,7 @@ void __init ixdp2x01_init_irq(void)
 	}
 
 	/* Hook into PCI interrupts */
-	set_irq_chained_handler(IRQ_IXP2000_PCIB, &ixdp2x01_irq_handler);
+	set_irq_chained_handler(IRQ_IXP2000_PCIB, ixdp2x01_irq_handler);
 }
 
 
Index: linux/arch/arm/mach-ixp2000/pci.c
===================================================================
--- linux.orig/arch/arm/mach-ixp2000/pci.c
+++ linux/arch/arm/mach-ixp2000/pci.c
@@ -145,7 +145,7 @@ int ixp2000_pci_abort_handler(unsigned l
 
 	pci_master_aborts = 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	temp = *(IXP2000_PCI_CONTROL);
 	if (temp & ((1 << 8) | (1 << 5))) {
 		ixp2000_reg_write(IXP2000_PCI_CONTROL, temp);
@@ -158,7 +158,7 @@ int ixp2000_pci_abort_handler(unsigned l
 			temp = *(IXP2000_PCI_CMDSTAT);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*
 	 * If it was an imprecise abort, then we need to correct the
@@ -176,7 +176,7 @@ clear_master_aborts(void)
 	volatile u32 temp;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	temp = *(IXP2000_PCI_CONTROL);
 	if (temp & ((1 << 8) | (1 << 5))) {	
 		ixp2000_reg_write(IXP2000_PCI_CONTROL, temp);
@@ -189,7 +189,7 @@ clear_master_aborts(void)
 			temp = *(IXP2000_PCI_CMDSTAT);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
Index: linux/arch/arm/mach-ixp4xx/common-pci.c
===================================================================
--- linux.orig/arch/arm/mach-ixp4xx/common-pci.c
+++ linux/arch/arm/mach-ixp4xx/common-pci.c
@@ -53,7 +53,7 @@ unsigned long ixp4xx_pci_reg_base = 0;
  * these transactions are atomic or we will end up
  * with corrupt data on the bus or in a driver.
  */
-static DEFINE_SPINLOCK(ixp4xx_pci_lock);
+static DEFINE_RAW_SPINLOCK(ixp4xx_pci_lock);
 
 /*
  * Read from PCI config space
Index: linux/arch/arm/mach-ixp4xx/coyote-pci.c
===================================================================
--- linux.orig/arch/arm/mach-ixp4xx/coyote-pci.c
+++ linux/arch/arm/mach-ixp4xx/coyote-pci.c
@@ -17,6 +17,7 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
Index: linux/arch/arm/mach-ixp4xx/ixdp425-pci.c
===================================================================
--- linux.orig/arch/arm/mach-ixp4xx/ixdp425-pci.c
+++ linux/arch/arm/mach-ixp4xx/ixdp425-pci.c
@@ -16,6 +16,7 @@
 
 #include <linux/kernel.h>
 #include <linux/config.h>
+#include <linux/irq.h>
 #include <linux/pci.h>
 #include <linux/init.h>
 #include <linux/delay.h>
Index: linux/arch/arm/mach-ixp4xx/ixdpg425-pci.c
===================================================================
--- linux.orig/arch/arm/mach-ixp4xx/ixdpg425-pci.c
+++ linux/arch/arm/mach-ixp4xx/ixdpg425-pci.c
@@ -16,10 +16,10 @@
 #include <linux/kernel.h>
 #include <linux/pci.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 
 #include <asm/mach-types.h>
 #include <asm/hardware.h>
-#include <asm/irq.h>
 
 #include <asm/mach/pci.h>
 
Index: linux/arch/arm/mach-l7200/core.c
===================================================================
--- linux.orig/arch/arm/mach-l7200/core.c
+++ linux/arch/arm/mach-l7200/core.c
@@ -7,6 +7,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/device.h>
 
 #include <asm/types.h>
Index: linux/arch/arm/mach-lh7a40x/arch-kev7a400.c
===================================================================
--- linux.orig/arch/arm/mach-lh7a40x/arch-kev7a400.c
+++ linux/arch/arm/mach-lh7a40x/arch-kev7a400.c
@@ -72,6 +72,8 @@ static void kev7a400_cpld_handler (unsig
 	}
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(kev7a400_cpld_handler);
+
 void __init lh7a40x_init_board_irq (void)
 {
 	int irq;
Index: linux/arch/arm/mach-lh7a40x/arch-lpd7a40x.c
===================================================================
--- linux.orig/arch/arm/mach-lh7a40x/arch-lpd7a40x.c
+++ linux/arch/arm/mach-lh7a40x/arch-lpd7a40x.c
@@ -12,6 +12,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <asm/hardware.h>
 #include <asm/setup.h>
@@ -173,6 +174,7 @@ static void lpd7a40x_cpld_handler (unsig
 	desc->chip->unmask (irq); /* Level-triggered need this */
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lpd7a40x_cpld_handler);
 
 void __init lh7a40x_init_board_irq (void)
 {
Index: linux/arch/arm/mach-lh7a40x/irq-kev7a400.c
===================================================================
--- linux.orig/arch/arm/mach-lh7a40x/irq-kev7a400.c
+++ linux/arch/arm/mach-lh7a40x/irq-kev7a400.c
@@ -60,6 +60,8 @@ lh7a400_cpld_handler (unsigned int irq, 
 	}
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(kev7a400_cpld_handler);
+
   /* IRQ initialization */
 
 void __init
Index: linux/arch/arm/mach-lh7a40x/irq-lpd7a40x.c
===================================================================
--- linux.orig/arch/arm/mach-lh7a40x/irq-lpd7a40x.c
+++ linux/arch/arm/mach-lh7a40x/irq-lpd7a40x.c
@@ -71,6 +71,7 @@ static void lh7a40x_cpld_handler (unsign
 	desc->chip->unmask (irq); /* Level-triggered need this */
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lh7a40x_cpld_handler);
 
   /* IRQ initialization */
 
Index: linux/arch/arm/mach-lh7a40x/time.c
===================================================================
--- linux.orig/arch/arm/mach-lh7a40x/time.c
+++ linux/arch/arm/mach-lh7a40x/time.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/time.h>
 
 #include <asm/hardware.h>
Index: linux/arch/arm/mach-omap1/board-osk.c
===================================================================
--- linux.orig/arch/arm/mach-omap1/board-osk.c
+++ linux/arch/arm/mach-omap1/board-osk.c
@@ -29,7 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/device.h>
-#include <linux/interrupt.h>
+#include <linux/irq.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
Index: linux/arch/arm/mach-omap1/fpga.c
===================================================================
--- linux.orig/arch/arm/mach-omap1/fpga.c
+++ linux/arch/arm/mach-omap1/fpga.c
@@ -120,6 +120,8 @@ static struct irqchip omap_fpga_irq = {
 	.unmask		= fpga_unmask_irq,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(innovator_fpga_IRQ_demux);
+
 /*
  * All of the FPGA interrupt request inputs except for the touchscreen are
  * edge-sensitive; the touchscreen is level-sensitive.  The edge-sensitive
Index: linux/arch/arm/mach-omap1/leds-h2p2-debug.c
===================================================================
--- linux.orig/arch/arm/mach-omap1/leds-h2p2-debug.c
+++ linux/arch/arm/mach-omap1/leds-h2p2-debug.c
@@ -45,7 +45,7 @@ void h2p2_dbg_leds_event(led_event_t evt
 	static struct h2p2_dbg_fpga __iomem *fpga;
 	static u16 led_state, hw_led_state;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (!(led_state & LED_STATE_ENABLED) && evt != led_start)
 		goto done;
@@ -140,5 +140,5 @@ void h2p2_dbg_leds_event(led_event_t evt
 		__raw_writew(~hw_led_state, &fpga->leds);
 
 done:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-omap1/serial.c
===================================================================
--- linux.orig/arch/arm/mach-omap1/serial.c
+++ linux/arch/arm/mach-omap1/serial.c
@@ -12,6 +12,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/irq.h>
 #include <linux/delay.h>
 #include <linux/serial.h>
 #include <linux/tty.h>
Index: linux/arch/arm/mach-pxa/dma.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/dma.c
+++ linux/arch/arm/mach-pxa/dma.c
@@ -43,7 +43,7 @@ int pxa_request_dma (char *name, pxa_dma
 	if (!name || !irq_handler)
 		return -EINVAL;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* try grabbing a DMA channel with the requested priority */
 	for (i = prio; i < prio + PXA_DMA_NBCH(prio); i++) {
@@ -73,7 +73,7 @@ int pxa_request_dma (char *name, pxa_dma
 		i = -ENODEV;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return i;
 }
 
@@ -88,10 +88,10 @@ void pxa_free_dma (int dma_ch)
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	DCSR(dma_ch) = DCSR_STARTINTR|DCSR_ENDINTR|DCSR_BUSERR;
 	dma_channels[dma_ch].name = NULL;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static irqreturn_t dma_irq_handler(int irq, void *dev_id, struct pt_regs *regs)
Index: linux/arch/arm/mach-pxa/generic.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/generic.c
+++ linux/arch/arm/mach-pxa/generic.c
@@ -49,7 +49,7 @@ void pxa_gpio_mode(int gpio_mode)
 	int fn = (gpio_mode & GPIO_MD_MASK_FN) >> 8;
 	int gafr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (gpio_mode & GPIO_DFLT_LOW)
 		GPCR(gpio) = GPIO_bit(gpio);
 	else if (gpio_mode & GPIO_DFLT_HIGH)
@@ -60,7 +60,7 @@ void pxa_gpio_mode(int gpio_mode)
 		GPDR(gpio) &= ~GPIO_bit(gpio);
 	gafr = GAFR(gpio) & ~(0x3 << (((gpio) & 0xf)*2));
 	GAFR(gpio) = gafr |  (fn  << (((gpio) & 0xf)*2));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(pxa_gpio_mode);
@@ -71,14 +71,14 @@ EXPORT_SYMBOL(pxa_gpio_mode);
 void pxa_set_cken(int clock, int enable)
 {
 	unsigned long flags;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (enable)
 		CKEN |= clock;
 	else
 		CKEN &= ~clock;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(pxa_set_cken);
Index: linux/arch/arm/mach-pxa/idp.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/idp.c
+++ linux/arch/arm/mach-pxa/idp.c
@@ -18,6 +18,7 @@
 
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/device.h>
 #include <linux/fb.h>
 
Index: linux/arch/arm/mach-pxa/irq.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/irq.c
+++ linux/arch/arm/mach-pxa/irq.c
@@ -244,6 +244,7 @@ static struct irqchip pxa_muxed_gpio_chi
 	.set_type	= pxa_gpio_irq_type,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(pxa_gpio_demux_handler);
 
 void __init pxa_init_irq(void)
 {
Index: linux/arch/arm/mach-pxa/leds-idp.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/leds-idp.c
+++ linux/arch/arm/mach-pxa/leds-idp.c
@@ -34,7 +34,7 @@ void idp_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -113,5 +113,5 @@ void idp_leds_event(led_event_t evt)
 	else
 		IDP_CPLD_LED_CONTROL |= IDP_LEDS_MASK;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-pxa/leds-lubbock.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/leds-lubbock.c
+++ linux/arch/arm/mach-pxa/leds-lubbock.c
@@ -48,7 +48,7 @@ void lubbock_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -122,5 +122,5 @@ void lubbock_leds_event(led_event_t evt)
 	else
 		LUB_DISC_BLNK_LED |= 0xff;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-pxa/leds-mainstone.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/leds-mainstone.c
+++ linux/arch/arm/mach-pxa/leds-mainstone.c
@@ -43,7 +43,7 @@ void mainstone_leds_event(led_event_t ev
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -117,5 +117,5 @@ void mainstone_leds_event(led_event_t ev
 	else
 		MST_LEDCTRL |= 0xff;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-pxa/lubbock.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/lubbock.c
+++ linux/arch/arm/mach-pxa/lubbock.c
@@ -47,9 +47,9 @@ void lubbock_set_misc_wr(unsigned int ma
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	LUB_MISC_WR = (LUB_MISC_WR & ~mask) | (set & mask);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 EXPORT_SYMBOL(lubbock_set_misc_wr);
 
@@ -90,6 +90,8 @@ static void lubbock_irq_handler(unsigned
 	} while (pending);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(lubbock_irq_handler);
+
 static void __init lubbock_init_irq(void)
 {
 	int irq;
Index: linux/arch/arm/mach-pxa/mainstone.c
===================================================================
--- linux.orig/arch/arm/mach-pxa/mainstone.c
+++ linux/arch/arm/mach-pxa/mainstone.c
@@ -78,6 +78,8 @@ static void mainstone_irq_handler(unsign
 	} while (pending);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(mainstone_irq_handler);
+
 static void __init mainstone_init_irq(void)
 {
 	int irq;
Index: linux/arch/arm/mach-rpc/dma.c
===================================================================
--- linux.orig/arch/arm/mach-rpc/dma.c
+++ linux/arch/arm/mach-rpc/dma.c
@@ -171,11 +171,11 @@ static void iomd_disable_dma(dmach_t cha
 	unsigned long dma_base = dma->dma_base;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (dma->state != ~DMA_ST_AB)
 		disable_irq(dma->dma_irq);
 	iomd_writeb(0, dma_base + CR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int iomd_set_dma_speed(dmach_t channel, dma_t *dma, int cycle)
Index: linux/arch/arm/mach-rpc/irq.c
===================================================================
--- linux.orig/arch/arm/mach-rpc/irq.c
+++ linux/arch/arm/mach-rpc/irq.c
@@ -112,6 +112,15 @@ static struct irqchip iomd_fiq_chip = {
 	.unmask = iomd_unmask_irq_fiq,
 };
 
+static unsigned int startup_irq_disabled(unsigned int irq)
+{
+	return 0;
+}
+
+/* Interrupt type for irqs which must not be
+ * automatically enabled in reqeust_irq */
+static struct irq_type level_type_nostart;
+
 void __init rpc_init_irq(void)
 {
 	unsigned int irq, flags;
@@ -121,16 +130,15 @@ void __init rpc_init_irq(void)
 	iomd_writeb(0, IOMD_FIQMASK);
 	iomd_writeb(0, IOMD_DMAMASK);
 
+	level_type_nostart = default_level_type;
+	level_type_nostart.startup = startup_irq_disabled;
+
 	for (irq = 0; irq < NR_IRQS; irq++) {
 		flags = IRQF_VALID;
 
 		if (irq <= 6 || (irq >= 9 && irq <= 15))
 			flags |= IRQF_PROBE;
 
-		if (irq == 21 || (irq >= 16 && irq <= 19) ||
-		    irq == IRQ_KEYBOARDTX)
-			flags |= IRQF_NOAUTOEN;
-
 		switch (irq) {
 		case 0 ... 7:
 			set_irq_chip(irq, &iomd_a_chip);
@@ -155,6 +163,10 @@ void __init rpc_init_irq(void)
 			set_irq_flags(irq, IRQF_VALID);
 			break;
 		}
+
+		if (irq == 21 || (irq >= 16 && irq <= 19) ||
+		    irq == IRQ_KEYBOARDTX)
+			set_irq_handler(irq, &level_type_nostart);
 	}
 
 	init_FIQ();
Index: linux/arch/arm/mach-s3c2410/bast-irq.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/bast-irq.c
+++ linux/arch/arm/mach-s3c2410/bast-irq.c
@@ -136,13 +136,15 @@ bast_irq_pc104_demux(unsigned int irq,
 		for (i = 0; stat != 0; i++, stat >>= 1) {
 			if (stat & 1) {
 				irqno = bast_pc104_irqs[i];
-
-				desc_handle_irq(irqno, irq_desc + irqno, regs);
+				desc = irq_desc + irqno;
+				desc_handle_irq(irqno, desc, regs);
 			}
 		}
 	}
 }
 
+DEFINE_IRQ_CHAINED_TYPE(bast_irq_pc104_demux);
+
 static __init int bast_irq_init(void)
 {
 	unsigned int i;
@@ -156,7 +158,7 @@ static __init int bast_irq_init(void)
 
 		set_irq_chained_handler(IRQ_ISA, bast_irq_pc104_demux);
 
-		/* reigster our IRQs */
+		/* register our IRQs */
 
 		for (i = 0; i < 4; i++) {
 			unsigned int irqno = bast_pc104_irqs[i];
Index: linux/arch/arm/mach-s3c2410/clock.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/clock.c
+++ linux/arch/arm/mach-s3c2410/clock.c
@@ -61,7 +61,7 @@ void inline s3c24xx_clk_enable(unsigned 
 	unsigned long clkcon;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	clkcon = __raw_readl(S3C2410_CLKCON);
 	clkcon &= ~clocks;
@@ -74,7 +74,7 @@ void inline s3c24xx_clk_enable(unsigned 
 
 	__raw_writel(clkcon, S3C2410_CLKCON);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* enable and disable calls for use with the clk struct */
Index: linux/arch/arm/mach-s3c2410/dma.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/dma.c
+++ linux/arch/arm/mach-s3c2410/dma.c
@@ -329,11 +329,11 @@ static int s3c2410_dma_start(s3c2410_dma
 
 	pr_debug("s3c2410_start_dma: channel=%d\n", chan->number);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (chan->state == S3C2410_DMA_RUNNING) {
 		pr_debug("s3c2410_start_dma: already running (%d)\n", chan->state);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 		return 0;
 	}
 
@@ -348,7 +348,7 @@ static int s3c2410_dma_start(s3c2410_dma
 			printk(KERN_ERR "dma%d: channel has nothing loaded\n",
 			       chan->number);
 			chan->state = S3C2410_DMA_IDLE;
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return -EINVAL;
 		}
 
@@ -385,7 +385,7 @@ static int s3c2410_dma_start(s3c2410_dma
 
 	dbg_showchan(chan);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -451,7 +451,7 @@ int s3c2410_dma_enqueue(unsigned int cha
 	buf->id    = id;
 	buf->magic = BUF_MAGIC;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (chan->curr == NULL) {
 		/* we've got nothing loaded... */
@@ -485,7 +485,7 @@ int s3c2410_dma_enqueue(unsigned int cha
 				       "timeout loading buffer\n",
 				       chan->number);
 				dbg_showchan(chan);
-				local_irq_restore(flags);
+				raw_local_irq_restore(flags);
 				return -EINVAL;
 			}
 		}
@@ -499,7 +499,7 @@ int s3c2410_dma_enqueue(unsigned int cha
 		}
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -661,9 +661,9 @@ s3c2410_dma_irq(int irq, void *devpw, st
 			return IRQ_HANDLED;
 		}
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		s3c2410_dma_loadbuffer(chan, chan->next);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	} else {
 		s3c2410_dma_lastxfer(chan);
 
@@ -698,14 +698,14 @@ int s3c2410_dma_request(unsigned int cha
 
 	check_channel(channel);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	dbg_showchan(chan);
 
 	if (chan->in_use) {
 		if (client != chan->client) {
 			printk(KERN_ERR "dma%d: already in use\n", channel);
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return -EBUSY;
 		} else {
 			printk(KERN_ERR "dma%d: client already has channel\n", channel);
@@ -724,7 +724,7 @@ int s3c2410_dma_request(unsigned int cha
 
 		if (err) {
 			chan->in_use = 0;
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 
 			printk(KERN_ERR "%s: cannot get IRQ %d for DMA %d\n",
 			       client->name, chan->irq, chan->number);
@@ -735,7 +735,7 @@ int s3c2410_dma_request(unsigned int cha
 		chan->irq_enabled = 1;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/* need to setup */
 
@@ -764,7 +764,7 @@ int s3c2410_dma_free(dmach_t channel, s3
 
 	check_channel(channel);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 
 	if (chan->client != client) {
@@ -789,7 +789,7 @@ int s3c2410_dma_free(dmach_t channel, s3
 		free_irq(chan->irq, (void *)chan);
 	chan->irq_claimed = 0;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
@@ -805,7 +805,7 @@ static int s3c2410_dma_dostop(s3c2410_dm
 
 	dbg_showchan(chan);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	s3c2410_dma_call_op(chan,  S3C2410_DMAOP_STOP);
 
@@ -823,7 +823,7 @@ static int s3c2410_dma_dostop(s3c2410_dm
 	chan->state      = S3C2410_DMA_IDLE;
 	chan->load_state = S3C2410_DMALOAD_NONE;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
@@ -840,7 +840,7 @@ static int s3c2410_dma_flush(s3c2410_dma
 
 	pr_debug("%s:\n", __FUNCTION__);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (chan->state != S3C2410_DMA_IDLE) {
 		pr_debug("%s: stopping channel...\n", __FUNCTION__ );
@@ -865,7 +865,7 @@ static int s3c2410_dma_flush(s3c2410_dma
 		}
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
Index: linux/arch/arm/mach-s3c2410/gpio.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/gpio.c
+++ linux/arch/arm/mach-s3c2410/gpio.c
@@ -58,7 +58,7 @@ void s3c2410_gpio_cfgpin(unsigned int pi
 		mask = 3 << S3C2410_GPIO_OFFSET(pin)*2;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	con  = __raw_readl(base + 0x00);
 	con &= ~mask;
@@ -66,7 +66,7 @@ void s3c2410_gpio_cfgpin(unsigned int pi
 
 	__raw_writel(con, base + 0x00);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(s3c2410_gpio_cfgpin);
@@ -97,14 +97,14 @@ void s3c2410_gpio_pullup(unsigned int pi
 	if (pin < S3C2410_GPIO_BANKB)
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	up = __raw_readl(base + 0x08);
 	up &= ~(1L << offs);
 	up |= to << offs;
 	__raw_writel(up, base + 0x08);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(s3c2410_gpio_pullup);
@@ -116,14 +116,14 @@ void s3c2410_gpio_setpin(unsigned int pi
 	unsigned long flags;
 	unsigned long dat;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	dat = __raw_readl(base + 0x04);
 	dat &= ~(1 << offs);
 	dat |= to << offs;
 	__raw_writel(dat, base + 0x04);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(s3c2410_gpio_setpin);
@@ -143,12 +143,12 @@ unsigned int s3c2410_modify_misccr(unsig
 	unsigned long flags;
 	unsigned long misccr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	misccr = __raw_readl(S3C2410_MISCCR);
 	misccr &= ~clear;
 	misccr ^= change;
 	__raw_writel(misccr, S3C2410_MISCCR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return misccr;
 }
@@ -189,7 +189,7 @@ int s3c2410_gpio_irqfilter(unsigned int 
 	pin -= S3C2410_GPG8_EINT16;
 	reg += pin & ~3;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* update filter width and clock source */
 
@@ -205,7 +205,7 @@ int s3c2410_gpio_irqfilter(unsigned int 
 	val |= on << ((pin * 4) + 3);
 	__raw_writel(val, S3C2410_EXTINT2);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
Index: linux/arch/arm/mach-s3c2410/irq.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/irq.c
+++ linux/arch/arm/mach-s3c2410/irq.c
@@ -573,6 +573,11 @@ s3c_irq_demux_uart2(unsigned int irq,
 }
 
 
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart0);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart1);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_uart2);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_adc);
+
 /* s3c24xx_init_irq
  *
  * Initialise S3C2410 IRQ system
Index: linux/arch/arm/mach-s3c2410/s3c2440-dsc.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/s3c2440-dsc.c
+++ linux/arch/arm/mach-s3c2410/s3c2440-dsc.c
@@ -45,14 +45,14 @@ int s3c2440_set_dsc(unsigned int pin, un
 	base = (pin & S3C2440_SELECT_DSC1) ? S3C2440_DSC1 : S3C2440_DSC0;
 	mask = 3 << S3C2440_DSC_GETSHIFT(pin);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	val = __raw_readl(base);
 	val &= ~mask;
 	val |= value & mask;
 	__raw_writel(val, base);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
Index: linux/arch/arm/mach-s3c2410/s3c2440-irq.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/s3c2440-irq.c
+++ linux/arch/arm/mach-s3c2410/s3c2440-irq.c
@@ -157,6 +157,9 @@ static struct irqchip s3c_irq_cam = {
 	.ack	    = s3c_irq_cam_ack,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_wdtac97);
+static DEFINE_IRQ_CHAINED_TYPE(s3c_irq_demux_cam);
+
 static int s3c2440_irq_add(struct sys_device *sysdev)
 {
 	unsigned int irqno;
Index: linux/arch/arm/mach-s3c2410/time.c
===================================================================
--- linux.orig/arch/arm/mach-s3c2410/time.c
+++ linux/arch/arm/mach-s3c2410/time.c
@@ -23,6 +23,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/err.h>
 
 #include <asm/system.h>
Index: linux/arch/arm/mach-sa1100/assabet.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/assabet.c
+++ linux/arch/arm/mach-sa1100/assabet.c
@@ -61,10 +61,10 @@ void ASSABET_BCR_frob(unsigned int mask,
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	BCR_value = (BCR_value & ~mask) | val;
 	ASSABET_BCR = BCR_value;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(ASSABET_BCR_frob);
Index: linux/arch/arm/mach-sa1100/badge4.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/badge4.c
+++ linux/arch/arm/mach-sa1100/badge4.c
@@ -227,7 +227,7 @@ void badge4_set_5V(unsigned subsystem, i
 	unsigned long flags;
 	unsigned old_5V_bitmap;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	old_5V_bitmap = badge4_5V_bitmap;
 
@@ -240,15 +240,22 @@ void badge4_set_5V(unsigned subsystem, i
 	/* detect on->off and off->on transitions */
 	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
 		/* was off, now on */
-		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
 		GPSR = BADGE4_GPIO_PCMEN5V;
 	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
 		/* was on, now off */
-		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
 		GPCR = BADGE4_GPIO_PCMEN5V;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
+
+	/* detect on->off and off->on transitions */
+	if ((!old_5V_bitmap) && (badge4_5V_bitmap)) {
+		/* was off, now on */
+		printk(KERN_INFO "%s: enabling 5V supply rail\n", __FUNCTION__);
+	} else if ((old_5V_bitmap) && (!badge4_5V_bitmap)) {
+		/* was on, now off */
+		printk(KERN_INFO "%s: disabling 5V supply rail\n", __FUNCTION__);
+	}
 }
 EXPORT_SYMBOL(badge4_set_5V);
 
Index: linux/arch/arm/mach-sa1100/cerf.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/cerf.c
+++ linux/arch/arm/mach-sa1100/cerf.c
@@ -15,6 +15,7 @@
 #include <linux/kernel.h>
 #include <linux/tty.h>
 #include <linux/device.h>
+#include <linux/irq.h>
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
 
Index: linux/arch/arm/mach-sa1100/cpu-sa1110.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/cpu-sa1110.c
+++ linux/arch/arm/mach-sa1100/cpu-sa1110.c
@@ -282,7 +282,7 @@ static int sa1110_target(struct cpufreq_
 	 * This means that we won't access SDRAM for the duration of
 	 * the programming.
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	asm("mcr p15, 0, %0, c7, c10, 4" : : "r" (0));
 	udelay(10);
 	__asm__ __volatile__("					\n\
@@ -303,7 +303,7 @@ static int sa1110_target(struct cpufreq_
 		: "r" (&MDCNFG), "r" (&PPCR), "0" (sd.mdcnfg),
 		  "r" (sd.mdrefr), "r" (sd.mdcas[0]),
 		  "r" (sd.mdcas[1]), "r" (sd.mdcas[2]), "r" (ppcr));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*
 	 * Now, return the SDRAM refresh back to normal.
Index: linux/arch/arm/mach-sa1100/dma.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/dma.c
+++ linux/arch/arm/mach-sa1100/dma.c
@@ -227,7 +227,7 @@ int sa1100_start_dma(dma_regs_t *regs, d
 	if (size > MAX_DMA_SIZE)
 		return -EOVERFLOW;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	status = regs->RdDCSR;
 
 	/* If both DMA buffers are started, there's nothing else we can do. */
@@ -262,7 +262,7 @@ int sa1100_start_dma(dma_regs_t *regs, d
 	ret = 0;
 
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return ret;
 }
 
Index: linux/arch/arm/mach-sa1100/generic.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/generic.c
+++ linux/arch/arm/mach-sa1100/generic.c
@@ -135,7 +135,7 @@ unsigned long long sched_clock(void)
 static void sa1100_power_off(void)
 {
 	mdelay(100);
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* disable internal oscillator, float CS lines */
 	PCFR = (PCFR_OPDE | PCFR_FP | PCFR_FS);
 	/* enable wake-up on GPIO0 (Assabet...) */
@@ -391,7 +391,7 @@ void __init sa1110_mb_disable(void)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	
 	PGSR &= ~GPIO_MBGNT;
 	GPCR = GPIO_MBGNT;
@@ -399,7 +399,7 @@ void __init sa1110_mb_disable(void)
 
 	GAFR &= ~(GPIO_MBGNT | GPIO_MBREQ);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -410,7 +410,7 @@ void __init sa1110_mb_enable(void)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	PGSR &= ~GPIO_MBGNT;
 	GPCR = GPIO_MBGNT;
@@ -419,6 +419,6 @@ void __init sa1110_mb_enable(void)
 	GAFR |= (GPIO_MBGNT | GPIO_MBREQ);
 	TUCR |= TUCR_MR;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
Index: linux/arch/arm/mach-sa1100/h3600.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/h3600.c
+++ linux/arch/arm/mach-sa1100/h3600.c
@@ -319,7 +319,7 @@ static void h3100_control_egpio(enum ipa
 	}
 
 	if (egpio || gpio) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (setp) {
 			h3100_egpio |= egpio;
 			GPSR = gpio;
@@ -328,7 +328,7 @@ static void h3100_control_egpio(enum ipa
 			GPCR = gpio;
 		}
 		H3100_EGPIO = h3100_egpio;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -451,13 +451,13 @@ static void h3600_control_egpio(enum ipa
 	}
 
 	if (egpio) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (setp)
 			h3600_egpio |= egpio;
 		else
 			h3600_egpio &= ~egpio;
 		H3600_EGPIO = h3600_egpio;
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -788,6 +788,8 @@ static void h3800_unmask_gpio_irq(unsign
 	H3800_ASIC2_GPIINTSTAT |= mask;
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(h3800_IRQ_demux);
+
 static void __init h3800_init_irq(void)
 {
 	int i;
@@ -826,7 +828,7 @@ static void __init h3800_init_irq(void)
 	}
 #endif
 	set_irq_type(IRQ_GPIO_H3800_ASIC, IRQT_RISING);
-	set_irq_chained_handler(IRQ_GPIO_H3800_ASIC, &h3800_IRQ_demux);
+	set_irq_chained_handler(IRQ_GPIO_H3800_ASIC, h3800_IRQ_demux);
 }
 
 
Index: linux/arch/arm/mach-sa1100/irq.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/irq.c
+++ linux/arch/arm/mach-sa1100/irq.c
@@ -11,12 +11,13 @@
  */
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ioport.h>
 #include <linux/ptrace.h>
 #include <linux/sysdev.h>
 
 #include <asm/hardware.h>
-#include <asm/irq.h>
 #include <asm/mach/irq.h>
 
 #include "generic.h"
@@ -281,6 +282,8 @@ static int __init sa1100irq_init_devicef
 	return sysdev_register(&sa1100irq_device);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(sa1100_high_gpio_handler);
+
 device_initcall(sa1100irq_init_devicefs);
 
 void __init sa1100_init_irq(void)
Index: linux/arch/arm/mach-sa1100/leds-assabet.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/leds-assabet.c
+++ linux/arch/arm/mach-sa1100/leds-assabet.c
@@ -32,7 +32,7 @@ void assabet_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch (evt) {
 	case led_start:
@@ -111,5 +111,5 @@ void assabet_leds_event(led_event_t evt)
 	if  (led_state & LED_STATE_ENABLED)
 		ASSABET_BCR_frob(ASSABET_BCR_LED_MASK, hw_led_state);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-sa1100/leds-badge4.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/leds-badge4.c
+++ linux/arch/arm/mach-sa1100/leds-badge4.c
@@ -36,7 +36,7 @@ void badge4_leds_event(led_event_t evt)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
         switch (evt) {
         case led_start:
@@ -108,5 +108,5 @@ void badge4_leds_event(led_event_t evt)
                 GPCR = hw_led_state ^ LED_MASK;
         }
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-sa1100/leds-cerf.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/leds-cerf.c
+++ linux/arch/arm/mach-sa1100/leds-cerf.c
@@ -29,7 +29,7 @@ void cerf_leds_event(led_event_t evt)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
         switch (evt) {
         case led_start:
@@ -107,5 +107,5 @@ void cerf_leds_event(led_event_t evt)
                 GPCR = hw_led_state ^ LED_MASK;
         }
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-sa1100/leds-hackkit.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/leds-hackkit.c
+++ linux/arch/arm/mach-sa1100/leds-hackkit.c
@@ -33,7 +33,7 @@ void hackkit_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch(evt) {
 		case led_start:
@@ -109,5 +109,5 @@ void hackkit_leds_event(led_event_t evt)
 		GPCR = hw_led_state ^ LED_MASK;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-sa1100/leds-lart.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/leds-lart.c
+++ linux/arch/arm/mach-sa1100/leds-lart.c
@@ -32,7 +32,7 @@ void lart_leds_event(led_event_t evt)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	switch(evt) {
 	case led_start:
@@ -98,5 +98,5 @@ void lart_leds_event(led_event_t evt)
 		GPCR = hw_led_state ^ LED_MASK;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
Index: linux/arch/arm/mach-sa1100/neponset.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/neponset.c
+++ linux/arch/arm/mach-sa1100/neponset.c
@@ -137,6 +137,8 @@ static struct sa1100_port_fns neponset_p
 	.get_mctrl	= neponset_get_mctrl,
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(neponset_irq_handler);
+
 static int neponset_probe(struct device *dev)
 {
 	sa1100_register_uart_fns(&neponset_port_fns);
Index: linux/arch/arm/mach-sa1100/pleb.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/pleb.c
+++ linux/arch/arm/mach-sa1100/pleb.c
@@ -7,6 +7,7 @@
 #include <linux/tty.h>
 #include <linux/ioport.h>
 #include <linux/device.h>
+#include <linux/irq.h>
 
 #include <linux/mtd/partitions.h>
 
Index: linux/arch/arm/mach-sa1100/simpad.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/simpad.c
+++ linux/arch/arm/mach-sa1100/simpad.c
@@ -168,7 +168,7 @@ static void __init simpad_map_io(void)
 
 static void simpad_power_off(void)
 {
-	local_irq_disable(); // was cli
+	raw_local_irq_disable(); // was cli
 	set_cs3(0x800);        /* only SD_MEDIAQ */
 
 	/* disable internal oscillator, float CS lines */
@@ -185,7 +185,7 @@ static void simpad_power_off(void)
 	PMCR = PMCR_SF;
 	while(1);
 
-	local_irq_enable(); /* we won't ever call it */
+	raw_local_irq_enable(); /* we won't ever call it */
 
 
 }
Index: linux/arch/arm/mach-sa1100/time.c
===================================================================
--- linux.orig/arch/arm/mach-sa1100/time.c
+++ linux/arch/arm/mach-sa1100/time.c
@@ -11,6 +11,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/timex.h>
 #include <linux/signal.h>
 
Index: linux/arch/arm/mach-shark/core.c
===================================================================
--- linux.orig/arch/arm/mach-shark/core.c
+++ linux/arch/arm/mach-shark/core.c
@@ -6,6 +6,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/sched.h>
 #include <linux/serial_8250.h>
 
Index: linux/arch/arm/mach-shark/leds.c
===================================================================
--- linux.orig/arch/arm/mach-shark/leds.c
+++ linux/arch/arm/mach-shark/leds.c
@@ -33,7 +33,7 @@ static char led_state;
 static short hw_led_state;
 static short saved_state;
 
-static DEFINE_SPINLOCK(leds_lock);
+static DEFINE_RAW_SPINLOCK(leds_lock);
 
 short sequoia_read(int addr) {
   outw(addr,0x24);
Index: linux/arch/arm/mach-versatile/core.c
===================================================================
--- linux.orig/arch/arm/mach-versatile/core.c
+++ linux/arch/arm/mach-versatile/core.c
@@ -112,6 +112,8 @@ sic_handle_irq(unsigned int irq, struct 
 	} while (status);
 }
 
+static DEFINE_IRQ_CHAINED_TYPE(sic_handle_irq);
+
 #if 1
 #define IRQ_MMCI0A	IRQ_VICSOURCE22
 #define IRQ_AACI	IRQ_VICSOURCE24
@@ -161,7 +163,7 @@ void __init versatile_init_irq(void)
 		}
 	}
 
-	set_irq_handler(IRQ_VICSOURCE31, sic_handle_irq);
+	set_irq_chained_handler(IRQ_VICSOURCE31, sic_handle_irq);
 	vic_unmask_irq(IRQ_VICSOURCE31);
 
 	/* Do second interrupt controller */
@@ -727,7 +729,7 @@ static void versatile_leds_event(led_eve
 	unsigned long flags;
 	u32 val;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	val = readl(VA_LEDS_BASE);
 
 	switch (ledevt) {
@@ -752,7 +754,7 @@ static void versatile_leds_event(led_eve
 	}
 
 	writel(val, VA_LEDS_BASE);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 #endif	/* CONFIG_LEDS */
 
Index: linux/arch/arm/mm/blockops.c
===================================================================
--- linux.orig/arch/arm/mm/blockops.c
+++ linux/arch/arm/mm/blockops.c
@@ -20,7 +20,7 @@ extern struct cpu_cache_fns blk_cache_fn
  *
  *	- kaddr   - kernel address (guaranteed to be page aligned)
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 blk_flush_kern_dcache_page(void *kaddr)
 {
 	asm(
@@ -45,7 +45,7 @@ blk_flush_kern_dcache_page(void *kaddr)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 blk_dma_inv_range_unified(unsigned long start, unsigned long end)
 {
 	asm(
@@ -61,7 +61,7 @@ blk_dma_inv_range_unified(unsigned long 
 	: "I" (L1_CACHE_BYTES - 1));
 }
 
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 blk_dma_inv_range_harvard(unsigned long start, unsigned long end)
 {
 	asm(
@@ -82,7 +82,7 @@ blk_dma_inv_range_harvard(unsigned long 
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 blk_dma_clean_range(unsigned long start, unsigned long end)
 {
 	asm(
@@ -97,7 +97,7 @@ blk_dma_clean_range(unsigned long start,
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 blk_dma_flush_range(unsigned long start, unsigned long end)
 {
 	asm(
Index: linux/arch/arm/mm/consistent.c
===================================================================
--- linux.orig/arch/arm/mm/consistent.c
+++ linux/arch/arm/mm/consistent.c
@@ -30,7 +30,7 @@
  * This is the page table (2MB) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
Index: linux/arch/arm/mm/copypage-v4mc.c
===================================================================
--- linux.orig/arch/arm/mm/copypage-v4mc.c
+++ linux/arch/arm/mm/copypage-v4mc.c
@@ -29,7 +29,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * ARMv4 mini-dcache optimised copy_user_page
@@ -43,7 +43,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * instruction.  If your processor does not supply this, you have to write your
  * own copy_user_page that does the right thing.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	asm volatile(
@@ -82,7 +82,7 @@ void v4_mc_copy_user_page(void *kto, con
 /*
  * ARMv4 optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 v4_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux/arch/arm/mm/copypage-v6.c
===================================================================
--- linux.orig/arch/arm/mm/copypage-v6.c
+++ linux/arch/arm/mm/copypage-v6.c
@@ -28,7 +28,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(v6_lock);
+static DEFINE_RAW_SPINLOCK(v6_lock);
 
 /*
  * Copy the user page.  No aliasing to deal with so we can just
Index: linux/arch/arm/mm/copypage-xscale.c
===================================================================
--- linux.orig/arch/arm/mm/copypage-xscale.c
+++ linux/arch/arm/mm/copypage-xscale.c
@@ -31,7 +31,7 @@
 
 #define TOP_PTE(x)	pte_offset_kernel(top_pmd, x)
 
-static DEFINE_SPINLOCK(minicache_lock);
+static DEFINE_RAW_SPINLOCK(minicache_lock);
 
 /*
  * XScale mini-dcache optimised copy_user_page
@@ -41,7 +41,7 @@ static DEFINE_SPINLOCK(minicache_lock);
  * Dcache aliasing issue.  The writes will be forwarded to the write buffer,
  * and merged as appropriate.
  */
-static void __attribute__((naked))
+static void notrace __attribute__((naked))
 mc_copy_user_page(void *from, void *to)
 {
 	/*
@@ -104,7 +104,7 @@ void xscale_mc_copy_user_page(void *kto,
 /*
  * XScale optimised clear_user_page
  */
-void __attribute__((naked))
+void notrace __attribute__((naked))
 xscale_mc_clear_user_page(void *kaddr, unsigned long vaddr)
 {
 	asm volatile(
Index: linux/arch/arm/mm/fault-armv.c
===================================================================
--- linux.orig/arch/arm/mm/fault-armv.c
+++ linux/arch/arm/mm/fault-armv.c
@@ -161,7 +161,7 @@ static int __init check_writebuffer(unsi
 {
 	register unsigned long zero = 0, one = 1, val;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	mb();
 	*p1 = one;
 	mb();
@@ -169,7 +169,7 @@ static int __init check_writebuffer(unsi
 	mb();
 	val = *p1;
 	mb();
-	local_irq_enable();
+	raw_local_irq_enable();
 	return val != zero;
 }
 
Index: linux/arch/arm/mm/fault.c
===================================================================
--- linux.orig/arch/arm/mm/fault.c
+++ linux/arch/arm/mm/fault.c
@@ -216,7 +216,7 @@ out:
 	return fault;
 }
 
-static int
+static notrace int
 do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk;
@@ -316,7 +316,7 @@ no_context:
  * interrupt or a critical region, and should only copy the information
  * from the master page table, nothing more.
  */
-static int
+static notrace int
 do_translation_fault(unsigned long addr, unsigned int fsr,
 		     struct pt_regs *regs)
 {
@@ -362,7 +362,7 @@ bad_area:
  * Some section permission faults need to be handled gracefully.
  * They can happen due to a __{get,put}_user during an oops.
  */
-static int
+static notrace int
 do_sect_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	struct task_struct *tsk = current;
@@ -373,7 +373,7 @@ do_sect_fault(unsigned long addr, unsign
 /*
  * This abort handler always returns "fault".
  */
-static int
+static notrace int
 do_bad(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	return 1;
@@ -428,7 +428,7 @@ static struct fsr_info {
 	{ do_bad,		SIGBUS,  0,		"unknown 31"			   }
 };
 
-void __init
+void __init notrace
 hook_fault_code(int nr, int (*fn)(unsigned long, unsigned int, struct pt_regs *),
 		int sig, const char *name)
 {
@@ -442,7 +442,7 @@ hook_fault_code(int nr, int (*fn)(unsign
 /*
  * Dispatch a data abort to the relevant handler.
  */
-asmlinkage void
+asmlinkage notrace void
 do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
 {
 	const struct fsr_info *inf = fsr_info + (fsr & 15) + ((fsr & (1 << 10)) >> 6);
@@ -461,7 +461,7 @@ do_DataAbort(unsigned long addr, unsigne
 	notify_die("", regs, &info, fsr, 0);
 }
 
-asmlinkage void
+asmlinkage notrace void
 do_PrefetchAbort(unsigned long addr, struct pt_regs *regs)
 {
 	do_translation_fault(addr, 0, regs);
Index: linux/arch/arm/mm/init.c
===================================================================
--- linux.orig/arch/arm/mm/init.c
+++ linux/arch/arm/mm/init.c
@@ -28,7 +28,7 @@
 
 #define TABLE_SIZE	(2 * PTRS_PER_PTE * sizeof(pte_t))
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 extern pgd_t swapper_pg_dir[PTRS_PER_PGD];
 extern void _stext, _text, _etext, __data_start, _end, __init_begin, __init_end;
Index: linux/arch/arm/plat-omap/clock.c
===================================================================
--- linux.orig/arch/arm/plat-omap/clock.c
+++ linux/arch/arm/plat-omap/clock.c
@@ -25,7 +25,7 @@
 
 static LIST_HEAD(clocks);
 static DECLARE_MUTEX(clocks_sem);
-static DEFINE_SPINLOCK(clockfw_lock);
+static DEFINE_RAW_SPINLOCK(clockfw_lock);
 static void propagate_rate(struct clk *  clk);
 /* UART clock function */
 static int set_uart_rate(struct clk * clk, unsigned long rate);
Index: linux/arch/arm/plat-omap/dma.c
===================================================================
--- linux.orig/arch/arm/plat-omap/dma.c
+++ linux/arch/arm/plat-omap/dma.c
@@ -586,7 +586,7 @@ void omap_dma_unlink_lch (int lch_head, 
 
 
 static struct lcd_dma_info {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	int reserved;
 	void (* callback)(u16 status, void *data);
 	void *cb_data;
@@ -948,11 +948,11 @@ void omap_clear_dma(int lch)
 	unsigned long flags;
 	int status;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	omap_writew(omap_readw(OMAP_DMA_CCR(lch)) & ~OMAP_DMA_CCR_EN,
 		    OMAP_DMA_CCR(lch));
 	status = OMAP_DMA_CSR(lch);	/* clear pending interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
Index: linux/arch/arm/plat-omap/gpio.c
===================================================================
--- linux.orig/arch/arm/plat-omap/gpio.c
+++ linux/arch/arm/plat-omap/gpio.c
@@ -121,7 +121,7 @@ struct gpio_bank {
 	u32 reserved_map;
 	u32 suspend_wakeup;
 	u32 saved_wakeup;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 #define METHOD_MPUIO		0
@@ -736,7 +736,7 @@ static void gpio_irq_handler(unsigned in
 
 	desc->chip->ack(irq);
 
-	bank = (struct gpio_bank *) desc->data;
+	bank = (struct gpio_bank *) desc->handler_data;
 	if (bank->method == METHOD_MPUIO)
 		isr_reg = bank->base + OMAP_MPUIO_GPIO_INT;
 #ifdef CONFIG_ARCH_OMAP1510
@@ -837,6 +837,8 @@ static struct irqchip mpuio_irq_chip = {
 	.unmask = mpuio_unmask_irq
 };
 
+static DEFINE_IRQ_CHAINED_TYPE(gpio_irq_handler);
+
 static int initialized = 0;
 static struct clk * gpio_ck = NULL;
 
Index: linux/arch/arm/plat-omap/mux.c
===================================================================
--- linux.orig/arch/arm/plat-omap/mux.c
+++ linux/arch/arm/plat-omap/mux.c
@@ -40,7 +40,7 @@
 int __init_or_module
 omap_cfg_reg(const reg_cfg_t reg_cfg)
 {
-	static DEFINE_SPINLOCK(mux_spin_lock);
+	static DEFINE_RAW_SPINLOCK(mux_spin_lock);
 
 	unsigned long flags;
 	reg_cfg_set *cfg;
Index: linux/arch/arm/plat-omap/pm.c
===================================================================
--- linux.orig/arch/arm/plat-omap/pm.c
+++ linux/arch/arm/plat-omap/pm.c
@@ -81,11 +81,11 @@ void omap_pm_idle(void)
 	 * seconds for wait for interrupt.
 	 */
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	local_fiq_disable();
 	if (need_resched()) {
 		local_fiq_enable();
-		local_irq_enable();
+		raw_local_irq_enable();
 		return;
 	}
 	mask32 = omap_readl(ARM_SYSST);
@@ -110,7 +110,7 @@ void omap_pm_idle(void)
 		omap_sram_idle();
 
 	local_fiq_enable();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
@@ -171,7 +171,7 @@ void omap_pm_suspend(void)
 	 * Step 1: turn off interrupts (FIXME: NOTE: already disabled)
 	 */
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	local_fiq_disable();
 
 	/*
@@ -308,7 +308,7 @@ void omap_pm_suspend(void)
 	 * Reenable interrupts
 	 */
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	local_fiq_enable();
 
 	omap_serial_wake_trigger(0);
Index: linux/arch/arm26/boot/compressed/misc.c
===================================================================
--- linux.orig/arch/arm26/boot/compressed/misc.c
+++ linux/arch/arm26/boot/compressed/misc.c
@@ -184,6 +184,7 @@ static ulg free_mem_ptr_end;
 
 #define HEAP_SIZE 0x2000
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 #ifndef STANDALONE_DEBUG
Index: linux/arch/arm26/kernel/time.c
===================================================================
--- linux.orig/arch/arm26/kernel/time.c
+++ linux/arch/arm26/kernel/time.c
@@ -34,10 +34,6 @@
 #include <asm/irq.h>
 #include <asm/ioc.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 
 /* this needs a better home */
Index: linux/arch/cris/kernel/time.c
===================================================================
--- linux.orig/arch/cris/kernel/time.c
+++ linux/arch/cris/kernel/time.c
@@ -32,10 +32,6 @@
 #include <linux/init.h>
 #include <linux/profile.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 int have_rtc;  /* used to remember if we have an RTC or not */;
 
 #define TICK_SIZE tick
Index: linux/arch/frv/kernel/time.c
===================================================================
--- linux.orig/arch/frv/kernel/time.c
+++ linux/arch/frv/kernel/time.c
@@ -34,9 +34,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned long __nongprelbss __clkin_clock_speed_HZ;
 unsigned long __nongprelbss __ext_bus_clock_speed_HZ;
 unsigned long __nongprelbss __res_bus_clock_speed_HZ;
Index: linux/arch/h8300/kernel/time.c
===================================================================
--- linux.orig/arch/h8300/kernel/time.c
+++ linux/arch/h8300/kernel/time.c
@@ -32,10 +32,6 @@
 
 #define	TICK_SIZE (tick_nsec / 1000)
 
-u64 jiffies_64;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * timer_interrupt() needs to keep up the real-time clock,
  * as well as call the "do_timer()" routine every clocktick
Index: linux/arch/i386/Kconfig
===================================================================
--- linux.orig/arch/i386/Kconfig
+++ linux/arch/i386/Kconfig
@@ -14,6 +14,10 @@ config X86
 	  486, 586, Pentiums, and various instruction-set-compatible chips by
 	  AMD, Cyrix, and others.
 
+config GENERIC_TIME
+	bool
+	default y
+
 config SEMAPHORE_SLEEPERS
 	bool
 	default y
@@ -376,16 +380,6 @@ config X86_L1_CACHE_SHIFT
 	default "5" if MWINCHIP3D || MWINCHIP2 || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODEGX1
 	default "6" if MK7 || MK8 || MPENTIUMM
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	depends on M386
-	default y
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	depends on !M386
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -442,7 +436,7 @@ config X86_USE_PPRO_CHECKSUM
 
 config X86_USE_3DNOW
 	bool
-	depends on MCYRIXIII || MK7
+	depends on (MCYRIXIII || MK7) && !PREEMPT_RT
 	default y
 
 config X86_OOSTORE
@@ -466,6 +460,8 @@ config HPET_EMULATE_RTC
 	depends on HPET_TIMER && RTC=y
 	default y
 
+source "kernel/time/Kconfig"
+
 config SMP
 	bool "Symmetric multi-processing support"
 	---help---
@@ -521,6 +517,20 @@ config SCHED_SMT
 
 source "kernel/Kconfig.preempt"
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on M386 || PREEMPT_RT
+	default y
+
+config ASM_SEMAPHORES
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+	depends on !RWSEM_GENERIC_SPINLOCK && !PREEMPT_RT
+	default y
+
 config X86_UP_APIC
 	bool "Local APIC support on uniprocessors"
 	depends on !SMP && !(X86_VISWS || X86_VOYAGER)
@@ -556,6 +566,16 @@ config X86_IO_APIC
 	depends on X86_UP_IOAPIC || (SMP && !(X86_VISWS || X86_VOYAGER))
 	default y
 
+config X86_IOAPIC_FAST
+	bool "enhanced IO-APIC support"
+	depends on X86_IO_APIC
+	default y
+	help
+	  this option will activate further optimizations in the IO-APIC
+	  code. NOTE: this is experimental code, and disabled by default.
+	  Symptoms of non-working systems are boot-time lockups, stray or
+	  screaming interrupts and other interrupt related weirdnesses.
+
 config X86_VISWS_APIC
 	bool
 	depends on X86_VISWS
@@ -917,7 +937,7 @@ config BOOT_IOREMAP
 
 config REGPARM
 	bool "Use register arguments (EXPERIMENTAL)"
-	depends on EXPERIMENTAL
+	depends on EXPERIMENTAL && !MCOUNT
 	default n
 	help
 	Compile the kernel with -mregparm=3. This uses a different ABI
Index: linux/arch/i386/Kconfig.debug
===================================================================
--- linux.orig/arch/i386/Kconfig.debug
+++ linux/arch/i386/Kconfig.debug
@@ -18,6 +18,7 @@ config EARLY_PRINTK
 config DEBUG_STACKOVERFLOW
 	bool "Check for stack overflows"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  This option will cause messages to be printed if free stack space
 	  drops below a certain limit.
@@ -35,6 +36,7 @@ config KPROBES
 config DEBUG_STACK_USAGE
 	bool "Stack utilization instrumentation"
 	depends on DEBUG_KERNEL
+	default y
 	help
 	  Enables the display of the minimum amount of free stack which each
 	  task has ever had available in the sysrq-T and sysrq-P debug output.
@@ -69,7 +71,7 @@ config X86_FIND_SMP_CONFIG
 
 config X86_MPPARSE
 	bool
-	depends on X86_LOCAL_APIC && !X86_VISWS
+	depends on X86_LOCAL_APIC && X86_IO_APIC && !X86_VISWS
 	default y
 
 endmenu
Index: linux/arch/i386/boot/compressed/misc.c
===================================================================
--- linux.orig/arch/i386/boot/compressed/misc.c
+++ linux/arch/i386/boot/compressed/misc.c
@@ -15,6 +15,12 @@
 #include <asm/io.h>
 #include <asm/page.h>
 
+#ifdef CONFIG_MCOUNT
+void notrace mcount(void)
+{
+}
+#endif
+
 /*
  * gzip declarations
  */
@@ -112,7 +118,7 @@ static long free_mem_end_ptr;
 #define INPLACE_MOVE_ROUTINE  0x1000
 #define LOW_BUFFER_START      0x2000
 #define LOW_BUFFER_MAX       0x90000
-#define HEAP_SIZE             0x3000
+#define HEAP_SIZE             0x4000
 static unsigned int low_buffer_end, low_buffer_size;
 static int high_loaded =0;
 static uch *high_buffer_start /* = (uch *)(((ulg)&end) + HEAP_SIZE)*/;
@@ -125,6 +131,7 @@ static int lines, cols;
 static void * xquad_portio = NULL;
 #endif
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)
Index: linux/arch/i386/kernel/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/Makefile
+++ linux/arch/i386/kernel/Makefile
@@ -4,13 +4,13 @@
 
 extra-y := head.o init_task.o vmlinux.lds
 
-obj-y	:= process.o semaphore.o signal.o entry.o traps.o irq.o vm86.o \
+obj-y	:= process.o signal.o entry.o traps.o irq.o vm86.o \
 		ptrace.o time.o ioport.o ldt.o setup.o i8259.o sys_i386.o \
 		pci-dma.o i386_ksyms.o i387.o dmi_scan.o bootflag.o \
-		doublefault.o quirks.o i8237.o
+		doublefault.o quirks.o i8237.o i8253.o tsc.o
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
 obj-y				+= cpu/
-obj-y				+= timers/
 obj-$(CONFIG_ACPI)		+= acpi/
 obj-$(CONFIG_X86_BIOS_REBOOT)	+= reboot.o
 obj-$(CONFIG_MCA)		+= mca.o
@@ -20,6 +20,7 @@ obj-$(CONFIG_MICROCODE)		+= microcode.o
 obj-$(CONFIG_APM)		+= apm.o
 obj-$(CONFIG_X86_SMP)		+= smp.o smpboot.o
 obj-$(CONFIG_X86_TRAMPOLINE)	+= trampoline.o
+obj-$(CONFIG_MCOUNT)		+= mcount-wrapper.o
 obj-$(CONFIG_X86_MPPARSE)	+= mpparse.o
 obj-$(CONFIG_X86_LOCAL_APIC)	+= apic.o nmi.o
 obj-$(CONFIG_X86_IO_APIC)	+= io_apic.o
@@ -34,6 +35,8 @@ obj-$(CONFIG_ACPI_SRAT) 	+= srat.o
 obj-$(CONFIG_HPET_TIMER) 	+= time_hpet.o
 obj-$(CONFIG_EFI) 		+= efi.o efi_stub.o
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
+obj-$(CONFIG_SYSFS)		+= switch2poll.o
+obj-$(CONFIG_HPET_TIMER)	+= hpet.o
 
 EXTRA_AFLAGS   := -traditional
 
Index: linux/arch/i386/kernel/acpi/boot.c
===================================================================
--- linux.orig/arch/i386/kernel/acpi/boot.c
+++ linux/arch/i386/kernel/acpi/boot.c
@@ -570,7 +570,7 @@ static int __init acpi_parse_sbf(unsigne
 }
 
 #ifdef CONFIG_HPET_TIMER
-
+#include <asm/hpet.h>
 static int __init acpi_parse_hpet(unsigned long phys, unsigned long size)
 {
 	struct acpi_table_hpet *hpet_tbl;
@@ -592,6 +592,7 @@ static int __init acpi_parse_hpet(unsign
 #ifdef	CONFIG_X86_64
 	vxtime.hpet_address = hpet_tbl->addr.addrl |
 	    ((long)hpet_tbl->addr.addrh << 32);
+	hpet_address = vxtime.hpet_address;
 
 	printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
 	       hpet_tbl->id, vxtime.hpet_address);
@@ -600,10 +601,10 @@ static int __init acpi_parse_hpet(unsign
 		extern unsigned long hpet_address;
 
 		hpet_address = hpet_tbl->addr.addrl;
-		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
-		       hpet_tbl->id, hpet_address);
 	}
-#endif				/* X86 */
+#endif	/* X86 */
+		printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n",
+			hpet_tbl->id, hpet_address);
 
 	return 0;
 }
@@ -612,7 +613,8 @@ static int __init acpi_parse_hpet(unsign
 #endif
 
 #ifdef CONFIG_X86_PM_TIMER
-extern u32 pmtmr_ioport;
+u32 acpi_pmtmr_ioport;
+int acpi_pmtmr_buggy;
 #endif
 
 static int __init acpi_parse_fadt(unsigned long phys, unsigned long size)
@@ -640,14 +642,22 @@ static int __init acpi_parse_fadt(unsign
 		    ACPI_ADR_SPACE_SYSTEM_IO)
 			return 0;
 
-		pmtmr_ioport = fadt->xpm_tmr_blk.address;
+		acpi_pmtmr_ioport = fadt->xpm_tmr_blk.address;
+		/*
+		 * "X" fields are optional extensions to the original V1.0
+		 * fields, so we must selectively expand V1.0 fields if the
+		 * corresponding X field is zero.
+	 	 */
+		if (!acpi_pmtmr_ioport)
+			acpi_pmtmr_ioport = fadt->V1_pm_tmr_blk;
 	} else {
 		/* FADT rev. 1 */
-		pmtmr_ioport = fadt->V1_pm_tmr_blk;
+		acpi_pmtmr_ioport = fadt->V1_pm_tmr_blk;
 	}
-	if (pmtmr_ioport)
-		printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n",
-		       pmtmr_ioport);
+
+	if (acpi_pmtmr_ioport)
+		printk(KERN_INFO PREFIX "PM-Timer IO Port: %#x\n", acpi_pmtmr_ioport);
+
 #endif
 	return 0;
 }
Index: linux/arch/i386/kernel/apic.c
===================================================================
--- linux.orig/arch/i386/kernel/apic.c
+++ linux/arch/i386/kernel/apic.c
@@ -26,6 +26,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/sysdev.h>
 #include <linux/cpu.h>
+#include <linux/clockchips.h>
 
 #include <asm/atomic.h>
 #include <asm/smp.h>
@@ -50,6 +51,23 @@ int enable_local_apic __initdata = 0; /*
  */
 int apic_verbosity;
 
+static unsigned int calibration_result;
+
+static void lapic_next_event(unsigned long evt);
+static void lapic_timer_setup(int mode);
+
+static struct clock_event lapic_clockevent = {
+	.name = "lapic",
+	.capabilities = CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE |
+			CLOCK_HAS_IRQHANDLER
+#ifdef CONFIG_SMP
+			| CLOCK_CAP_UPDATE
+#endif
+	,
+	.shift = 32,
+	.set_mode = lapic_timer_setup,
+	.set_next_event = lapic_next_event,
+};
 
 static void apic_pm_activate(void);
 
@@ -92,10 +110,6 @@ void __init apic_intr_init(void)
 /* Using APIC to generate smp_local_timer_interrupt? */
 int using_apic_timer = 0;
 
-static DEFINE_PER_CPU(int, prof_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_old_multiplier) = 1;
-static DEFINE_PER_CPU(int, prof_counter) = 1;
-
 static int enabled_via_apicbase;
 
 void enable_NMI_through_LVT0 (void * dummy)
@@ -559,15 +573,21 @@ void __devinit setup_local_APIC(void)
  * If Linux enabled the LAPIC against the BIOS default
  * disable it down before re-entering the BIOS on shutdown.
  * Otherwise the BIOS may get confused and not power-off.
+ * Additionally clear all LVT entries before disable_local_APIC
+ * for the case where Linux didn't enable the LAPIC.
  */
 void lapic_shutdown(void)
 {
-	if (!cpu_has_apic || !enabled_via_apicbase)
+	if (!cpu_has_apic)
 		return;
 
-	local_irq_disable();
-	disable_local_APIC();
-	local_irq_enable();
+	raw_local_irq_disable();
+	clear_local_APIC();
+
+	if (enabled_via_apicbase)
+		disable_local_APIC();
+
+	raw_local_irq_enable();
 }
 
 #ifdef CONFIG_PM
@@ -611,9 +631,9 @@ static int lapic_suspend(struct sys_devi
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
 	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
 	
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	disable_local_APIC();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -625,7 +645,7 @@ static int lapic_resume(struct sys_devic
 	if (!apic_pm_state.active)
 		return 0;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/*
 	 * Make sure the APICBASE points to the right address
@@ -656,7 +676,7 @@ static int lapic_resume(struct sys_devic
 	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -849,10 +869,10 @@ fake_ioapic_page:
 				ioapic_phys = (unsigned long)
 					      alloc_bootmem_pages(PAGE_SIZE);
 				ioapic_phys = __pa(ioapic_phys);
+				set_fixmap_nocache(idx, ioapic_phys);
+				printk(KERN_DEBUG "faked IOAPIC to %08lx (%08lx)\n",
+				       __fix_to_virt(idx), ioapic_phys);
 			}
-			set_fixmap_nocache(idx, ioapic_phys);
-			printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
-			       __fix_to_virt(idx), ioapic_phys);
 			idx++;
 		}
 	}
@@ -869,6 +889,11 @@ fake_ioapic_page:
  */
 
 /*
+ * FIXME: Move this to i8253.h. There is no need to keep the access to
+ * the PIT scattered all around the place -tglx
+ */
+
+/*
  * The timer chip is already set up at HZ interrupts per second here,
  * but we do not accept timer interrupts yet. We only allow the BP
  * to calibrate.
@@ -926,12 +951,16 @@ void (*wait_timer_tick)(void) __devinitd
 
 #define APIC_DIVISOR 16
 
-static void __setup_APIC_LVTT(unsigned int clocks)
+static void __setup_APIC_LVTT(unsigned int clocks, int oneshot)
 {
 	unsigned int lvtt_value, tmp_value, ver;
 
 	ver = GET_APIC_VERSION(apic_read(APIC_LVR));
-	lvtt_value = APIC_LVT_TIMER_PERIODIC | LOCAL_TIMER_VECTOR;
+
+	lvtt_value = LOCAL_TIMER_VECTOR;
+	if (!oneshot)
+		lvtt_value |= APIC_LVT_TIMER_PERIODIC;
+
 	if (!APIC_INTEGRATED(ver))
 		lvtt_value |= SET_APIC_TIMER_BASE(APIC_TIMER_BASE_DIV);
 	apic_write_around(APIC_LVTT, lvtt_value);
@@ -944,23 +973,27 @@ static void __setup_APIC_LVTT(unsigned i
 				& ~(APIC_TDR_DIV_1 | APIC_TDR_DIV_TMBASE))
 				| APIC_TDR_DIV_16);
 
-	apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
+	if (!oneshot)
+		apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR);
 }
 
-static void __devinit setup_APIC_timer(unsigned int clocks)
+static void lapic_next_event(unsigned long evt)
 {
-	unsigned long flags;
-
-	local_irq_save(flags);
+	apic_write_around(APIC_TMICT, evt);
+}
 
-	/*
-	 * Wait for IRQ0's slice:
-	 */
-	wait_timer_tick();
+static void lapic_timer_setup(int mode)
+{
+	unsigned long flags;
 
-	__setup_APIC_LVTT(clocks);
+	raw_local_irq_save(flags);
+	__setup_APIC_LVTT(calibration_result, mode == CLOCK_EVT_ONESHOT);
+	raw_local_irq_restore(flags);
+}
 
-	local_irq_restore(flags);
+static void __devinit setup_APIC_timer(void)
+{
+	setup_local_clockevent(&lapic_clockevent, CPU_MASK_NONE);
 }
 
 /*
@@ -969,6 +1002,8 @@ static void __devinit setup_APIC_timer(u
  * to calibrate, since some later bootup code depends on getting
  * the first irq? Ugh.
  *
+ * TODO: Fix this rather than saying "Ugh" -tglx
+ *
  * We want to do the calibration only once since we
  * want to have local timer irqs syncron. CPUs connected
  * by the same APIC bus have the very same bus frequency.
@@ -991,7 +1026,7 @@ static int __init calibrate_APIC_clock(v
 	 * value into the APIC clock, we just want to get the
 	 * counter running for calibration.
 	 */
-	__setup_APIC_LVTT(1000000000);
+	__setup_APIC_LVTT(1000000000, 0);
 
 	/*
 	 * The timer chip counts down to zero. Let's wait
@@ -1028,6 +1063,13 @@ static int __init calibrate_APIC_clock(v
 
 	result = (tt1-tt2)*APIC_DIVISOR/LOOPS;
 
+	/* Calculate the scaled math multiplication factor */
+	lapic_clockevent.mult = div_sc32(tt1-tt2, TICK_NSEC * LOOPS);
+	lapic_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFFFF, &lapic_clockevent);
+	lapic_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &lapic_clockevent);
+
 	if (cpu_has_tsc)
 		apic_printk(APIC_VERBOSE, "..... CPU clock speed is "
 			"%ld.%04ld MHz.\n",
@@ -1042,27 +1084,26 @@ static int __init calibrate_APIC_clock(v
 	return result;
 }
 
-static unsigned int calibration_result;
-
 void __init setup_boot_APIC_clock(void)
 {
+	unsigned long flags;
 	apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n");
 	using_apic_timer = 1;
 
-	local_irq_disable();
+	raw_local_irq_save(flags);
 
 	calibration_result = calibrate_APIC_clock();
 	/*
 	 * Now set up the timer for real.
 	 */
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 
-	local_irq_enable();
+	raw_local_irq_restore(flags);
 }
 
 void __devinit setup_secondary_APIC_clock(void)
 {
-	setup_APIC_timer(calibration_result);
+	setup_APIC_timer();
 }
 
 void __devinit disable_APIC_timer(void)
@@ -1085,6 +1126,8 @@ void enable_APIC_timer(void)
 	}
 }
 
+static DEFINE_PER_CPU(int, prof_multiplier) = 1;
+
 /*
  * the frequency of the profiling timer can be changed
  * by writing a multiplier value into /proc/profile.
@@ -1112,60 +1155,6 @@ int setup_profiling_timer(unsigned int m
 
 	return 0;
 }
-
-#undef APIC_DIVISOR
-
-/*
- * Local timer interrupt handler. It does both profiling and
- * process statistics/rescheduling.
- *
- * We do profiling in every local tick, statistics/rescheduling
- * happen only every 'profiling multiplier' ticks. The default
- * multiplier is 1 and it can be changed by writing the new multiplier
- * value into /proc/profile.
- */
-
-inline void smp_local_timer_interrupt(struct pt_regs * regs)
-{
-	int cpu = smp_processor_id();
-
-	profile_tick(CPU_PROFILING, regs);
-	if (--per_cpu(prof_counter, cpu) <= 0) {
-		/*
-		 * The multiplier may have changed since the last time we got
-		 * to this point as a result of the user writing to
-		 * /proc/profile. In this case we need to adjust the APIC
-		 * timer accordingly.
-		 *
-		 * Interrupts are already masked off at this point.
-		 */
-		per_cpu(prof_counter, cpu) = per_cpu(prof_multiplier, cpu);
-		if (per_cpu(prof_counter, cpu) !=
-					per_cpu(prof_old_multiplier, cpu)) {
-			__setup_APIC_LVTT(
-					calibration_result/
-					per_cpu(prof_counter, cpu));
-			per_cpu(prof_old_multiplier, cpu) =
-						per_cpu(prof_counter, cpu);
-		}
-
-#ifdef CONFIG_SMP
-		update_process_times(user_mode_vm(regs));
-#endif
-	}
-
-	/*
-	 * We take the 'long' return path, and there every subsystem
-	 * grabs the apropriate locks (kernel lock/ irq lock).
-	 *
-	 * we might want to decouple profiling from the 'long path',
-	 * and do the profiling totally in assembly.
-	 *
-	 * Currently this isn't too much of an issue (performance wise),
-	 * we can take more than 100K local irqs per second on a 100 MHz P5.
-	 */
-}
-
 /*
  * Local APIC timer interrupt. This is the most natural way for doing
  * local interrupts, but local timer interrupts can be emulated by
@@ -1175,7 +1164,7 @@ inline void smp_local_timer_interrupt(st
  *   interrupt as well. Thus we cannot inline the local irq ... ]
  */
 
-fastcall void smp_apic_timer_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_apic_timer_interrupt(struct pt_regs *regs)
 {
 	int cpu = smp_processor_id();
 
@@ -1184,6 +1173,8 @@ fastcall void smp_apic_timer_interrupt(s
 	 */
 	per_cpu(irq_stat, cpu).apic_timer_irqs++;
 
+        trace_special(regs->eip, 0, 0);
+
 	/*
 	 * NOTE! We'd better ACK the irq immediately,
 	 * because timer handling can be slow.
@@ -1195,7 +1186,17 @@ fastcall void smp_apic_timer_interrupt(s
 	 * interrupt lock, which is the WrongThing (tm) to do.
 	 */
 	irq_enter();
-	smp_local_timer_interrupt(regs);
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_light_softlockup_watchdog();
+
+	if (lapic_clockevent.event_handler)
+		lapic_clockevent.event_handler(regs);
+
 	irq_exit();
 }
 
@@ -1250,6 +1251,7 @@ fastcall void smp_error_interrupt(struct
 	*/
 	printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n",
 	        smp_processor_id(), v , v1);
+	dump_stack();
 	irq_exit();
 }
 
Index: linux/arch/i386/kernel/apm.c
===================================================================
--- linux.orig/arch/i386/kernel/apm.c
+++ linux/arch/i386/kernel/apm.c
@@ -552,9 +552,9 @@ static inline void apm_restore_cpus(cpum
  */
 #define APM_DO_CLI	\
 	if (apm_info.allow_ints) \
-		local_irq_enable(); \
+		raw_local_irq_enable(); \
 	else \
-		local_irq_disable();
+		raw_local_irq_disable();
 
 #ifdef APM_ZERO_SEGS
 #	define APM_DECL_SEGS \
@@ -604,12 +604,12 @@ static u8 apm_bios_call(u32 func, u32 eb
 	save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8];
 	per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc;
 
-	local_save_flags(flags);
+	raw_local_save_flags(flags);
 	APM_DO_CLI;
 	APM_DO_SAVE_SEGS;
 	apm_bios_call_asm(func, ebx_in, ecx_in, eax, ebx, ecx, edx, esi);
 	APM_DO_RESTORE_SEGS;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = save_desc_40;
 	put_cpu();
 	apm_restore_cpus(cpus);
@@ -647,12 +647,12 @@ static u8 apm_bios_call_simple(u32 func,
 	save_desc_40 = per_cpu(cpu_gdt_table, cpu)[0x40 / 8];
 	per_cpu(cpu_gdt_table, cpu)[0x40 / 8] = bad_bios_desc;
 
-	local_save_flags(flags);
+	raw_local_save_flags(flags);
 	APM_DO_CLI;
 	APM_DO_SAVE_SEGS;
 	error = apm_bios_call_simple_asm(func, ebx_in, ecx_in, eax);
 	APM_DO_RESTORE_SEGS;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	__get_cpu_var(cpu_gdt_table)[0x40 / 8] = save_desc_40;
 	put_cpu();
 	apm_restore_cpus(cpus);
@@ -1194,7 +1194,7 @@ static int suspend(int vetoable)
 	}
 
 	device_suspend(PMSG_SUSPEND);
-	local_irq_disable();
+	raw_local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
 
 	/* serialize with the timer interrupt */
@@ -1210,14 +1210,14 @@ static int suspend(int vetoable)
 	 */
 	spin_unlock(&i8253_lock);
 	write_sequnlock(&xtime_lock);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	save_processor_state();
 	err = set_system_power_state(APM_STATE_SUSPEND);
 	ignore_normal_resume = 1;
 	restore_processor_state();
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	write_seqlock(&xtime_lock);
 	spin_lock(&i8253_lock);
 	reinit_timer();
@@ -1232,7 +1232,7 @@ static int suspend(int vetoable)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
 	device_power_up();
-	local_irq_enable();
+	raw_local_irq_enable();
 	device_resume();
 	pm_send_all(PM_RESUME, (void *)0);
 	queue_event(APM_NORMAL_RESUME, NULL);
@@ -1251,22 +1251,22 @@ static void standby(void)
 {
 	int	err;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
 	/* serialize with the timer interrupt */
 	write_seqlock(&xtime_lock);
 	/* If needed, notify drivers here */
 	get_time_diff();
 	write_sequnlock(&xtime_lock);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	err = set_system_power_state(APM_STATE_STANDBY);
 	if ((err != APM_SUCCESS) && (err != APM_NO_ERROR))
 		apm_error("standby", err);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	device_power_up();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 static apm_event_t get_event(void)
Index: linux/arch/i386/kernel/cpu/cpufreq/longhaul.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/cpufreq/longhaul.c
+++ linux/arch/i386/kernel/cpu/cpufreq/longhaul.c
@@ -144,7 +144,7 @@ static void do_powersaver(union msr_long
 	longhaul->bits.RevisionKey = 0;
 
 	preempt_disable();
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/*
 	 * get current pci bus master state for all devices
@@ -166,11 +166,11 @@ static void do_powersaver(union msr_long
 	outb(0xFE,0x21);	/* TMR0 only */
 	outb(0xFF,0x80);	/* delay */
 
-	safe_halt();
+	raw_safe_halt();
 	wrmsrl(MSR_VIA_LONGHAUL, longhaul->val);
 	halt();
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	outb(tmp_mask,0x21);	/* restore mask */
 
@@ -184,7 +184,7 @@ static void do_powersaver(union msr_long
 			pci_write_config_byte(dev, PCI_COMMAND, pci_cmd);
 		}
 	} while (dev != NULL);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	preempt_enable();
 
 	/* disable bus ratio bit */
@@ -245,16 +245,16 @@ static void longhaul_setstate(unsigned i
 		/* Enable software clock multiplier */
 		bcr2.bits.ESOFTBF = 1;
 		bcr2.bits.CLOCKMUL = clock_ratio_index;
-		local_irq_disable();
+		raw_local_irq_disable();
 		wrmsrl (MSR_VIA_BCR2, bcr2.val);
-		safe_halt();
+		raw_safe_halt();
 
 		/* Disable software clock multiplier */
 		rdmsrl (MSR_VIA_BCR2, bcr2.val);
 		bcr2.bits.ESOFTBF = 0;
-		local_irq_disable();
+		raw_local_irq_disable();
 		wrmsrl (MSR_VIA_BCR2, bcr2.val);
-		local_irq_enable();
+		raw_local_irq_enable();
 		break;
 
 	/*
Index: linux/arch/i386/kernel/cpu/mtrr/cyrix.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/mtrr/cyrix.c
+++ linux/arch/i386/kernel/cpu/mtrr/cyrix.c
@@ -17,7 +17,7 @@ cyrix_get_arr(unsigned int reg, unsigned
 	arr = CX86_ARR_BASE + (reg << 1) + reg;	/* avoid multiplication by 3 */
 
 	/* Save flags and disable interrupts */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	ccr3 = getCx86(CX86_CCR3);
 	setCx86(CX86_CCR3, (ccr3 & 0x0f) | 0x10);	/* enable MAPEN */
@@ -28,7 +28,7 @@ cyrix_get_arr(unsigned int reg, unsigned
 	setCx86(CX86_CCR3, ccr3);	/* disable MAPEN */
 
 	/* Enable interrupts if it was enabled previously */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	shift = ((unsigned char *) base)[1] & 0x0f;
 	*base >>= PAGE_SHIFT;
 
Index: linux/arch/i386/kernel/cpu/mtrr/generic.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/mtrr/generic.c
+++ linux/arch/i386/kernel/cpu/mtrr/generic.c
@@ -234,7 +234,7 @@ static unsigned long set_mtrr_state(u32 
 
 static unsigned long cr4 = 0;
 static u32 deftype_lo, deftype_hi;
-static DEFINE_SPINLOCK(set_atomicity_lock);
+static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
 
 /*
  * Since we are disabling the cache don't allow any interrupts - they
@@ -296,14 +296,14 @@ static void generic_set_all(void)
 	unsigned long mask, count;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prepare_set();
 
 	/* Actually set the state */
 	mask = set_mtrr_state(deftype_lo,deftype_hi);
 
 	post_set();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*  Use the atomic bitops to update the global mask  */
 	for (count = 0; count < sizeof mask * 8; ++count) {
@@ -331,7 +331,7 @@ static void generic_set_mtrr(unsigned in
 
 	vr = &mtrr_state.var_ranges[reg];
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	prepare_set();
 
 	if (size == 0) {
@@ -350,7 +350,7 @@ static void generic_set_mtrr(unsigned in
 	}
 
 	post_set();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 int generic_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
Index: linux/arch/i386/kernel/cpu/mtrr/main.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/mtrr/main.c
+++ linux/arch/i386/kernel/cpu/mtrr/main.c
@@ -146,7 +146,7 @@ static void ipi_handler(void *info)
 	struct set_mtrr_data *data = info;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	atomic_dec(&data->count);
 	while(!atomic_read(&data->gate))
@@ -164,7 +164,7 @@ static void ipi_handler(void *info)
 		cpu_relax();
 
 	atomic_dec(&data->count);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #endif
@@ -225,7 +225,7 @@ static void set_mtrr(unsigned int reg, u
 	if (smp_call_function(ipi_handler, &data, 1, 0) != 0)
 		panic("mtrr: timed out waiting for other CPUs\n");
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	while(atomic_read(&data.count))
 		cpu_relax();
@@ -259,7 +259,7 @@ static void set_mtrr(unsigned int reg, u
 	while(atomic_read(&data.count))
 		cpu_relax();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /**
@@ -687,11 +687,11 @@ void mtrr_ap_init(void)
 	 * 2.cpu hotadd time. We let mtrr_add/del_page hold cpuhotplug lock to
 	 * prevent mtrr entry changes
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	mtrr_if->set_all();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static int __init mtrr_init_finialize(void)
Index: linux/arch/i386/kernel/cpu/mtrr/state.c
===================================================================
--- linux.orig/arch/i386/kernel/cpu/mtrr/state.c
+++ linux/arch/i386/kernel/cpu/mtrr/state.c
@@ -12,7 +12,7 @@ void set_mtrr_prepare_save(struct set_mt
 	unsigned int cr0;
 
 	/*  Disable interrupts locally  */
-	local_irq_save(ctxt->flags);
+	raw_local_irq_save(ctxt->flags);
 
 	if (use_intel() || is_cpu(CYRIX)) {
 
@@ -73,6 +73,6 @@ void set_mtrr_done(struct set_mtrr_conte
 			write_cr4(ctxt->cr4val);
 	}
 	/*  Re-enable interrupts locally (if enabled previously)  */
-	local_irq_restore(ctxt->flags);
+	raw_local_irq_restore(ctxt->flags);
 }
 
Index: linux/arch/i386/kernel/entry.S
===================================================================
--- linux.orig/arch/i386/kernel/entry.S
+++ linux/arch/i386/kernel/entry.S
@@ -76,10 +76,10 @@ NT_MASK		= 0x00004000
 VM_MASK		= 0x00020000
 
 #ifdef CONFIG_PREEMPT
-#define preempt_stop		cli
+# define preempt_stop		cli
 #else
-#define preempt_stop
-#define resume_kernel		restore_nocheck
+# define preempt_stop
+# define resume_kernel		restore_nocheck
 #endif
 
 #define SAVE_ALL \
@@ -160,14 +160,17 @@ ENTRY(resume_userspace)
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
 	cli
+	cmpl $0, kernel_preemption
+	jz restore_nocheck
 	cmpl $0,TI_preempt_count(%ebp)	# non-zero preempt_count ?
 	jnz restore_nocheck
 need_resched:
 	movl TI_flags(%ebp), %ecx	# need_resched set ?
 	testb $_TIF_NEED_RESCHED, %cl
-	jz restore_all
+	jz restore_nocheck
 	testl $IF_MASK,EFLAGS(%esp)     # interrupts off (exception path) ?
-	jz restore_all
+	jz restore_nocheck
+	cli
 	call preempt_schedule_irq
 	jmp need_resched
 #endif
@@ -200,6 +203,11 @@ sysenter_past_esp:
 
 	pushl %eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -213,6 +221,11 @@ sysenter_past_esp:
 	movl TI_flags(%ebp), %ecx
 	testw $_TIF_ALLWORK_MASK, %cx
 	jne syscall_exit_work
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %eax
+	call sys_ret
+	popl %eax
+#endif
 /* if something modifies registers it must also disable sysexit */
 	movl EIP(%esp), %edx
 	movl OLDESP(%esp), %ecx
@@ -225,6 +238,11 @@ sysenter_past_esp:
 ENTRY(system_call)
 	pushl %eax			# save orig_eax
 	SAVE_ALL
+#ifdef CONFIG_LATENCY_TRACE
+	pushl %edx; pushl %ecx; pushl %ebx; pushl %eax
+	call sys_call
+	popl %eax; popl %ebx; popl %ecx; popl %edx
+#endif
 	GET_THREAD_INFO(%ebp)
 					# system call tracing in operation / emulation
 	/* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */
@@ -254,6 +272,17 @@ restore_all:
 	cmpl $((4 << 8) | 3), %eax
 	je ldt_ss			# returning to user-space with LDT SS
 restore_nocheck:
+#if defined(CONFIG_CRITICAL_IRQSOFF_TIMING) || defined(CONFIG_LATENCY_TRACE)
+	pushl %eax
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	call trace_irqs_on
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	call sys_ret
+#endif
+	popl %eax
+#endif
+restore_nocheck_nmi:
 	RESTORE_REGS
 	addl $4, %esp
 1:	iret
@@ -297,18 +326,19 @@ ldt_ss:
 	# perform work that needs to be done immediately before resumption
 	ALIGN
 work_pending:
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jz work_notifysig
 work_resched:
-	call schedule
-	cli				# make sure we don't miss an interrupt
+	cli
+	call __schedule
+					# make sure we don't miss an interrupt
 					# setting need_resched or sigpending
 					# between sampling and the iret
 	movl TI_flags(%ebp), %ecx
 	andl $_TIF_WORK_MASK, %ecx	# is there any work to be done other
 					# than syscall tracing?
 	jz restore_all
-	testb $_TIF_NEED_RESCHED, %cl
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), %ecx
 	jnz work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -351,6 +381,11 @@ syscall_trace_entry:
 syscall_exit_work:
 	testb $(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SINGLESTEP), %cl
 	jz work_pending
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	pushl %eax
+	call trace_irqs_on
+	popl %eax
+#endif
 	sti				# could let do_syscall_trace() call
 					# schedule() instead
 	movl %esp, %eax
@@ -412,9 +447,16 @@ ENTRY(irq_entries_start)
 vector=vector+1
 .endr
 
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# define TRACE_IRQS_OFF call trace_irqs_off_lowlevel;
+#else
+# define TRACE_IRQS_OFF
+#endif
+
 	ALIGN
 common_interrupt:
 	SAVE_ALL
+	TRACE_IRQS_OFF
 	movl %esp,%eax
 	call do_IRQ
 	jmp ret_from_intr
@@ -423,6 +465,7 @@ common_interrupt:
 ENTRY(name)				\
 	pushl $nr-256;			\
 	SAVE_ALL			\
+	TRACE_IRQS_OFF			\
 	movl %esp,%eax;			\
 	call smp_/**/name;		\
 	jmp ret_from_intr;
@@ -552,7 +595,7 @@ nmi_stack_correct:
 	xorl %edx,%edx		# zero error code
 	movl %esp,%eax		# pt_regs pointer
 	call do_nmi
-	jmp restore_all
+	jmp restore_nocheck_nmi
 
 nmi_stack_fixup:
 	FIX_STACK(12,nmi_stack_correct, 1)
Index: linux/arch/i386/kernel/hpet.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/hpet.c
@@ -0,0 +1,69 @@
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/hpet.h>
+#include <linux/init.h>
+
+#include <asm/hpet.h>
+#include <asm/io.h>
+
+#define HPET_MASK	0xFFFFFFFF
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)readl(hpet_ptr);
+}
+
+struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.mask		= (cycle_t)HPET_MASK,
+	.mult		= 0, /* set below */
+	.shift		= HPET_SHIFT,
+	.is_continuous	= 1,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+	unsigned long hpet_period;
+	void __iomem* hpet_base;
+	u64 tmp;
+
+	if (!hpet_address)
+		return -ENODEV;
+
+	/* calculate the hpet address: */
+	hpet_base =
+		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+	hpet_ptr = hpet_base + HPET_COUNTER;
+
+	/* calculate the frequency: */
+	hpet_period = readl(hpet_base + HPET_PERIOD);
+
+	/*
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	register_clocksource(&clocksource_hpet);
+
+	return 0;
+}
+
+module_init(init_hpet_clocksource);
Index: linux/arch/i386/kernel/i386_ksyms.c
===================================================================
--- linux.orig/arch/i386/kernel/i386_ksyms.c
+++ linux/arch/i386/kernel/i386_ksyms.c
@@ -6,10 +6,12 @@
 /* This is definitely a GPL-only symbol */
 EXPORT_SYMBOL_GPL(cpu_gdt_table);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_ASM_SEMAPHORES
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_generic);
 
@@ -25,7 +27,7 @@ EXPORT_SYMBOL(__put_user_8);
 EXPORT_SYMBOL(strpbrk);
 EXPORT_SYMBOL(strstr);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_ASM_SEMAPHORES)
 extern void FASTCALL( __write_lock_failed(rwlock_t *rw));
 extern void FASTCALL( __read_lock_failed(rwlock_t *rw));
 EXPORT_SYMBOL(__write_lock_failed);
Index: linux/arch/i386/kernel/i8253.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/i8253.c
@@ -0,0 +1,138 @@
+/*
+ * i8253.c  8253/PIT functions
+ *
+ */
+#include <linux/clockchips.h>
+#include <linux/spinlock.h>
+#include <linux/jiffies.h>
+#include <linux/sysdev.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/mca.h>
+
+#include <asm/smp.h>
+#include <asm/io_apic.h>
+#include <asm/delay.h>
+#include <asm/i8253.h>
+#include <asm/io.h>
+
+#include "io_ports.h"
+
+DEFINE_RAW_SPINLOCK(i8253_lock);
+EXPORT_SYMBOL(i8253_lock);
+
+static void init_pit_timer(int mode)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+
+	if (mode != CLOCK_EVT_ONESHOT) {
+		/* binary, mode 2, LSB/MSB, ch 0 */
+		outb_p(0x34, PIT_MODE);
+		udelay(10);
+		outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
+		outb(LATCH >> 8 , PIT_CH0);	/* MSB */
+	} else {
+		/* One shot setup */
+		outb_p(0x38, PIT_MODE);
+		udelay(10);
+	}
+
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static void pit_next_event(unsigned long evt)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&i8253_lock, flags);
+	outb_p(evt & 0xff , PIT_CH0);	/* LSB */
+	outb(evt >> 8 , PIT_CH0);	/* MSB */
+	spin_unlock_irqrestore(&i8253_lock, flags);
+}
+
+static struct clock_event pit_clockevent = {
+	.name		= "pit",
+	.capabilities	= CLOCK_CAP_TICK
+#ifndef CONFIG_SMP
+			| CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE |
+			CLOCK_CAP_UPDATE
+#endif
+	,
+	.set_mode	= init_pit_timer,
+	.set_next_event = pit_next_event,
+	.start_event	= io_apic_timer_ack,
+	.end_event	= mca_timer_ack,
+	.shift		= 32,
+	.irq		= 0,
+};
+
+void setup_pit_timer(void)
+{
+	pit_clockevent.mult = div_sc32(CLOCK_TICK_RATE, NSEC_PER_SEC);
+	pit_clockevent.max_delta_ns =
+		clockevent_delta2ns(0x7FFF, &pit_clockevent);
+	pit_clockevent.min_delta_ns =
+		clockevent_delta2ns(0xF, &pit_clockevent);
+	setup_global_clockevent(&pit_clockevent, CPU_MASK_NONE);
+}
+
+/*
+ * Since the PIT overflows every tick, its not very useful
+ * to just read by itself. So use jiffies to emulate a free
+ * running counter:
+ */
+static cycle_t pit_read(void)
+{
+	unsigned long flags, seq;
+	int count;
+	u64 jifs;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+
+		spin_lock_irqsave(&i8253_lock, flags);
+		outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
+		count = inb_p(PIT_CH0);	/* read the latched count */
+		count |= inb_p(PIT_CH0) << 8;
+
+		/* VIA686a test code... reset the latch if count > max + 1 */
+		if (count > LATCH) {
+			outb_p(0x34, PIT_MODE);
+			outb_p(LATCH & 0xff, PIT_CH0);
+			outb(LATCH >> 8, PIT_CH0);
+			count = LATCH - 1;
+		}
+		spin_unlock_irqrestore(&i8253_lock, flags);
+
+		jifs = jiffies_64;
+	} while (read_seqretry(&xtime_lock, seq));
+
+	jifs -= INITIAL_JIFFIES;
+	count = (LATCH-1) - count;
+
+	return (cycle_t)(jifs * LATCH) + count;
+}
+
+static struct clocksource clocksource_pit = {
+	.name	= "pit",
+	.rating = 110,
+	.read	= pit_read,
+	.mask	= (cycle_t)-1,
+	.mult	= 0,
+	.shift	= 20,
+};
+
+static int __init init_pit_clocksource(void)
+{
+	/* TODO: bogus limit of 4 CPUs? --mingo */
+	if (num_possible_cpus() > 4) /* PIT does not scale! */
+		return 0;
+
+	clocksource_pit.mult = clocksource_hz2mult(CLOCK_TICK_RATE, 20);
+	register_clocksource(&clocksource_pit);
+
+	return 0;
+}
+module_init(init_pit_clocksource);
Index: linux/arch/i386/kernel/i8259.c
===================================================================
--- linux.orig/arch/i386/kernel/i8259.c
+++ linux/arch/i386/kernel/i8259.c
@@ -35,7 +35,7 @@
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -366,7 +366,7 @@ static irqreturn_t math_error_irq(int cp
  * New motherboards sometimes make IRQ 13 be a PCI interrupt,
  * so allow interrupt sharing.
  */
-static struct irqaction fpu_irq = { math_error_irq, 0, CPU_MASK_NONE, "fpu", NULL, NULL };
+static struct irqaction fpu_irq = { math_error_irq, SA_NODELAY, CPU_MASK_NONE, "fpu", NULL, NULL };
 
 void __init init_ISA_irqs (void)
 {
@@ -422,12 +422,6 @@ void __init init_IRQ(void)
 	intr_init_hook();
 
 	/*
-	 * Set the clock to HZ Hz, we already have a valid
-	 * vector now:
-	 */
-	setup_pit_timer();
-
-	/*
 	 * External FPU? Set up irq13 if so, for
 	 * original braindamaged IBM FERR coupling.
 	 */
Index: linux/arch/i386/kernel/init_task.c
===================================================================
--- linux.orig/arch/i386/kernel/init_task.c
+++ linux/arch/i386/kernel/init_task.c
@@ -10,8 +10,8 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux/arch/i386/kernel/io_apic.c
===================================================================
--- linux.orig/arch/i386/kernel/io_apic.c
+++ linux/arch/i386/kernel/io_apic.c
@@ -30,6 +30,7 @@
 #include <linux/mc146818rtc.h>
 #include <linux/compiler.h>
 #include <linux/acpi.h>
+#include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/sysdev.h>
 
@@ -46,7 +47,7 @@
 int (*ioapic_renumber_irq)(int ioapic, int irq);
 atomic_t irq_mis_count;
 
-static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 /*
  *	Is the SiS APIC rmw bug present ?
@@ -54,11 +55,6 @@ static DEFINE_SPINLOCK(ioapic_lock);
  */
 int sis_apic_bug = -1;
 
-/*
- * # of IRQ routing registers
- */
-int nr_ioapic_registers[MAX_IO_APICS];
-
 int disable_timer_pin_1 __initdata;
 
 /*
@@ -87,6 +83,27 @@ int vector_irq[NR_VECTORS] __read_mostly
 #define vector_to_irq(vector)	(vector)
 #endif
 
+static int timer_ack;
+
+void io_apic_timer_ack(void *priv)
+{
+	unsigned long flags;
+
+	if (timer_ack) {
+		/*
+		 * Subtle, when I/O APICs are used we have to ack timer IRQ
+		 * manually to reset the IRR bit for do_slow_gettimeoffset().
+		 * This will also deassert NMI lines for the watchdog if run
+		 * on an 82489DX-based system.
+		 */
+		spin_lock_irqsave(&i8259A_lock, flags);
+		outb(0x0c, PIC_MASTER_OCW3);
+		/* Ack the IRQ; AEOI will end it automatically. */
+		inb(PIC_MASTER_POLL);
+		spin_unlock_irqrestore(&i8259A_lock, flags);
+	}
+}
+
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
@@ -130,19 +147,133 @@ static void __init replace_pin_at_irq(un
 	}
 }
 
+#ifdef CONFIG_X86_IOAPIC_FAST
+# define IOAPIC_CACHE
+#endif
+
+struct ioapic_data_struct {
+	struct sys_device dev;
+	int nr_registers;	//  # of IRQ routing registers
+	volatile unsigned int *base;
+	struct IO_APIC_route_entry *entry;
+#ifdef IOAPIC_CACHE
+	unsigned int reg_set;
+	u32 cached_val[0];
+#endif
+};
+
+static struct ioapic_data_struct *ioapic_data[MAX_IO_APICS];
+
+int nr_ioapic_registers(int apic)
+{
+	return ioapic_data[apic]->nr_registers;
+}
+
+static inline unsigned int __raw_io_apic_read(struct ioapic_data_struct *ioapic, unsigned int reg)
+{
+# ifdef IOAPIC_CACHE
+	ioapic->reg_set = reg;
+# endif
+	ioapic->base[0] = reg;
+	return ioapic->base[4];
+}
+
+
+# ifdef IOAPIC_CACHE
+static void __init ioapic_cache_init(struct ioapic_data_struct *ioapic)
+{
+	int reg;
+	for (reg = 0; reg < (0x10 + 2 * ioapic->nr_registers); reg++)
+		ioapic->cached_val[reg] = __raw_io_apic_read(ioapic, reg);
+}
+# endif
+
+
+static unsigned int raw_io_apic_read(struct ioapic_data_struct *ioapic, unsigned int reg)
+{
+	unsigned int val = __raw_io_apic_read(ioapic, reg);
+
+# ifdef IOAPIC_CACHE
+	ioapic->cached_val[reg] = val;
+# endif
+	return val;
+}
+
+static unsigned int io_apic_read(struct ioapic_data_struct *ioapic, unsigned int reg)
+{
+# ifdef IOAPIC_CACHE
+	if (likely(!sis_apic_bug)) {
+		ioapic->reg_set = -1;
+		return ioapic->cached_val[reg];
+	}
+# endif
+	return raw_io_apic_read(ioapic, reg);
+}
+
+static void io_apic_write(struct ioapic_data_struct *ioapic, unsigned int reg, unsigned int val)
+{
+# ifdef IOAPIC_CACHE
+	ioapic->cached_val[reg] = val;
+	ioapic->reg_set = reg;
+# endif
+	ioapic->base[0] = reg;
+	ioapic->base[4] = val;
+}
+
+
+/*
+ * Some systems need a POST flush or else level-triggered interrupts
+ * generate lots of spurious interrupts due to the POST-ed write not
+ * reaching the IOAPIC before the IRQ is ACK-ed in the local APIC.
+ *
+ * It seems most systems need this - disable the optimization for now.
+ */
+#ifndef CONFIG_X86_IOAPIC_FAST
+# define IOAPIC_POSTFLUSH
+#endif
+
+/*
+ * Re-write a value: to be used for read-modify-write
+ * cycles where the read already set up the index register.
+ *
+ * Older SiS APIC requires we rewrite the index regiser
+ */
+static void io_apic_modify(struct ioapic_data_struct *ioapic, unsigned int reg, unsigned int val)
+{
+#ifdef IOAPIC_CACHE
+	ioapic->cached_val[reg] = val;
+	if (ioapic->reg_set != reg || sis_apic_bug) {
+		ioapic->reg_set = reg;
+#else
+	if (unlikely(sis_apic_bug)) {
+#endif
+		ioapic->base[0] = reg;
+	}
+	ioapic->base[4] = val;
+#ifndef IOAPIC_POSTFLUSH
+	if (unlikely(sis_apic_bug))
+#endif
+		/*
+		 * Force POST flush by reading:
+ 		 */
+		val = ioapic->base[4];
+}
+
 static void __modify_IO_APIC_irq (unsigned int irq, unsigned long enable, unsigned long disable)
 {
 	struct irq_pin_list *entry = irq_2_pin + irq;
-	unsigned int pin, reg;
+	unsigned int pin, val;
+	struct ioapic_data_struct *ioapic;
 
 	for (;;) {
 		pin = entry->pin;
 		if (pin == -1)
 			break;
-		reg = io_apic_read(entry->apic, 0x10 + pin*2);
-		reg &= ~disable;
-		reg |= enable;
-		io_apic_modify(entry->apic, 0x10 + pin*2, reg);
+		ioapic = ioapic_data[entry->apic];
+		val = io_apic_read(ioapic, 0x10 + pin*2);
+		val &= ~disable;
+		val |= enable;
+		io_apic_modify(ioapic, 0x10 + pin*2, val);
 		if (!entry->next)
 			break;
 		entry = irq_2_pin + entry->next;
@@ -150,29 +281,17 @@ static void __modify_IO_APIC_irq (unsign
 }
 
 /* mask = 1 */
-static void __mask_IO_APIC_irq (unsigned int irq)
+static inline void __mask_IO_APIC_irq (unsigned int irq)
 {
 	__modify_IO_APIC_irq(irq, 0x00010000, 0);
 }
 
 /* mask = 0 */
-static void __unmask_IO_APIC_irq (unsigned int irq)
+static inline void __unmask_IO_APIC_irq (unsigned int irq)
 {
 	__modify_IO_APIC_irq(irq, 0, 0x00010000);
 }
 
-/* mask = 1, trigger = 0 */
-static void __mask_and_edge_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
-}
-
-/* mask = 0, trigger = 1 */
-static void __unmask_and_level_IO_APIC_irq (unsigned int irq)
-{
-	__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
-}
-
 static void mask_IO_APIC_irq (unsigned int irq)
 {
 	unsigned long flags;
@@ -191,15 +310,15 @@ static void unmask_IO_APIC_irq (unsigned
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
+static void clear_IO_APIC_pin(struct ioapic_data_struct *ioapic, unsigned int pin)
 {
 	struct IO_APIC_route_entry entry;
 	unsigned long flags;
 	
 	/* Check delivery_mode to be sure we're not clearing an SMI pin */
 	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int*)&entry) + 0) = io_apic_read(apic, 0x10 + 2 * pin);
-	*(((int*)&entry) + 1) = io_apic_read(apic, 0x11 + 2 * pin);
+	*(((int*)&entry) + 0) = io_apic_read(ioapic, 0x10 + 2 * pin);
+	*(((int*)&entry) + 1) = io_apic_read(ioapic, 0x11 + 2 * pin);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 	if (entry.delivery_mode == dest_SMI)
 		return;
@@ -210,8 +329,8 @@ static void clear_IO_APIC_pin(unsigned i
 	memset(&entry, 0, sizeof(entry));
 	entry.mask = 1;
 	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(apic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
-	io_apic_write(apic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
+	io_apic_write(ioapic, 0x10 + 2 * pin, *(((int *)&entry) + 0));
+	io_apic_write(ioapic, 0x11 + 2 * pin, *(((int *)&entry) + 1));
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -219,9 +338,14 @@ static void clear_IO_APIC (void)
 {
 	int apic, pin;
 
-	for (apic = 0; apic < nr_ioapics; apic++)
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++)
-			clear_IO_APIC_pin(apic, pin);
+	for (apic = 0; apic < nr_ioapics; apic++) {
+		struct ioapic_data_struct *ioapic = ioapic_data[apic];
+#ifdef IOAPIC_CACHE
+		ioapic->reg_set = -1;
+#endif
+		for (pin = 0; pin < ioapic->nr_registers; pin++)
+			clear_IO_APIC_pin(ioapic, pin);
+	}
 }
 
 #ifdef CONFIG_SMP
@@ -247,7 +371,7 @@ static void set_ioapic_affinity_irq(unsi
 		pin = entry->pin;
 		if (pin == -1)
 			break;
-		io_apic_write(entry->apic, 0x10 + 1 + pin*2, apicid_value);
+		io_apic_write(ioapic_data[entry->apic], 0x10 + 1 + pin*2, apicid_value);
 		if (!entry->next)
 			break;
 		entry = irq_2_pin + entry->next;
@@ -819,7 +943,7 @@ void __init setup_ioapic_dest(void)
 		return;
 
 	for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-		for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+		for (pin = 0; pin < ioapic_data[ioapic]->nr_registers; pin++) {
 			irq_entry = find_irq_entry(ioapic, pin, mp_INT);
 			if (irq_entry == -1)
 				continue;
@@ -1063,7 +1187,7 @@ static int pin_2_irq(int idx, int apic, 
 			 */
 			i = irq = 0;
 			while (i < apic)
-				irq += nr_ioapic_registers[i++];
+				irq += ioapic_data[i++]->nr_registers;
 			irq += pin;
 
 			/*
@@ -1106,7 +1230,7 @@ static inline int IO_APIC_irq_trigger(in
 	int apic, idx, pin;
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
-		for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+		for (pin = 0; pin < ioapic_data[apic]->nr_registers; pin++) {
 			idx = find_irq_entry(apic,pin,mp_INT);
 			if ((idx != -1) && (irq == pin_2_irq(idx,apic,pin)))
 				return irq_trigger(idx);
@@ -1178,11 +1302,13 @@ static void __init setup_IO_APIC_irqs(vo
 	struct IO_APIC_route_entry entry;
 	int apic, pin, idx, irq, first_notcon = 1, vector;
 	unsigned long flags;
+	struct ioapic_data_struct *ioapic;
 
 	apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
-	for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) {
+		ioapic = ioapic_data[apic];
+	for (pin = 0; pin < ioapic->nr_registers; pin++) {
 
 		/*
 		 * add it to the IO-APIC irq-routing table:
@@ -1239,8 +1365,8 @@ static void __init setup_IO_APIC_irqs(vo
 				disable_8259A_irq(irq);
 		}
 		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0x11+2*pin, *(((int *)&entry)+1));
-		io_apic_write(apic, 0x10+2*pin, *(((int *)&entry)+0));
+		io_apic_write(ioapic, 0x11+2*pin, *(((int *)&entry)+1));
+		io_apic_write(ioapic, 0x10+2*pin, *(((int *)&entry)+0));
 		set_native_irq_info(irq, TARGET_CPUS);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
@@ -1287,8 +1413,8 @@ static void __init setup_ExtINT_IRQ0_pin
 	 * Add it to the IO-APIC irq-routing table:
 	 */
 	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
-	io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+	io_apic_write(ioapic_data[0], 0x11+2*pin, *(((int *)&entry)+1));
+	io_apic_write(ioapic_data[0], 0x10+2*pin, *(((int *)&entry)+0));
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	enable_8259A_irq(0);
@@ -1298,7 +1424,7 @@ static inline void UNEXPECTED_IO_APIC(vo
 {
 }
 
-void __init print_IO_APIC(void)
+void /*__init*/ print_IO_APIC(void)
 {
 	int apic, i;
 	union IO_APIC_reg_00 reg_00;
@@ -1306,6 +1432,7 @@ void __init print_IO_APIC(void)
 	union IO_APIC_reg_02 reg_02;
 	union IO_APIC_reg_03 reg_03;
 	unsigned long flags;
+	struct ioapic_data_struct *ioapic;
 
 	if (apic_verbosity == APIC_QUIET)
 		return;
@@ -1313,7 +1440,7 @@ void __init print_IO_APIC(void)
  	printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
 	for (i = 0; i < nr_ioapics; i++)
 		printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
-		       mp_ioapics[i].mpc_apicid, nr_ioapic_registers[i]);
+		       mp_ioapics[i].mpc_apicid, ioapic_data[i]->nr_registers);
 
 	/*
 	 * We are a bit conservative about what we expect.  We have to
@@ -1322,14 +1449,14 @@ void __init print_IO_APIC(void)
 	printk(KERN_INFO "testing the IO APIC.......................\n");
 
 	for (apic = 0; apic < nr_ioapics; apic++) {
-
+	ioapic = ioapic_data[apic];
 	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(apic, 0);
-	reg_01.raw = io_apic_read(apic, 1);
+	reg_00.raw = io_apic_read(ioapic, 0);
+	reg_01.raw = io_apic_read(ioapic, 1);
 	if (reg_01.bits.version >= 0x10)
-		reg_02.raw = io_apic_read(apic, 2);
+		reg_02.raw = io_apic_read(ioapic, 2);
 	if (reg_01.bits.version >= 0x20)
-		reg_03.raw = io_apic_read(apic, 3);
+		reg_03.raw = io_apic_read(ioapic, 3);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid);
@@ -1400,8 +1527,8 @@ void __init print_IO_APIC(void)
 		struct IO_APIC_route_entry entry;
 
 		spin_lock_irqsave(&ioapic_lock, flags);
-		*(((int *)&entry)+0) = io_apic_read(apic, 0x10+i*2);
-		*(((int *)&entry)+1) = io_apic_read(apic, 0x11+i*2);
+		*(((int *)&entry)+0) = raw_io_apic_read(ioapic, 0x10+i*2);
+		*(((int *)&entry)+1) = raw_io_apic_read(ioapic, 0x11+i*2);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 
 		printk(KERN_DEBUG " %02x %03X %02X  ",
@@ -1447,7 +1574,7 @@ void __init print_IO_APIC(void)
 	return;
 }
 
-#if 0
+#if 1
 
 static void print_APIC_bitfield (int base)
 {
@@ -1594,9 +1721,7 @@ void /*__init*/ print_PIC(void)
 
 static void __init enable_IO_APIC(void)
 {
-	union IO_APIC_reg_01 reg_01;
 	int i;
-	unsigned long flags;
 
 	for (i = 0; i < PIN_MAP_SIZE; i++) {
 		irq_2_pin[i].pin = -1;
@@ -1607,16 +1732,6 @@ static void __init enable_IO_APIC(void)
 			pirq_entries[i] = -1;
 
 	/*
-	 * The number of IO-APIC IRQ registers (== #pins):
-	 */
-	for (i = 0; i < nr_ioapics; i++) {
-		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_01.raw = io_apic_read(i, 1);
-		spin_unlock_irqrestore(&ioapic_lock, flags);
-		nr_ioapic_registers[i] = reg_01.bits.entries+1;
-	}
-
-	/*
 	 * Do not trust the IO-APIC being empty at bootup
 	 */
 	clear_IO_APIC();
@@ -1659,8 +1774,7 @@ void disable_IO_APIC(void)
 		 * Add it to the IO-APIC irq-routing table:
 		 */
 		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(0, 0x11+2*pin, *(((int *)&entry)+1));
-		io_apic_write(0, 0x10+2*pin, *(((int *)&entry)+0));
+		io_apic_write(ioapic_data[0], 0x11+2*pin, *(((int *)&entry)+1));		io_apic_write(ioapic_data[0], 0x10+2*pin, *(((int *)&entry)+0));
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 	}
 	disconnect_bsp_APIC(pin != -1);
@@ -1682,6 +1796,7 @@ static void __init setup_ioapic_ids_from
 	int i;
 	unsigned char old_id;
 	unsigned long flags;
+	struct ioapic_data_struct *ioapic;
 
 	/*
 	 * Don't check I/O APIC IDs for xAPIC systems.  They have
@@ -1699,10 +1814,10 @@ static void __init setup_ioapic_ids_from
 	 * Set the IOAPIC ID to the value stored in the MPC table.
 	 */
 	for (apic = 0; apic < nr_ioapics; apic++) {
-
+		ioapic = ioapic_data[apic];
 		/* Read the register 0 value */
 		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
+		reg_00.raw = io_apic_read(ioapic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 		
 		old_id = mp_ioapics[apic].mpc_apicid;
@@ -1763,14 +1878,14 @@ static void __init setup_ioapic_ids_from
 
 		reg_00.bits.ID = mp_ioapics[apic].mpc_apicid;
 		spin_lock_irqsave(&ioapic_lock, flags);
-		io_apic_write(apic, 0, reg_00.raw);
+		io_apic_write(ioapic, 0, reg_00.raw);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 
 		/*
 		 * Sanity check
 		 */
 		spin_lock_irqsave(&ioapic_lock, flags);
-		reg_00.raw = io_apic_read(apic, 0);
+		reg_00.raw = io_apic_read(ioapic, 0);
 		spin_unlock_irqrestore(&ioapic_lock, flags);
 		if (reg_00.bits.ID != mp_ioapics[apic].mpc_apicid)
 			printk("could not set ID!\n");
@@ -1794,7 +1909,7 @@ static int __init timer_irq_works(void)
 {
 	unsigned long t1 = jiffies;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	/* Let ten ticks pass... */
 	mdelay((10 * 1000) / HZ);
 
@@ -1805,7 +1920,7 @@ static int __init timer_irq_works(void)
 	 * might have cached one ExtINT interrupt.  Finally, at
 	 * least one tick may be lost due to delays.
 	 */
-	if (jiffies - t1 > 4)
+	if (jiffies - t1 > 4 && jiffies - t1 < 16)
 		return 1;
 
 	return 0;
@@ -1858,9 +1973,11 @@ static unsigned int startup_edge_ioapic_
 static void ack_edge_ioapic_irq(unsigned int irq)
 {
 	move_irq(irq);
+#if 0
 	if ((irq_desc[irq].status & (IRQ_PENDING | IRQ_DISABLED))
 					== (IRQ_PENDING | IRQ_DISABLED))
 		mask_IO_APIC_irq(irq);
+#endif
 	ack_APIC_irq();
 }
 
@@ -1885,6 +2002,30 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+/*
+ * in the PREEMPT_HARDIRQS case we dont want to keep the local
+ * APIC unacked, because the prevents further interrupts from
+ * being handled - and with IRQ threads being delayed arbitrarily,
+ * that's unacceptable. So we first mask the IRQ, then ack it.
+ * The hardirq thread will then unmask it.
+ */
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	move_irq(irq);
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+#else
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
+#endif
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	unsigned long v;
@@ -1919,8 +2060,10 @@ static void end_level_ioapic_irq (unsign
 	if (!(v & (1 << (i & 0x1f)))) {
 		atomic_inc(&irq_mis_count);
 		spin_lock(&ioapic_lock);
-		__mask_and_edge_IO_APIC_irq(irq);
-		__unmask_and_level_IO_APIC_irq(irq);
+		/* mask = 1, trigger = 0 */
+		__modify_IO_APIC_irq(irq, 0x00010000, 0x00008000);
+		/* mask = 0, trigger = 1 */
+		__modify_IO_APIC_irq(irq, 0x00008000, 0x00010000);
 		spin_unlock(&ioapic_lock);
 	}
 }
@@ -1948,6 +2091,13 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
@@ -2111,22 +2261,23 @@ static void setup_nmi (void)
  * cycles as some i82489DX-based boards have glue logic that keeps the
  * 8259A interrupt line asserted until INTA.  --macro
  */
-static inline void unlock_ExtINT_logic(void)
+static void __init unlock_ExtINT_logic(void)
 {
 	int pin, i;
 	struct IO_APIC_route_entry entry0, entry1;
 	unsigned char save_control, save_freq_select;
 	unsigned long flags;
+	struct ioapic_data_struct *ioapic0 = ioapic_data[0];
 
 	pin = find_isa_irq_pin(8, mp_INT);
 	if (pin == -1)
 		return;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	*(((int *)&entry0) + 1) = io_apic_read(0, 0x11 + 2 * pin);
-	*(((int *)&entry0) + 0) = io_apic_read(0, 0x10 + 2 * pin);
+	*(((int *)&entry0) + 1) = io_apic_read(ioapic0, 0x11 + 2 * pin);
+	*(((int *)&entry0) + 0) = io_apic_read(ioapic0, 0x10 + 2 * pin);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
-	clear_IO_APIC_pin(0, pin);
+	clear_IO_APIC_pin(ioapic0, pin);
 
 	memset(&entry1, 0, sizeof(entry1));
 
@@ -2139,8 +2290,8 @@ static inline void unlock_ExtINT_logic(v
 	entry1.vector = 0;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
-	io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
+	io_apic_write(ioapic0, 0x11 + 2 * pin, *(((int *)&entry1) + 1));
+	io_apic_write(ioapic0, 0x10 + 2 * pin, *(((int *)&entry1) + 0));
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	save_control = CMOS_READ(RTC_CONTROL);
@@ -2158,11 +2309,11 @@ static inline void unlock_ExtINT_logic(v
 
 	CMOS_WRITE(save_control, RTC_CONTROL);
 	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
-	clear_IO_APIC_pin(0, pin);
+	clear_IO_APIC_pin(ioapic0, pin);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	io_apic_write(0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
-	io_apic_write(0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
+	io_apic_write(ioapic0, 0x11 + 2 * pin, *(((int *)&entry0) + 1));
+	io_apic_write(ioapic0, 0x10 + 2 * pin, *(((int *)&entry0) + 0));
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
@@ -2172,10 +2323,11 @@ static inline void unlock_ExtINT_logic(v
  * is so screwy.  Thanks to Brian Perkins for testing/hacking this beast
  * fanatically on his truly buggy board.
  */
-static inline void check_timer(void)
+static void __init check_timer(void)
 {
 	int pin1, pin2;
 	int vector;
+	struct ioapic_data_struct *ioapic0 = ioapic_data[0];
 
 	/*
 	 * get/set the timer IRQ vector:
@@ -2193,7 +2345,10 @@ static inline void check_timer(void)
 	 */
 	apic_write_around(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
 	init_8259A(1);
-	timer_ack = 1;
+#ifdef CONFIG_PREEMPT_RT
+	if (nmi_watchdog)
+#endif
+		timer_ack = 1;
 	enable_8259A_irq(0);
 
 	pin1 = find_isa_irq_pin(0, mp_INT);
@@ -2216,7 +2371,7 @@ static inline void check_timer(void)
 				clear_IO_APIC_pin(0, pin1);
 			return;
 		}
-		clear_IO_APIC_pin(0, pin1);
+		clear_IO_APIC_pin(ioapic0, pin1);
 		printk(KERN_ERR "..MP-BIOS bug: 8254 timer not connected to IO-APIC\n");
 	}
 
@@ -2241,7 +2396,7 @@ static inline void check_timer(void)
 		/*
 		 * Cleanup, just in case ...
 		 */
-		clear_IO_APIC_pin(0, pin2);
+		clear_IO_APIC_pin(ioapic0, pin2);
 	}
 	printk(" failed.\n");
 
@@ -2282,6 +2437,46 @@ static inline void check_timer(void)
 		"report.  Then try booting with the 'noapic' option");
 }
 
+void __init setup_IO_APIC_early(int _ioapic)
+{
+	union IO_APIC_reg_01 reg_01;
+	unsigned long flags;
+	int size, nr_ioapic_registers;
+	volatile int *ioapic;
+	if (ioapic_data[_ioapic]) {
+		printk("been in %s before !!!!!\n", __FUNCTION__);
+		return;
+	}
+
+	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + _ioapic, mp_ioapics[_ioapic].mpc_apicaddr);
+	printk(KERN_DEBUG "mapped IOAPIC to %08lx (%08lx)\n",
+	       __fix_to_virt(FIX_IO_APIC_BASE_0 + _ioapic), mp_ioapics[_ioapic].mpc_apicaddr);
+	/*
+	 * The number of IO-APIC IRQ registers (== #pins):
+	 */
+	ioapic = IO_APIC_BASE(_ioapic);
+	spin_lock_irqsave(&ioapic_lock, flags);
+	ioapic[0] = 1;
+	reg_01.raw = ioapic[4];
+	spin_unlock_irqrestore(&ioapic_lock, flags);
+	nr_ioapic_registers = reg_01.bits.entries+1;
+
+	/*
+	 * Initialsize ioapic_data struct:
+	 */
+	size = sizeof(struct ioapic_data_struct);
+#ifdef IOAPIC_CACHE
+	size += 0x10 * sizeof(u32) + nr_ioapic_registers * sizeof(struct IO_APIC_route_entry);
+#endif
+	ioapic_data[_ioapic] = alloc_bootmem(size);
+	memset(ioapic_data[_ioapic], 0, size);
+	ioapic_data[_ioapic]->nr_registers = nr_ioapic_registers;
+	ioapic_data[_ioapic]->base = ioapic;
+#ifdef IOAPIC_CACHE
+	ioapic_cache_init(ioapic_data[_ioapic]);
+#endif
+}
+
 /*
  *
  * IRQ's that are handled by the PIC in the MPS IOAPIC case.
@@ -2329,25 +2524,22 @@ static int __init io_apic_bug_finalize(v
 
 late_initcall(io_apic_bug_finalize);
 
-struct sysfs_ioapic_data {
-	struct sys_device dev;
-	struct IO_APIC_route_entry entry[0];
-};
-static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
-
 static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
 {
 	struct IO_APIC_route_entry *entry;
-	struct sysfs_ioapic_data *data;
+	struct ioapic_data_struct *data;
 	unsigned long flags;
 	int i;
+	struct ioapic_data_struct *ioapic;
 	
-	data = container_of(dev, struct sysfs_ioapic_data, dev);
+	data = container_of(dev, struct ioapic_data_struct, dev);
 	entry = data->entry;
+
+	ioapic = ioapic_data[dev->id];
 	spin_lock_irqsave(&ioapic_lock, flags);
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		*(((int *)entry) + 1) = io_apic_read(dev->id, 0x11 + 2 * i);
-		*(((int *)entry) + 0) = io_apic_read(dev->id, 0x10 + 2 * i);
+	for (i = 0; i < ioapic_data[dev->id]->nr_registers; i ++, entry ++) {
+		*(((int *)entry) + 1) = io_apic_read(ioapic, 0x11 + 2 * i);
+		*(((int *)entry) + 0) = io_apic_read(ioapic, 0x10 + 2 * i);
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
@@ -2357,23 +2549,25 @@ static int ioapic_suspend(struct sys_dev
 static int ioapic_resume(struct sys_device *dev)
 {
 	struct IO_APIC_route_entry *entry;
-	struct sysfs_ioapic_data *data;
+	struct ioapic_data_struct *data;
 	unsigned long flags;
 	union IO_APIC_reg_00 reg_00;
 	int i;
-	
-	data = container_of(dev, struct sysfs_ioapic_data, dev);
+	struct ioapic_data_struct *ioapic;
+
+	data = container_of(dev, struct ioapic_data_struct, dev);
 	entry = data->entry;
 
+	ioapic = ioapic_data[dev->id];
 	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_00.raw = io_apic_read(dev->id, 0);
+	reg_00.raw = io_apic_read(ioapic, 0);
 	if (reg_00.bits.ID != mp_ioapics[dev->id].mpc_apicid) {
 		reg_00.bits.ID = mp_ioapics[dev->id].mpc_apicid;
-		io_apic_write(dev->id, 0, reg_00.raw);
+		io_apic_write(ioapic, 0, reg_00.raw);
 	}
-	for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ ) {
-		io_apic_write(dev->id, 0x11+2*i, *(((int *)entry)+1));
-		io_apic_write(dev->id, 0x10+2*i, *(((int *)entry)+0));
+	for (i = 0; i < ioapic_data[dev->id]->nr_registers; i ++, entry ++) {
+		io_apic_write(ioapic, 0x11+2*i, *(((int *)entry)+1));
+		io_apic_write(ioapic, 0x10+2*i, *(((int *)entry)+0));
 	}
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
@@ -2396,21 +2590,20 @@ static int __init ioapic_init_sysfs(void
 		return error;
 
 	for (i = 0; i < nr_ioapics; i++ ) {
-		size = sizeof(struct sys_device) + nr_ioapic_registers[i] 
-			* sizeof(struct IO_APIC_route_entry);
-		mp_ioapic_data[i] = kmalloc(size, GFP_KERNEL);
-		if (!mp_ioapic_data[i]) {
+		size = ioapic_data[i]->nr_registers * sizeof(struct IO_APIC_route_entry);
+		ioapic_data[i]->entry = kmalloc(size, GFP_KERNEL);
+		if (!ioapic_data[i]->entry) {
 			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
 			continue;
 		}
-		memset(mp_ioapic_data[i], 0, size);
-		dev = &mp_ioapic_data[i]->dev;
+		memset(ioapic_data[i]->entry, 0, size);
+		dev = &ioapic_data[i]->dev;
 		dev->id = i; 
 		dev->cls = &ioapic_sysdev_class;
 		error = sysdev_register(dev);
 		if (error) {
-			kfree(mp_ioapic_data[i]);
-			mp_ioapic_data[i] = NULL;
+			kfree(ioapic_data[i]->entry);
+			ioapic_data[i]->entry = NULL;
 			printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
 			continue;
 		}
@@ -2427,13 +2620,14 @@ device_initcall(ioapic_init_sysfs);
 
 #ifdef CONFIG_ACPI
 
-int __init io_apic_get_unique_id (int ioapic, int apic_id)
+int __init io_apic_get_unique_id (int apic, int apic_id)
 {
 	union IO_APIC_reg_00 reg_00;
 	static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
 	physid_mask_t tmp;
 	unsigned long flags;
 	int i = 0;
+	struct ioapic_data_struct *ioapic = ioapic_data[apic];
 
 	/*
 	 * The P4 platform supports up to 256 APIC IDs on two separate APIC 
@@ -2453,7 +2647,7 @@ int __init io_apic_get_unique_id (int io
 
 	if (apic_id >= get_physical_broadcast()) {
 		printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
-			"%d\n", ioapic, apic_id, reg_00.bits.ID);
+			"%d\n", apic, apic_id, reg_00.bits.ID);
 		apic_id = reg_00.bits.ID;
 	}
 
@@ -2472,7 +2666,7 @@ int __init io_apic_get_unique_id (int io
 			panic("Max apic_id exceeded!\n");
 
 		printk(KERN_WARNING "IOAPIC[%d]: apic_id %d already used, "
-			"trying %d\n", ioapic, apic_id, i);
+			"trying %d\n", apic, apic_id, i);
 
 		apic_id = i;
 	} 
@@ -2490,50 +2684,50 @@ int __init io_apic_get_unique_id (int io
 
 		/* Sanity check */
 		if (reg_00.bits.ID != apic_id)
-			panic("IOAPIC[%d]: Unable change apic_id!\n", ioapic);
+			panic("IOAPIC[%d]: Unable change apic_id!\n", apic);
 	}
 
 	apic_printk(APIC_VERBOSE, KERN_INFO
-			"IOAPIC[%d]: Assigned apic_id %d\n", ioapic, apic_id);
+			"IOAPIC[%d]: Assigned apic_id %d\n", apic, apic_id);
 
 	return apic_id;
 }
 
 
-int __init io_apic_get_version (int ioapic)
+int __init io_apic_get_version (int apic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_01.raw = io_apic_read(ioapic, 1);
+	reg_01.raw = io_apic_read(ioapic_data[apic], 1);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return reg_01.bits.version;
 }
 
 
-int __init io_apic_get_redir_entries (int ioapic)
+int __init io_apic_get_redir_entries (int apic)
 {
 	union IO_APIC_reg_01	reg_01;
 	unsigned long flags;
 
 	spin_lock_irqsave(&ioapic_lock, flags);
-	reg_01.raw = io_apic_read(ioapic, 1);
+	reg_01.raw = io_apic_read(ioapic_data[apic], 1);
 	spin_unlock_irqrestore(&ioapic_lock, flags);
 
 	return reg_01.bits.entries;
 }
 
 
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int edge_level, int active_high_low)
+int io_apic_set_pci_routing (int apic, int pin, int irq, int edge_level, int active_high_low)
 {
 	struct IO_APIC_route_entry entry;
 	unsigned long flags;
-
+	struct ioapic_data_struct *ioapic = ioapic_data[apic];
 	if (!IO_APIC_IRQ(irq)) {
 		printk(KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-			ioapic);
+			apic);
 		return -EINVAL;
 	}
 
@@ -2556,18 +2750,18 @@ int io_apic_set_pci_routing (int ioapic,
 	 * IRQs < 16 are already in the irq_2_pin[] map
 	 */
 	if (irq >= 16)
-		add_pin_to_irq(irq, ioapic, pin);
+		add_pin_to_irq(irq, apic, pin);
 
 	entry.vector = assign_irq_vector(irq);
 
 	apic_printk(APIC_DEBUG, KERN_DEBUG "IOAPIC[%d]: Set PCI routing entry "
-		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", ioapic,
-		mp_ioapics[ioapic].mpc_apicid, pin, entry.vector, irq,
+		"(%d-%d -> 0x%x -> IRQ %d Mode:%i Active:%i)\n", apic,
+		mp_ioapics[apic].mpc_apicid, pin, entry.vector, irq,
 		edge_level, active_high_low);
 
 	ioapic_register_intr(irq, entry.vector, edge_level);
 
-	if (!ioapic && (irq < 16))
+	if (!apic && (irq < 16))
 		disable_8259A_irq(irq);
 
 	spin_lock_irqsave(&ioapic_lock, flags);
Index: linux/arch/i386/kernel/irq.c
===================================================================
--- linux.orig/arch/i386/kernel/irq.c
+++ linux/arch/i386/kernel/irq.c
@@ -51,7 +51,7 @@ static union irq_ctx *softirq_ctx[NR_CPU
  * SMP cross-CPU interrupts have their own specific
  * handlers).
  */
-fastcall unsigned int do_IRQ(struct pt_regs *regs)
+fastcall notrace unsigned int do_IRQ(struct pt_regs *regs)
 {	
 	/* high bits used in ret_from_ code */
 	int irq = regs->orig_eax & 0xff;
@@ -59,8 +59,12 @@ fastcall unsigned int do_IRQ(struct pt_r
 	union irq_ctx *curctx, *irqctx;
 	u32 *isp;
 #endif
-
 	irq_enter();
+#ifdef CONFIG_LATENCY_TRACE
+	if (irq == trace_user_trigger_irq)
+		user_trace_start();
+#endif
+	trace_special(regs->eip, irq, 0);
 #ifdef CONFIG_DEBUG_STACKOVERFLOW
 	/* Debugging check for stack overflow: is there less than 1KB free? */
 	{
@@ -69,7 +73,7 @@ fastcall unsigned int do_IRQ(struct pt_r
 		__asm__ __volatile__("andl %%esp,%0" :
 					"=r" (esp) : "0" (THREAD_SIZE - 1));
 		if (unlikely(esp < (sizeof(struct thread_info) + STACK_WARN))) {
-			printk("do_IRQ: stack overflow: %ld\n",
+			printk("BUG: do_IRQ: stack overflow: %ld\n",
 				esp - sizeof(struct thread_info));
 			dump_stack();
 		}
@@ -173,7 +177,7 @@ asmlinkage void do_softirq(void)
 	if (in_interrupt())
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (local_softirq_pending()) {
 		curctx = current_thread_info();
@@ -194,7 +198,7 @@ asmlinkage void do_softirq(void)
 		);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
@@ -224,8 +228,10 @@ int show_interrupts(struct seq_file *p, 
 	}
 
 	if (i < NR_IRQS) {
-		spin_lock_irqsave(&irq_desc[i].lock, flags);
-		action = irq_desc[i].action;
+		irq_desc_t *desc = irq_desc + i;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		action = desc->action;
 		if (!action)
 			goto skip;
 		seq_printf(p, "%3d: ",i);
@@ -235,15 +241,27 @@ int show_interrupts(struct seq_file *p, 
 		for_each_cpu(j)
 			seq_printf(p, "%10u ", kstat_cpu(j).irqs[i]);
 #endif
-		seq_printf(p, " %14s", irq_desc[i].handler->typename);
+		seq_printf(p, " %-14s", desc->handler->typename);
+#define F(x,c) ((desc->status & x) ? c : '.')
+		seq_printf(p, " [%c%c%c%c%c%c%c%c%c/",
+			F(IRQ_INPROGRESS,	'I'),
+			F(IRQ_DISABLED,		'D'),
+			F(IRQ_PENDING,		'P'),
+			F(IRQ_REPLAY,		'R'),
+			F(IRQ_AUTODETECT,	'A'),
+			F(IRQ_WAITING,		'W'),
+			F(IRQ_LEVEL,		'L'),
+			F(IRQ_MASKED,		'M'),
+			F(IRQ_NODELAY,		'N'));
+#undef F
+		seq_printf(p, "%3d]", desc->irqs_unhandled);
 		seq_printf(p, "  %s", action->name);
-
 		for (action=action->next; action; action = action->next)
 			seq_printf(p, ", %s", action->name);
 
 		seq_putc(p, '\n');
 skip:
-		spin_unlock_irqrestore(&irq_desc[i].lock, flags);
+		spin_unlock_irqrestore(&desc->lock, flags);
 	} else if (i == NR_IRQS) {
 		seq_printf(p, "NMI: ");
 		for_each_cpu(j)
@@ -298,9 +316,9 @@ void fixup_irqs(cpumask_t map)
 	barrier();
 #else
 	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay(1);
-	local_irq_disable();
+	raw_local_irq_disable();
 #endif
 }
 #endif
Index: linux/arch/i386/kernel/mca.c
===================================================================
--- linux.orig/arch/i386/kernel/mca.c
+++ linux/arch/i386/kernel/mca.c
@@ -472,3 +472,22 @@ void mca_handle_nmi(void)
 
 	mca_nmi_hook();
 } /* mca_handle_nmi */
+
+void mca_timer_ack(void *priv)
+{
+	int irq;
+
+	if (MCA_bus) {
+		/* The PS/2 uses level-triggered interrupts.  You can't
+		turn them off, nor would you want to (any attempt to
+		enable edge-triggered interrupts usually gets intercepted by a
+		special hardware circuit).  Hence we have to acknowledge
+		the timer interrupt.  Through some incredibly stupid
+		design idea, the reset for IRQ 0 is done by setting the
+		high bit of the PPI port B (0x61).  Note that some PS/2s,
+		notably the 55SX, work fine if this is removed.  */
+
+		irq = inb_p( 0x61 );	/* read the current state */
+		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
+	}
+}
Index: linux/arch/i386/kernel/mcount-wrapper.S
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/mcount-wrapper.S
@@ -0,0 +1,27 @@
+/*
+ *  linux/arch/i386/mcount-wrapper.S
+ *
+ *  Copyright (C) 2004 Ingo Molnar
+ */
+
+.globl mcount
+mcount:
+
+	cmpl $0, mcount_enabled
+	jz out
+
+	push %ebp
+	mov %esp, %ebp
+	pushl %eax
+	pushl %ecx
+	pushl %edx
+
+	call __mcount
+
+	popl %edx
+	popl %ecx
+	popl %eax
+	popl %ebp
+out:
+	ret
+
Index: linux/arch/i386/kernel/microcode.c
===================================================================
--- linux.orig/arch/i386/kernel/microcode.c
+++ linux/arch/i386/kernel/microcode.c
@@ -109,7 +109,7 @@ MODULE_LICENSE("GPL");
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
 /* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
+static DEFINE_RAW_SPINLOCK(microcode_update_lock);
 
 /* no concurrent ->write()s are allowed on /dev/cpu/microcode */
 static DECLARE_MUTEX(microcode_sem);
Index: linux/arch/i386/kernel/mpparse.c
===================================================================
--- linux.orig/arch/i386/kernel/mpparse.c
+++ linux/arch/i386/kernel/mpparse.c
@@ -271,6 +271,7 @@ static void __init MP_ioapic_info (struc
 		return;
 	}
 	mp_ioapics[nr_ioapics] = *m;
+	setup_IO_APIC_early(nr_ioapics);
 	nr_ioapics++;
 }
 
@@ -919,7 +920,7 @@ void __init mp_register_ioapic (
 	mp_ioapics[idx].mpc_flags = MPC_APIC_USABLE;
 	mp_ioapics[idx].mpc_apicaddr = address;
 
-	set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
+	setup_IO_APIC_early(idx);
 	if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) && (boot_cpu_data.x86 < 15))
 		mp_ioapics[idx].mpc_apicid = io_apic_get_unique_id(idx, id);
 	else
Index: linux/arch/i386/kernel/nmi.c
===================================================================
--- linux.orig/arch/i386/kernel/nmi.c
+++ linux/arch/i386/kernel/nmi.c
@@ -34,7 +34,7 @@
 
 unsigned int nmi_watchdog = NMI_NONE;
 extern int unknown_nmi_panic;
-static unsigned int nmi_hz = HZ;
+static unsigned int nmi_hz = 1000;
 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
 static unsigned int nmi_p4_cccr_val;
 extern void show_registers(struct pt_regs *regs);
@@ -112,8 +112,8 @@ static int __init check_nmi_watchdog(voi
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		prev_nmi_count[cpu] = per_cpu(irq_stat, cpu).__nmi_count;
-	local_irq_enable();
-	mdelay((10*1000)/nmi_hz); // wait 10 ticks
+	raw_local_irq_enable();
+	mdelay((100*1000)/nmi_hz); // wait 100 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
 #ifdef CONFIG_SMP
@@ -134,7 +134,7 @@ static int __init check_nmi_watchdog(voi
 	/* now that we know it works we can reduce NMI frequency to
 	   something more reasonable; makes a difference in some configs */
 	if (nmi_watchdog == NMI_LOCAL_APIC)
-		nmi_hz = 1;
+		nmi_hz = 10000;
 
 	return 0;
 }
@@ -486,9 +486,34 @@ void touch_nmi_watchdog (void)
 
 extern void die_nmi(struct pt_regs *, const char *msg);
 
-void nmi_watchdog_tick (struct pt_regs * regs)
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
 {
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+	printk("nmi_show_all_regs(): start at CPU#%d.\n",
+		raw_smp_processor_id());
+	dump_stack();
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
 
+void notrace nmi_watchdog_tick (struct pt_regs * regs)
+{
 	/*
 	 * Since current_thread_info()-> is always on the stack, and we
 	 * always switch the stack NMI-atomically, it's safe to use
@@ -496,7 +521,16 @@ void nmi_watchdog_tick (struct pt_regs *
 	 */
 	int sum, cpu = smp_processor_id();
 
-	sum = per_cpu(irq_stat, cpu).apic_timer_irqs;
+	sum = per_cpu(irq_stat, cpu).apic_timer_irqs + kstat_irqs(0);
+
+	profile_tick(CPU_PROFILING, regs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		printk("NMI show regs on CPU#%d:\n", cpu);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 
 	if (last_irq_sums[cpu] == sum) {
 		/*
@@ -504,12 +538,25 @@ void nmi_watchdog_tick (struct pt_regs *
 		 * wait a few IRQs (5 seconds) before doing the oops ...
 		 */
 		alert_counter[cpu]++;
-		if (alert_counter[cpu] == 5*nmi_hz)
-			/*
-			 * die_nmi will return ONLY if NOTIFY_STOP happens..
-			 */
-			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+		if (alert_counter[cpu] && !(alert_counter[cpu] % (5*nmi_hz))) {
+			int i;
+
+			bust_spinlocks(1);
+			spin_lock(&nmi_print_lock);
+			printk("NMI watchdog detected lockup on CPU#%d (%d/%d)\n", cpu, alert_counter[cpu], 5*nmi_hz);
+			show_regs(regs);
+			spin_unlock(&nmi_print_lock);
+
+			for_each_online_cpu(i)
+				if (i != cpu)
+					nmi_show_regs[i] = 1;
+			for_each_online_cpu(i)
+				while (nmi_show_regs[i] == 1)
+					barrier();
 
+			die_nmi(regs, "NMI Watchdog detected LOCKUP");
+		}
+	} else {
 		last_irq_sums[cpu] = sum;
 		alert_counter[cpu] = 0;
 	}
Index: linux/arch/i386/kernel/process.c
===================================================================
--- linux.orig/arch/i386/kernel/process.c
+++ linux/arch/i386/kernel/process.c
@@ -39,6 +39,7 @@
 #include <linux/ptrace.h>
 #include <linux/random.h>
 #include <linux/kprobes.h>
+#include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -64,6 +65,12 @@ static int hlt_counter;
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 
+DEFINE_SPINLOCK(pm_idle_switch_lock);
+EXPORT_SYMBOL_GPL(pm_idle_switch_lock);
+
+int pm_idle_locked = 0;
+EXPORT_SYMBOL_GPL(pm_idle_locked);
+
 /*
  * Return saved PC of a blocked thread.
  */
@@ -100,12 +107,13 @@ EXPORT_SYMBOL(enable_hlt);
 void default_idle(void)
 {
 	if (!hlt_counter && boot_cpu_data.hlt_works_ok) {
-		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
+		raw_local_irq_disable();
+		if (!need_resched() && !need_resched_delayed())
+			raw_safe_halt();
 		else
-			local_irq_enable();
+			raw_local_irq_enable();
 	} else {
+		raw_local_irq_enable();
 		cpu_relax();
 	}
 }
@@ -118,11 +126,11 @@ EXPORT_SYMBOL(default_idle);
  * to poll the ->work.need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+void poll_idle (void)
 {
 	int oldval;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	/*
 	 * Deal with another CPU just having chosen a thread to
@@ -137,7 +145,7 @@ static void poll_idle (void)
 			"testl %0, %1;"
 			"rep; nop;"
 			"je 2b;"
-			: : "i"(_TIF_NEED_RESCHED), "m" (current_thread_info()->flags));
+			: : "i"(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED), "m" (current_thread_info()->flags));
 
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
@@ -160,7 +168,7 @@ static inline void play_dead(void)
 	/*
 	 * With physical CPU hotplug, we should halt the cpu
 	 */
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1)
 		halt();
 }
@@ -183,7 +191,9 @@ void cpu_idle(void)
 
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(raw_irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -199,9 +209,13 @@ void cpu_idle(void)
 				play_dead();
 
 			__get_cpu_var(irq_stat).idle_timestamp = jiffies;
+			stop_critical_timing();
+			propagate_preempt_locks_value();
 			idle();
 		}
-		schedule();
+		raw_local_irq_disable();
+		__schedule();
+		raw_local_irq_enable();
 	}
 }
 
@@ -242,16 +256,16 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 static void mwait_idle(void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		set_thread_flag(TIF_POLLING_NRFLAG);
 		do {
 			__monitor((void *)&current_thread_info()->flags, 0, 0);
-			if (need_resched())
+			if (need_resched() || need_resched_delayed())
 				break;
 			__mwait(0, 0);
-		} while (!need_resched());
+		} while (!need_resched() && !need_resched_delayed());
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 	}
 }
@@ -378,11 +392,16 @@ void exit_thread(void)
 
 	/* The process may have allocated an io port bitmap... nuke it. */
 	if (unlikely(NULL != t->io_bitmap_ptr)) {
-		int cpu = get_cpu();
-		struct tss_struct *tss = &per_cpu(init_tss, cpu);
+		int cpu;
+		struct tss_struct *tss;
+		void *io_bitmap_ptr = t->io_bitmap_ptr;
 
-		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
+		mb();
+		kfree(io_bitmap_ptr);
+
+		cpu = get_cpu();
+		tss = &per_cpu(init_tss, cpu);
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
Index: linux/arch/i386/kernel/reboot.c
===================================================================
--- linux.orig/arch/i386/kernel/reboot.c
+++ linux/arch/i386/kernel/reboot.c
@@ -194,7 +194,7 @@ void machine_real_restart(unsigned char 
 {
 	unsigned long flags;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Write zero to CMOS register number 0x0f, which the BIOS POST
 	   routine will recognize as telling it to do a proper reboot.  (Well
Index: linux/arch/i386/kernel/semaphore.c
===================================================================
--- linux.orig/arch/i386/kernel/semaphore.c
+++ linux/arch/i386/kernel/semaphore.c
@@ -13,6 +13,7 @@
  * rw semaphores implemented November 1999 by Benjamin LaHaise <bcrl@kvack.org>
  */
 #include <linux/config.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -28,15 +29,15 @@
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed\n"
-"__down_failed:\n\t"
+".globl __compat_down_failed\n"
+"__compat_down_failed:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down\n\t"
+	"call __compat_down\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -49,15 +50,15 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_interruptible\n"
-"__down_failed_interruptible:\n\t"
+".globl __compat_down_failed_interruptible\n"
+"__compat_down_failed_interruptible:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_interruptible\n\t"
+	"call __compat_down_interruptible\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -70,15 +71,15 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __down_failed_trylock\n"
-"__down_failed_trylock:\n\t"
+".globl __compat_down_failed_trylock\n"
+"__compat_down_failed_trylock:\n\t"
 #if defined(CONFIG_FRAME_POINTER)
 	"pushl %ebp\n\t"
 	"movl  %esp,%ebp\n\t"
 #endif
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __down_trylock\n\t"
+	"call __compat_down_trylock\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 #if defined(CONFIG_FRAME_POINTER)
@@ -91,45 +92,13 @@ asm(
 asm(
 ".section .sched.text\n"
 ".align 4\n"
-".globl __up_wakeup\n"
-"__up_wakeup:\n\t"
+".globl __compat_up_wakeup\n"
+"__compat_up_wakeup:\n\t"
 	"pushl %edx\n\t"
 	"pushl %ecx\n\t"
-	"call __up\n\t"
+	"call __compat_up\n\t"
 	"popl %ecx\n\t"
 	"popl %edx\n\t"
 	"ret"
 );
 
-/*
- * rw spinlock fallbacks
- */
-#if defined(CONFIG_SMP)
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__write_lock_failed\n"
-"__write_lock_failed:\n\t"
-	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jne	1b\n\t"
-	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
-	"jnz	__write_lock_failed\n\t"
-	"ret"
-);
-
-asm(
-".section .sched.text\n"
-".align	4\n"
-".globl	__read_lock_failed\n"
-"__read_lock_failed:\n\t"
-	LOCK "incl	(%eax)\n"
-"1:	rep; nop\n\t"
-	"cmpl	$1,(%eax)\n\t"
-	"js	1b\n\t"
-	LOCK "decl	(%eax)\n\t"
-	"js	__read_lock_failed\n\t"
-	"ret"
-);
-#endif
Index: linux/arch/i386/kernel/setup.c
===================================================================
--- linux.orig/arch/i386/kernel/setup.c
+++ linux/arch/i386/kernel/setup.c
@@ -1612,6 +1612,7 @@ void __init setup_arch(char **cmdline_p)
 	conswitchp = &dummy_con;
 #endif
 #endif
+	tsc_init();
 }
 
 #include "setup_arch_post.h"
Index: linux/arch/i386/kernel/signal.c
===================================================================
--- linux.orig/arch/i386/kernel/signal.c
+++ linux/arch/i386/kernel/signal.c
@@ -604,6 +604,13 @@ int fastcall do_signal(struct pt_regs *r
 	int signr;
 	struct k_sigaction ka;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux/arch/i386/kernel/smp.c
===================================================================
--- linux.orig/arch/i386/kernel/smp.c
+++ linux/arch/i386/kernel/smp.c
@@ -163,7 +163,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu
 	unsigned long cfg;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	WARN_ON(mask & ~cpus_addr(cpu_online_map)[0]);
 	/*
 	 * Wait for idle.
@@ -186,7 +186,7 @@ void send_IPI_mask_bitmask(cpumask_t cpu
 	 */
 	apic_write_around(APIC_ICR, cfg);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void send_IPI_mask_sequence(cpumask_t mask, int vector)
@@ -200,7 +200,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 	 * should be modified to do 1 message per cluster ID - mbligh
 	 */ 
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for (query_cpu = 0; query_cpu < NR_CPUS; ++query_cpu) {
 		if (cpu_isset(query_cpu, mask)) {
@@ -227,7 +227,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 			apic_write_around(APIC_ICR, cfg);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #include <mach_ipi.h> /* must come after the send_IPI functions above for inlining */
@@ -245,7 +245,7 @@ void send_IPI_mask_sequence(cpumask_t ma
 static cpumask_t flush_cpumask;
 static struct mm_struct * flush_mm;
 static unsigned long flush_va;
-static DEFINE_SPINLOCK(tlbstate_lock);
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
 #define FLUSH_ALL	0xffffffff
 
 /*
@@ -390,7 +390,7 @@ static void flush_tlb_others(cpumask_t c
 
 	while (!cpus_empty(flush_cpumask))
 		/* nothing. lockup detection does not belong here */
-		mb();
+		cpu_relax();
 
 	flush_mm = NULL;
 	flush_va = 0;
@@ -481,10 +481,20 @@ void smp_send_reschedule(int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -538,7 +548,7 @@ int smp_call_function (void (*func) (voi
 	}
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON(raw_irqs_disabled());
 
 	data.func = func;
 	data.info = info;
@@ -572,7 +582,7 @@ static void stop_this_cpu (void * dummy)
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_disable();
+	raw_local_irq_disable();
 	disable_local_APIC();
 	if (cpu_data[smp_processor_id()].hlt_works_ok)
 		for(;;) halt();
@@ -587,19 +597,20 @@ void smp_send_stop(void)
 {
 	smp_call_function(stop_this_cpu, NULL, 1, 0);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	disable_local_APIC();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
- * Reschedule call back. Nothing to do,
- * all the work is done automatically when
- * we return from the interrupt.
+ * Reschedule call back. Trigger a reschedule pass so that
+ * RT-overload balancing can pass tasks around.
  */
-fastcall void smp_reschedule_interrupt(struct pt_regs *regs)
+fastcall notrace void smp_reschedule_interrupt(struct pt_regs *regs)
 {
+	trace_special(regs->eip, 0, 0);
 	ack_APIC_irq();
+	set_tsk_need_resched(current);
 }
 
 fastcall void smp_call_function_interrupt(struct pt_regs *regs)
Index: linux/arch/i386/kernel/smpboot.c
===================================================================
--- linux.orig/arch/i386/kernel/smpboot.c
+++ linux/arch/i386/kernel/smpboot.c
@@ -208,142 +208,299 @@ valid_k7:
 	;
 }
 
-/*
- * TSC synchronization.
- *
- * We first check whether all CPUs have their TSC's synchronized,
- * then we print a warning if not, and always resync.
- */
+static atomic_t tsc_start_flag, tsc_check_start, tsc_check_stop;
 
-static atomic_t tsc_start_flag = ATOMIC_INIT(0);
-static atomic_t tsc_count_start = ATOMIC_INIT(0);
-static atomic_t tsc_count_stop = ATOMIC_INIT(0);
-static unsigned long long tsc_values[NR_CPUS];
-
-#define NR_LOOPS 5
-
-static void __init synchronize_tsc_bp (void)
+static int __init check_tsc_warp(void)
 {
-	int i;
-	unsigned long long t0;
-	unsigned long long sum, avg;
-	long long delta;
-	unsigned int one_usec;
-	int buggy = 0;
-
-	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus());
-
-	/* convert from kcyc/sec to cyc/usec */
-	one_usec = cpu_khz / 1000;
+	static DEFINE_RAW_SPINLOCK(warp_lock);
+	static long long prev;
+	static unsigned int error;
 
-	atomic_set(&tsc_start_flag, 1);
-	wmb();
+	int cpus = num_booting_cpus(), nr = 0;
+	long long start, now, end, delta;
 
+	atomic_inc(&tsc_check_start);
+	while (atomic_read(&tsc_check_start) != cpus)
+		cpu_relax();
 	/*
-	 * We loop a few times to get a primed instruction cache,
-	 * then the last pass is more or less synchronized and
-	 * the BP and APs set their cycle counters to zero all at
-	 * once. This reduces the chance of having random offsets
-	 * between the processors, and guarantees that the maximum
-	 * delay between the cycle counters is never bigger than
-	 * the latency of information-passing (cachelines) between
-	 * two CPUs.
+	 * Run the check for 500 msecs:
 	 */
-	for (i = 0; i < NR_LOOPS; i++) {
-		/*
-		 * all APs synchronize but they loop on '== num_cpus'
-		 */
-		while (atomic_read(&tsc_count_start) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_stop, 0);
-		wmb();
-		/*
-		 * this lets the APs save their current TSC:
-		 */
-		atomic_inc(&tsc_count_start);
+	rdtscll(start);
+	end = start + cpu_khz*500;
 
-		rdtscll(tsc_values[smp_processor_id()]);
+	for (;;) {
 		/*
-		 * We clear the TSC in the last loop:
+		 * Check for the TSC going backwards (between CPUs):
 		 */
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
+		spin_lock(&warp_lock);
+		rdtscll(now);
+		delta = now - prev;
+		prev = now;
+		spin_unlock(&warp_lock);
+		if (unlikely(delta < 0))
+			error = 1;
 
+		if (now > end)
+			break;
 		/*
-		 * Wait for all APs to leave the synchronization point:
+		 * Take it easy every couple of iterations,
+		 * to not starve other CPUs:
 		 */
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1)
-			mb();
-		atomic_set(&tsc_count_start, 0);
-		wmb();
-		atomic_inc(&tsc_count_stop);
+		nr++;
+		if (!(nr % 31))
+			cpu_relax();
 	}
 
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (cpu_isset(i, cpu_callout_map)) {
-			t0 = tsc_values[i];
-			sum += t0;
-		}
-	}
-	avg = sum;
-	do_div(avg, num_booting_cpus());
+	atomic_inc(&tsc_check_stop);
+	while (atomic_read(&tsc_check_stop) != cpus)
+		cpu_relax();
 
-	sum = 0;
-	for (i = 0; i < NR_CPUS; i++) {
-		if (!cpu_isset(i, cpu_callout_map))
-			continue;
-		delta = tsc_values[i] - avg;
-		if (delta < 0)
-			delta = -delta;
-		/*
-		 * We report bigger than 2 microseconds clock differences.
-		 */
-		if (delta > 2*one_usec) {
-			long realdelta;
-			if (!buggy) {
-				buggy = 1;
-				printk("\n");
-			}
-			realdelta = delta;
-			do_div(realdelta, one_usec);
-			if (tsc_values[i] < avg)
-				realdelta = -realdelta;
+	return error;
+}
 
-			printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta);
-		}
+/*
+ * TSC synchronization based on ia64 itc synchronization code.  Synchronize
+ * pairs of processors rahter than tring to synchronize all of the processors
+ * with a single event.  When several processors are all waiting for an
+ * event they don't all see it at the same time.  The write will cause
+ * an invalidate on each processors cache and then they all scramble to
+ * re-read that cache line.
+ *
+ * Writing the TSC resets the upper 32-bits, so we need to be careful
+ * that all of the cpus can be synchronized before we overflow the
+ * 32-bit count.
+ */
 
-		sum += delta;
+#define MASTER	0
+#define SLAVE	(SMP_CACHE_BYTES/sizeof(long))
+
+#define NUM_ROUNDS	64	/* magic value */
+#define NUM_ITERS	5	/* likewise */
+
+static volatile unsigned long go[2*SLAVE] __cacheline_aligned;
+static volatile int current_slave = -1;
+static volatile int tsc_sync_complete = 0;
+static volatile int tsc_adj_latency = 0;
+static unsigned int max_rt = 0;
+static unsigned int max_delta = 0;
+
+#define DEBUG_TSC_SYNC	0
+#if DEBUG_TSC_SYNC
+struct tsc_sync_debug {
+	long rt;	/* roundtrip time */
+	long master;	/* master's timestamp */
+	long diff;	/* difference between midpoint and master's timestamp */
+	long lat;	/* estimate of tsc adjustment latency */
+} tsc_sync_debug[NUM_ROUNDS*NR_CPUS];
+#endif
+
+void
+sync_master(void)
+{
+	unsigned long  n, tsc, last_go_master;
+
+	last_go_master = 0;
+	while (1) {
+		while ((n = go[MASTER]) == last_go_master)
+			rep_nop();
+		if (n == ~0)
+			break;
+		rdtscl(tsc);
+		if (unlikely(!tsc))
+			tsc = 1;
+		go[SLAVE] = tsc;
+		last_go_master = n;
 	}
-	if (!buggy)
-		printk("passed.\n");
 }
 
-static void __init synchronize_tsc_ap (void)
+/*
+ * Return the number of cycles by which our TSC differs from the TSC on
+ * the master (time-keeper) CPU.  A positive number indicates our TSC is
+ * ahead of the master, negative that it is behind.
+ */
+static inline long
+get_delta (long *rt, long *master)
 {
-	int i;
+	unsigned long best_t0 = 0, best_t1 = ~0UL, best_tm = 0;
+	unsigned long tcenter, t0, t1, tm, last_go_slave;
+	long i;
+
+	last_go_slave = go[SLAVE];
+	for (i = 0; i < NUM_ITERS; ++i) {
+		rdtscl(t0);
+		go[MASTER] = i+1;
+		while ((tm = go[SLAVE]) == last_go_slave)
+			rep_nop();
+		rdtscl(t1);
+
+		if (t1 - t0 < best_t1 - best_t0)
+			best_t0 = t0, best_t1 = t1, best_tm = tm;
+		last_go_slave = tm;
+	}
+
+	*rt = best_t1 - best_t0;
+	*master = best_tm - best_t0;
+
+	/* average best_t0 and best_t1 without overflow: */
+	tcenter = (best_t0/2 + best_t1/2);
+	if (best_t0 % 2 + best_t1 % 2 == 2)
+		++tcenter;
+	return tcenter - best_tm;
+}
+
+/*
+ * Synchronize TSC of the current (slave) CPU with the TSC of the MASTER CPU
+ * (normally the time-keeper CPU).  We use a closed loop to eliminate the
+ * possibility of unaccounted-for errors (such as getting a machine check in
+ * the middle of a calibration step).  The basic idea is for the slave to ask
+ * the master what TSC value it has and to read its own TSC before and after
+ * the master responds.  Each iteration gives us three
+ * timestamps:
+ *
+ *	slave		master
+ *
+ *	t0 ---\
+ *             ---\
+ *		   --->
+ *			tm
+ *		   /---
+ *	       /---
+ *	t1 <---
+ *
+ *
+ * The goal is to adjust the slave's TSC such that tm falls exactly half-way
+ * between t0 and t1.  If we achieve this, the clocks are synchronized provided
+ * the interconnect between the slave and the master is symmetric.  Even if the
+ * interconnect were asymmetric, we would still know that the synchronization
+ * error is smaller than the roundtrip latency (t0 - t1).
+ *
+ * When the interconnect is quiet and symmetric, this lets us synchronize the
+ * TSC to within one or two cycles.  However, we can only *guarantee* that the
+ * synchronization is accurate to within a round-trip time, which is typically
+ * in the range of several hundred cycles (e.g., ~500 cycles).  In practice,
+ * this means that the TSC's are usually almost perfectly synchronized, but we
+ * shouldn't assume that the accuracy is much better than half a micro second
+ * or so.
+ */
+
+static void __init
+synchronize_tsc_ap (void)
+{
+	long i, delta, adj, adjust_latency, n_rounds;
+	unsigned long rt, master_time_stamp,  tsc;
+#if DEBUG_TSC_SYNC
+	struct tsc_sync_debug *t =
+		 &tsc_sync_debug[smp_processor_id() * NUM_ROUNDS];
+#endif
+
+	while (!atomic_read(&tsc_start_flag))
+		mb();
+
+	if (!check_tsc_warp())
+		return;
 
 	/*
-	 * Not every cpu is online at the time
-	 * this gets called, so we first wait for the BP to
-	 * finish SMP initialization:
+	 * Wait for our turn to synchronize with the boot processor.
 	 */
-	while (!atomic_read(&tsc_start_flag)) mb();
+	while (current_slave != smp_processor_id())
+		rep_nop();
+	adjust_latency = tsc_adj_latency;
+
+	go[SLAVE] = 0;
+	go[MASTER] = 0;
+	write_tsc(0,0);
+	for (i = 0; i < NUM_ROUNDS; ++i) {
+		delta = get_delta(&rt, &master_time_stamp);
+		if (delta == 0)
+			break;
+
+		if (i > 0)
+			adjust_latency += -delta;
+		adj = -delta + adjust_latency/8;
+		rdtscl(tsc);
+		write_tsc(tsc + adj, 0);
+#if DEBUG_TSC_SYNC
+		t[i].rt = rt;
+		t[i].master = master_time_stamp;
+		t[i].diff = delta;
+		t[i].lat = adjust_latency/8;
+#endif
+	}
+	n_rounds = i;
+	go[MASTER] = ~0;
+
+#if (DEBUG_TSC_SYNC == 2)
+	for (i = 0; i < n_rounds; ++i)
+		printk("rt=%5ld master=%5ld diff=%5ld adjlat=%5ld\n",
+		       t[i].rt, t[i].master, t[i].diff, t[i].lat);
+
+	printk("CPU %d: synchronized TSC (last diff %ld cycles, maxerr %lu cycles)\n",
+	       smp_processor_id(), delta, rt);
+
+	printk("It took %ld rounds\n", n_rounds);
+#endif
+	if (rt > max_rt)
+		max_rt = rt;
+	if (delta < 0)
+		delta = -delta;
+	if (delta > max_delta)
+		max_delta = delta;
+	tsc_adj_latency = adjust_latency;
+	current_slave = -1;
+	while (!tsc_sync_complete)
+		rep_nop();
+}
+
+/*
+ * The boot processor set its own TSC to zero and then gives each
+ * slave processor the chance to synchronize itself.
+ */
 
-	for (i = 0; i < NR_LOOPS; i++) {
-		atomic_inc(&tsc_count_start);
-		while (atomic_read(&tsc_count_start) != num_booting_cpus())
-			mb();
+static void __init synchronize_tsc_bp (void)
+{
+	unsigned int tsc_low, tsc_high, error;
+	int cpu;
+
+	atomic_set(&tsc_start_flag, 1);
 
-		rdtscll(tsc_values[smp_processor_id()]);
-		if (i == NR_LOOPS-1)
-			write_tsc(0, 0);
+	printk(KERN_INFO "checking TSC synchronization across %u CPUs: ",
+		num_booting_cpus());
 
-		atomic_inc(&tsc_count_stop);
-		while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb();
+	if (!check_tsc_warp()) {
+		printk("passed.\n");
+		return;
+	}
+	printk("failed.\n");
+
+	printk(KERN_INFO "starting TSC synchronization\n");
+	write_tsc(0, 0);
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (!cpu_isset(cpu, cpu_callout_map))
+			continue;
+		if (cpu == smp_processor_id())
+			continue;
+		go[MASTER] = 0;
+		current_slave = cpu;
+		sync_master();
+		while (current_slave != -1)
+			rep_nop();
+	}
+	rdtsc(tsc_low, tsc_high);
+	if (tsc_high)
+		printk("TSC overflowed during synchronization\n");
+	else
+		printk("TSC synchronization complete max_delta=%d cycles\n",
+			max_delta);
+	if (max_rt < 4293) {
+		error = (max_rt * 1000000)/cpu_khz;
+		printk("TSC sync round-trip time %d.%03d microseconds\n",
+			error/1000, error%1000);
+	} else {
+		printk("TSC sync round-trip time %d cycles\n", max_rt);
 	}
+	tsc_sync_complete = 1;
 }
-#undef NR_LOOPS
 
 extern void calibrate_delay(void);
 
@@ -517,7 +674,7 @@ static void __devinit start_secondary(vo
 	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
 
 	/* We can take interrupts now: we're officially "up". */
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	wmb();
 	cpu_idle();
@@ -1305,9 +1462,9 @@ int __cpu_disable(void)
 	/* We enable the timer again on the exit path of the death loop */
 	disable_APIC_timer();
 	/* Allow any queued timer interrupts to get serviced */
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay(1);
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	remove_siblinginfo(cpu);
 
@@ -1351,11 +1508,11 @@ int __devinit __cpu_up(unsigned int cpu)
 	/* In case one didn't come up */
 	if (!cpu_isset(cpu, cpu_callin_map)) {
 		printk(KERN_DEBUG "skipping cpu%d, didn't come online\n", cpu);
-		local_irq_enable();
+		raw_local_irq_enable();
 		return -EIO;
 	}
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
 	/* Unleash the CPU! */
 	cpu_set(cpu, smp_commenced_mask);
Index: linux/arch/i386/kernel/switch2poll.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/switch2poll.c
@@ -0,0 +1,5 @@
+/*
+ * Same type of hack used for early_printk.  This keeps the code
+ * in one place.
+ */
+#include "../../x86_64/kernel/switch2poll.c"
Index: linux/arch/i386/kernel/time.c
===================================================================
--- linux.orig/arch/i386/kernel/time.c
+++ linux/arch/i386/kernel/time.c
@@ -46,6 +46,7 @@
 #include <linux/bcd.h>
 #include <linux/efi.h>
 #include <linux/mca.h>
+#include <linux/clockchips.h>
 
 #include <asm/io.h>
 #include <asm/smp.h>
@@ -56,6 +57,7 @@
 #include <asm/uaccess.h>
 #include <asm/processor.h>
 #include <asm/timer.h>
+#include <asm/timeofday.h>
 
 #include "mach_time.h"
 
@@ -74,25 +76,14 @@ int pit_latch_buggy;              /* ext
 
 #include "do_timer.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 unsigned int cpu_khz;	/* Detected as we calibrate the TSC */
 EXPORT_SYMBOL(cpu_khz);
 
 extern unsigned long wall_jiffies;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 EXPORT_SYMBOL(rtc_lock);
 
-#include <asm/i8253.h>
-
-DEFINE_SPINLOCK(i8253_lock);
-EXPORT_SYMBOL(i8253_lock);
-
-struct timer_opts *cur_timer __read_mostly = &timer_none;
-
 /*
  * This is a special lock that is owned by the CPU and holds the index
  * register we are working with.  It is required for NMI access to the
@@ -122,118 +113,25 @@ void rtc_cmos_write(unsigned char val, u
 }
 EXPORT_SYMBOL(rtc_cmos_write);
 
-/*
- * This version of gettimeofday has microsecond resolution
- * and better than microsecond precision on fast x86 machines with TSC.
- */
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long seq;
-	unsigned long usec, sec;
-	unsigned long max_ntp_tick;
-
-	do {
-		unsigned long lost;
-
-		seq = read_seqbegin(&xtime_lock);
-
-		usec = cur_timer->get_offset();
-		lost = jiffies - wall_jiffies;
-
-		/*
-		 * If time_adjust is negative then NTP is slowing the clock
-		 * so make sure not to go into next possible interval.
-		 * Better to lose some accuracy than have time go backwards..
-		 */
-		if (unlikely(time_adjust < 0)) {
-			max_ntp_tick = (USEC_PER_SEC / HZ) - tickadj;
-			usec = min(usec, max_ntp_tick);
-
-			if (lost)
-				usec += lost * max_ntp_tick;
-		}
-		else if (unlikely(lost))
-			usec += lost * (USEC_PER_SEC / HZ);
-
-		sec = xtime.tv_sec;
-		usec += (xtime.tv_nsec / 1000);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	while (usec >= 1000000) {
-		usec -= 1000000;
-		sec++;
-	}
-
-	tv->tv_sec = sec;
-	tv->tv_usec = usec;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-	/*
-	 * This is revolting. We need to set "xtime" correctly. However, the
-	 * value in this location is the value at the most recent update of
-	 * wall time.  Discover what correction gettimeofday() would have
-	 * made, and then undo it!
-	 */
-	nsec -= cur_timer->get_offset() * NSEC_PER_USEC;
-	nsec -= (jiffies - wall_jiffies) * TICK_NSEC;
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 static int set_rtc_mmss(unsigned long nowtime)
 {
 	int retval;
-
-	WARN_ON(irqs_disabled());
+	unsigned long flags;
 
 	/* gets recalled with irq locally disabled */
-	spin_lock_irq(&rtc_lock);
+	/* XXX - does irqsave resolve this? -johnstul */
+	spin_lock_irqsave(&rtc_lock, flags);
 	if (efi_enabled)
 		retval = efi_set_rtc_mmss(nowtime);
 	else
 		retval = mach_set_rtc_mmss(nowtime);
-	spin_unlock_irq(&rtc_lock);
+	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	return retval;
 }
 
-
-int timer_ack;
-
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
-	return cur_timer->monotonic_clock();
-}
-EXPORT_SYMBOL(monotonic_clock);
-
 #if defined(CONFIG_SMP) && defined(CONFIG_FRAME_POINTER)
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
@@ -245,70 +143,6 @@ unsigned long profile_pc(struct pt_regs 
 EXPORT_SYMBOL(profile_pc);
 #endif
 
-/*
- * timer_interrupt() needs to keep up the real-time clock,
- * as well as call the "do_timer()" routine every clocktick
- */
-static inline void do_timer_interrupt(int irq, struct pt_regs *regs)
-{
-#ifdef CONFIG_X86_IO_APIC
-	if (timer_ack) {
-		/*
-		 * Subtle, when I/O APICs are used we have to ack timer IRQ
-		 * manually to reset the IRR bit for do_slow_gettimeoffset().
-		 * This will also deassert NMI lines for the watchdog if run
-		 * on an 82489DX-based system.
-		 */
-		spin_lock(&i8259A_lock);
-		outb(0x0c, PIC_MASTER_OCW3);
-		/* Ack the IRQ; AEOI will end it automatically. */
-		inb(PIC_MASTER_POLL);
-		spin_unlock(&i8259A_lock);
-	}
-#endif
-
-	do_timer_interrupt_hook(regs);
-
-
-	if (MCA_bus) {
-		/* The PS/2 uses level-triggered interrupts.  You can't
-		turn them off, nor would you want to (any attempt to
-		enable edge-triggered interrupts usually gets intercepted by a
-		special hardware circuit).  Hence we have to acknowledge
-		the timer interrupt.  Through some incredibly stupid
-		design idea, the reset for IRQ 0 is done by setting the
-		high bit of the PPI port B (0x61).  Note that some PS/2s,
-		notably the 55SX, work fine if this is removed.  */
-
-		irq = inb_p( 0x61 );	/* read the current state */
-		outb_p( irq|0x80, 0x61 );	/* reset the IRQ */
-	}
-}
-
-/*
- * This is the same as the above, except we _also_ save the current
- * Time Stamp Counter value at the time of the timer interrupt, so that
- * we later on can estimate the time of day more exactly.
- */
-irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
-{
-	/*
-	 * Here we are in the timer irq handler. We just have irqs locally
-	 * disabled but we don't know if the timer_bh is running on the other
-	 * CPU. We need to avoid to SMP race with it. NOTE: we don' t need
-	 * the irq version of write_lock because as just said we have irq
-	 * locally disabled. -arca
-	 */
-	write_seqlock(&xtime_lock);
-
-	cur_timer->mark_offset();
- 
-	do_timer_interrupt(irq, regs);
-
-	write_sequnlock(&xtime_lock);
-	return IRQ_HANDLED;
-}
-
 /* not static: needed by APM */
 unsigned long get_cmos_time(void)
 {
@@ -327,139 +161,42 @@ unsigned long get_cmos_time(void)
 }
 EXPORT_SYMBOL(get_cmos_time);
 
-static void sync_cmos_clock(unsigned long dummy);
-
-static DEFINE_TIMER(sync_cmos_timer, sync_cmos_clock, 0, 0);
-
-static void sync_cmos_clock(unsigned long dummy)
+/* arch specific timeofday hooks */
+nsec_t read_persistent_clock(void)
 {
-	struct timeval now, next;
-	int fail = 1;
+	return (nsec_t)get_cmos_time() * NSEC_PER_SEC;
+}
 
+void sync_persistent_clock(struct timespec ts)
+{
+	static unsigned long last_rtc_update;
 	/*
 	 * If we have an externally synchronized Linux clock, then update
 	 * CMOS clock accordingly every ~11 minutes. Set_rtc_mmss() has to be
 	 * called as close as possible to 500 ms before the new second starts.
-	 * This code is run on a timer.  If the clock is set, that timer
-	 * may not expire at the correct time.  Thus, we adjust...
 	 */
-	if (!ntp_synced())
-		/*
-		 * Not synced, exit, do not restart a timer (if one is
-		 * running, let it run out).
-		 */
+	if (ts.tv_sec <= last_rtc_update + 660)
 		return;
 
-	do_gettimeofday(&now);
-	if (now.tv_usec >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
-	    now.tv_usec <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2)
-		fail = set_rtc_mmss(now.tv_sec);
-
-	next.tv_usec = USEC_AFTER - now.tv_usec;
-	if (next.tv_usec <= 0)
-		next.tv_usec += USEC_PER_SEC;
-
-	if (!fail)
-		next.tv_sec = 659;
-	else
-		next.tv_sec = 0;
-
-	if (next.tv_usec >= USEC_PER_SEC) {
-		next.tv_sec++;
-		next.tv_usec -= USEC_PER_SEC;
+	if((ts.tv_nsec / 1000) >= USEC_AFTER - ((unsigned) TICK_SIZE) / 2 &&
+		(ts.tv_nsec / 1000) <= USEC_BEFORE + ((unsigned) TICK_SIZE) / 2) {
+		/* horrible...FIXME */
+		if (set_rtc_mmss(ts.tv_sec) == 0)
+			last_rtc_update = ts.tv_sec;
+		else
+			last_rtc_update = ts.tv_sec - 600; /* do it again in 60 s */
 	}
-	mod_timer(&sync_cmos_timer, jiffies + timeval_to_jiffies(&next));
-}
-
-void notify_arch_cmos_timer(void)
-{
-	mod_timer(&sync_cmos_timer, jiffies + 1);
 }
 
-static long clock_cmos_diff, sleep_start;
-
-static struct timer_opts *last_timer;
-static int timer_suspend(struct sys_device *dev, pm_message_t state)
-{
-	/*
-	 * Estimate time zone so that set_time can update the clock
-	 */
-	clock_cmos_diff = -get_cmos_time();
-	clock_cmos_diff += get_seconds();
-	sleep_start = get_cmos_time();
-	last_timer = cur_timer;
-	cur_timer = &timer_none;
-	if (last_timer->suspend)
-		last_timer->suspend(state);
-	return 0;
-}
-
-static int timer_resume(struct sys_device *dev)
-{
-	unsigned long flags;
-	unsigned long sec;
-	unsigned long sleep_length;
-
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled())
-		hpet_reenable();
-#endif
-	setup_pit_timer();
-	sec = get_cmos_time() + clock_cmos_diff;
-	sleep_length = (get_cmos_time() - sleep_start) * HZ;
-	write_seqlock_irqsave(&xtime_lock, flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	write_sequnlock_irqrestore(&xtime_lock, flags);
-	jiffies += sleep_length;
-	wall_jiffies += sleep_length;
-	if (last_timer->resume)
-		last_timer->resume();
-	cur_timer = last_timer;
-	last_timer = NULL;
-	touch_softlockup_watchdog();
-	return 0;
-}
-
-static struct sysdev_class timer_sysclass = {
-	.resume = timer_resume,
-	.suspend = timer_suspend,
-	set_kset_name("timer"),
-};
-
-
-/* XXX this driverfs stuff should probably go elsewhere later -john */
-static struct sys_device device_timer = {
-	.id	= 0,
-	.cls	= &timer_sysclass,
-};
-
-static int time_init_device(void)
-{
-	int error = sysdev_class_register(&timer_sysclass);
-	if (!error)
-		error = sysdev_register(&device_timer);
-	return error;
-}
-
-device_initcall(time_init_device);
-
 #ifdef CONFIG_HPET_TIMER
 extern void (*late_time_init)(void);
 /* Duplicate of time_init() below, with hpet_enable part added */
 static void __init hpet_time_init(void)
 {
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
 	if ((hpet_enable() >= 0) && hpet_use_timer) {
 		printk("Using HPET for base-timer\n");
 	}
 
-	cur_timer = select_timer();
-	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
 
 	time_init_hook();
 }
@@ -467,6 +204,9 @@ static void __init hpet_time_init(void)
 
 void __init time_init(void)
 {
+	/* Set the clock to HZ Hz: */
+	setup_pit_timer();
+
 #ifdef CONFIG_HPET_TIMER
 	if (is_hpet_capable()) {
 		/*
@@ -477,13 +217,5 @@ void __init time_init(void)
 		return;
 	}
 #endif
-	xtime.tv_sec = get_cmos_time();
-	xtime.tv_nsec = (INITIAL_JIFFIES % HZ) * (NSEC_PER_SEC / HZ);
-	set_normalized_timespec(&wall_to_monotonic,
-		-xtime.tv_sec, -xtime.tv_nsec);
-
-	cur_timer = select_timer();
-	printk(KERN_INFO "Using %s for high-res timesource\n",cur_timer->name);
-
 	time_init_hook();
 }
Index: linux/arch/i386/kernel/time_hpet.c
===================================================================
--- linux.orig/arch/i386/kernel/time_hpet.c
+++ linux/arch/i386/kernel/time_hpet.c
@@ -302,11 +302,11 @@ int hpet_rtc_timer_init(void)
 	else
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
 	cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
Index: linux/arch/i386/kernel/timers/Makefile
===================================================================
--- linux.orig/arch/i386/kernel/timers/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for x86 timers
-#
-
-obj-y := timer.o timer_none.o timer_tsc.o timer_pit.o common.o
-
-obj-$(CONFIG_X86_CYCLONE_TIMER)	+= timer_cyclone.o
-obj-$(CONFIG_HPET_TIMER)	+= timer_hpet.o
-obj-$(CONFIG_X86_PM_TIMER)	+= timer_pm.o
Index: linux/arch/i386/kernel/timers/common.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/common.c
+++ /dev/null
@@ -1,172 +0,0 @@
-/*
- *	Common functions used across the timers go here
- */
-
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/jiffies.h>
-#include <linux/module.h>
-
-#include <asm/io.h>
-#include <asm/timer.h>
-#include <asm/hpet.h>
-
-#include "mach_timer.h"
-
-/* ------ Calibrate the TSC -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for do_fast_gettimeoffset().
- * Too much 64-bit arithmetic here to do this cleanly in C, and for
- * accuracy's sake we want to keep the overhead on the CTC speaker (channel 2)
- * output busy loop as low as possible. We avoid reading the CTC registers
- * directly because of the awkward 8-bit access mechanism of the 82C54
- * device.
- */
-
-#define CALIBRATE_TIME	(5 * 1000020/HZ)
-
-unsigned long calibrate_tsc(void)
-{
-	mach_prepare_counter();
-
-	{
-		unsigned long startlow, starthigh;
-		unsigned long endlow, endhigh;
-		unsigned long count;
-
-		rdtsc(startlow,starthigh);
-		mach_countup(&count);
-		rdtsc(endlow,endhigh);
-
-
-		/* Error: ECTCNEVERSET */
-		if (count <= 1)
-			goto bad_ctc;
-
-		/* 64-bit subtract - gcc just messes up with long longs */
-		__asm__("subl %2,%0\n\t"
-			"sbbl %3,%1"
-			:"=a" (endlow), "=d" (endhigh)
-			:"g" (startlow), "g" (starthigh),
-			 "0" (endlow), "1" (endhigh));
-
-		/* Error: ECPUTOOFAST */
-		if (endhigh)
-			goto bad_ctc;
-
-		/* Error: ECPUTOOSLOW */
-		if (endlow <= CALIBRATE_TIME)
-			goto bad_ctc;
-
-		__asm__("divl %2"
-			:"=a" (endlow), "=d" (endhigh)
-			:"r" (endlow), "0" (0), "1" (CALIBRATE_TIME));
-
-		return endlow;
-	}
-
-	/*
-	 * The CTC wasn't reliable: we got a hit on the very first read,
-	 * or the CPU was so fast/slow that the quotient wouldn't fit in
-	 * 32 bits..
-	 */
-bad_ctc:
-	return 0;
-}
-
-#ifdef CONFIG_HPET_TIMER
-/* ------ Calibrate the TSC using HPET -------
- * Return 2^32 * (1 / (TSC clocks per usec)) for getting the CPU freq.
- * Second output is parameter 1 (when non NULL)
- * Set 2^32 * (1 / (tsc per HPET clk)) for delay_hpet().
- * calibrate_tsc() calibrates the processor TSC by comparing
- * it to the HPET timer of known frequency.
- * Too much 64-bit arithmetic here to do this cleanly in C
- */
-#define CALIBRATE_CNT_HPET 	(5 * hpet_tick)
-#define CALIBRATE_TIME_HPET 	(5 * KERNEL_TICK_USEC)
-
-unsigned long __devinit calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr)
-{
-	unsigned long tsc_startlow, tsc_starthigh;
-	unsigned long tsc_endlow, tsc_endhigh;
-	unsigned long hpet_start, hpet_end;
-	unsigned long result, remain;
-
-	hpet_start = hpet_readl(HPET_COUNTER);
-	rdtsc(tsc_startlow, tsc_starthigh);
-	do {
-		hpet_end = hpet_readl(HPET_COUNTER);
-	} while ((hpet_end - hpet_start) < CALIBRATE_CNT_HPET);
-	rdtsc(tsc_endlow, tsc_endhigh);
-
-	/* 64-bit subtract - gcc just messes up with long longs */
-	__asm__("subl %2,%0\n\t"
-		"sbbl %3,%1"
-		:"=a" (tsc_endlow), "=d" (tsc_endhigh)
-		:"g" (tsc_startlow), "g" (tsc_starthigh),
-		 "0" (tsc_endlow), "1" (tsc_endhigh));
-
-	/* Error: ECPUTOOFAST */
-	if (tsc_endhigh)
-		goto bad_calibration;
-
-	/* Error: ECPUTOOSLOW */
-	if (tsc_endlow <= CALIBRATE_TIME_HPET)
-		goto bad_calibration;
-
-	ASM_DIV64_REG(result, remain, tsc_endlow, 0, CALIBRATE_TIME_HPET);
-	if (remain > (tsc_endlow >> 1))
-		result++; /* rounding the result */
-
-	if (tsc_hpet_quotient_ptr) {
-		unsigned long tsc_hpet_quotient;
-
-		ASM_DIV64_REG(tsc_hpet_quotient, remain, tsc_endlow, 0,
-			CALIBRATE_CNT_HPET);
-		if (remain > (tsc_endlow >> 1))
-			tsc_hpet_quotient++; /* rounding the result */
-		*tsc_hpet_quotient_ptr = tsc_hpet_quotient;
-	}
-
-	return result;
-bad_calibration:
-	/*
-	 * the CPU was so fast/slow that the quotient wouldn't fit in
-	 * 32 bits..
-	 */
-	return 0;
-}
-#endif
-
-
-unsigned long read_timer_tsc(void)
-{
-	unsigned long retval;
-	rdtscl(retval);
-	return retval;
-}
-
-
-/* calculate cpu_khz */
-void init_cpu_khz(void)
-{
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient = calibrate_tsc();
-		if (tsc_quotient) {
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				__asm__("divl %2"
-		       		:"=a" (cpu_khz), "=d" (edx)
-        	       		:"r" (tsc_quotient),
-	                	"0" (eax), "1" (edx));
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-		}
-	}
-}
-
Index: linux/arch/i386/kernel/timers/timer.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/timer.c
+++ /dev/null
@@ -1,75 +0,0 @@
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/string.h>
-#include <asm/timer.h>
-
-#ifdef CONFIG_HPET_TIMER
-/*
- * HPET memory read is slower than tsc reads, but is more dependable as it
- * always runs at constant frequency and reduces complexity due to
- * cpufreq. So, we prefer HPET timer to tsc based one. Also, we cannot use
- * timer_pit when HPET is active. So, we default to timer_tsc.
- */
-#endif
-/* list of timers, ordered by preference, NULL terminated */
-static struct init_timer_opts* __initdata timers[] = {
-#ifdef CONFIG_X86_CYCLONE_TIMER
-	&timer_cyclone_init,
-#endif
-#ifdef CONFIG_HPET_TIMER
-	&timer_hpet_init,
-#endif
-#ifdef CONFIG_X86_PM_TIMER
-	&timer_pmtmr_init,
-#endif
-	&timer_tsc_init,
-	&timer_pit_init,
-	NULL,
-};
-
-static char clock_override[10] __initdata;
-
-static int __init clock_setup(char* str)
-{
-	if (str)
-		strlcpy(clock_override, str, sizeof(clock_override));
-	return 1;
-}
-__setup("clock=", clock_setup);
-
-
-/* The chosen timesource has been found to be bad.
- * Fall back to a known good timesource (the PIT)
- */
-void clock_fallback(void)
-{
-	cur_timer = &timer_pit;
-}
-
-/* iterates through the list of timers, returning the first 
- * one that initializes successfully.
- */
-struct timer_opts* __init select_timer(void)
-{
-	int i = 0;
-	
-	/* find most preferred working timer */
-	while (timers[i]) {
-		if (timers[i]->init)
-			if (timers[i]->init(clock_override) == 0)
-				return timers[i]->opts;
-		++i;
-	}
-		
-	panic("select_timer: Cannot find a suitable timer\n");
-	return NULL;
-}
-
-int read_current_timer(unsigned long *timer_val)
-{
-	if (cur_timer->read_timer) {
-		*timer_val = cur_timer->read_timer();
-		return 0;
-	}
-	return -1;
-}
Index: linux/arch/i386/kernel/timers/timer_cyclone.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/timer_cyclone.c
+++ /dev/null
@@ -1,259 +0,0 @@
-/*	Cyclone-timer: 
- *		This code implements timer_ops for the cyclone counter found
- *		on IBM x440, x360, and other Summit based systems.
- *
- *	Copyright (C) 2002 IBM, John Stultz (johnstul@us.ibm.com)
- */
-
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/pgtable.h>
-#include <asm/fixmap.h>
-#include <asm/i8253.h>
-
-#include "io_ports.h"
-
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-#define CYCLONE_CBAR_ADDR 0xFEB00CD0
-#define CYCLONE_PMCC_OFFSET 0x51A0
-#define CYCLONE_MPMC_OFFSET 0x51D0
-#define CYCLONE_MPCS_OFFSET 0x51A8
-#define CYCLONE_TIMER_FREQ 100000000
-#define CYCLONE_TIMER_MASK (((u64)1<<40)-1) /* 40 bit mask */
-int use_cyclone = 0;
-
-static u32* volatile cyclone_timer;	/* Cyclone MPMC0 register */
-static u32 last_cyclone_low;
-static u32 last_cyclone_high;
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* helper macro to atomically read both cyclone counter registers */
-#define read_cyclone_counter(low,high) \
-	do{ \
-		high = cyclone_timer[1]; low = cyclone_timer[0]; \
-	} while (high != cyclone_timer[1]);
-
-
-static void mark_offset_cyclone(void)
-{
-	unsigned long lost, delay;
-	unsigned long delta = last_cyclone_low;
-	int count;
-	unsigned long long this_offset, last_offset;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-	
-	spin_lock(&i8253_lock);
-	read_cyclone_counter(last_cyclone_low,last_cyclone_high);
-
-	/* read values for delay_at_last_interrupt */
-	outb_p(0x00, 0x43);     /* latch the count ASAP */
-
-	count = inb_p(0x40);    /* read the latched count */
-	count |= inb(0x40) << 8;
-
-	/*
-	 * VIA686a test code... reset the latch if count > max + 1
-	 * from timer_pit.c - cjb
-	 */
-	if (count > LATCH) {
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff, PIT_CH0);
-		outb(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-	spin_unlock(&i8253_lock);
-
-	/* lost tick compensation */
-	delta = last_cyclone_low - delta;	
-	delta /= (CYCLONE_TIMER_FREQ/1000000);
-	delta += delay_at_last_interrupt;
-	lost = delta/(1000000/HZ);
-	delay = delta%(1000000/HZ);
-	if (lost >= 2)
-		jiffies_64 += lost-1;
-	
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-	monotonic_base += (this_offset - last_offset) & CYCLONE_TIMER_MASK;
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-
-	/* catch corner case where tick rollover occured 
-	 * between cyclone and pit reads (as noted when 
-	 * usec delta is > 90% # of usecs/tick)
-	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
-		jiffies_64++;
-}
-
-static unsigned long get_offset_cyclone(void)
-{
-	u32 offset;
-
-	if(!cyclone_timer)
-		return delay_at_last_interrupt;
-
-	/* Read the cyclone timer */
-	offset = cyclone_timer[0];
-
-	/* .. relative to previous jiffy */
-	offset = offset - last_cyclone_low;
-
-	/* convert cyclone ticks to microseconds */	
-	/* XXX slow, can we speed this up? */
-	offset = offset/(CYCLONE_TIMER_FREQ/1000000);
-
-	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + offset;
-}
-
-static unsigned long long monotonic_clock_cyclone(void)
-{
-	u32 now_low, now_high;
-	unsigned long long last_offset, this_offset, base;
-	unsigned long long ret;
-	unsigned seq;
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_cyclone_high<<32)|last_cyclone_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-
-	/* Read the cyclone counter */
-	read_cyclone_counter(now_low,now_high);
-	this_offset = ((unsigned long long)now_high<<32)|now_low;
-
-	/* convert to nanoseconds */
-	ret = base + ((this_offset - last_offset)&CYCLONE_TIMER_MASK);
-	return ret * (1000000000 / CYCLONE_TIMER_FREQ);
-}
-
-static int __init init_cyclone(char* override)
-{
-	u32* reg;	
-	u32 base;		/* saved cyclone base address */
-	u32 pageaddr;	/* page that contains cyclone_timer register */
-	u32 offset;		/* offset from pageaddr to cyclone_timer register */
-	int i;
-	
-	/* check clock override */
-	if (override[0] && strncmp(override,"cyclone",7))
-			return -ENODEV;
-
-	/*make sure we're on a summit box*/
-	if(!use_cyclone) return -ENODEV; 
-	
-	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
-
-	/* find base address */
-	pageaddr = (CYCLONE_CBAR_ADDR)&PAGE_MASK;
-	offset = (CYCLONE_CBAR_ADDR)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
-		return -ENODEV;
-	}
-	base = *reg;	
-	if(!base){
-		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
-		return -ENODEV;
-	}
-	
-	/* setup PMCC */
-	pageaddr = (base + CYCLONE_PMCC_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_PMCC_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
-		return -ENODEV;
-	}
-	reg[0] = 0x00000001;
-
-	/* setup MPCS */
-	pageaddr = (base + CYCLONE_MPCS_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_MPCS_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	reg = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!reg){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
-		return -ENODEV;
-	}
-	reg[0] = 0x00000001;
-
-	/* map in cyclone_timer */
-	pageaddr = (base + CYCLONE_MPMC_OFFSET)&PAGE_MASK;
-	offset = (base + CYCLONE_MPMC_OFFSET)&(~PAGE_MASK);
-	set_fixmap_nocache(FIX_CYCLONE_TIMER, pageaddr);
-	cyclone_timer = (u32*)(fix_to_virt(FIX_CYCLONE_TIMER) + offset);
-	if(!cyclone_timer){
-		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
-		return -ENODEV;
-	}
-
-	/*quick test to make sure its ticking*/
-	for(i=0; i<3; i++){
-		u32 old = cyclone_timer[0];
-		int stall = 100;
-		while(stall--) barrier();
-		if(cyclone_timer[0] == old){
-			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
-			cyclone_timer = 0;
-			return -ENODEV;
-		}
-	}
-
-	init_cpu_khz();
-
-	/* Everything looks good! */
-	return 0;
-}
-
-
-static void delay_cyclone(unsigned long loops)
-{
-	unsigned long bclock, now;
-	if(!cyclone_timer)
-		return;
-	bclock = cyclone_timer[0];
-	do {
-		rep_nop();
-		now = cyclone_timer[0];
-	} while ((now-bclock) < loops);
-}
-/************************************************************/
-
-/* cyclone timer_opts struct */
-static struct timer_opts timer_cyclone = {
-	.name = "cyclone",
-	.mark_offset = mark_offset_cyclone, 
-	.get_offset = get_offset_cyclone,
-	.monotonic_clock =	monotonic_clock_cyclone,
-	.delay = delay_cyclone,
-};
-
-struct init_timer_opts __initdata timer_cyclone_init = {
-	.init = init_cyclone,
-	.opts = &timer_cyclone,
-};
Index: linux/arch/i386/kernel/timers/timer_hpet.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/timer_hpet.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-#include <asm/hpet.h>
-
-static unsigned long hpet_usec_quotient __read_mostly;	/* convert hpet clks to usec */
-static unsigned long tsc_hpet_quotient __read_mostly;	/* convert tsc to hpet clks */
-static unsigned long hpet_last; 	/* hpet counter value at last tick*/
-static unsigned long last_tsc_low;	/* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; 	/* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_mhz * 10^6))
- *		ns = cycles * (10^3 / cpu_mhz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^3 * SC / cpu_mhz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale;
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
-{
-	cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static unsigned long long monotonic_clock_hpet(void)
-{
-	unsigned long long last_offset, this_offset, base;
-	unsigned seq;
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return base + cycles_2_ns(this_offset - last_offset);
-}
-
-static unsigned long get_offset_hpet(void)
-{
-	register unsigned long eax, edx;
-
-	eax = hpet_readl(HPET_COUNTER);
-	eax -= hpet_last;	/* hpet delta */
-	eax = min(hpet_tick, eax);
-	/*
-         * Time offset = (hpet delta) * ( usecs per HPET clock )
-	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
-	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
-	 *
-	 * Where,
-	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
-	 *
-	 * Using a mull instead of a divl saves some cycles in critical path.
-         */
-	ASM_MUL64_REG(eax, edx, hpet_usec_quotient, eax);
-
-	/* our adjusted time offset in microseconds */
-	return edx;
-}
-
-static void mark_offset_hpet(void)
-{
-	unsigned long long this_offset, last_offset;
-	unsigned long offset;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	if (hpet_use_timer)
-		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	else
-		offset = hpet_readl(HPET_COUNTER);
-	if (unlikely(((offset - hpet_last) >= (2*hpet_tick)) && (hpet_last != 0))) {
-		int lost_ticks = ((offset - hpet_last) / hpet_tick) - 1;
-		jiffies_64 += lost_ticks;
-	}
-	hpet_last = offset;
-
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-}
-
-static void delay_hpet(unsigned long loops)
-{
-	unsigned long hpet_start, hpet_end;
-	unsigned long eax;
-
-	/* loops is the number of cpu cycles. Convert it to hpet clocks */
-	ASM_MUL64_REG(eax, loops, tsc_hpet_quotient, loops);
-
-	hpet_start = hpet_readl(HPET_COUNTER);
-	do {
-		rep_nop();
-		hpet_end = hpet_readl(HPET_COUNTER);
-	} while ((hpet_end - hpet_start) < (loops));
-}
-
-static struct timer_opts timer_hpet;
-
-static int __init init_hpet(char* override)
-{
-	unsigned long result, remain;
-
-	/* check clock override */
-	if (override[0] && strncmp(override,"hpet",4))
-		return -ENODEV;
-
-	if (!is_hpet_enabled())
-		return -ENODEV;
-
-	printk("Using HPET for gettimeofday\n");
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient = calibrate_tsc_hpet(&tsc_hpet_quotient);
-		if (tsc_quotient) {
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				ASM_DIV64_REG(cpu_khz, edx, tsc_quotient,
-						eax, edx);
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-			set_cyc2ns_scale(cpu_khz/1000);
-		}
-		/* set this only when cpu_has_tsc */
-		timer_hpet.read_timer = read_timer_tsc;
-	}
-
-	/*
-	 * Math to calculate hpet to usec multiplier
-	 * Look for the comments at get_offset_hpet()
-	 */
-	ASM_DIV64_REG(result, remain, hpet_tick, 0, KERNEL_TICK_USEC);
-	if (remain > (hpet_tick >> 1))
-		result++; /* rounding the result */
-	hpet_usec_quotient = result;
-
-	return 0;
-}
-
-static int hpet_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	if (hpet_use_timer)
-		hpet_last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	else
-		hpet_last = hpet_readl(HPET_COUNTER);
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_hpet __read_mostly = {
-	.name = 		"hpet",
-	.mark_offset =		mark_offset_hpet,
-	.get_offset =		get_offset_hpet,
-	.monotonic_clock =	monotonic_clock_hpet,
-	.delay = 		delay_hpet,
-	.resume	=		hpet_resume,
-};
-
-struct init_timer_opts __initdata timer_hpet_init = {
-	.init =	init_hpet,
-	.opts = &timer_hpet,
-};
Index: linux/arch/i386/kernel/timers/timer_none.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/timer_none.c
+++ /dev/null
@@ -1,39 +0,0 @@
-#include <linux/init.h>
-#include <asm/timer.h>
-
-static void mark_offset_none(void)
-{
-	/* nothing needed */
-}
-
-static unsigned long get_offset_none(void)
-{
-	return 0;
-}
-
-static unsigned long long monotonic_clock_none(void)
-{
-	return 0;
-}
-
-static void delay_none(unsigned long loops)
-{
-	int d0;
-	__asm__ __volatile__(
-		"\tjmp 1f\n"
-		".align 16\n"
-		"1:\tjmp 2f\n"
-		".align 16\n"
-		"2:\tdecl %0\n\tjns 2b"
-		:"=&a" (d0)
-		:"0" (loops));
-}
-
-/* none timer_opts struct */
-struct timer_opts timer_none = {
-	.name = 	"none",
-	.mark_offset =	mark_offset_none, 
-	.get_offset =	get_offset_none,
-	.monotonic_clock =	monotonic_clock_none,
-	.delay = delay_none,
-};
Index: linux/arch/i386/kernel/timers/timer_pit.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/timer_pit.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- */
-
-#include <linux/spinlock.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/sysdev.h>
-#include <linux/timex.h>
-#include <asm/delay.h>
-#include <asm/mpspec.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-#include <asm/i8253.h>
-
-#include "do_timer.h"
-#include "io_ports.h"
-
-static int count_p; /* counter in get_offset_pit() */
-
-static int __init init_pit(char* override)
-{
- 	/* check clock override */
- 	if (override[0] && strncmp(override,"pit",3))
- 		printk(KERN_ERR "Warning: clock= override failed. Defaulting to PIT\n");
- 
-	count_p = LATCH;
-	return 0;
-}
-
-static void mark_offset_pit(void)
-{
-	/* nothing needed */
-}
-
-static unsigned long long monotonic_clock_pit(void)
-{
-	return 0;
-}
-
-static void delay_pit(unsigned long loops)
-{
-	int d0;
-	__asm__ __volatile__(
-		"\tjmp 1f\n"
-		".align 16\n"
-		"1:\tjmp 2f\n"
-		".align 16\n"
-		"2:\tdecl %0\n\tjns 2b"
-		:"=&a" (d0)
-		:"0" (loops));
-}
-
-
-/* This function must be called with xtime_lock held.
- * It was inspired by Steve McCanne's microtime-i386 for BSD.  -- jrs
- * 
- * However, the pc-audio speaker driver changes the divisor so that
- * it gets interrupted rather more often - it loads 64 into the
- * counter rather than 11932! This has an adverse impact on
- * do_gettimeoffset() -- it stops working! What is also not
- * good is that the interval that our timer function gets called
- * is no longer 10.0002 ms, but 9.9767 ms. To get around this
- * would require using a different timing source. Maybe someone
- * could use the RTC - I know that this can interrupt at frequencies
- * ranging from 8192Hz to 2Hz. If I had the energy, I'd somehow fix
- * it so that at startup, the timer code in sched.c would select
- * using either the RTC or the 8253 timer. The decision would be
- * based on whether there was any other device around that needed
- * to trample on the 8253. I'd set up the RTC to interrupt at 1024 Hz,
- * and then do some jiggery to have a version of do_timer that 
- * advanced the clock by 1/1024 s. Every time that reached over 1/100
- * of a second, then do all the old code. If the time was kept correct
- * then do_gettimeoffset could just return 0 - there is no low order
- * divider that can be accessed.
- *
- * Ideally, you would be able to use the RTC for the speaker driver,
- * but it appears that the speaker driver really needs interrupt more
- * often than every 120 us or so.
- *
- * Anyway, this needs more thought....		pjsg (1993-08-28)
- * 
- * If you are really that interested, you should be reading
- * comp.protocols.time.ntp!
- */
-
-static unsigned long get_offset_pit(void)
-{
-	int count;
-	unsigned long flags;
-	static unsigned long jiffies_p = 0;
-
-	/*
-	 * cache volatile jiffies temporarily; we have xtime_lock. 
-	 */
-	unsigned long jiffies_t;
-
-	spin_lock_irqsave(&i8253_lock, flags);
-	/* timer count may underflow right here */
-	outb_p(0x00, PIT_MODE);	/* latch the count ASAP */
-
-	count = inb_p(PIT_CH0);	/* read the latched count */
-
-	/*
-	 * We do this guaranteed double memory access instead of a _p 
-	 * postfix in the previous port access. Wheee, hackady hack
-	 */
- 	jiffies_t = jiffies;
-
-	count |= inb_p(PIT_CH0) << 8;
-	
-        /* VIA686a test code... reset the latch if count > max + 1 */
-        if (count > LATCH) {
-                outb_p(0x34, PIT_MODE);
-                outb_p(LATCH & 0xff, PIT_CH0);
-                outb(LATCH >> 8, PIT_CH0);
-                count = LATCH - 1;
-        }
-	
-	/*
-	 * avoiding timer inconsistencies (they are rare, but they happen)...
-	 * there are two kinds of problems that must be avoided here:
-	 *  1. the timer counter underflows
-	 *  2. hardware problem with the timer, not giving us continuous time,
-	 *     the counter does small "jumps" upwards on some Pentium systems,
-	 *     (see c't 95/10 page 335 for Neptun bug.)
-	 */
-
-	if( jiffies_t == jiffies_p ) {
-		if( count > count_p ) {
-			/* the nutcase */
-			count = do_timer_overflow(count);
-		}
-	} else
-		jiffies_p = jiffies_t;
-
-	count_p = count;
-
-	spin_unlock_irqrestore(&i8253_lock, flags);
-
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	count = (count + LATCH/2) / LATCH;
-
-	return count;
-}
-
-
-/* tsc timer_opts struct */
-struct timer_opts timer_pit = {
-	.name = "pit",
-	.mark_offset = mark_offset_pit, 
-	.get_offset = get_offset_pit,
-	.monotonic_clock = monotonic_clock_pit,
-	.delay = delay_pit,
-};
-
-struct init_timer_opts __initdata timer_pit_init = {
-	.init = init_pit, 
-	.opts = &timer_pit,
-};
-
-void setup_pit_timer(void)
-{
-	unsigned long flags;
-
-	spin_lock_irqsave(&i8253_lock, flags);
-	outb_p(0x34,PIT_MODE);		/* binary, mode 2, LSB/MSB, ch 0 */
-	udelay(10);
-	outb_p(LATCH & 0xff , PIT_CH0);	/* LSB */
-	udelay(10);
-	outb(LATCH >> 8 , PIT_CH0);	/* MSB */
-	spin_unlock_irqrestore(&i8253_lock, flags);
-}
Index: linux/arch/i386/kernel/timers/timer_pm.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/timer_pm.c
+++ /dev/null
@@ -1,268 +0,0 @@
-/*
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- */
-
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/device.h>
-#include <linux/init.h>
-#include <asm/types.h>
-#include <asm/timer.h>
-#include <asm/smp.h>
-#include <asm/io.h>
-#include <asm/arch_hooks.h>
-
-#include <linux/timex.h>
-#include "mach_timer.h"
-
-/* Number of PMTMR ticks expected during calibration run */
-#define PMTMR_TICKS_PER_SEC 3579545
-#define PMTMR_EXPECTED_RATE \
-  ((CALIBRATE_LATCH * (PMTMR_TICKS_PER_SEC >> 10)) / (CLOCK_TICK_RATE>>10))
-
-
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/acpi/boot.c */
-u32 pmtmr_ioport = 0;
-
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_tick;
-static u32 offset_delay;
-
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-/*helper function to safely read acpi pm timesource*/
-static inline u32 read_pmtmr(void)
-{
-	u32 v1=0,v2=0,v3=0;
-	/* It has been reported that because of various broken
-	 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM time
-	 * source is not latched, so you must read it multiple
-	 * times to insure a safe value is read.
-	 */
-	do {
-		v1 = inl(pmtmr_ioport);
-		v2 = inl(pmtmr_ioport);
-		v3 = inl(pmtmr_ioport);
-	} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
-			|| (v3 > v1 && v3 < v2));
-
-	/* mask the output to 24 bits */
-	return v2 & ACPI_PM_MASK;
-}
-
-
-/*
- * Some boards have the PMTMR running way too fast. We check
- * the PMTMR rate against PIT channel 2 to catch these cases.
- */
-static int verify_pmtmr_rate(void)
-{
-	u32 value1, value2;
-	unsigned long count, delta;
-
-	mach_prepare_counter();
-	value1 = read_pmtmr();
-	mach_countup(&count);
-	value2 = read_pmtmr();
-	delta = (value2 - value1) & ACPI_PM_MASK;
-
-	/* Check that the PMTMR delta is within 5% of what we expect */
-	if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
-	    delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
-		printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
-		return -1;
-	}
-
-	return 0;
-}
-
-
-static int init_pmtmr(char* override)
-{
-	u32 value1, value2;
-	unsigned int i;
-
- 	if (override[0] && strncmp(override,"pmtmr",5))
-		return -ENODEV;
-
-	if (!pmtmr_ioport)
-		return -ENODEV;
-
-	/* we use the TSC for delay_pmtmr, so make sure it exists */
-	if (!cpu_has_tsc)
-		return -ENODEV;
-
-	/* "verify" this timing source */
-	value1 = read_pmtmr();
-	for (i = 0; i < 10000; i++) {
-		value2 = read_pmtmr();
-		if (value2 == value1)
-			continue;
-		if (value2 > value1)
-			goto pm_good;
-		if ((value2 < value1) && ((value2) < 0xFFF))
-			goto pm_good;
-		printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
-		return -EINVAL;
-	}
-	printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
-	return -ENODEV;
-
-pm_good:
-	if (verify_pmtmr_rate() != 0)
-		return -ENODEV;
-
-	init_cpu_khz();
-	return 0;
-}
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-/*
- * this gets called during each timer interrupt
- *   - Called while holding the writer xtime_lock
- */
-static void mark_offset_pmtmr(void)
-{
-	u32 lost, delta, last_offset;
-	static int first_run = 1;
-	last_offset = offset_tick;
-
-	write_seqlock(&monotonic_lock);
-
-	offset_tick = read_pmtmr();
-
-	/* calculate tick interval */
-	delta = (offset_tick - last_offset) & ACPI_PM_MASK;
-
-	/* convert to usecs */
-	delta = cyc2us(delta);
-
-	/* update the monotonic base value */
-	monotonic_base += delta * NSEC_PER_USEC;
-	write_sequnlock(&monotonic_lock);
-
-	/* convert to ticks */
-	delta += offset_delay;
-	lost = delta / (USEC_PER_SEC / HZ);
-	offset_delay = delta % (USEC_PER_SEC / HZ);
-
-
-	/* compensate for lost ticks */
-	if (lost >= 2)
-		jiffies_64 += lost - 1;
-
-	/* don't calculate delay for first run,
-	   or if we've got less then a tick */
-	if (first_run || (lost < 1)) {
-		first_run = 0;
-		offset_delay = 0;
-	}
-}
-
-static int pmtmr_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	offset_tick = read_pmtmr();
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-
-static unsigned long long monotonic_clock_pmtmr(void)
-{
-	u32 last_offset, this_offset;
-	unsigned long long base, ret;
-	unsigned seq;
-
-
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = offset_tick;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the pmtmr */
-	this_offset =  read_pmtmr();
-
-	/* convert to nanoseconds */
-	ret = (this_offset - last_offset) & ACPI_PM_MASK;
-	ret = base + (cyc2us(ret) * NSEC_PER_USEC);
-	return ret;
-}
-
-static void delay_pmtmr(unsigned long loops)
-{
-	unsigned long bclock, now;
-
-	rdtscl(bclock);
-	do
-	{
-		rep_nop();
-		rdtscl(now);
-	} while ((now-bclock) < loops);
-}
-
-
-/*
- * get the offset (in microseconds) from the last call to mark_offset()
- *	- Called holding a reader xtime_lock
- */
-static unsigned long get_offset_pmtmr(void)
-{
-	u32 now, offset, delta = 0;
-
-	offset = offset_tick;
-	now = read_pmtmr();
-	delta = (now - offset)&ACPI_PM_MASK;
-
-	return (unsigned long) offset_delay + cyc2us(delta);
-}
-
-
-/* acpi timer_opts struct */
-static struct timer_opts timer_pmtmr = {
-	.name			= "pmtmr",
-	.mark_offset		= mark_offset_pmtmr,
-	.get_offset		= get_offset_pmtmr,
-	.monotonic_clock 	= monotonic_clock_pmtmr,
-	.delay 			= delay_pmtmr,
-	.read_timer 		= read_timer_tsc,
-	.resume			= pmtmr_resume,
-};
-
-struct init_timer_opts __initdata timer_pmtmr_init = {
-	.init = init_pmtmr,
-	.opts = &timer_pmtmr,
-};
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
-MODULE_DESCRIPTION("Power Management Timer (PMTMR) as primary timing source for x86");
Index: linux/arch/i386/kernel/timers/timer_tsc.c
===================================================================
--- linux.orig/arch/i386/kernel/timers/timer_tsc.c
+++ /dev/null
@@ -1,595 +0,0 @@
-/*
- * This code largely moved from arch/i386/kernel/time.c.
- * See comments there for proper credits.
- *
- * 2004-06-25    Jesper Juhl
- *      moved mark_offset_tsc below cpufreq_delayed_get to avoid gcc 3.4
- *      failing to inline.
- */
-
-#include <linux/spinlock.h>
-#include <linux/init.h>
-#include <linux/timex.h>
-#include <linux/errno.h>
-#include <linux/cpufreq.h>
-#include <linux/string.h>
-#include <linux/jiffies.h>
-
-#include <asm/timer.h>
-#include <asm/io.h>
-/* processor.h for distable_tsc flag */
-#include <asm/processor.h>
-
-#include "io_ports.h"
-#include "mach_timer.h"
-
-#include <asm/hpet.h>
-#include <asm/i8253.h>
-
-#ifdef CONFIG_HPET_TIMER
-static unsigned long hpet_usec_quotient;
-static unsigned long hpet_last;
-static struct timer_opts timer_tsc;
-#endif
-
-static inline void cpufreq_delayed_get(void);
-
-int tsc_disable __devinitdata = 0;
-
-static int use_tsc;
-/* Number of usecs that the last interrupt was delayed */
-static int delay_at_last_interrupt;
-
-static unsigned long last_tsc_low; /* lsb 32 bits of Time Stamp Counter */
-static unsigned long last_tsc_high; /* msb 32 bits of Time Stamp Counter */
-static unsigned long long monotonic_base;
-static seqlock_t monotonic_lock = SEQLOCK_UNLOCKED;
-
-/* convert from cycles(64bits) => nanoseconds (64bits)
- *  basic equation:
- *		ns = cycles / (freq / ns_per_sec)
- *		ns = cycles * (ns_per_sec / freq)
- *		ns = cycles * (10^9 / (cpu_mhz * 10^6))
- *		ns = cycles * (10^3 / cpu_mhz)
- *
- *	Then we use scaling math (suggested by george@mvista.com) to get:
- *		ns = cycles * (10^3 * SC / cpu_mhz) / SC
- *		ns = cycles * cyc2ns_scale / SC
- *
- *	And since SC is a constant power of two, we can convert the div
- *  into a shift.   
- *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
- */
-static unsigned long cyc2ns_scale; 
-#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
-
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
-{
-	cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
-}
-
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
-{
-	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
-}
-
-static int count2; /* counter for mark_offset_tsc() */
-
-/* Cached *multiplier* to convert TSC counts to microseconds.
- * (see the equation below).
- * Equal to 2^32 * (1 / (clocks per usec) ).
- * Initialized in time_init.
- */
-static unsigned long fast_gettimeoffset_quotient;
-
-static unsigned long get_offset_tsc(void)
-{
-	register unsigned long eax, edx;
-
-	/* Read the Time Stamp Counter */
-
-	rdtsc(eax,edx);
-
-	/* .. relative to previous jiffy (32 bits is enough) */
-	eax -= last_tsc_low;	/* tsc_low delta */
-
-	/*
-         * Time offset = (tsc_low delta) * fast_gettimeoffset_quotient
-         *             = (tsc_low delta) * (usecs_per_clock)
-         *             = (tsc_low delta) * (usecs_per_jiffy / clocks_per_jiffy)
-	 *
-	 * Using a mull instead of a divl saves up to 31 clock cycles
-	 * in the critical path.
-         */
-
-	__asm__("mull %2"
-		:"=a" (eax), "=d" (edx)
-		:"rm" (fast_gettimeoffset_quotient),
-		 "0" (eax));
-
-	/* our adjusted time offset in microseconds */
-	return delay_at_last_interrupt + edx;
-}
-
-static unsigned long long monotonic_clock_tsc(void)
-{
-	unsigned long long last_offset, this_offset, base;
-	unsigned seq;
-	
-	/* atomically read monotonic base & last_offset */
-	do {
-		seq = read_seqbegin(&monotonic_lock);
-		last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-		base = monotonic_base;
-	} while (read_seqretry(&monotonic_lock, seq));
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return base + cycles_2_ns(this_offset - last_offset);
-}
-
-/*
- * Scheduler clock - returns current time in nanosec units.
- */
-unsigned long long sched_clock(void)
-{
-	unsigned long long this_offset;
-
-	/*
-	 * In the NUMA case we dont use the TSC as they are not
-	 * synchronized across all CPUs.
-	 */
-#ifndef CONFIG_NUMA
-	if (!use_tsc)
-#endif
-		/* no locking but a rare wrong value is not a big deal */
-		return jiffies_64 * (1000000000 / HZ);
-
-	/* Read the Time Stamp Counter */
-	rdtscll(this_offset);
-
-	/* return the value in ns */
-	return cycles_2_ns(this_offset);
-}
-
-static void delay_tsc(unsigned long loops)
-{
-	unsigned long bclock, now;
-	
-	rdtscl(bclock);
-	do
-	{
-		rep_nop();
-		rdtscl(now);
-	} while ((now-bclock) < loops);
-}
-
-#ifdef CONFIG_HPET_TIMER
-static void mark_offset_tsc_hpet(void)
-{
-	unsigned long long this_offset, last_offset;
- 	unsigned long offset, temp, hpet_current;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	/*
-	 * It is important that these two operations happen almost at
-	 * the same time. We do the RDTSC stuff first, since it's
-	 * faster. To avoid any inconsistencies, we need interrupts
-	 * disabled locally.
-	 */
-	/*
-	 * Interrupts are just disabled locally since the timer irq
-	 * has the SA_INTERRUPT flag set. -arca
-	 */
-	/* read Pentium cycle counter */
-
-	hpet_current = hpet_readl(HPET_COUNTER);
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	/* lost tick compensation */
-	offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-	if (unlikely(((offset - hpet_last) > hpet_tick) && (hpet_last != 0))) {
-		int lost_ticks = (offset - hpet_last) / hpet_tick;
-		jiffies_64 += lost_ticks;
-	}
-	hpet_last = hpet_current;
-
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	/*
-	 * Time offset = (hpet delta) * ( usecs per HPET clock )
-	 *             = (hpet delta) * ( usecs per tick / HPET clocks per tick)
-	 *             = (hpet delta) * ( hpet_usec_quotient ) / (2^32)
-	 * Where,
-	 * hpet_usec_quotient = (2^32 * usecs per tick)/HPET clocks per tick
-	 */
-	delay_at_last_interrupt = hpet_current - offset;
-	ASM_MUL64_REG(temp, delay_at_last_interrupt,
-			hpet_usec_quotient, delay_at_last_interrupt);
-}
-#endif
-
-
-#ifdef CONFIG_CPU_FREQ
-#include <linux/workqueue.h>
-
-static unsigned int cpufreq_delayed_issched = 0;
-static unsigned int cpufreq_init = 0;
-static struct work_struct cpufreq_delayed_get_work;
-
-static void handle_cpufreq_delayed_get(void *v)
-{
-	unsigned int cpu;
-	for_each_online_cpu(cpu) {
-		cpufreq_get(cpu);
-	}
-	cpufreq_delayed_issched = 0;
-}
-
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static inline void cpufreq_delayed_get(void) 
-{
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
-
-/* If the CPU frequency is scaled, TSC-based delays will need a different
- * loops_per_jiffy value to function properly.
- */
-
-static unsigned int  ref_freq = 0;
-static unsigned long loops_per_jiffy_ref = 0;
-
-#ifndef CONFIG_SMP
-static unsigned long fast_gettimeoffset_ref = 0;
-static unsigned int cpu_khz_ref = 0;
-#endif
-
-static int
-time_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
-		       void *data)
-{
-	struct cpufreq_freqs *freq = data;
-
-	if (val != CPUFREQ_RESUMECHANGE)
-		write_seqlock_irq(&xtime_lock);
-	if (!ref_freq) {
-		ref_freq = freq->old;
-		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
-#ifndef CONFIG_SMP
-		fast_gettimeoffset_ref = fast_gettimeoffset_quotient;
-		cpu_khz_ref = cpu_khz;
-#endif
-	}
-
-	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
-	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
-	    (val == CPUFREQ_RESUMECHANGE)) {
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
-			cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
-#ifndef CONFIG_SMP
-		if (cpu_khz)
-			cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (use_tsc) {
-			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
-				fast_gettimeoffset_quotient = cpufreq_scale(fast_gettimeoffset_ref, freq->new, ref_freq);
-				set_cyc2ns_scale(cpu_khz/1000);
-			}
-		}
-#endif
-	}
-
-	if (val != CPUFREQ_RESUMECHANGE)
-		write_sequnlock_irq(&xtime_lock);
-
-	return 0;
-}
-
-static struct notifier_block time_cpufreq_notifier_block = {
-	.notifier_call	= time_cpufreq_notifier
-};
-
-
-static int __init cpufreq_tsc(void)
-{
-	int ret;
-	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
-	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
-					CPUFREQ_TRANSITION_NOTIFIER);
-	if (!ret)
-		cpufreq_init = 1;
-	return ret;
-}
-core_initcall(cpufreq_tsc);
-
-#else /* CONFIG_CPU_FREQ */
-static inline void cpufreq_delayed_get(void) { return; }
-#endif 
-
-int recalibrate_cpu_khz(void)
-{
-#ifndef CONFIG_SMP
-	unsigned int cpu_khz_old = cpu_khz;
-
-	if (cpu_has_tsc) {
-		init_cpu_khz();
-		cpu_data[0].loops_per_jiffy =
-		    cpufreq_scale(cpu_data[0].loops_per_jiffy,
-			          cpu_khz_old,
-				  cpu_khz);
-		return 0;
-	} else
-		return -ENODEV;
-#else
-	return -ENODEV;
-#endif
-}
-EXPORT_SYMBOL(recalibrate_cpu_khz);
-
-static void mark_offset_tsc(void)
-{
-	unsigned long lost,delay;
-	unsigned long delta = last_tsc_low;
-	int count;
-	int countmp;
-	static int count1 = 0;
-	unsigned long long this_offset, last_offset;
-	static int lost_count = 0;
-
-	write_seqlock(&monotonic_lock);
-	last_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	/*
-	 * It is important that these two operations happen almost at
-	 * the same time. We do the RDTSC stuff first, since it's
-	 * faster. To avoid any inconsistencies, we need interrupts
-	 * disabled locally.
-	 */
-
-	/*
-	 * Interrupts are just disabled locally since the timer irq
-	 * has the SA_INTERRUPT flag set. -arca
-	 */
-
-	/* read Pentium cycle counter */
-
-	rdtsc(last_tsc_low, last_tsc_high);
-
-	spin_lock(&i8253_lock);
-	outb_p(0x00, PIT_MODE);     /* latch the count ASAP */
-
-	count = inb_p(PIT_CH0);    /* read the latched count */
-	count |= inb(PIT_CH0) << 8;
-
-	/*
-	 * VIA686a test code... reset the latch if count > max + 1
-	 * from timer_pit.c - cjb
-	 */
-	if (count > LATCH) {
-		outb_p(0x34, PIT_MODE);
-		outb_p(LATCH & 0xff, PIT_CH0);
-		outb(LATCH >> 8, PIT_CH0);
-		count = LATCH - 1;
-	}
-
-	spin_unlock(&i8253_lock);
-
-	if (pit_latch_buggy) {
-		/* get center value of last 3 time lutch */
-		if ((count2 >= count && count >= count1)
-		    || (count1 >= count && count >= count2)) {
-			count2 = count1; count1 = count;
-		} else if ((count1 >= count2 && count2 >= count)
-			   || (count >= count2 && count2 >= count1)) {
-			countmp = count;count = count2;
-			count2 = count1;count1 = countmp;
-		} else {
-			count2 = count1; count1 = count; count = count1;
-		}
-	}
-
-	/* lost tick compensation */
-	delta = last_tsc_low - delta;
-	{
-		register unsigned long eax, edx;
-		eax = delta;
-		__asm__("mull %2"
-		:"=a" (eax), "=d" (edx)
-		:"rm" (fast_gettimeoffset_quotient),
-		 "0" (eax));
-		delta = edx;
-	}
-	delta += delay_at_last_interrupt;
-	lost = delta/(1000000/HZ);
-	delay = delta%(1000000/HZ);
-	if (lost >= 2) {
-		jiffies_64 += lost-1;
-
-		/* sanity check to ensure we're not always losing ticks */
-		if (lost_count++ > 100) {
-			printk(KERN_WARNING "Losing too many ticks!\n");
-			printk(KERN_WARNING "TSC cannot be used as a timesource.  \n");
-			printk(KERN_WARNING "Possible reasons for this are:\n");
-			printk(KERN_WARNING "  You're running with Speedstep,\n");
-			printk(KERN_WARNING "  You don't have DMA enabled for your hard disk (see hdparm),\n");
-			printk(KERN_WARNING "  Incorrect TSC synchronization on an SMP system (see dmesg).\n");
-			printk(KERN_WARNING "Falling back to a sane timesource now.\n");
-
-			clock_fallback();
-		}
-		/* ... but give the TSC a fair chance */
-		if (lost_count > 25)
-			cpufreq_delayed_get();
-	} else
-		lost_count = 0;
-	/* update the monotonic base value */
-	this_offset = ((unsigned long long)last_tsc_high<<32)|last_tsc_low;
-	monotonic_base += cycles_2_ns(this_offset - last_offset);
-	write_sequnlock(&monotonic_lock);
-
-	/* calculate delay_at_last_interrupt */
-	count = ((LATCH-1) - count) * TICK_SIZE;
-	delay_at_last_interrupt = (count + LATCH/2) / LATCH;
-
-	/* catch corner case where tick rollover occured
-	 * between tsc and pit reads (as noted when
-	 * usec delta is > 90% # of usecs/tick)
-	 */
-	if (lost && abs(delay - delay_at_last_interrupt) > (900000/HZ))
-		jiffies_64++;
-}
-
-static int __init init_tsc(char* override)
-{
-
-	/* check clock override */
-	if (override[0] && strncmp(override,"tsc",3)) {
-#ifdef CONFIG_HPET_TIMER
-		if (is_hpet_enabled()) {
-			printk(KERN_ERR "Warning: clock= override failed. Defaulting to tsc\n");
-		} else
-#endif
-		{
-			return -ENODEV;
-		}
-	}
-
-	/*
-	 * If we have APM enabled or the CPU clock speed is variable
-	 * (CPU stops clock on HLT or slows clock to save power)
-	 * then the TSC timestamps may diverge by up to 1 jiffy from
-	 * 'real time' but nothing will break.
-	 * The most frequent case is that the CPU is "woken" from a halt
-	 * state by the timer interrupt itself, so we get 0 error. In the
-	 * rare cases where a driver would "wake" the CPU and request a
-	 * timestamp, the maximum error is < 1 jiffy. But timestamps are
-	 * still perfectly ordered.
-	 * Note that the TSC counter will be reset if APM suspends
-	 * to disk; this won't break the kernel, though, 'cuz we're
-	 * smart.  See arch/i386/kernel/apm.c.
-	 */
- 	/*
- 	 *	Firstly we have to do a CPU check for chips with
- 	 * 	a potentially buggy TSC. At this point we haven't run
- 	 *	the ident/bugs checks so we must run this hook as it
- 	 *	may turn off the TSC flag.
- 	 *
- 	 *	NOTE: this doesn't yet handle SMP 486 machines where only
- 	 *	some CPU's have a TSC. Thats never worked and nobody has
- 	 *	moaned if you have the only one in the world - you fix it!
- 	 */
-
-	count2 = LATCH; /* initialize counter for mark_offset_tsc() */
-
-	if (cpu_has_tsc) {
-		unsigned long tsc_quotient;
-#ifdef CONFIG_HPET_TIMER
-		if (is_hpet_enabled() && hpet_use_timer) {
-			unsigned long result, remain;
-			printk("Using TSC for gettimeofday\n");
-			tsc_quotient = calibrate_tsc_hpet(NULL);
-			timer_tsc.mark_offset = &mark_offset_tsc_hpet;
-			/*
-			 * Math to calculate hpet to usec multiplier
-			 * Look for the comments at get_offset_tsc_hpet()
-			 */
-			ASM_DIV64_REG(result, remain, hpet_tick,
-					0, KERNEL_TICK_USEC);
-			if (remain > (hpet_tick >> 1))
-				result++; /* rounding the result */
-
-			hpet_usec_quotient = result;
-		} else
-#endif
-		{
-			tsc_quotient = calibrate_tsc();
-		}
-
-		if (tsc_quotient) {
-			fast_gettimeoffset_quotient = tsc_quotient;
-			use_tsc = 1;
-			/*
-			 *	We could be more selective here I suspect
-			 *	and just enable this for the next intel chips ?
-			 */
-			/* report CPU clock rate in Hz.
-			 * The formula is (10^6 * 2^32) / (2^32 * 1 / (clocks/us)) =
-			 * clock/second. Our precision is about 100 ppm.
-			 */
-			{	unsigned long eax=0, edx=1000;
-				__asm__("divl %2"
-		       		:"=a" (cpu_khz), "=d" (edx)
-        	       		:"r" (tsc_quotient),
-	                	"0" (eax), "1" (edx));
-				printk("Detected %u.%03u MHz processor.\n",
-					cpu_khz / 1000, cpu_khz % 1000);
-			}
-			set_cyc2ns_scale(cpu_khz/1000);
-			return 0;
-		}
-	}
-	return -ENODEV;
-}
-
-static int tsc_resume(void)
-{
-	write_seqlock(&monotonic_lock);
-	/* Assume this is the last mark offset time */
-	rdtsc(last_tsc_low, last_tsc_high);
-#ifdef CONFIG_HPET_TIMER
-	if (is_hpet_enabled() && hpet_use_timer)
-		hpet_last = hpet_readl(HPET_COUNTER);
-#endif
-	write_sequnlock(&monotonic_lock);
-	return 0;
-}
-
-#ifndef CONFIG_X86_TSC
-/* disable flag for tsc.  Takes effect by clearing the TSC cpu flag
- * in cpu/common.c */
-static int __init tsc_setup(char *str)
-{
-	tsc_disable = 1;
-	return 1;
-}
-#else
-static int __init tsc_setup(char *str)
-{
-	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
-				"cannot disable TSC.\n");
-	return 1;
-}
-#endif
-__setup("notsc", tsc_setup);
-
-
-
-/************************************************************/
-
-/* tsc timer_opts struct */
-static struct timer_opts timer_tsc = {
-	.name = "tsc",
-	.mark_offset = mark_offset_tsc, 
-	.get_offset = get_offset_tsc,
-	.monotonic_clock = monotonic_clock_tsc,
-	.delay = delay_tsc,
-	.read_timer = read_timer_tsc,
-	.resume	= tsc_resume,
-};
-
-struct init_timer_opts __initdata timer_tsc_init = {
-	.init = init_tsc,
-	.opts = &timer_tsc,
-};
Index: linux/arch/i386/kernel/traps.c
===================================================================
--- linux.orig/arch/i386/kernel/traps.c
+++ linux/arch/i386/kernel/traps.c
@@ -93,7 +93,7 @@ asmlinkage void machine_check(void);
 
 static int kstack_depth_to_print = 24;
 struct notifier_block *i386die_chain;
-static DEFINE_SPINLOCK(die_notifier_lock);
+static DEFINE_RAW_SPINLOCK(die_notifier_lock);
 
 int register_die_notifier(struct notifier_block *nb)
 {
@@ -116,22 +116,27 @@ static inline unsigned long print_contex
 				unsigned long *stack, unsigned long ebp)
 {
 	unsigned long addr;
+#ifndef CONFIG_FRAME_POINTER
+	unsigned long prev_frame;
+#endif
 
-#ifdef	CONFIG_FRAME_POINTER
+#ifdef CONFIG_FRAME_POINTER
 	while (valid_stack_ptr(tinfo, (void *)ebp)) {
 		addr = *(unsigned long *)(ebp + 4);
 		printk(" [<%08lx>] ", addr);
 		print_symbol("%s", addr);
-		printk("\n");
+		printk(" (%ld)\n", *(unsigned long *)ebp - ebp);
 		ebp = *(unsigned long *)ebp;
 	}
 #else
+	prev_frame = (unsigned long)stack;
 	while (valid_stack_ptr(tinfo, stack)) {
 		addr = *stack++;
 		if (__kernel_text_address(addr)) {
 			printk(" [<%08lx>]", addr);
 			print_symbol(" %s", addr);
-			printk("\n");
+			printk(" (%ld)\n", (unsigned long)stack - prev_frame);
+			prev_frame = (unsigned long)stack;
 		}
 	}
 #endif
@@ -163,6 +168,8 @@ void show_trace(struct task_struct *task
 			break;
 		printk(" =======================\n");
 	}
+	print_traces(task);
+	show_held_locks(task);
 }
 
 void show_stack(struct task_struct *task, unsigned long *esp)
@@ -201,6 +208,12 @@ void dump_stack(void)
 
 EXPORT_SYMBOL(dump_stack);
 
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_LATENCY_TRACE)
+extern unsigned long worst_stack_left;
+#else
+# define worst_stack_left -1L
+#endif
+
 void show_registers(struct pt_regs *regs)
 {
 	int i;
@@ -225,10 +238,17 @@ void show_registers(struct pt_regs *regs
 		regs->eax, regs->ebx, regs->ecx, regs->edx);
 	printk("esi: %08lx   edi: %08lx   ebp: %08lx   esp: %08lx\n",
 		regs->esi, regs->edi, regs->ebp, esp);
-	printk("ds: %04x   es: %04x   ss: %04x\n",
-		regs->xds & 0xffff, regs->xes & 0xffff, ss);
-	printk("Process %s (pid: %d, threadinfo=%p task=%p)",
+	printk("ds: %04x   es: %04x   ss: %04x   preempt: %08x\n",
+		regs->xds & 0xffff, regs->xes & 0xffff, ss, preempt_count());
+	printk("Process %s (pid: %d, threadinfo=%p task=%p",
 		current->comm, current->pid, current_thread_info(), current);
+
+	if (in_kernel)
+		printk(" stack_left=%ld worst_left=%ld)",
+			(esp & (THREAD_SIZE-1))-sizeof(struct thread_info),
+			worst_stack_left);
+	else
+		printk(")");
 	/*
 	 * When in-kernel, we also print out the stack and code at the
 	 * time of the fault..
@@ -297,11 +317,11 @@ bug:
 void die(const char * str, struct pt_regs * regs, long err)
 {
 	static struct {
-		spinlock_t lock;
+		raw_spinlock_t lock;
 		u32 lock_owner;
 		int lock_owner_depth;
 	} die = {
-		.lock =			SPIN_LOCK_UNLOCKED,
+		.lock =			RAW_SPIN_LOCK_UNLOCKED,
 		.lock_owner =		-1,
 		.lock_owner_depth =	0
 	};
@@ -379,6 +399,11 @@ static void __kprobes do_trap(int trapnr
 	if (!user_mode(regs))
 		goto kernel_trap;
 
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	trap_signal: {
 		if (info)
 			force_sig_info(signr, info, tsk);
@@ -508,7 +533,7 @@ fastcall void __kprobes do_general_prote
 	return;
 
 gp_in_vm86:
-	local_irq_enable();
+	raw_local_irq_enable();
 	handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
 	return;
 
@@ -562,10 +587,12 @@ static void unknown_nmi_error(unsigned c
 	printk("Do you have a strange power saving mode enabled?\n");
 }
 
-static DEFINE_SPINLOCK(nmi_print_lock);
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
 
 void die_nmi (struct pt_regs *regs, const char *msg)
 {
+	deadlock_trace_off();
+
 	if (notify_die(DIE_NMIWATCHDOG, msg, regs, 0, 0, SIGINT) ==
 	    NOTIFY_STOP)
 		return;
@@ -593,10 +620,11 @@ void die_nmi (struct pt_regs *regs, cons
 		crash_kexec(regs);
 	}
 
+	nmi_exit();
 	do_exit(SIGSEGV);
 }
 
-static void default_do_nmi(struct pt_regs * regs)
+static void notrace default_do_nmi(struct pt_regs * regs)
 {
 	unsigned char reason = 0;
 
@@ -615,6 +643,7 @@ static void default_do_nmi(struct pt_reg
 		 */
 		if (nmi_watchdog) {
 			nmi_watchdog_tick(regs);
+//			trace_special(6, 1, 0);
 			return;
 		}
 #endif
@@ -634,18 +663,19 @@ static void default_do_nmi(struct pt_reg
 	reassert_nmi();
 }
 
-static int dummy_nmi_callback(struct pt_regs * regs, int cpu)
+static notrace int dummy_nmi_callback(struct pt_regs * regs, int cpu)
 {
 	return 0;
 }
  
 static nmi_callback_t nmi_callback = dummy_nmi_callback;
  
-fastcall void do_nmi(struct pt_regs * regs, long error_code)
+fastcall notrace void do_nmi(struct pt_regs * regs, long error_code)
 {
 	int cpu;
 
 	nmi_enter();
+	nmi_trace((unsigned long)do_nmi, regs->eip, regs->eflags);
 
 	cpu = smp_processor_id();
 
@@ -723,7 +753,7 @@ fastcall void __kprobes do_debug(struct 
 		return;
 	/* It's safe to allow irq's after DR6 has been saved */
 	if (regs->eflags & X86_EFLAGS_IF)
-		local_irq_enable();
+		raw_local_irq_enable();
 
 	/* Mask out spurious debug traps due to lazy DR7 setting */
 	if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) {
Index: linux/arch/i386/kernel/tsc.c
===================================================================
--- /dev/null
+++ linux/arch/i386/kernel/tsc.c
@@ -0,0 +1,493 @@
+/*
+ * This code largely moved from arch/i386/kernel/timer/timer_tsc.c
+ * which was originally moved from arch/i386/kernel/time.c.
+ * See comments there for proper credits.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/workqueue.h>
+#include <linux/cpufreq.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+#include <asm/delay.h>
+#include <asm/tsc.h>
+#include <asm/io.h>
+
+#include "mach_timer.h"
+
+/*
+ * On some systems the TSC frequency does not
+ * change with the cpu frequency. So we need
+ * an extra value to store the TSC freq
+ */
+unsigned int tsc_khz;
+
+int tsc_disable __initdata = 0;
+
+#ifdef CONFIG_X86_TSC
+static int __init tsc_setup(char *str)
+{
+	printk(KERN_WARNING "notsc: Kernel compiled with CONFIG_X86_TSC, "
+				"cannot disable TSC.\n");
+	return 1;
+}
+#else
+/*
+ * disable flag for tsc. Takes effect by clearing the TSC cpu flag
+ * in cpu/common.c
+ */
+static int __init tsc_setup(char *str)
+{
+	tsc_disable = 1;
+
+	return 1;
+}
+#endif
+
+__setup("notsc", tsc_setup);
+
+/*
+ * code to mark and check if the TSC is unstable
+ * due to cpufreq or due to unsynced TSCs
+ */
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+
+void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;
+}
+
+/* Code to compensate for C3 stalls */
+static u64 tsc_c3_offset;
+
+void tsc_c3_compensate(unsigned long nsecs)
+{
+	/* this could def be optimized */
+	u64 cycles = ((u64)nsecs * tsc_khz);
+
+	do_div(cycles, 1000000);
+	tsc_c3_offset += cycles;
+}
+
+EXPORT_SYMBOL_GPL(tsc_c3_compensate);
+
+static inline u64 tsc_read_c3_time(void)
+{
+	return tsc_c3_offset;
+}
+
+/* Accellerators for sched_clock()
+ * convert from cycles(64bits) => nanoseconds (64bits)
+ *  basic equation:
+ *		ns = cycles / (freq / ns_per_sec)
+ *		ns = cycles * (ns_per_sec / freq)
+ *		ns = cycles * (10^9 / (cpu_khz * 10^3))
+ *		ns = cycles * (10^6 / cpu_khz)
+ *
+ *	Then we use scaling math (suggested by george@mvista.com) to get:
+ *		ns = cycles * (10^6 * SC / cpu_khz) / SC
+ *		ns = cycles * cyc2ns_scale / SC
+ *
+ *	And since SC is a constant power of two, we can convert the div
+ *  into a shift.
+ *
+ *  We can use khz divisor instead of mhz to keep a better percision, since
+ *  cyc2ns_scale is limited to 10^6 * 2^10, which fits in 32 bits.
+ *  (mathieu.desnoyers@polymtl.ca)
+ *
+ *			-johnstul@us.ibm.com "math is hard, lets go shopping!"
+ */
+static unsigned long cyc2ns_scale;
+
+#define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
+
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
+{
+	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
+}
+
+static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+{
+	return (cyc * cyc2ns_scale) >> CYC2NS_SCALE_FACTOR;
+}
+
+/*
+ * Scheduler clock - returns current time in nanosec units.
+ */
+unsigned long long sched_clock(void)
+{
+	unsigned long long this_offset;
+
+	/*
+	 * in the NUMA case we dont use the TSC as they are not
+	 * synchronized across all CPUs.
+	 */
+#ifndef CONFIG_NUMA
+	if (!cpu_khz || check_tsc_unstable())
+#endif
+		/* no locking but a rare wrong value is not a big deal */
+		return (jiffies_64 - INITIAL_JIFFIES) * (1000000000 / HZ);
+
+	/* read the Time Stamp Counter: */
+	rdtscll(this_offset);
+	this_offset += tsc_read_c3_time();
+
+	/* return the value in ns */
+	return cycles_2_ns(this_offset);
+}
+
+static unsigned long calculate_cpu_khz(void)
+{
+	unsigned long long start, end;
+	unsigned long count;
+	u64 delta64;
+	int i;
+
+	/* run 3 times to ensure the cache is warm */
+	for (i = 0; i < 3; i++) {
+		mach_prepare_counter();
+		rdtscll(start);
+		mach_countup(&count);
+		rdtscll(end);
+	}
+	/*
+	 * Error: ECTCNEVERSET
+	 * The CTC wasn't reliable: we got a hit on the very first read,
+	 * or the CPU was so fast/slow that the quotient wouldn't fit in
+	 * 32 bits..
+	 */
+	if (count <= 1)
+		return 0;
+
+	delta64 = end - start;
+
+	/* cpu freq too fast: */
+	if (delta64 > (1ULL<<32))
+		return 0;
+
+	/* cpu freq too slow: */
+	if (delta64 <= CALIBRATE_TIME_MSEC)
+		return 0;
+
+	delta64 += CALIBRATE_TIME_MSEC/2; /* round for do_div */
+	do_div(delta64,CALIBRATE_TIME_MSEC);
+
+	return (unsigned long)delta64;
+}
+
+int recalibrate_cpu_khz(void)
+{
+#ifndef CONFIG_SMP
+	unsigned long cpu_khz_old = cpu_khz;
+
+	if (cpu_has_tsc) {
+		cpu_khz = calculate_cpu_khz();
+		tsc_khz = cpu_khz;
+		cpu_data[0].loops_per_jiffy =
+			cpufreq_scale(cpu_data[0].loops_per_jiffy,
+					cpu_khz_old, cpu_khz);
+		return 0;
+	} else
+		return -ENODEV;
+#else
+	return -ENODEV;
+#endif
+}
+
+EXPORT_SYMBOL(recalibrate_cpu_khz);
+
+void tsc_init(void)
+{
+	if (!cpu_has_tsc || tsc_disable)
+		return;
+
+	cpu_khz = calculate_cpu_khz();
+	tsc_khz = cpu_khz;
+
+	if (!cpu_khz)
+		return;
+
+	printk("Detected %lu.%03lu MHz processor.\n",
+				(unsigned long)cpu_khz / 1000,
+				(unsigned long)cpu_khz % 1000);
+
+	set_cyc2ns_scale(cpu_khz);
+	use_tsc_delay();
+}
+
+#ifdef CONFIG_CPU_FREQ
+
+static unsigned int cpufreq_delayed_issched = 0;
+static unsigned int cpufreq_init = 0;
+static struct work_struct cpufreq_delayed_get_work;
+
+static void handle_cpufreq_delayed_get(void *v)
+{
+	unsigned int cpu;
+
+	for_each_online_cpu(cpu)
+		cpufreq_get(cpu);
+
+	cpufreq_delayed_issched = 0;
+}
+
+/*
+ * if we notice cpufreq oddness, schedule a call to cpufreq_get() as it tries
+ * to verify the CPU frequency the timing core thinks the CPU is running
+ * at is still correct.
+ */
+static inline void cpufreq_delayed_get(void)
+{
+	if (cpufreq_init && !cpufreq_delayed_issched) {
+		cpufreq_delayed_issched = 1;
+		printk(KERN_DEBUG "Checking if CPU frequency changed.\n");
+		schedule_work(&cpufreq_delayed_get_work);
+	}
+}
+
+/*
+ * if the CPU frequency is scaled, TSC-based delays will need a different
+ * loops_per_jiffy value to function properly.
+ */
+static unsigned int ref_freq = 0;
+static unsigned long loops_per_jiffy_ref = 0;
+
+#ifndef CONFIG_SMP
+static unsigned long cpu_khz_ref = 0;
+#endif
+
+static int
+time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, void *data)
+{
+	struct cpufreq_freqs *freq = data;
+
+	if (val != CPUFREQ_RESUMECHANGE)
+		write_seqlock_irq(&xtime_lock);
+
+	if (!ref_freq) {
+		ref_freq = freq->old;
+		loops_per_jiffy_ref = cpu_data[freq->cpu].loops_per_jiffy;
+#ifndef CONFIG_SMP
+		cpu_khz_ref = cpu_khz;
+#endif
+	}
+
+	if ((val == CPUFREQ_PRECHANGE  && freq->old < freq->new) ||
+	    (val == CPUFREQ_POSTCHANGE && freq->old > freq->new) ||
+	    (val == CPUFREQ_RESUMECHANGE)) {
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+			cpu_data[freq->cpu].loops_per_jiffy = cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
+
+		if (cpu_khz) {
+#ifndef CONFIG_SMP
+			cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq,
+						freq->new);
+#endif
+			if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
+				tsc_khz = cpu_khz;
+				set_cyc2ns_scale(cpu_khz);
+				/*
+				 * TSC based sched_clock turns
+				 * to junk w/ cpufreq
+				 */
+				mark_tsc_unstable();
+			}
+		}
+	}
+
+	if (val != CPUFREQ_RESUMECHANGE)
+		write_sequnlock_irq(&xtime_lock);
+
+	return 0;
+}
+
+static struct notifier_block time_cpufreq_notifier_block = {
+	.notifier_call	= time_cpufreq_notifier
+};
+
+static int __init cpufreq_tsc(void)
+{
+	int ret;
+
+	INIT_WORK(&cpufreq_delayed_get_work, handle_cpufreq_delayed_get, NULL);
+	ret = cpufreq_register_notifier(&time_cpufreq_notifier_block,
+					CPUFREQ_TRANSITION_NOTIFIER);
+	if (!ret)
+		cpufreq_init = 1;
+
+	return ret;
+}
+
+core_initcall(cpufreq_tsc);
+
+#endif
+
+/* Clock source code */
+
+static unsigned long current_tsc_khz = 0;
+static int tsc_update_callback(void);
+
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+/* This will hurt performance! */
+static DEFINE_RAW_SPINLOCK(checktsc_lock);
+static cycle_t last_tsc;
+
+static cycle_t read_tsc(void)
+{
+	static int once = 1;
+
+	unsigned long flags;
+	cycle_t ret;
+
+	spin_lock_irqsave(&checktsc_lock, flags);
+
+	rdtscll(ret);
+
+	if (once && ret < last_tsc) {
+		once = 0;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+		printk("BUG in read_tsc(): TSC went backward!\n");
+		if (num_online_cpus() > 1)
+			printk("... Unsynced TSCs?\n");
+		printk("... [ from %016Lx to %016Lx ]\n", last_tsc, ret);
+
+	} else {
+		last_tsc = ret;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+	}
+
+	return ret;
+}
+
+static cycle_t read_tsc_c3(void)
+{
+	static int once = 1;
+
+	unsigned long flags;
+	cycle_t ret;
+
+	spin_lock_irqsave(&checktsc_lock, flags);
+
+	rdtscll(ret);
+	ret += tsc_read_c3_time();
+
+	if (once && ret < last_tsc) {
+		once = 0;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+		printk("BUG in read_tsc_c3(): TSC went backward!\n");
+		if (num_online_cpus() > 1)
+			printk("... Unsynced TSCs?\n");
+		printk("... [ from %016Lx to %016Lx ]\n", last_tsc, ret);
+	} else {
+		last_tsc = ret;
+		spin_unlock_irqrestore(&checktsc_lock, flags);
+	}
+
+	return ret;
+}
+
+#else /* CONFIG_PARANOID_GENERIC_TIME */
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static cycle_t read_tsc_c3(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret + tsc_read_c3_time();
+}
+
+#endif /* CONFIG_PARANOID_GENERIC_TIME */
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.mask			= (cycle_t)-1,
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.update_callback	= tsc_update_callback,
+	.is_continuous		= 1,
+};
+
+static int tsc_update_callback(void)
+{
+	int change = 0;
+
+	/* check to see if we should switch to the safe clocksource: */
+	if (tsc_read_c3_time() && strncmp(clocksource_tsc.name, "c3tsc", 5)) {
+		printk("Falling back to C3 safe TSC\n");
+		clocksource_tsc.read = read_tsc_c3;
+		clocksource_tsc.name = "c3tsc";
+		change = 1;
+	}
+
+	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+		clocksource_tsc.rating = 50;
+		reselect_clocksource();
+		change = 1;
+	}
+
+	/* only update if tsc_khz has changed: */
+	if (current_tsc_khz != tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		change = 1;
+	}
+
+	return change;
+}
+
+/*
+ * Make an educated guess if the TSC is trustworthy and synchronized
+ * over all CPUs.
+ */
+static __init int unsynchronized_tsc(void)
+{
+	/*
+	 * Intel systems are normally all synchronized.
+	 * Exceptions must mark TSC as unstable:
+	 */
+	if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
+ 		return 0;
+
+	/* assume multi socket systems are not synchronized: */
+ 	return num_possible_cpus() > 1;
+}
+
+/* NUMAQ can't use TSC: */
+#ifndef CONFIG_X86_NUMAQ
+static int __init init_tsc_clocksource(void)
+{
+	/* TSC initialization is done in arch/i386/kernel/tsc.c */
+	if (cpu_has_tsc && tsc_khz) {
+		if (unsynchronized_tsc()) /* lower rating if unsynced */
+			clocksource_tsc.rating = 150;
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		register_clocksource(&clocksource_tsc);
+	}
+
+	return 0;
+}
+
+module_init(init_tsc_clocksource);
+
+#endif
Index: linux/arch/i386/kernel/vm86.c
===================================================================
--- linux.orig/arch/i386/kernel/vm86.c
+++ linux/arch/i386/kernel/vm86.c
@@ -105,9 +105,10 @@ struct pt_regs * fastcall save_v86_state
 	 * from process context. Enable interrupts here, before trying
 	 * to access user space.
 	 */
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	if (!current->thread.vm86_info) {
+		raw_local_irq_disable();
 		printk("no vm86_info: BAD\n");
 		do_exit(SIGSEGV);
 	}
Index: linux/arch/i386/lib/bitops.c
===================================================================
--- linux.orig/arch/i386/lib/bitops.c
+++ linux/arch/i386/lib/bitops.c
@@ -68,3 +68,37 @@ int find_next_zero_bit(const unsigned lo
 	return (offset + set + res);
 }
 EXPORT_SYMBOL(find_next_zero_bit);
+
+
+/*
+ * rw spinlock fallbacks
+ */
+#if defined(CONFIG_SMP)
+asm(
+".section .sched.text\n"
+".align	4\n"
+".globl	__write_lock_failed\n"
+"__write_lock_failed:\n\t"
+	LOCK "addl	$" RW_LOCK_BIAS_STR ",(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jne	1b\n\t"
+	LOCK "subl	$" RW_LOCK_BIAS_STR ",(%eax)\n\t"
+	"jnz	__write_lock_failed\n\t"
+	"ret"
+);
+
+asm(
+".section .sched.text\n"
+".align	4\n"
+".globl	__read_lock_failed\n"
+"__read_lock_failed:\n\t"
+	LOCK "incl	(%eax)\n"
+"1:	rep; nop\n\t"
+	"cmpl	$1,(%eax)\n\t"
+	"js	1b\n\t"
+	LOCK "decl	(%eax)\n\t"
+	"js	__read_lock_failed\n\t"
+	"ret"
+);
+#endif
Index: linux/arch/i386/lib/delay.c
===================================================================
--- linux.orig/arch/i386/lib/delay.c
+++ linux/arch/i386/lib/delay.c
@@ -10,43 +10,93 @@
  *	we have to worry about.
  */
 
+#include <linux/timeofday.h>
+#include <linux/module.h>
 #include <linux/config.h>
 #include <linux/sched.h>
 #include <linux/delay.h>
-#include <linux/module.h>
+
 #include <asm/processor.h>
 #include <asm/delay.h>
 #include <asm/timer.h>
 
 #ifdef CONFIG_SMP
-#include <asm/smp.h>
+# include <asm/smp.h>
 #endif
 
-extern struct timer_opts* timer;
+/* simple loop based delay: */
+static void delay_loop(unsigned long loops)
+{
+	int d0;
+
+	__asm__ __volatile__(
+		"\tjmp 1f\n"
+		".align 16\n"
+		"1:\tjmp 2f\n"
+		".align 16\n"
+		"2:\tdecl %0\n\tjns 2b"
+		:"=&a" (d0)
+		:"0" (loops));
+}
+
+/* TSC based delay: */
+static void delay_tsc(unsigned long loops)
+{
+	unsigned long bclock, now;
+
+	rdtscl(bclock);
+	do {
+		rep_nop();
+		rdtscl(now);
+	} while ((now-bclock) < loops);
+}
+
+/*
+ * Since we calibrate only once at boot, this
+ * function should be set once at boot and not changed
+ */
+static void (*delay_fn)(unsigned long) = delay_loop;
+
+void use_tsc_delay(void)
+{
+	delay_fn = delay_tsc;
+}
+
+int read_current_timer(unsigned long *timer_val)
+{
+	if (delay_fn == delay_tsc) {
+		rdtscl(*timer_val);
+		return 0;
+	}
+	return -1;
+}
 
 void __delay(unsigned long loops)
 {
-	cur_timer->delay(loops);
+	delay_fn(loops);
 }
 
 inline void __const_udelay(unsigned long xloops)
 {
 	int d0;
+
 	xloops *= 4;
 	__asm__("mull %0"
 		:"=d" (xloops), "=&a" (d0)
-		:"1" (xloops),"0" (cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
-        __delay(++xloops);
+		:"1" (xloops), "0"
+		(cpu_data[raw_smp_processor_id()].loops_per_jiffy * (HZ/4)));
+
+	__delay(++xloops);
 }
 
 void __udelay(unsigned long usecs)
 {
-	__const_udelay(usecs * 0x000010c7);  /* 2**32 / 1000000 (rounded up) */
+	__const_udelay(usecs * 0x000010c7); /* 2**32 / 1000000 (rounded up) */
 }
 
 void __ndelay(unsigned long nsecs)
 {
-	__const_udelay(nsecs * 0x00005);  /* 2**32 / 1000000000 (rounded up) */
+	__const_udelay(nsecs * 0x00005); /* 2**32 / 1000000000 (rounded up) */
 }
 
 EXPORT_SYMBOL(__delay);
Index: linux/arch/i386/mach-default/setup.c
===================================================================
--- linux.orig/arch/i386/mach-default/setup.c
+++ linux/arch/i386/mach-default/setup.c
@@ -34,7 +34,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 /**
  * intr_init_hook - post gate setup interrupt initialisation
@@ -78,8 +78,6 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
-
 /**
  * time_init_hook - do any specific initialisations for the system timer.
  *
@@ -89,7 +87,6 @@ static struct irqaction irq0  = { timer_
  **/
 void __init time_init_hook(void)
 {
-	setup_irq(0, &irq0);
 }
 
 #ifdef CONFIG_MCA
Index: linux/arch/i386/mach-es7000/es7000plat.c
===================================================================
--- linux.orig/arch/i386/mach-es7000/es7000plat.c
+++ linux/arch/i386/mach-es7000/es7000plat.c
@@ -65,7 +65,7 @@ es7000_rename_gsi(int ioapic, int gsi)
 	if (!base) {
 		int i;
 		for (i = 0; i < nr_ioapics; i++)
-			base += nr_ioapic_registers[i];
+			base += nr_ioapic_registers(i);
 	}
 
 	if (!ioapic && (gsi < 16)) 
Index: linux/arch/i386/mach-visws/setup.c
===================================================================
--- linux.orig/arch/i386/mach-visws/setup.c
+++ linux/arch/i386/mach-visws/setup.c
@@ -113,7 +113,7 @@ void __init pre_setup_arch_hook()
 
 static struct irqaction irq0 = {
 	.handler =	timer_interrupt,
-	.flags =	SA_INTERRUPT,
+	.flags =	SA_INTERRUPT | SA_NODELAY,
 	.name =		"timer",
 };
 
Index: linux/arch/i386/mach-visws/visws_apic.c
===================================================================
--- linux.orig/arch/i386/mach-visws/visws_apic.c
+++ linux/arch/i386/mach-visws/visws_apic.c
@@ -260,11 +260,13 @@ out_unlock:
 static struct irqaction master_action = {
 	.handler =	piix4_master_intr,
 	.name =		"PIIX4-8259",
+	.flags =	SA_NODELAY,
 };
 
 static struct irqaction cascade_action = {
 	.handler = 	no_action,
 	.name =		"cascade",
+	.flags =	SA_NODELAY,
 };
 
 
Index: linux/arch/i386/mach-voyager/setup.c
===================================================================
--- linux.orig/arch/i386/mach-voyager/setup.c
+++ linux/arch/i386/mach-voyager/setup.c
@@ -16,7 +16,7 @@ void __init pre_intr_init_hook(void)
 /*
  * IRQ2 is cascade interrupt to second interrupt controller
  */
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init intr_init_hook(void)
 {
@@ -39,7 +39,7 @@ void __init trap_init_hook(void)
 {
 }
 
-static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL};
+static struct irqaction irq0  = { timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL};
 
 void __init time_init_hook(void)
 {
Index: linux/arch/i386/mm/fault.c
===================================================================
--- linux.orig/arch/i386/mm/fault.c
+++ linux/arch/i386/mm/fault.c
@@ -39,6 +39,8 @@ void bust_spinlocks(int yes)
 	int loglevel_save = console_loglevel;
 
 	if (yes) {
+		stop_trace();
+		zap_rt_locks();
 		oops_in_progress = 1;
 		return;
 	}
@@ -224,8 +226,8 @@ fastcall void do_invalid_op(struct pt_re
  *	bit 1 == 0 means read, 1 means write
  *	bit 2 == 0 means kernel, 1 means user-mode
  */
-fastcall void __kprobes do_page_fault(struct pt_regs *regs,
-				      unsigned long error_code)
+fastcall notrace void __kprobes do_page_fault(struct pt_regs *regs,
+					      unsigned long error_code)
 {
 	struct task_struct *tsk;
 	struct mm_struct *mm;
@@ -236,13 +238,14 @@ fastcall void __kprobes do_page_fault(st
 
 	/* get the address */
         address = read_cr2();
+	trace_special(regs->eip, error_code, address);
 
 	if (notify_die(DIE_PAGE_FAULT, "page fault", regs, error_code, 14,
 					SIGSEGV) == NOTIFY_STOP)
 		return;
 	/* It's safe to allow irq's after cr2 has been saved */
 	if (regs->eflags & (X86_EFLAGS_IF|VM_MASK))
-		local_irq_enable();
+		raw_local_irq_enable();
 
 	tsk = current;
 
@@ -449,9 +452,9 @@ no_context:
 	}
 #endif
 	if (address < PAGE_SIZE)
-		printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference");
+		printk(KERN_ALERT "BUG: Unable to handle kernel NULL pointer dereference");
 	else
-		printk(KERN_ALERT "Unable to handle kernel paging request");
+		printk(KERN_ALERT "BUG: Unable to handle kernel paging request");
 	printk(" at virtual address %08lx\n",address);
 	printk(KERN_ALERT " printing eip:\n");
 	printk("%08lx\n", regs->eip);
Index: linux/arch/i386/mm/highmem.c
===================================================================
--- linux.orig/arch/i386/mm/highmem.c
+++ linux/arch/i386/mm/highmem.c
@@ -18,6 +18,27 @@ void kunmap(struct page *page)
 	kunmap_high(page);
 }
 
+void kunmap_virt(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return;
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	kunmap(page);
+}
+
+struct page *kmap_to_page(void *ptr)
+{
+	struct page *page;
+
+	if ((unsigned long)ptr < PKMAP_ADDR(0))
+		return virt_to_page(ptr);
+	page = pte_page(pkmap_page_table[PKMAP_NR((unsigned long)ptr)]);
+	return page;
+}
+
+
 /*
  * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
  * no global lock is needed and because the kmap code must perform a global TLB
@@ -26,7 +47,7 @@ void kunmap(struct page *page)
  * However when holding an atomic kmap is is not legal to sleep, so atomic
  * kmaps are appropriate for short, tight code paths only.
  */
-void *kmap_atomic(struct page *page, enum km_type type)
+void *__kmap_atomic(struct page *page, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -48,7 +69,7 @@ void *kmap_atomic(struct page *page, enu
 	return (void*) vaddr;
 }
 
-void kunmap_atomic(void *kvaddr, enum km_type type)
+void __kunmap_atomic(void *kvaddr, enum km_type type)
 {
 #ifdef CONFIG_DEBUG_HIGHMEM
 	unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
@@ -78,7 +99,7 @@ void kunmap_atomic(void *kvaddr, enum km
 /* This is the same as kmap_atomic() but can map memory that doesn't
  * have a struct page associated with it.
  */
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type)
 {
 	enum fixed_addresses idx;
 	unsigned long vaddr;
@@ -93,7 +114,7 @@ void *kmap_atomic_pfn(unsigned long pfn,
 	return (void*) vaddr;
 }
 
-struct page *kmap_atomic_to_page(void *ptr)
+struct page *__kmap_atomic_to_page(void *ptr)
 {
 	unsigned long idx, vaddr = (unsigned long)ptr;
 	pte_t *pte;
@@ -108,6 +129,7 @@ struct page *kmap_atomic_to_page(void *p
 
 EXPORT_SYMBOL(kmap);
 EXPORT_SYMBOL(kunmap);
-EXPORT_SYMBOL(kmap_atomic);
-EXPORT_SYMBOL(kunmap_atomic);
-EXPORT_SYMBOL(kmap_atomic_to_page);
+EXPORT_SYMBOL(kunmap_virt);
+EXPORT_SYMBOL(__kmap_atomic);
+EXPORT_SYMBOL(__kunmap_atomic);
+EXPORT_SYMBOL(__kmap_atomic_to_page);
Index: linux/arch/i386/mm/init.c
===================================================================
--- linux.orig/arch/i386/mm/init.c
+++ linux/arch/i386/mm/init.c
@@ -42,7 +42,7 @@
 
 unsigned int __VMALLOC_RESERVE = 128 << 20;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static int noinline do_test_wp_bit(void);
Index: linux/arch/i386/mm/pageattr.c
===================================================================
--- linux.orig/arch/i386/mm/pageattr.c
+++ linux/arch/i386/mm/pageattr.c
@@ -207,6 +207,9 @@ void kernel_map_pages(struct page *page,
 {
 	if (PageHighMem(page))
 		return;
+	if (!enable)
+		check_no_locks_freed(page_address(page), page_address(page+numpages));
+
 	/* the return value is ignored - the calls cannot fail,
 	 * large pages are disabled at boot time.
 	 */
Index: linux/arch/i386/mm/pgtable.c
===================================================================
--- linux.orig/arch/i386/mm/pgtable.c
+++ linux/arch/i386/mm/pgtable.c
@@ -180,7 +180,7 @@ void pmd_ctor(void *pmd, kmem_cache_t *c
  * recommendations and having no core impact whatsoever.
  * -- wli
  */
-DEFINE_SPINLOCK(pgd_lock);
+DEFINE_RAW_SPINLOCK(pgd_lock);
 struct page *pgd_list;
 
 static inline void pgd_list_add(pgd_t *pgd)
Index: linux/arch/i386/oprofile/Kconfig
===================================================================
--- linux.orig/arch/i386/oprofile/Kconfig
+++ linux/arch/i386/oprofile/Kconfig
@@ -19,5 +19,9 @@ config OPROFILE
 
 	  If unsure, say N.
 
+config PROFILE_NMI
+	bool
+	default y
+
 endmenu
 
Index: linux/arch/i386/pci/direct.c
===================================================================
--- linux.orig/arch/i386/pci/direct.c
+++ linux/arch/i386/pci/direct.c
@@ -211,16 +211,23 @@ static int __init pci_check_type1(void)
 	unsigned int tmp;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x01, 0xCFB);
 	tmp = inl(0xCF8);
 	outl(0x80000000, 0xCF8);
-	if (inl(0xCF8) == 0x80000000 && pci_sanity_check(&pci_direct_conf1)) {
-		works = 1;
+
+	if (inl(0xCF8) == 0x80000000) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf1))
+			works = 1;
+
+		spin_lock_irqsave(&pci_config_lock, flags);
 	}
 	outl(tmp, 0xCF8);
-	local_irq_restore(flags);
+
+	spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
@@ -230,17 +237,19 @@ static int __init pci_check_type2(void)
 	unsigned long flags;
 	int works = 0;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&pci_config_lock, flags);
 
 	outb(0x00, 0xCFB);
 	outb(0x00, 0xCF8);
 	outb(0x00, 0xCFA);
-	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00 &&
-	    pci_sanity_check(&pci_direct_conf2)) {
-		works = 1;
-	}
 
-	local_irq_restore(flags);
+	if (inb(0xCF8) == 0x00 && inb(0xCFA) == 0x00) {
+		spin_unlock_irqrestore(&pci_config_lock, flags);
+
+		if (pci_sanity_check(&pci_direct_conf2))
+			works = 1;
+	} else
+		spin_unlock_irqrestore(&pci_config_lock, flags);
 
 	return works;
 }
Index: linux/arch/i386/pci/pcbios.c
===================================================================
--- linux.orig/arch/i386/pci/pcbios.c
+++ linux/arch/i386/pci/pcbios.c
@@ -70,7 +70,7 @@ static unsigned long bios32_service(unsi
 	unsigned long entry;		/* %edx */
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	__asm__("lcall *(%%edi); cld"
 		: "=a" (return_code),
 		  "=b" (address),
@@ -79,7 +79,7 @@ static unsigned long bios32_service(unsi
 		: "0" (service),
 		  "1" (0),
 		  "D" (&bios32_indirect));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	switch (return_code) {
 		case 0:
@@ -110,7 +110,7 @@ static int __devinit check_pcibios(void)
 	if ((pcibios_entry = bios32_service(PCI_SERVICE))) {
 		pci_indirect.address = pcibios_entry + PAGE_OFFSET;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		__asm__(
 			"lcall *(%%edi); cld\n\t"
 			"jc 1f\n\t"
@@ -123,7 +123,7 @@ static int __devinit check_pcibios(void)
 			: "1" (PCIBIOS_PCI_BIOS_PRESENT),
 			  "D" (&pci_indirect)
 			: "memory");
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 
 		status = (eax >> 8) & 0xff;
 		hw_mech = eax & 0xff;
Index: linux/arch/ia64/kernel/time.c
===================================================================
--- linux.orig/arch/ia64/kernel/time.c
+++ linux/arch/ia64/kernel/time.c
@@ -32,10 +32,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #define TIME_KEEPER_ID	0	/* smp_processor_id() of time-keeper */
 
 #ifdef CONFIG_IA64_DEBUG_IRQ
Index: linux/arch/m32r/kernel/time.c
===================================================================
--- linux.orig/arch/m32r/kernel/time.c
+++ linux/arch/m32r/kernel/time.c
@@ -39,10 +39,6 @@ extern void send_IPI_allbutself(int, int
 extern void smp_local_timer_interrupt(struct pt_regs *);
 #endif
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 #define TICK_SIZE	(tick_nsec / 1000)
 
Index: linux/arch/m68k/kernel/time.c
===================================================================
--- linux.orig/arch/m68k/kernel/time.c
+++ linux/arch/m68k/kernel/time.c
@@ -27,10 +27,6 @@
 #include <linux/timex.h>
 #include <linux/profile.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static inline int set_rtc_mmss(unsigned long nowtime)
 {
   if (mach_set_clock_mmss)
Index: linux/arch/m68knommu/kernel/time.c
===================================================================
--- linux.orig/arch/m68knommu/kernel/time.c
+++ linux/arch/m68knommu/kernel/time.c
@@ -27,10 +27,6 @@
 
 #define	TICK_SIZE (tick_nsec / 1000)
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 extern unsigned long wall_jiffies;
 
 
Index: linux/arch/mips/Kconfig
===================================================================
--- linux.orig/arch/mips/Kconfig
+++ linux/arch/mips/Kconfig
@@ -364,6 +364,7 @@ config MIPS_SEAD
 config MOMENCO_OCELOT
 	bool "Support for Momentum Ocelot board"
 	select DMA_NONCOHERENT
+	select NO_SPINLOCK
 	select HW_HAS_PCI
 	select IRQ_CPU
 	select IRQ_CPU_RM7K
@@ -750,6 +751,7 @@ config SIBYTE_SB1xxx_SOC
 	depends on EXPERIMENTAL
 	select BOOT_ELF32
 	select DMA_COHERENT
+	select NO_SPINLOCK
 	select SWAP_IO_SPACE
 	select SYS_SUPPORTS_32BIT_KERNEL
 	select SYS_SUPPORTS_64BIT_KERNEL
@@ -998,12 +1000,21 @@ config TOSHIBA_FPCIB0
 	bool "FPCIB0 Backplane Support"
 	depends on TOSHIBA_RBTX4927
 
+source "kernel/Kconfig.preempt"
+
 config RWSEM_GENERIC_SPINLOCK
 	bool
+	depends on !PREEMPT_RT
 	default y
 
 config RWSEM_XCHGADD_ALGORITHM
 	bool
+	depends on !PREEMPT_RT
+
+config ASM_SEMAPHORES
+	bool
+#	depends on !PREEMPT_RT
+	default y
 
 config GENERIC_CALIBRATE_DELAY
 	bool
@@ -1034,6 +1045,9 @@ config DMA_NONCOHERENT
 config DMA_NEED_PCI_MAP_STATE
 	bool
 
+config NO_SPINLOCK
+	bool
+
 config EARLY_PRINTK
 	bool
 	depends on MACH_DECSTATION
@@ -1543,15 +1557,6 @@ config NR_CPUS
 	  This is purely to save memory - each supported CPU adds
 	  approximately eight kilobytes to the kernel image.
 
-config PREEMPT
-	bool "Preemptible Kernel"
-	help
-	  This option reduces the latency of the kernel when reacting to
-	  real-time or interactive events by allowing a low priority process to
-	  be preempted even if it is in kernel mode executing a system call.
-	  This allows applications to run more reliably even when the system is
-	  under load.
-
 config RTC_DS1742
 	bool "DS1742 BRAM/RTC support"
 	depends on TOSHIBA_JMR3927 || TOSHIBA_RBTX4927
@@ -1566,10 +1571,6 @@ config MIPS_INSANE_LARGE
 	  This will result in additional memory usage, so it is not
 	  recommended for normal users.
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
 endmenu
 
 menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)"
Index: linux/arch/mips/arc/misc.c
===================================================================
--- linux.orig/arch/mips/arc/misc.c
+++ linux/arch/mips/arc/misc.c
@@ -27,7 +27,7 @@ VOID
 ArcHalt(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -39,7 +39,7 @@ VOID
 ArcPowerDown(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -52,7 +52,7 @@ VOID
 ArcRestart(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -64,7 +64,7 @@ VOID
 ArcReboot(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
@@ -76,7 +76,7 @@ VOID
 ArcEnterInteractiveMode(VOID)
 {
 	bc_disable();
-	local_irq_disable();
+	raw_local_irq_disable();
 #ifdef CONFIG_SCSI_SGIWD93
 	reset_wd33c93(sgiwd93_host);
 #endif
Index: linux/arch/mips/au1000/common/irq.c
===================================================================
--- linux.orig/arch/mips/au1000/common/irq.c
+++ linux/arch/mips/au1000/common/irq.c
@@ -253,47 +253,43 @@ void restore_local_and_enable(int contro
 
 
 static struct hw_interrupt_type rise_edge_irq_type = {
-	"Au1000 Rise Edge",
-	startup_irq,
-	shutdown_irq,
-	local_enable_irq,
-	local_disable_irq,
-	mask_and_ack_rise_edge_irq,
-	end_irq,
-	NULL
+	.typename = "Au1000 Rise Edge",
+	.startup = startup_irq,
+	.shutdown = shutdown_irq,
+	.enable = local_enable_irq,
+	.disable = local_disable_irq,
+	.ack = mask_and_ack_rise_edge_irq,
+	.end = end_irq,
 };
 
 static struct hw_interrupt_type fall_edge_irq_type = {
-	"Au1000 Fall Edge",
-	startup_irq,
-	shutdown_irq,
-	local_enable_irq,
-	local_disable_irq,
-	mask_and_ack_fall_edge_irq,
-	end_irq,
-	NULL
+	.typename = "Au1000 Fall Edge",
+	.startup = startup_irq,
+	.shutdown = shutdown_irq,
+	.enable = local_enable_irq,
+	.disable = local_disable_irq,
+	.ack = mask_and_ack_fall_edge_irq,
+	.end = end_irq,
 };
 
 static struct hw_interrupt_type either_edge_irq_type = {
-	"Au1000 Rise or Fall Edge",
-	startup_irq,
-	shutdown_irq,
-	local_enable_irq,
-	local_disable_irq,
-	mask_and_ack_either_edge_irq,
-	end_irq,
-	NULL
+	.typename = "Au1000 Rise or Fall Edge",
+	.startup = startup_irq,
+	.shutdown = shutdown_irq,
+	.enable = local_enable_irq,
+	.disable = local_disable_irq,
+	.ack = mask_and_ack_either_edge_irq,
+	.end = end_irq,
 };
 
 static struct hw_interrupt_type level_irq_type = {
-	"Au1000 Level",
-	startup_irq,
-	shutdown_irq,
-	local_enable_irq,
-	local_disable_irq,
-	mask_and_ack_level_irq,
-	end_irq,
-	NULL
+	.typename = "Au1000 Level",
+	.startup = startup_irq,
+	.shutdown = shutdown_irq,
+	.enable = local_enable_irq,
+	.disable = local_disable_irq,
+	.ack = mask_and_ack_level_irq,
+	.end = end_irq,
 };
 
 #ifdef CONFIG_PM
Index: linux/arch/mips/ddb5xxx/ddb5074/nile4_pic.c
===================================================================
--- linux.orig/arch/mips/ddb5xxx/ddb5074/nile4_pic.c
+++ linux/arch/mips/ddb5xxx/ddb5074/nile4_pic.c
@@ -209,14 +209,13 @@ static void nile4_irq_end(unsigned int i
 #define nile4_irq_shutdown nile4_disable_irq
 
 static hw_irq_controller nile4_irq_controller = {
-    "nile4",
-    nile4_irq_startup,
-    nile4_irq_shutdown,
-    nile4_enable_irq,
-    nile4_disable_irq,
-    nile4_ack_irq,
-    nile4_irq_end,
-    NULL
+	.typename = "nile4",
+	.startup = nile4_irq_startup,
+	.shutdown = nile4_irq_shutdown,
+	.enable = nile4_enable_irq,
+	.disable = nile4_disable_irq,
+	.ack = nile4_ack_irq,
+	.end = nile4_irq_end,
 };
 
 void nile4_irq_setup(u32 base) {
Index: linux/arch/mips/ddb5xxx/ddb5476/vrc5476_irq.c
===================================================================
--- linux.orig/arch/mips/ddb5xxx/ddb5476/vrc5476_irq.c
+++ linux/arch/mips/ddb5xxx/ddb5476/vrc5476_irq.c
@@ -53,14 +53,13 @@ static void vrc5476_irq_end(uint irq)
 }
 
 static hw_irq_controller vrc5476_irq_controller = {
-	"vrc5476",
-	vrc5476_irq_startup,
-	vrc5476_irq_shutdown,
-	vrc5476_irq_enable,
-	vrc5476_irq_disable,
-	vrc5476_irq_ack,
-	vrc5476_irq_end,
-	NULL				/* no affinity stuff for UP */
+	.typename = "vrc5476",
+	.startup = vrc5476_irq_startup,
+	.shutdown = vrc5476_irq_shutdown,
+	.enable = vrc5476_irq_enable,
+	.disable = vrc5476_irq_disable,
+	.ack = vrc5476_irq_ack,
+	.end = vrc5476_irq_end
 };
 
 void __init
Index: linux/arch/mips/ddb5xxx/ddb5477/irq_5477.c
===================================================================
--- linux.orig/arch/mips/ddb5xxx/ddb5477/irq_5477.c
+++ linux/arch/mips/ddb5xxx/ddb5477/irq_5477.c
@@ -90,14 +90,13 @@ vrc5477_irq_end(unsigned int irq)
 }
 
 hw_irq_controller vrc5477_irq_controller = {
-	"vrc5477_irq",
-	vrc5477_irq_startup,
-	vrc5477_irq_shutdown,
-	vrc5477_irq_enable,
-	vrc5477_irq_disable,
-	vrc5477_irq_ack,
-	vrc5477_irq_end,
-	NULL			/* no affinity stuff for UP */
+	.typename = "vrc5477_irq",
+	.startup = vrc5477_irq_startup,
+	.shutdown = vrc5477_irq_shutdown,
+	.enable = vrc5477_irq_enable,
+	.disable = vrc5477_irq_disable,
+	.ack = vrc5477_irq_ack,
+	.end = vrc5477_irq_end
 };
 
 void __init vrc5477_irq_init(u32 irq_base)
Index: linux/arch/mips/gt64120/ev64120/irq.c
===================================================================
--- linux.orig/arch/mips/gt64120/ev64120/irq.c
+++ linux/arch/mips/gt64120/ev64120/irq.c
@@ -60,25 +60,25 @@ static void disable_ev64120_irq(unsigned
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (irq_nr >= 8) {	// All PCI interrupts are on line 5 or 2
 		clear_c0_status(9 << 10);
 	} else {
 		clear_c0_status(1 << (irq_nr + 8));
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void enable_ev64120_irq(unsigned int irq_nr)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (irq_nr >= 8)	// All PCI interrupts are on line 5 or 2
 		set_c0_status(9 << 10);
 	else
 		set_c0_status(1 << (irq_nr + 8));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_ev64120_irq(unsigned int irq)
@@ -119,7 +119,7 @@ void gt64120_irq_setup(void)
 	/* Sets the exception_handler array. */
 	set_except_vector(0, galileo_handle_int);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Enable timer.  Other interrupts will be enabled as they are
Index: linux/arch/mips/gt64120/momenco_ocelot/irq.c
===================================================================
--- linux.orig/arch/mips/gt64120/momenco_ocelot/irq.c
+++ linux/arch/mips/gt64120/momenco_ocelot/irq.c
@@ -57,7 +57,7 @@ void __init arch_init_irq(void)
 	 * int-handler is not on bootstrap
 	 */
 	clear_c0_status(ST0_IM);
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Sets the first-level interrupt dispatcher. */
 	set_except_vector(0, ocelot_handle_int);
Index: linux/arch/mips/ite-boards/generic/irq.c
===================================================================
--- linux.orig/arch/mips/ite-boards/generic/irq.c
+++ linux/arch/mips/ite-boards/generic/irq.c
@@ -138,14 +138,13 @@ static void end_ite_irq(unsigned int irq
 }
 
 static struct hw_interrupt_type it8172_irq_type = {
-	"ITE8172",
-	startup_ite_irq,
-	shutdown_ite_irq,
-	enable_it8172_irq,
-	disable_it8172_irq,
-	mask_and_ack_ite_irq,
-	end_ite_irq,
-	NULL
+	.typename = "ITE8172",
+	.startup = startup_ite_irq,
+	.shutdown = shutdown_ite_irq,
+	.enable = enable_it8172_irq,
+	.disable = disable_it8172_irq,
+	.ack = mask_and_ack_ite_irq,
+	.end = end_ite_irq,
 };
 
 
@@ -159,22 +158,22 @@ static void ack_none(unsigned int irq) {
 #define end_none	enable_none
 
 static struct hw_interrupt_type cp0_irq_type = {
-	"CP0 Count",
-	startup_none,
-	shutdown_none,
-	enable_none,
-	disable_none,
-	ack_none,
-	end_none
+	.typename = "CP0 Count",
+	.startup = startup_none,
+	.shutdown = shutdown_none,
+	.enable = enable_none,
+	.disable = disable_none,
+	.ack = ack_none,
+	.end = end_none
 };
 
 void enable_cpu_timer(void)
 {
         unsigned long flags;
 
-        local_irq_save(flags);
+        raw_local_irq_save(flags);
 	set_c0_status(0x100 << EXT_IRQ5_TO_IP);
-        local_irq_restore(flags);
+        raw_local_irq_restore(flags);
 }
 
 void __init arch_init_irq(void)
Index: linux/arch/mips/ite-boards/generic/time.c
===================================================================
--- linux.orig/arch/mips/ite-boards/generic/time.c
+++ linux/arch/mips/ite-boards/generic/time.c
@@ -124,7 +124,7 @@ static unsigned long __init cal_r4koff(v
 {
 	unsigned int flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Start counter exactly on falling edge of update flag */
 	while (CMOS_READ(RTC_REG_A) & RTC_UIP);
@@ -140,7 +140,7 @@ static unsigned long __init cal_r4koff(v
 	mips_hpt_frequency = read_c0_count();
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return (mips_hpt_frequency / HZ);
 }
@@ -153,11 +153,11 @@ it8172_rtc_get_time(void)
 
 	/* avoid update-in-progress. */
 	for (;;) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (! (CMOS_READ(RTC_REG_A) & RTC_UIP))
 			break;
 		/* don't hold intr closed all the time */
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 
 	/* Read regs. */
@@ -170,7 +170,7 @@ it8172_rtc_get_time(void)
 		hw_to_bin(*rtc_century_reg) * 100;
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return mktime(year, mon, day, hour, min, sec);
 }
@@ -186,11 +186,11 @@ it8172_rtc_set_time(unsigned long t)
 
 	/* avoid update-in-progress. */
 	for (;;) {
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		if (! (CMOS_READ(RTC_REG_A) & RTC_UIP))
 			break;
 		/* don't hold intr closed all the time */
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 
 	*rtc_century_reg = bin_to_hw(tm.tm_year/100);
@@ -202,7 +202,7 @@ it8172_rtc_set_time(unsigned long t)
 	CMOS_WRITE(bin_to_hw(tm.tm_year%100), RTC_YEAR);
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return 0;
 }
@@ -211,7 +211,7 @@ void __init it8172_time_init(void)
 {
         unsigned int est_freq, flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	saved_control = CMOS_READ(RTC_CONTROL);
 
@@ -225,7 +225,7 @@ void __init it8172_time_init(void)
 	printk("CPU frequency %d.%02d MHz\n", est_freq/1000000,
 	       (est_freq%1000000)*100/1000000);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	rtc_get_time = it8172_rtc_get_time;
 	rtc_set_time = it8172_rtc_set_time;
Index: linux/arch/mips/jazz/irq.c
===================================================================
--- linux.orig/arch/mips/jazz/irq.c
+++ linux/arch/mips/jazz/irq.c
@@ -58,14 +58,13 @@ static void end_r4030_irq(unsigned int i
 }
 
 static struct hw_interrupt_type r4030_irq_type = {
-	"R4030",
-	startup_r4030_irq,
-	shutdown_r4030_irq,
-	enable_r4030_irq,
-	disable_r4030_irq,
-	mask_and_ack_r4030_irq,
-	end_r4030_irq,
-	NULL
+	.typename = "R4030",
+	.startup = startup_r4030_irq,
+	.shutdown = shutdown_r4030_irq,
+	.enable = enable_r4030_irq,
+	.disable = disable_r4030_irq,
+	.ack = mask_and_ack_r4030_irq,
+	.end = end_r4030_irq,
 };
 
 void __init init_r4030_ints(void)
Index: linux/arch/mips/jmr3927/rbhma3100/irq.c
===================================================================
--- linux.orig/arch/mips/jmr3927/rbhma3100/irq.c
+++ linux/arch/mips/jmr3927/rbhma3100/irq.c
@@ -412,13 +412,13 @@ void __init arch_init_irq(void)
 }
 
 static hw_irq_controller jmr3927_irq_controller = {
-	"jmr3927_irq",
-	jmr3927_irq_startup,
-	jmr3927_irq_shutdown,
-	jmr3927_irq_enable,
-	jmr3927_irq_disable,
-	jmr3927_irq_ack,
-	jmr3927_irq_end,
+	.typename = "jmr3927_irq",
+	.startup = jmr3927_irq_startup,
+	.shutdown = jmr3927_irq_shutdown,
+	.enable = jmr3927_irq_enable,
+	.disable = jmr3927_irq_disable,
+	.ack = jmr3927_irq_ack,
+	.end = jmr3927_irq_end,
 };
 
 void jmr3927_irq_init(u32 irq_base)
Index: linux/arch/mips/jmr3927/rbhma3100/setup.c
===================================================================
--- linux.orig/arch/mips/jmr3927/rbhma3100/setup.c
+++ linux/arch/mips/jmr3927/rbhma3100/setup.c
@@ -108,7 +108,7 @@ static inline void do_reset(void)
 
 static void jmr3927_machine_restart(char *command)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	puts("Rebooting...");
 	do_reset();
 }
Index: linux/arch/mips/kernel/Makefile
===================================================================
--- linux.orig/arch/mips/kernel/Makefile
+++ linux/arch/mips/kernel/Makefile
@@ -5,7 +5,7 @@
 extra-y		:= head.o init_task.o vmlinux.lds
 
 obj-y		+= cpu-probe.o branch.o entry.o genex.o irq.o process.o \
-		   ptrace.o reset.o semaphore.o setup.o signal.o syscall.o \
+		   ptrace.o reset.o setup.o signal.o syscall.o \
 		   time.o traps.o unaligned.o
 
 binfmt_irix-objs	:= irixelf.o irixinv.o irixioctl.o irixsig.o	\
@@ -17,6 +17,8 @@ obj-$(CONFIG_32BIT)		+= module-elf32.o
 obj-$(CONFIG_64BIT)		+= module-elf64.o
 endif
 
+obj-$(CONFIG_ASM_SEMAPHORES)	+= semaphore.o
+
 obj-$(CONFIG_CPU_R3000)		+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX39XX)	+= r2300_fpu.o r2300_switch.o
 obj-$(CONFIG_CPU_TX49XX)	+= r4k_fpu.o r4k_switch.o
Index: linux/arch/mips/kernel/asm-offsets.c
===================================================================
--- linux.orig/arch/mips/kernel/asm-offsets.c
+++ linux/arch/mips/kernel/asm-offsets.c
@@ -11,6 +11,9 @@
 #include <linux/config.h>
 #include <linux/compat.h>
 #include <linux/types.h>
+#include <linux/linkage.h>
+#include <linux/rt_irq.h>
+#include <asm/interrupt.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
Index: linux/arch/mips/kernel/cpu-bugs64.c
===================================================================
--- linux.orig/arch/mips/kernel/cpu-bugs64.c
+++ linux/arch/mips/kernel/cpu-bugs64.c
@@ -48,7 +48,7 @@ static inline void mult_sh_align_mod(lon
 	 * used for.
 	 */
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/*
 	 * The following code leads to a wrong result of the first
 	 * dsll32 when executed on R4000 rev. 2.2 or 3.0 (PRId
@@ -101,7 +101,7 @@ static inline void mult_sh_align_mod(lon
 		""
 		: "=r" (lv2)
 		: "0" (lv2), "r" (p));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	*v1 = lv1;
 	*v2 = lv2;
@@ -182,7 +182,7 @@ static inline void check_daddi(void)
 
 	printk("Checking for the daddi bug... ");
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	handler = set_except_vector(12, handle_daddi_ov);
 	/*
 	 * The following code fails to trigger an overflow exception
@@ -208,7 +208,7 @@ static inline void check_daddi(void)
 		: "=r" (v), "=&r" (tmp)
 		: "I" (0xffffffffffffdb9a), "I" (0x1234));
 	set_except_vector(12, handler);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (daddi_ov) {
 		printk("no.\n");
@@ -217,7 +217,7 @@ static inline void check_daddi(void)
 
 	printk("yes, workaround... ");
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	handler = set_except_vector(12, handle_daddi_ov);
 	asm volatile(
 		"addiu	%1, $0, %2\n\t"
@@ -226,7 +226,7 @@ static inline void check_daddi(void)
 		: "=r" (v), "=&r" (tmp)
 		: "I" (0xffffffffffffdb9a), "I" (0x1234));
 	set_except_vector(12, handler);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (daddi_ov) {
 		printk("yes.\n");
Index: linux/arch/mips/kernel/entry.S
===================================================================
--- linux.orig/arch/mips/kernel/entry.S
+++ linux/arch/mips/kernel/entry.S
@@ -23,7 +23,7 @@
 	.endm
 #else
 	.macro	preempt_stop reg=t0
-	local_irq_disable \reg
+	mips_raw_local_irq_disable \reg
 	.endm
 #define resume_kernel	restore_all
 #endif
@@ -38,7 +38,7 @@ FEXPORT(ret_from_irq)
 	beqz	t0, resume_kernel
 
 FEXPORT(resume_userspace)
-	local_irq_disable	t0	# make sure we dont miss an
+	mips_raw_local_irq_disable	t0	# make sure we dont miss an
 					# interrupt setting need_resched
 					# between sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -48,6 +48,8 @@ FEXPORT(resume_userspace)
 
 #ifdef CONFIG_PREEMPT
 ENTRY(resume_kernel)
+	lw	t0, kernel_preemption
+	beqz	t0, restore_all
 	lw	t0, TI_PRE_COUNT($28)
 	bnez	t0, restore_all
 need_resched:
@@ -57,12 +59,9 @@ need_resched:
 	LONG_L	t0, PT_STATUS(sp)		# Interrupts off?
 	andi	t0, 1
 	beqz	t0, restore_all
-	li	t0, PREEMPT_ACTIVE
-	sw	t0, TI_PRE_COUNT($28)
-	local_irq_enable t0
-	jal	schedule
+	mips_raw_local_irq_disable t0
+	jal	preempt_schedule_irq
 	sw	zero, TI_PRE_COUNT($28)
-	local_irq_disable t0
 	b	need_resched
 #endif
 
@@ -70,7 +69,7 @@ FEXPORT(ret_from_fork)
 	jal	schedule_tail		# a0 = task_t *prev
 
 FEXPORT(syscall_exit)
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
@@ -89,19 +88,19 @@ FEXPORT(restore_partial)		# restore part
 	.set	at
 
 FEXPORT(work_pending)
-	andi	t0, a2, _TIF_NEED_RESCHED
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beqz	t0, work_notifysig
 work_resched:
-	jal	schedule
-
-	local_irq_disable t0		# make sure need_resched and
+	mips_raw_local_irq_disable
+	jal	__schedule
+					# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
 	andi	t0, a2, _TIF_WORK_MASK	# is there any work to be done
 					# other than syscall tracing?
 	beqz	t0, restore_all
-	andi	t0, a2, _TIF_NEED_RESCHED
+	andi	t0, a2, (_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bnez	t0, work_resched
 
 work_notifysig:				# deal with pending signals and
@@ -118,7 +117,7 @@ FEXPORT(syscall_exit_work)
 	li	t1, _TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT
 	and	t0, t1
 	beqz	t0, work_pending	# trace bit is set
-	local_irq_enable		# could let do_syscall_trace()
+	mips_raw_local_irq_enable		# could let do_syscall_trace()
 					# call schedule() instead
 	move	a0, sp
 	li	a1, 1
Index: linux/arch/mips/kernel/gdb-stub.c
===================================================================
--- linux.orig/arch/mips/kernel/gdb-stub.c
+++ linux/arch/mips/kernel/gdb-stub.c
@@ -400,7 +400,7 @@ void set_debug_traps(void)
 	unsigned long flags;
 	unsigned char c;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
 		saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low);
 
@@ -416,7 +416,7 @@ void set_debug_traps(void)
 	putDebugChar('+'); /* ack it */
 
 	initialized = 1;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void restore_debug_traps(void)
@@ -424,10 +424,10 @@ void restore_debug_traps(void)
 	struct hard_trap_info *ht;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	for (ht = hard_trap_info; ht->tt && ht->signo; ht++)
 		set_except_vector(ht->tt, saved_vectors[ht->tt]);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -643,9 +643,11 @@ void set_async_breakpoint(unsigned long 
 	if ((*epc & 0x80000000) == 0)
 		return;
 
+#ifdef CONFIG_SMP
 	/* avoid deadlock if someone is make IPC */
 	if (spin_is_locked(&smp_call_lock))
 		return;
+#endif
 
 	async_bp.addr = *epc;
 	*epc = (unsigned long)async_breakpoint;
@@ -656,12 +658,12 @@ void kgdb_wait(void *arg)
 	unsigned flags;
 	int cpu = smp_processor_id();
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	spin_lock(&kgdb_cpulock[cpu]);
 	spin_unlock(&kgdb_cpulock[cpu]);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 
Index: linux/arch/mips/kernel/i8259.c
===================================================================
--- linux.orig/arch/mips/kernel/i8259.c
+++ linux/arch/mips/kernel/i8259.c
@@ -31,7 +31,7 @@ void disable_8259A_irq(unsigned int irq)
  * moves to arch independent land
  */
 
-spinlock_t DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -52,14 +52,13 @@ static unsigned int startup_8259A_irq(un
 }
 
 static struct hw_interrupt_type i8259A_irq_type = {
-	"XT-PIC",
-	startup_8259A_irq,
-	shutdown_8259A_irq,
-	enable_8259A_irq,
-	disable_8259A_irq,
-	mask_and_ack_8259A,
-	end_8259A_irq,
-	NULL
+	.typename = "XT-PIC",
+	.startup = startup_8259A_irq,
+	.shutdown = shutdown_8259A_irq,
+	.enable = enable_8259A_irq,
+	.disable = disable_8259A_irq,
+	.ack = mask_and_ack_8259A,
+	.end = end_8259A_irq,
 };
 
 /*
Index: linux/arch/mips/kernel/init_task.c
===================================================================
--- linux.orig/arch/mips/kernel/init_task.c
+++ linux/arch/mips/kernel/init_task.c
@@ -9,8 +9,8 @@
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux/arch/mips/kernel/irq-msc01.c
===================================================================
--- linux.orig/arch/mips/kernel/irq-msc01.c
+++ linux/arch/mips/kernel/irq-msc01.c
@@ -129,25 +129,23 @@ msc_bind_eic_interrupt (unsigned int irq
 #define shutdown_msc_irq	disable_msc_irq
 
 struct hw_interrupt_type msc_levelirq_type = {
-	"SOC-it-Level",
-	startup_msc_irq,
-	shutdown_msc_irq,
-	enable_msc_irq,
-	disable_msc_irq,
-	level_mask_and_ack_msc_irq,
-	end_msc_irq,
-	NULL
+	.typename = "SOC-it-Level",
+	.startup = startup_msc_irq,
+	.shutdown = shutdown_msc_irq,
+	.enable = enable_msc_irq,
+	.disable = disable_msc_irq,
+	.ack = level_mask_and_ack_msc_irq,
+	.end = end_msc_irq,
 };
 
 struct hw_interrupt_type msc_edgeirq_type = {
-	"SOC-it-Edge",
-	startup_msc_irq,
-	shutdown_msc_irq,
-	enable_msc_irq,
-	disable_msc_irq,
-	edge_mask_and_ack_msc_irq,
-	end_msc_irq,
-	NULL
+	.typename = "SOC-it-Edge",
+	.startup =startup_msc_irq,
+	.shutdown = shutdown_msc_irq,
+	.enable = enable_msc_irq,
+	.disable = disable_msc_irq,
+	.ack = edge_mask_and_ack_msc_irq,
+	.end = end_msc_irq,
 };
 
 
Index: linux/arch/mips/kernel/irq-mv6434x.c
===================================================================
--- linux.orig/arch/mips/kernel/irq-mv6434x.c
+++ linux/arch/mips/kernel/irq-mv6434x.c
@@ -135,14 +135,13 @@ void ll_mv64340_irq(struct pt_regs *regs
 #define shutdown_mv64340_irq	disable_mv64340_irq
 
 struct hw_interrupt_type mv64340_irq_type = {
-	"MV-64340",
-	startup_mv64340_irq,
-	shutdown_mv64340_irq,
-	enable_mv64340_irq,
-	disable_mv64340_irq,
-	mask_and_ack_mv64340_irq,
-	end_mv64340_irq,
-	NULL
+	.typename = "MV-64340",
+	.startup = startup_mv64340_irq,
+	.shutdown = shutdown_mv64340_irq,
+	.enable = enable_mv64340_irq,
+	.disable = disable_mv64340_irq,
+	.ack = mask_and_ack_mv64340_irq,
+	.end = end_mv64340_irq,
 };
 
 void __init mv64340_irq_init(unsigned int base)
Index: linux/arch/mips/kernel/irq-rm7000.c
===================================================================
--- linux.orig/arch/mips/kernel/irq-rm7000.c
+++ linux/arch/mips/kernel/irq-rm7000.c
@@ -33,18 +33,18 @@ static inline void rm7k_cpu_irq_enable(u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	unmask_rm7k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void rm7k_cpu_irq_disable(unsigned int irq)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_rm7k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int rm7k_cpu_irq_startup(unsigned int irq)
@@ -72,13 +72,13 @@ static void rm7k_cpu_irq_end(unsigned in
 }
 
 static hw_irq_controller rm7k_irq_controller = {
-	"RM7000",
-	rm7k_cpu_irq_startup,
-	rm7k_cpu_irq_shutdown,
-	rm7k_cpu_irq_enable,
-	rm7k_cpu_irq_disable,
-	rm7k_cpu_irq_ack,
-	rm7k_cpu_irq_end,
+	.typename = "RM7000",
+	.startup = rm7k_cpu_irq_startup,
+	.shutdown = rm7k_cpu_irq_shutdown,
+	.enable = rm7k_cpu_irq_enable,
+	.disable = rm7k_cpu_irq_disable,
+	.ack = rm7k_cpu_irq_ack,
+	.end = rm7k_cpu_irq_end,
 };
 
 void __init rm7k_cpu_irq_init(int base)
Index: linux/arch/mips/kernel/irq-rm9000.c
===================================================================
--- linux.orig/arch/mips/kernel/irq-rm9000.c
+++ linux/arch/mips/kernel/irq-rm9000.c
@@ -34,18 +34,18 @@ static inline void rm9k_cpu_irq_enable(u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	unmask_rm9k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void rm9k_cpu_irq_disable(unsigned int irq)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_rm9k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int rm9k_cpu_irq_startup(unsigned int irq)
@@ -79,9 +79,9 @@ static void local_rm9k_perfcounter_irq_s
 	unsigned int irq = (unsigned int) args;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_rm9k_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void rm9k_perfcounter_irq_shutdown(unsigned int irq)
@@ -106,23 +106,23 @@ static void rm9k_cpu_irq_end(unsigned in
 }
 
 static hw_irq_controller rm9k_irq_controller = {
-	"RM9000",
-	rm9k_cpu_irq_startup,
-	rm9k_cpu_irq_shutdown,
-	rm9k_cpu_irq_enable,
-	rm9k_cpu_irq_disable,
-	rm9k_cpu_irq_ack,
-	rm9k_cpu_irq_end,
+	.typename = "RM9000",
+	.startup = rm9k_cpu_irq_startup,
+	.shutdown = rm9k_cpu_irq_shutdown,
+	.enable = rm9k_cpu_irq_enable,
+	.disable = rm9k_cpu_irq_disable,
+	.ack = rm9k_cpu_irq_ack,
+	.end = rm9k_cpu_irq_end,
 };
 
 static hw_irq_controller rm9k_perfcounter_irq = {
-	"RM9000",
-	rm9k_perfcounter_irq_startup,
-	rm9k_perfcounter_irq_shutdown,
-	rm9k_cpu_irq_enable,
-	rm9k_cpu_irq_disable,
-	rm9k_cpu_irq_ack,
-	rm9k_cpu_irq_end,
+	.typename = "RM9000",
+	.startup = rm9k_perfcounter_irq_startup,
+	.shutdown = rm9k_perfcounter_irq_shutdown,
+	.enable = rm9k_cpu_irq_enable,
+	.disable = rm9k_cpu_irq_disable,
+	.ack = rm9k_cpu_irq_ack,
+	.end = rm9k_cpu_irq_end,
 };
 
 unsigned int rm9000_perfcount_irq;
Index: linux/arch/mips/kernel/irq.c
===================================================================
--- linux.orig/arch/mips/kernel/irq.c
+++ linux/arch/mips/kernel/irq.c
@@ -125,7 +125,10 @@ void __init init_IRQ(void)
 		irq_desc[i].action  = NULL;
 		irq_desc[i].depth   = 1;
 		irq_desc[i].handler = &no_irq_type;
-		spin_lock_init(&irq_desc[i].lock);
+		__raw_spin_lock_init(&irq_desc[i].lock);
+#ifdef CONFIG_PREEMPT_HARDIRQS
+		irq_desc[i].thread = NULL;
+#endif
 	}
 
 	arch_init_irq();
Index: linux/arch/mips/kernel/irq_cpu.c
===================================================================
--- linux.orig/arch/mips/kernel/irq_cpu.c
+++ linux/arch/mips/kernel/irq_cpu.c
@@ -50,18 +50,18 @@ static inline void mips_cpu_irq_enable(u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	unmask_mips_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void mips_cpu_irq_disable(unsigned int irq)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	mask_mips_irq(irq);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int mips_cpu_irq_startup(unsigned int irq)
@@ -92,14 +92,13 @@ static void mips_cpu_irq_end(unsigned in
 }
 
 static hw_irq_controller mips_cpu_irq_controller = {
-	"MIPS",
-	mips_cpu_irq_startup,
-	mips_cpu_irq_shutdown,
-	mips_cpu_irq_enable,
-	mips_cpu_irq_disable,
-	mips_cpu_irq_ack,
-	mips_cpu_irq_end,
-	NULL			/* no affinity stuff for UP */
+	.typename = "MIPS",
+	.startup = mips_cpu_irq_startup,
+	.shutdown = mips_cpu_irq_shutdown,
+	.enable = mips_cpu_irq_enable,
+	.disable = mips_cpu_irq_disable,
+	.ack = mips_cpu_irq_ack,
+	.end = mips_cpu_irq_end,
 };
 
 
Index: linux/arch/mips/kernel/module.c
===================================================================
--- linux.orig/arch/mips/kernel/module.c
+++ linux/arch/mips/kernel/module.c
@@ -2,7 +2,7 @@
 #include <linux/spinlock.h>
 
 static LIST_HEAD(dbe_list);
-static DEFINE_SPINLOCK(dbe_lock);
+static DEFINE_RAW_SPINLOCK(dbe_lock);
 
 /* Given an address, look for it in the module exception tables. */
 const struct exception_table_entry *search_module_dbetables(unsigned long addr)
Index: linux/arch/mips/kernel/process.c
===================================================================
--- linux.orig/arch/mips/kernel/process.c
+++ linux/arch/mips/kernel/process.c
@@ -55,10 +55,12 @@ ATTRIB_NORET void cpu_idle(void)
 {
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched())
+		while (!need_resched() && !need_resched_delayed())
 			if (cpu_wait)
 				(*cpu_wait)();
-		schedule();
+		raw_local_irq_disable();
+		__schedule();
+		raw_local_irq_enable();
 	}
 }
 
Index: linux/arch/mips/kernel/scall32-o32.S
===================================================================
--- linux.orig/arch/mips/kernel/scall32-o32.S
+++ linux/arch/mips/kernel/scall32-o32.S
@@ -72,7 +72,7 @@ stack_done:
 1:	sw	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	mips_raw_local_irq_disable	# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	lw	a2, TI_FLAGS($28)	# current->work
Index: linux/arch/mips/kernel/scall64-64.S
===================================================================
--- linux.orig/arch/mips/kernel/scall64-64.S
+++ linux/arch/mips/kernel/scall64-64.S
@@ -71,7 +71,7 @@ NESTED(handle_sys64, PT_SIZE, sp)
 1:	sd	v0, PT_R2(sp)		# result
 
 n64_syscall_exit:
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)	# current->work
Index: linux/arch/mips/kernel/scall64-n32.S
===================================================================
--- linux.orig/arch/mips/kernel/scall64-n32.S
+++ linux/arch/mips/kernel/scall64-n32.S
@@ -68,7 +68,7 @@ NESTED(handle_sysn32, PT_SIZE, sp)
 	sd	v0, PT_R0(sp)		# set flag for syscall restarting
 1:	sd	v0, PT_R2(sp)		# result
 
-	local_irq_disable		# make sure need_resched and
+	raw_local_irq_disable		# make sure need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L  a2, TI_FLAGS($28)	# current->work
Index: linux/arch/mips/kernel/scall64-o32.S
===================================================================
--- linux.orig/arch/mips/kernel/scall64-o32.S
+++ linux/arch/mips/kernel/scall64-o32.S
@@ -97,7 +97,7 @@ NESTED(handle_sys, PT_SIZE, sp)
 1:	sd	v0, PT_R2(sp)		# result
 
 o32_syscall_exit:
-	local_irq_disable		# make need_resched and
+	raw_local_irq_disable		# make need_resched and
 					# signals dont change between
 					# sampling and return
 	LONG_L	a2, TI_FLAGS($28)
Index: linux/arch/mips/kernel/semaphore.c
===================================================================
--- linux.orig/arch/mips/kernel/semaphore.c
+++ linux/arch/mips/kernel/semaphore.c
@@ -36,7 +36,7 @@
  * sem->count and sem->waking atomic.  Scalability isn't an issue because
  * this lock is used on UP only so it's just an empty variable.
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -63,7 +63,7 @@ static inline int __sem_update_count(str
 		: "=&r" (old_count), "=&r" (tmp), "=m" (sem->count)
 		: "r" (incr), "m" (sem->count));
 	} else {
-		static DEFINE_SPINLOCK(semaphore_lock);
+		static DEFINE_RAW_SPINLOCK(semaphore_lock);
 		unsigned long flags;
 
 		spin_lock_irqsave(&semaphore_lock, flags);
@@ -76,7 +76,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -90,7 +90,7 @@ void __up(struct semaphore *sem)
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__up);
+EXPORT_SYMBOL(__compat_up);
 
 /*
  * Note that when we come in to __down or __down_interruptible,
@@ -100,7 +100,7 @@ EXPORT_SYMBOL(__up);
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -129,9 +129,9 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-EXPORT_SYMBOL(__down);
+EXPORT_SYMBOL(__compat_down);
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -161,4 +161,4 @@ int __sched __down_interruptible(struct 
 	return retval;
 }
 
-EXPORT_SYMBOL(__down_interruptible);
+EXPORT_SYMBOL(__compat_down_interruptible);
Index: linux/arch/mips/kernel/signal.c
===================================================================
--- linux.orig/arch/mips/kernel/signal.c
+++ linux/arch/mips/kernel/signal.c
@@ -448,6 +448,10 @@ static int do_signal(sigset_t *oldset, s
 	}
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
Index: linux/arch/mips/kernel/signal32.c
===================================================================
--- linux.orig/arch/mips/kernel/signal32.c
+++ linux/arch/mips/kernel/signal32.c
@@ -765,6 +765,10 @@ int do_signal32(sigset_t *oldset, struct
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which is why we may in certain
 	 * cases get here from kernel mode. Just return without doing anything
Index: linux/arch/mips/kernel/smp.c
===================================================================
--- linux.orig/arch/mips/kernel/smp.c
+++ linux/arch/mips/kernel/smp.c
@@ -105,7 +105,22 @@ asmlinkage void start_secondary(void)
 	cpu_idle();
 }
 
-DEFINE_SPINLOCK(smp_call_lock);
+DEFINE_RAW_SPINLOCK(smp_call_lock);
+
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them.
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	int cpu = smp_processor_id();
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		if (cpu_online(i) && i != cpu)
+			core_send_ipi(i, SMP_RESCHEDULE_YOURSELF);
+}
 
 struct call_data_struct *call_data;
 
@@ -197,7 +212,7 @@ static void stop_this_cpu(void *dummy)
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_enable();	/* May need to service _machine_restart IPI */
+	raw_local_irq_enable();	/* May need to service _machine_restart IPI */
 	for (;;);		/* Wait if available. */
 }
 
@@ -284,6 +299,8 @@ int setup_profiling_timer(unsigned int m
 	return 0;
 }
 
+static DEFINE_RAW_SPINLOCK(tlbstate_lock);
+
 static void flush_tlb_all_ipi(void *info)
 {
 	local_flush_tlb_all();
@@ -315,6 +332,7 @@ static void flush_tlb_mm_ipi(void *mm)
 void flush_tlb_mm(struct mm_struct *mm)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
 
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		smp_call_function(flush_tlb_mm_ipi, (void *)mm, 1, 1);
@@ -324,6 +342,7 @@ void flush_tlb_mm(struct mm_struct *mm)
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_mm(mm);
 
 	preempt_enable();
@@ -347,6 +366,8 @@ void flush_tlb_range(struct vm_area_stru
 	struct mm_struct *mm = vma->vm_mm;
 
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&mm->mm_users) != 1) || (current->mm != mm)) {
 		struct flush_tlb_data fd;
 
@@ -360,6 +381,7 @@ void flush_tlb_range(struct vm_area_stru
 			if (smp_processor_id() != i)
 				cpu_context(i, mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_range(vma, start, end);
 	preempt_enable();
 }
@@ -390,6 +412,8 @@ static void flush_tlb_page_ipi(void *inf
 void flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
 {
 	preempt_disable();
+	spin_lock(&tlbstate_lock);
+
 	if ((atomic_read(&vma->vm_mm->mm_users) != 1) || (current->mm != vma->vm_mm)) {
 		struct flush_tlb_data fd;
 
@@ -402,6 +426,7 @@ void flush_tlb_page(struct vm_area_struc
 			if (smp_processor_id() != i)
 				cpu_context(i, vma->vm_mm) = 0;
 	}
+	spin_unlock(&tlbstate_lock);
 	local_flush_tlb_page(vma, page);
 	preempt_enable();
 }
Index: linux/arch/mips/kernel/time.c
===================================================================
--- linux.orig/arch/mips/kernel/time.c
+++ linux/arch/mips/kernel/time.c
@@ -43,16 +43,12 @@
 
 #define TICK_SIZE	(tick_nsec / 1000)
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /*
  * forward reference
  */
 extern volatile unsigned long wall_jiffies;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 
 /*
  * By default we provide the null RTC ops
@@ -552,7 +548,7 @@ unsigned int mips_hpt_frequency;
 
 static struct irqaction timer_irqaction = {
 	.handler = timer_interrupt,
-	.flags = SA_INTERRUPT,
+	.flags = SA_NODELAY | SA_INTERRUPT,
 	.name = "timer",
 };
 
Index: linux/arch/mips/kernel/traps.c
===================================================================
--- linux.orig/arch/mips/kernel/traps.c
+++ linux/arch/mips/kernel/traps.c
@@ -250,7 +250,7 @@ void show_registers(struct pt_regs *regs
 	printk("\n");
 }
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 
 NORET_TYPE void __die(const char * str, struct pt_regs * regs,
 	const char * file, const char * func, unsigned long line)
Index: linux/arch/mips/lasat/interrupt.c
===================================================================
--- linux.orig/arch/mips/lasat/interrupt.c
+++ linux/arch/mips/lasat/interrupt.c
@@ -39,18 +39,18 @@ void disable_lasat_irq(unsigned int irq_
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*lasat_int_mask &= ~(1 << irq_nr) << lasat_int_mask_shift;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void enable_lasat_irq(unsigned int irq_nr)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	*lasat_int_mask |= (1 << irq_nr) << lasat_int_mask_shift;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_lasat_irq(unsigned int irq)
@@ -71,14 +71,13 @@ static void end_lasat_irq(unsigned int i
 }
 
 static struct hw_interrupt_type lasat_irq_type = {
-	"Lasat",
-	startup_lasat_irq,
-	shutdown_lasat_irq,
-	enable_lasat_irq,
-	disable_lasat_irq,
-	mask_and_ack_lasat_irq,
-	end_lasat_irq,
-	NULL
+	.typename = "Lasat",
+	.startup = startup_lasat_irq,
+	.shutdown = shutdown_lasat_irq,
+	.enable = enable_lasat_irq,
+	.disable = disable_lasat_irq,
+	.ack = mask_and_ack_lasat_irq,
+	.end = end_lasat_irq,
 };
 
 static inline int ls1bit32(unsigned int x)
Index: linux/arch/mips/lasat/reset.c
===================================================================
--- linux.orig/arch/mips/lasat/reset.c
+++ linux/arch/mips/lasat/reset.c
@@ -33,7 +33,7 @@ int lasat_boot_to_service = 0;
 
 static void lasat_machine_restart(char *command)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	if (lasat_boot_to_service) {
 		printk("machine_restart: Rebooting to service mode\n");
@@ -47,7 +47,7 @@ static void lasat_machine_restart(char *
 #define MESSAGE "System halted"
 static void lasat_machine_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Disable interrupts and loop forever */
 	printk(KERN_NOTICE MESSAGE "\n");
Index: linux/arch/mips/lib-32/dump_tlb.c
===================================================================
--- linux.orig/arch/mips/lib-32/dump_tlb.c
+++ linux/arch/mips/lib-32/dump_tlb.c
@@ -111,7 +111,7 @@ void dump_tlb_addr(unsigned long addr)
 	unsigned int flags, oldpid;
 	int index;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi() & 0xff;
 	BARRIER();
 	write_c0_entryhi((addr & PAGE_MASK) | oldpid);
@@ -120,7 +120,7 @@ void dump_tlb_addr(unsigned long addr)
 	BARRIER();
 	index = read_c0_index();
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (index < 0) {
 		printk("No entry for address 0x%08lx in TLB\n", addr);
Index: linux/arch/mips/lib-32/r3k_dump_tlb.c
===================================================================
--- linux.orig/arch/mips/lib-32/r3k_dump_tlb.c
+++ linux/arch/mips/lib-32/r3k_dump_tlb.c
@@ -79,13 +79,13 @@ void dump_tlb_addr(unsigned long addr)
 	unsigned long flags, oldpid;
 	int index;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi() & 0xff;
 	write_c0_entryhi((addr & PAGE_MASK) | oldpid);
 	tlb_probe();
 	index = read_c0_index();
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (index < 0) {
 		printk("No entry for address 0x%08lx in TLB\n", addr);
Index: linux/arch/mips/lib-64/dump_tlb.c
===================================================================
--- linux.orig/arch/mips/lib-64/dump_tlb.c
+++ linux/arch/mips/lib-64/dump_tlb.c
@@ -112,7 +112,7 @@ void dump_tlb_addr(unsigned long addr)
 	unsigned int flags, oldpid;
 	int index;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi() & 0xff;
 	BARRIER();
 	write_c0_entryhi((addr & PAGE_MASK) | oldpid);
@@ -121,7 +121,7 @@ void dump_tlb_addr(unsigned long addr)
 	BARRIER();
 	index = read_c0_index();
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	if (index < 0) {
 		printk("No entry for address 0x%08lx in TLB\n", addr);
Index: linux/arch/mips/math-emu/cp1emu.c
===================================================================
--- linux.orig/arch/mips/math-emu/cp1emu.c
+++ linux/arch/mips/math-emu/cp1emu.c
@@ -1310,7 +1310,9 @@ int fpu_emulator_cop1Handler(int xcptno,
 		if (sig)
 			break;
 
+		preempt_enable();
 		cond_resched();
+		preempt_disable();
 	} while (xcp->cp0_epc > prevepc);
 
 	/* SIGILL indicates a non-fpu instruction */
Index: linux/arch/mips/mips-boards/atlas/atlas_int.c
===================================================================
--- linux.orig/arch/mips/mips-boards/atlas/atlas_int.c
+++ linux/arch/mips/mips-boards/atlas/atlas_int.c
@@ -76,14 +76,13 @@ static void end_atlas_irq(unsigned int i
 }
 
 static struct hw_interrupt_type atlas_irq_type = {
-	"Atlas",
-	startup_atlas_irq,
-	shutdown_atlas_irq,
-	enable_atlas_irq,
-	disable_atlas_irq,
-	mask_and_ack_atlas_irq,
-	end_atlas_irq,
-	NULL
+	.typename = "Atlas",
+	.startup = startup_atlas_irq,
+	.shutdown = shutdown_atlas_irq,
+	.enable = enable_atlas_irq,
+	.disable = disable_atlas_irq,
+	.ack = mask_and_ack_atlas_irq,
+	.end = end_atlas_irq,
 };
 
 static inline int ls1bit32(unsigned int x)
Index: linux/arch/mips/mips-boards/generic/mipsIRQ.S
===================================================================
--- linux.orig/arch/mips/mips-boards/generic/mipsIRQ.S
+++ linux/arch/mips/mips-boards/generic/mipsIRQ.S
@@ -143,11 +143,23 @@
 	 * time we take the exception the IRQ pin goes low, so just leave if
 	 * this is the case.
 	 */
+#define	PREEMPT_RT_MALTA_DEBUG
+#ifdef	PREEMPT_RT_MALTA_DEBUG
+	lui	t0, 0x1000
+	and	a0, s0, t0
+	bne	a0, zero, 1f
+	 nop				# delay slot
+	beq	s0, zero, 1f
+	 nop				# delay slot
+#endif
 	move	a1,s0
 	PRINT("Got interrupt: c0_cause = %08x\n")
 	mfc0	a1, CP0_EPC
 	PRINT("c0_epc = %08x\n")
 
+#ifdef	PREEMPT_RT_MALTA_DEBUG
+1:
+#endif
 	j	ret_from_irq
 	 nop
 	END(mipsIRQ)
Index: linux/arch/mips/mips-boards/generic/time.c
===================================================================
--- linux.orig/arch/mips/mips-boards/generic/time.c
+++ linux/arch/mips/mips-boards/generic/time.c
@@ -99,7 +99,7 @@ static unsigned int __init estimate_cpu_
 #if defined(CONFIG_MIPS_ATLAS) || defined(CONFIG_MIPS_MALTA)
 	unsigned int flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Start counter exactly on falling edge of update flag */
 	while (CMOS_READ(RTC_REG_A) & RTC_UIP);
@@ -115,7 +115,7 @@ static unsigned int __init estimate_cpu_
 	count = read_c0_count();
 
 	/* restore interrupts */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 #endif
 
 	mips_hpt_frequency = count;
@@ -138,7 +138,7 @@ void __init mips_time_init(void)
 {
 	unsigned int est_freq, flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 #if defined(CONFIG_MIPS_ATLAS) || defined(CONFIG_MIPS_MALTA)
         /* Set Data mode - binary. */
@@ -152,7 +152,7 @@ void __init mips_time_init(void)
 
         cpu_khz = est_freq / 1000;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init mips_timer_setup(struct irqaction *irq)
Index: linux/arch/mips/mm/c-r4k.c
===================================================================
--- linux.orig/arch/mips/mm/c-r4k.c
+++ linux/arch/mips/mm/c-r4k.c
@@ -110,9 +110,9 @@ static inline void blast_r4600_v1_icache
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	blast_icache32();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx49_blast_icache32(void)
@@ -140,9 +140,9 @@ static inline void blast_icache32_r4600_
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	blast_icache32_page_indexed(page);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx49_blast_icache32_page_indexed(unsigned long page)
@@ -1063,7 +1063,7 @@ static int __init probe_scache(void)
 	 * This is such a bitch, you'd think they would make it easy to do
 	 * this.  Away you daemons of stupidity!
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Fill each size-multiple cache line with a valid tag. */
 	pow2 = (64 * 1024);
@@ -1091,7 +1091,7 @@ static int __init probe_scache(void)
 			break;
 		pow2 <<= 1;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	addr -= begin;
 
 	scache_size = addr;
Index: linux/arch/mips/mm/c-tx39.c
===================================================================
--- linux.orig/arch/mips/mm/c-tx39.c
+++ linux/arch/mips/mm/c-tx39.c
@@ -49,7 +49,7 @@ static void tx39h_flush_icache_all(void)
 	unsigned long flags, config;
 
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
@@ -61,7 +61,7 @@ static void tx39h_flush_icache_all(void)
 	}
 
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void tx39h_dma_cache_wback_inv(unsigned long addr, unsigned long size)
@@ -104,39 +104,39 @@ static inline void tx39_blast_icache_pag
 {
 	unsigned long flags, config;
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	blast_icache16_page(addr);
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx39_blast_icache_page_indexed(unsigned long addr)
 {
 	unsigned long flags, config;
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	blast_icache16_page_indexed(addr);
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx39_blast_icache(void)
 {
 	unsigned long flags, config;
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	blast_icache16();
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void tx39_flush_cache_all(void)
@@ -263,7 +263,7 @@ static void tx39_flush_icache_range(unsi
 		addr = start & ~(dc_lsize - 1);
 		aend = (end - 1) & ~(dc_lsize - 1);
 		/* disable icache (set ICE#) */
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		config = read_c0_conf();
 		write_c0_conf(config & ~TX39_CONF_ICE);
 		TX39_STOP_STREAMING();
@@ -275,7 +275,7 @@ static void tx39_flush_icache_range(unsi
 			addr += dc_lsize;
 		}
 		write_c0_conf(config);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -364,13 +364,13 @@ static void tx39_flush_cache_sigtramp(un
 	protected_writeback_dcache_line(addr & ~(dc_lsize - 1));
 
 	/* disable icache (set ICE#) */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	config = read_c0_conf();
 	write_c0_conf(config & ~TX39_CONF_ICE);
 	TX39_STOP_STREAMING();
 	protected_flush_icache_line(addr & ~(ic_lsize - 1));
 	write_c0_conf(config);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static __init void tx39_probe_cache(void)
Index: linux/arch/mips/mm/init.c
===================================================================
--- linux.orig/arch/mips/mm/init.c
+++ linux/arch/mips/mm/init.c
@@ -35,7 +35,7 @@
 #include <asm/pgalloc.h>
 #include <asm/tlb.h>
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long highstart_pfn, highend_pfn;
 
Index: linux/arch/mips/mm/sc-ip22.c
===================================================================
--- linux.orig/arch/mips/mm/sc-ip22.c
+++ linux/arch/mips/mm/sc-ip22.c
@@ -72,7 +72,7 @@ static void indy_sc_wback_invalidate(uns
 	first_line = SC_INDEX(addr);
 	last_line = SC_INDEX(addr + size - 1);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (first_line <= last_line) {
 		indy_sc_wipe(first_line, last_line);
 		goto out;
@@ -81,7 +81,7 @@ static void indy_sc_wback_invalidate(uns
 	indy_sc_wipe(first_line, SC_SIZE - SC_LINE);
 	indy_sc_wipe(0, last_line);
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void indy_sc_enable(void)
Index: linux/arch/mips/mm/sc-r5k.c
===================================================================
--- linux.orig/arch/mips/mm/sc-r5k.c
+++ linux/arch/mips/mm/sc-r5k.c
@@ -61,20 +61,20 @@ static void r5k_sc_enable(void)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	set_c0_config(R5K_CONF_SE);
 	blast_r5000_scache();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void r5k_sc_disable(void)
 {
         unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	blast_r5000_scache();
 	clear_c0_config(R5K_CONF_SE);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline int __init r5k_sc_probe(void)
Index: linux/arch/mips/mm/tlb-andes.c
===================================================================
--- linux.orig/arch/mips/mm/tlb-andes.c
+++ linux/arch/mips/mm/tlb-andes.c
@@ -27,7 +27,7 @@ void local_flush_tlb_all(void)
 	unsigned long old_ctx;
 	unsigned long entry;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi() & ASID_MASK;
 	write_c0_entryhi(CKSEG0);
@@ -43,7 +43,7 @@ void local_flush_tlb_all(void)
 		entry++;
 	}
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_mm(struct mm_struct *mm)
@@ -64,7 +64,7 @@ void local_flush_tlb_range(struct vm_are
 		unsigned long flags;
 		int size;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 		size = (size + 1) >> 1;
 		if (size <= NTLB_ENTRIES_HALF) {
@@ -93,7 +93,7 @@ void local_flush_tlb_range(struct vm_are
 		} else {
 			drop_mmu_context(mm, cpu);
 		}
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -105,7 +105,7 @@ void local_flush_tlb_kernel_range(unsign
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (size <= NTLB_ENTRIES_HALF) {
 		int pid = read_c0_entryhi();
 
@@ -131,7 +131,7 @@ void local_flush_tlb_kernel_range(unsign
 	} else {
 		local_flush_tlb_all();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -143,7 +143,7 @@ void local_flush_tlb_page(struct vm_area
 		newpid = (cpu_context(smp_processor_id(), vma->vm_mm) &
 			  ASID_MASK);
 		page &= (PAGE_MASK << 1);
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		oldpid = (read_c0_entryhi() & ASID_MASK);
 		write_c0_entryhi(page | newpid);
 		tlb_probe();
@@ -157,7 +157,7 @@ void local_flush_tlb_page(struct vm_area
 
 	finish:
 		write_c0_entryhi(oldpid);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -170,7 +170,7 @@ void local_flush_tlb_one(unsigned long p
 	unsigned long flags;
 	int oldpid, idx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	page &= (PAGE_MASK << 1);
 	oldpid = read_c0_entryhi() & 0xff;
 	write_c0_entryhi(page);
@@ -185,7 +185,7 @@ void local_flush_tlb_one(unsigned long p
 	}
 	write_c0_entryhi(oldpid);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* XXX Simplify this.  On the R10000 writing a TLB entry for an virtual
@@ -215,7 +215,7 @@ void __update_tlb(struct vm_area_struct 
 		       vma->vm_mm) & ASID_MASK), pid);
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= (PAGE_MASK << 1);
 	write_c0_entryhi(address | (pid));
 	pgdp = pgd_offset(vma->vm_mm, address);
@@ -232,7 +232,7 @@ void __update_tlb(struct vm_area_struct 
 		tlb_write_indexed();
 	}
 	write_c0_entryhi(pid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init tlb_init(void)
Index: linux/arch/mips/mm/tlb-r3k.c
===================================================================
--- linux.orig/arch/mips/mm/tlb-r3k.c
+++ linux/arch/mips/mm/tlb-r3k.c
@@ -49,7 +49,7 @@ void local_flush_tlb_all(void)
 	printk("[tlball]");
 #endif
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	old_ctx = read_c0_entryhi() & ASID_MASK;
 	write_c0_entrylo0(0);
 	entry = r3k_have_wired_reg ? read_c0_wired() : 8;
@@ -60,7 +60,7 @@ void local_flush_tlb_all(void)
 		tlb_write_indexed();
 	}
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_mm(struct mm_struct *mm)
@@ -89,7 +89,7 @@ void local_flush_tlb_range(struct vm_are
 		printk("[tlbrange<%lu,0x%08lx,0x%08lx>]",
 			cpu_context(cpu, mm) & ASID_MASK, start, end);
 #endif
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 		if (size <= current_cpu_data.tlbsize) {
 			int oldpid = read_c0_entryhi() & ASID_MASK;
@@ -115,7 +115,7 @@ void local_flush_tlb_range(struct vm_are
 		} else {
 			drop_mmu_context(mm, cpu);
 		}
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -127,7 +127,7 @@ void local_flush_tlb_kernel_range(unsign
 #ifdef DEBUG_TLB
 	printk("[tlbrange<%lu,0x%08lx,0x%08lx>]", start, end);
 #endif
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	if (size <= current_cpu_data.tlbsize) {
 		int pid = read_c0_entryhi();
@@ -153,7 +153,7 @@ void local_flush_tlb_kernel_range(unsign
 	} else {
 		local_flush_tlb_all();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -169,7 +169,7 @@ void local_flush_tlb_page(struct vm_area
 #endif
 		newpid = cpu_context(cpu, vma->vm_mm) & ASID_MASK;
 		page &= PAGE_MASK;
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		oldpid = read_c0_entryhi() & ASID_MASK;
 		write_c0_entryhi(page | newpid);
 		BARRIER;
@@ -183,7 +183,7 @@ void local_flush_tlb_page(struct vm_area
 
 finish:
 		write_c0_entryhi(oldpid);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -207,7 +207,7 @@ void __update_tlb(struct vm_area_struct 
 	}
 #endif
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= PAGE_MASK;
 	write_c0_entryhi(address | pid);
 	BARRIER;
@@ -221,7 +221,7 @@ void __update_tlb(struct vm_area_struct 
 		tlb_write_indexed();
 	}
 	write_c0_entryhi(pid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init add_wired_entry(unsigned long entrylo0, unsigned long entrylo1,
@@ -240,7 +240,7 @@ void __init add_wired_entry(unsigned lon
 		       entrylo0, entryhi, pagemask);
 #endif
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		/* Save old context and create impossible VPN2 value */
 		old_ctx = read_c0_entryhi() & ASID_MASK;
 		old_pagemask = read_c0_pagemask();
@@ -260,7 +260,7 @@ void __init add_wired_entry(unsigned lon
 		write_c0_entryhi(old_ctx);
 		write_c0_pagemask(old_pagemask);
 		local_flush_tlb_all();
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 
 	} else if (wired < 8) {
 #ifdef DEBUG_TLB
@@ -268,7 +268,7 @@ void __init add_wired_entry(unsigned lon
 		       entrylo0, entryhi);
 #endif
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		old_ctx = read_c0_entryhi() & ASID_MASK;
 		write_c0_entrylo0(entrylo0);
 		write_c0_entryhi(entryhi);
@@ -277,7 +277,7 @@ void __init add_wired_entry(unsigned lon
 		tlb_write_indexed();
 		write_c0_entryhi(old_ctx);
 		local_flush_tlb_all();
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
Index: linux/arch/mips/mm/tlb-r4k.c
===================================================================
--- linux.orig/arch/mips/mm/tlb-r4k.c
+++ linux/arch/mips/mm/tlb-r4k.c
@@ -32,7 +32,7 @@ void local_flush_tlb_all(void)
 	unsigned long old_ctx;
 	int entry;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	write_c0_entrylo0(0);
@@ -54,7 +54,7 @@ void local_flush_tlb_all(void)
 	}
 	tlbw_use_hazard();
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_mm(struct mm_struct *mm)
@@ -75,7 +75,7 @@ void local_flush_tlb_range(struct vm_are
 		unsigned long flags;
 		int size;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 		size = (size + 1) >> 1;
 		if (size <= current_cpu_data.tlbsize/2) {
@@ -109,7 +109,7 @@ void local_flush_tlb_range(struct vm_are
 		} else {
 			drop_mmu_context(mm, cpu);
 		}
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -118,7 +118,7 @@ void local_flush_tlb_kernel_range(unsign
 	unsigned long flags;
 	int size;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
 	if (size <= current_cpu_data.tlbsize / 2) {
@@ -151,7 +151,7 @@ void local_flush_tlb_kernel_range(unsign
 	} else {
 		local_flush_tlb_all();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -164,7 +164,7 @@ void local_flush_tlb_page(struct vm_area
 
 		newpid = cpu_asid(cpu, vma->vm_mm);
 		page &= (PAGE_MASK << 1);
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		oldpid = read_c0_entryhi();
 		write_c0_entryhi(page | newpid);
 		mtc0_tlbw_hazard();
@@ -183,7 +183,7 @@ void local_flush_tlb_page(struct vm_area
 
 	finish:
 		write_c0_entryhi(oldpid);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -196,7 +196,7 @@ void local_flush_tlb_one(unsigned long p
 	unsigned long flags;
 	int oldpid, idx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	page &= (PAGE_MASK << 1);
 	oldpid = read_c0_entryhi();
 	write_c0_entryhi(page);
@@ -215,7 +215,7 @@ void local_flush_tlb_one(unsigned long p
 	}
 	write_c0_entryhi(oldpid);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -239,7 +239,7 @@ void __update_tlb(struct vm_area_struct 
 
 	pid = read_c0_entryhi() & ASID_MASK;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= (PAGE_MASK << 1);
 	write_c0_entryhi(address | pid);
 	pgdp = pgd_offset(vma->vm_mm, address);
@@ -266,7 +266,7 @@ void __update_tlb(struct vm_area_struct 
 		tlb_write_indexed();
 	tlbw_use_hazard();
 	write_c0_entryhi(pid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #if 0
@@ -280,7 +280,7 @@ static void r4k_update_mmu_cache_hwbug(s
 	pte_t *ptep;
 	int idx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= (PAGE_MASK << 1);
 	asid = read_c0_entryhi() & ASID_MASK;
 	write_c0_entryhi(address | asid);
@@ -299,7 +299,7 @@ static void r4k_update_mmu_cache_hwbug(s
 	else
 		tlb_write_indexed();
 	tlbw_use_hazard();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 #endif
 
@@ -311,7 +311,7 @@ void __init add_wired_entry(unsigned lon
 	unsigned long old_pagemask;
 	unsigned long old_ctx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	old_pagemask = read_c0_pagemask();
@@ -331,7 +331,7 @@ void __init add_wired_entry(unsigned lon
 	BARRIER;
 	write_c0_pagemask(old_pagemask);
 	local_flush_tlb_all();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -351,7 +351,7 @@ __init int add_temporary_entry(unsigned 
 	unsigned long old_pagemask;
 	unsigned long old_ctx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	old_pagemask = read_c0_pagemask();
@@ -374,7 +374,7 @@ __init int add_temporary_entry(unsigned 
 	write_c0_entryhi(old_ctx);
 	write_c0_pagemask(old_pagemask);
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return ret;
 }
 
Index: linux/arch/mips/mm/tlb-r8k.c
===================================================================
--- linux.orig/arch/mips/mm/tlb-r8k.c
+++ linux/arch/mips/mm/tlb-r8k.c
@@ -35,7 +35,7 @@ void local_flush_tlb_all(void)
 	unsigned long old_ctx;
 	int entry;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi();
 	write_c0_entrylo(0);
@@ -49,7 +49,7 @@ void local_flush_tlb_all(void)
 	}
 	tlbw_use_hazard();
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_mm(struct mm_struct *mm)
@@ -74,7 +74,7 @@ void local_flush_tlb_range(struct vm_are
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (size > TFP_TLB_SIZE / 2) {
 		drop_mmu_context(mm, cpu);
@@ -106,7 +106,7 @@ void local_flush_tlb_range(struct vm_are
 	write_c0_entryhi(oldpid);
 
 out_restore:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* Usable for KV1 addresses only! */
@@ -123,7 +123,7 @@ void local_flush_tlb_kernel_range(unsign
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	write_c0_entrylo(0);
 
@@ -145,7 +145,7 @@ void local_flush_tlb_kernel_range(unsign
 		tlb_write();
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -160,7 +160,7 @@ void local_flush_tlb_page(struct vm_area
 
 	newpid = cpu_asid(cpu, vma->vm_mm);
 	page &= PAGE_MASK;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	oldpid = read_c0_entryhi();
 	write_c0_vaddr(page);
 	write_c0_entryhi(newpid);
@@ -175,7 +175,7 @@ void local_flush_tlb_page(struct vm_area
 
 finish:
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -199,7 +199,7 @@ void __update_tlb(struct vm_area_struct 
 
 	pid = read_c0_entryhi() & ASID_MASK;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	address &= PAGE_MASK;
 	write_c0_vaddr(address);
 	write_c0_entryhi(pid);
@@ -212,7 +212,7 @@ void __update_tlb(struct vm_area_struct 
 	tlb_write();
 
 	write_c0_entryhi(pid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void __init probe_tlb(unsigned long config)
Index: linux/arch/mips/mm/tlb-sb1.c
===================================================================
--- linux.orig/arch/mips/mm/tlb-sb1.c
+++ linux/arch/mips/mm/tlb-sb1.c
@@ -66,7 +66,7 @@ void sb1_dump_tlb(void)
 	unsigned long old_ctx;
 	unsigned long flags;
 	int entry;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	old_ctx = read_c0_entryhi();
 	printk("Current TLB registers state:\n"
 	       "      EntryHi       EntryLo0          EntryLo1     PageMask  Index\n"
@@ -83,7 +83,7 @@ void sb1_dump_tlb(void)
 	}
 	printk("\n");
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_all(void)
@@ -92,7 +92,7 @@ void local_flush_tlb_all(void)
 	unsigned long old_ctx;
 	int entry;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* Save old context and create impossible VPN2 value */
 	old_ctx = read_c0_entryhi() & ASID_MASK;
 	write_c0_entrylo0(0);
@@ -106,7 +106,7 @@ void local_flush_tlb_all(void)
 		entry++;
 	}
 	write_c0_entryhi(old_ctx);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 
@@ -147,7 +147,7 @@ void local_flush_tlb_range(struct vm_are
 	unsigned long flags;
 	int cpu;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cpu = smp_processor_id();
 	if (cpu_context(cpu, mm) != 0) {
 		int size;
@@ -179,7 +179,7 @@ void local_flush_tlb_range(struct vm_are
 			drop_mmu_context(mm, cpu);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_kernel_range(unsigned long start, unsigned long end)
@@ -190,7 +190,7 @@ void local_flush_tlb_kernel_range(unsign
 	size = (end - start + (PAGE_SIZE - 1)) >> PAGE_SHIFT;
 	size = (size + 1) >> 1;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (size <= (current_cpu_data.tlbsize/2)) {
 		int pid = read_c0_entryhi();
 
@@ -216,7 +216,7 @@ void local_flush_tlb_kernel_range(unsign
 	} else {
 		local_flush_tlb_all();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void local_flush_tlb_page(struct vm_area_struct *vma, unsigned long page)
@@ -224,7 +224,7 @@ void local_flush_tlb_page(struct vm_area
 	unsigned long flags;
 	int cpu = smp_processor_id();
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (cpu_context(cpu, vma->vm_mm) != 0) {
 		int oldpid, newpid, idx;
 		newpid = cpu_asid(cpu, vma->vm_mm);
@@ -243,7 +243,7 @@ void local_flush_tlb_page(struct vm_area
 	finish:
 		write_c0_entryhi(oldpid);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -258,7 +258,7 @@ void local_flush_tlb_one(unsigned long p
 	page &= (PAGE_MASK << 1);
 	oldpid = read_c0_entryhi() & ASID_MASK;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	write_c0_entryhi(page);
 	tlb_probe();
 	idx = read_c0_index();
@@ -271,7 +271,7 @@ void local_flush_tlb_one(unsigned long p
 	}
 
 	write_c0_entryhi(oldpid);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* All entries common to a mm share an asid.  To effectively flush
@@ -307,7 +307,7 @@ void __update_tlb(struct vm_area_struct 
 	if (current->active_mm != vma->vm_mm)
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	pid = read_c0_entryhi() & ASID_MASK;
 	address &= (PAGE_MASK << 1);
@@ -324,7 +324,7 @@ void __update_tlb(struct vm_area_struct 
 	} else {
 		tlb_write_indexed();
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init add_wired_entry(unsigned long entrylo0, unsigned long entrylo1,
@@ -335,7 +335,7 @@ void __init add_wired_entry(unsigned lon
 	unsigned long old_pagemask;
 	unsigned long old_ctx;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	old_ctx = read_c0_entryhi() & 0xff;
 	old_pagemask = read_c0_pagemask();
 	wired = read_c0_wired();
@@ -352,7 +352,7 @@ void __init add_wired_entry(unsigned lon
 	write_c0_pagemask(old_pagemask);
 
 	local_flush_tlb_all();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
Index: linux/arch/mips/momentum/ocelot_c/cpci-irq.c
===================================================================
--- linux.orig/arch/mips/momentum/ocelot_c/cpci-irq.c
+++ linux/arch/mips/momentum/ocelot_c/cpci-irq.c
@@ -129,14 +129,13 @@ void ll_cpci_irq(struct pt_regs *regs)
 #define shutdown_cpci_irq	disable_cpci_irq
 
 struct hw_interrupt_type cpci_irq_type = {
-	"CPCI/FPGA",
-	startup_cpci_irq,
-	shutdown_cpci_irq,
-	enable_cpci_irq,
-	disable_cpci_irq,
-	mask_and_ack_cpci_irq,
-	end_cpci_irq,
-	NULL
+	.typename = "CPCI/FPGA",
+	.startup = startup_cpci_irq,
+	.shutdown = shutdown_cpci_irq,
+	.enable = enable_cpci_irq,
+	.disable = disable_cpci_irq,
+	.ack = mask_and_ack_cpci_irq,
+	.end = end_cpci_irq,
 };
 
 void cpci_irq_init(void)
Index: linux/arch/mips/momentum/ocelot_c/uart-irq.c
===================================================================
--- linux.orig/arch/mips/momentum/ocelot_c/uart-irq.c
+++ linux/arch/mips/momentum/ocelot_c/uart-irq.c
@@ -122,14 +122,13 @@ void ll_uart_irq(struct pt_regs *regs)
 #define shutdown_uart_irq	disable_uart_irq
 
 struct hw_interrupt_type uart_irq_type = {
-	"UART/FPGA",
-	startup_uart_irq,
-	shutdown_uart_irq,
-	enable_uart_irq,
-	disable_uart_irq,
-	mask_and_ack_uart_irq,
-	end_uart_irq,
-	NULL
+	.typename = "UART/FPGA",
+	.startup = startup_uart_irq,
+	.shutdown = shutdown_uart_irq,
+	.enable = enable_uart_irq,
+	.disable = disable_uart_irq,
+	.ack = mask_and_ack_uart_irq,
+	.end = end_uart_irq,
 };
 
 void uart_irq_init(void)
Index: linux/arch/mips/momentum/ocelot_g/irq.c
===================================================================
--- linux.orig/arch/mips/momentum/ocelot_g/irq.c
+++ linux/arch/mips/momentum/ocelot_g/irq.c
@@ -58,7 +58,7 @@ void __init arch_init_irq(void)
 	 * int-handler is not on bootstrap
 	 */
 	clear_c0_status(ST0_IM);
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Sets the first-level interrupt dispatcher. */
 	set_except_vector(0, ocelot_handle_int);
Index: linux/arch/mips/pci/ops-au1000.c
===================================================================
--- linux.orig/arch/mips/pci/ops-au1000.c
+++ linux/arch/mips/pci/ops-au1000.c
@@ -102,7 +102,7 @@ static int config_access(unsigned char a
 		return -1;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	au_writel(((0x2000 << 16) | (au_readl(Au1500_PCI_STATCMD) & 0xffff)),
 			Au1500_PCI_STATCMD);
 	au_sync_udelay(1);
@@ -135,7 +135,7 @@ static int config_access(unsigned char a
 	if (board_pci_idsel) {
 		if (board_pci_idsel(device, 1) == 0) {
 			*data = 0xffffffff;
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return -1;
 		}
 	}
@@ -194,7 +194,7 @@ static int config_access(unsigned char a
 		(void)board_pci_idsel(device, 0);
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return error;
 #endif
 }
Index: linux/arch/mips/pmc-sierra/yosemite/smp.c
===================================================================
--- linux.orig/arch/mips/pmc-sierra/yosemite/smp.c
+++ linux/arch/mips/pmc-sierra/yosemite/smp.c
@@ -19,7 +19,7 @@ static unsigned char launchstack[LAUNCHS
 
 static void __init prom_smp_bootstrap(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	while (spin_is_locked(&launch_lock));
 
Index: linux/arch/mips/sgi-ip22/ip22-eisa.c
===================================================================
--- linux.orig/arch/mips/sgi-ip22/ip22-eisa.c
+++ linux/arch/mips/sgi-ip22/ip22-eisa.c
@@ -107,13 +107,13 @@ static void enable_eisa1_irq(unsigned in
 	unsigned long flags;
 	u8 mask;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	mask = EISA_READ_8(EISA_INT1_MASK);
 	mask &= ~((u8) (1 << irq));
 	EISA_WRITE_8(EISA_INT1_MASK, mask);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_eisa1_irq(unsigned int irq)
@@ -169,13 +169,13 @@ static void enable_eisa2_irq(unsigned in
 	unsigned long flags;
 	u8 mask;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	mask = EISA_READ_8(EISA_INT2_MASK);
 	mask &= ~((u8) (1 << (irq - 8)));
 	EISA_WRITE_8(EISA_INT2_MASK, mask);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_eisa2_irq(unsigned int irq)
Index: linux/arch/mips/sgi-ip22/ip22-int.c
===================================================================
--- linux.orig/arch/mips/sgi-ip22/ip22-int.c
+++ linux/arch/mips/sgi-ip22/ip22-int.c
@@ -44,12 +44,12 @@ static void enable_local0_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* don't allow mappable interrupt to be enabled from setup_irq,
 	 * we have our own way to do so */
 	if (irq != SGI_MAP_0_IRQ)
 		sgint->imask0 |= (1 << (irq - SGINT_LOCAL0));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local0_irq(unsigned int irq)
@@ -62,9 +62,9 @@ static void disable_local0_irq(unsigned 
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask0 &= ~(1 << (irq - SGINT_LOCAL0));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local0_irq	disable_local0_irq
@@ -90,12 +90,12 @@ static void enable_local1_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	/* don't allow mappable interrupt to be enabled from setup_irq,
 	 * we have our own way to do so */
 	if (irq != SGI_MAP_1_IRQ)
 		sgint->imask1 |= (1 << (irq - SGINT_LOCAL1));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local1_irq(unsigned int irq)
@@ -108,9 +108,9 @@ void disable_local1_irq(unsigned int irq
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask1 &= ~(1 << (irq - SGINT_LOCAL1));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local1_irq	disable_local1_irq
@@ -136,10 +136,10 @@ static void enable_local2_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask0 |= (1 << (SGI_MAP_0_IRQ - SGINT_LOCAL0));
 	sgint->cmeimask0 |= (1 << (irq - SGINT_LOCAL2));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local2_irq(unsigned int irq)
@@ -152,11 +152,11 @@ void disable_local2_irq(unsigned int irq
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->cmeimask0 &= ~(1 << (irq - SGINT_LOCAL2));
 	if (!sgint->cmeimask0)
 		sgint->imask0 &= ~(1 << (SGI_MAP_0_IRQ - SGINT_LOCAL0));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local2_irq disable_local2_irq
@@ -182,10 +182,10 @@ static void enable_local3_irq(unsigned i
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->imask1 |= (1 << (SGI_MAP_1_IRQ - SGINT_LOCAL1));
 	sgint->cmeimask1 |= (1 << (irq - SGINT_LOCAL3));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static unsigned int startup_local3_irq(unsigned int irq)
@@ -198,11 +198,11 @@ void disable_local3_irq(unsigned int irq
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	sgint->cmeimask1 &= ~(1 << (irq - SGINT_LOCAL3));
 	if (!sgint->cmeimask1)
 		sgint->imask1 &= ~(1 << (SGI_MAP_1_IRQ - SGINT_LOCAL1));
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #define shutdown_local3_irq disable_local3_irq
Index: linux/arch/mips/sgi-ip22/ip22-reset.c
===================================================================
--- linux.orig/arch/mips/sgi-ip22/ip22-reset.c
+++ linux/arch/mips/sgi-ip22/ip22-reset.c
@@ -66,7 +66,7 @@ static void sgi_machine_power_off(void)
 {
 	unsigned int tmp;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Disable watchdog */
 	tmp = hpc3c0->rtcregs[RTC_CMD] & 0xff;
Index: linux/arch/mips/sgi-ip27/ip27-smp.c
===================================================================
--- linux.orig/arch/mips/sgi-ip27/ip27-smp.c
+++ linux/arch/mips/sgi-ip27/ip27-smp.c
@@ -188,7 +188,7 @@ void __init prom_boot_secondary(int cpu,
 void prom_init_secondary(void)
 {
 	per_cpu_init();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 void __init prom_cpus_done(void)
Index: linux/arch/mips/sgi-ip32/ip32-irq.c
===================================================================
--- linux.orig/arch/mips/sgi-ip32/ip32-irq.c
+++ linux/arch/mips/sgi-ip32/ip32-irq.c
@@ -163,14 +163,13 @@ static void end_cpu_irq(unsigned int irq
 #define mask_and_ack_cpu_irq disable_cpu_irq
 
 static struct hw_interrupt_type ip32_cpu_interrupt = {
-	"IP32 CPU",
-	startup_cpu_irq,
-	shutdown_cpu_irq,
-	enable_cpu_irq,
-	disable_cpu_irq,
-	mask_and_ack_cpu_irq,
-	end_cpu_irq,
-	NULL
+	.typename = "IP32 CPU",
+	.startup = startup_cpu_irq,
+	.shutdown = shutdown_cpu_irq,
+	.enable = enable_cpu_irq,
+	.disable = disable_cpu_irq,
+	.ack = mask_and_ack_cpu_irq,
+	.end = end_cpu_irq,
 };
 
 /*
@@ -234,14 +233,13 @@ static void end_crime_irq(unsigned int i
 #define shutdown_crime_irq disable_crime_irq
 
 static struct hw_interrupt_type ip32_crime_interrupt = {
-	"IP32 CRIME",
-	startup_crime_irq,
-	shutdown_crime_irq,
-	enable_crime_irq,
-	disable_crime_irq,
-	mask_and_ack_crime_irq,
-	end_crime_irq,
-	NULL
+	.typename = "IP32 CRIME",
+	.startup = startup_crime_irq,
+	.shutdown = shutdown_crime_irq,
+	.enable = enable_crime_irq,
+	.disable = disable_crime_irq,
+	.ack = mask_and_ack_crime_irq,
+	.end = end_crime_irq,
 };
 
 /*
@@ -294,14 +292,13 @@ static void end_macepci_irq(unsigned int
 #define mask_and_ack_macepci_irq disable_macepci_irq
 
 static struct hw_interrupt_type ip32_macepci_interrupt = {
-	"IP32 MACE PCI",
-	startup_macepci_irq,
-	shutdown_macepci_irq,
-	enable_macepci_irq,
-	disable_macepci_irq,
-	mask_and_ack_macepci_irq,
-	end_macepci_irq,
-	NULL
+	.typename = "IP32 MACE PCI",
+	.startup = startup_macepci_irq,
+	.shutdown = shutdown_macepci_irq,
+	.enable = enable_macepci_irq,
+	.disable = disable_macepci_irq,
+	.ack = mask_and_ack_macepci_irq,
+	.end = end_macepci_irq,
 };
 
 /* This is used for MACE ISA interrupts.  That means bits 4-6 in the
@@ -425,14 +422,13 @@ static void end_maceisa_irq(unsigned irq
 #define shutdown_maceisa_irq disable_maceisa_irq
 
 static struct hw_interrupt_type ip32_maceisa_interrupt = {
-	"IP32 MACE ISA",
-	startup_maceisa_irq,
-	shutdown_maceisa_irq,
-	enable_maceisa_irq,
-	disable_maceisa_irq,
-	mask_and_ack_maceisa_irq,
-	end_maceisa_irq,
-	NULL
+	.typename = "IP32 MACE ISA",
+	.startup = startup_maceisa_irq,
+	.shutdown = shutdown_maceisa_irq,
+	.enable = enable_maceisa_irq,
+	.disable = disable_maceisa_irq,
+	.ack = mask_and_ack_maceisa_irq,
+	.end = end_maceisa_irq,
 };
 
 /* This is used for regular non-ISA, non-PCI MACE interrupts.  That means
@@ -476,14 +472,13 @@ static void end_mace_irq(unsigned int ir
 #define mask_and_ack_mace_irq disable_mace_irq
 
 static struct hw_interrupt_type ip32_mace_interrupt = {
-	"IP32 MACE",
-	startup_mace_irq,
-	shutdown_mace_irq,
-	enable_mace_irq,
-	disable_mace_irq,
-	mask_and_ack_mace_irq,
-	end_mace_irq,
-	NULL
+	.typename = "IP32 MACE",
+	.startup = startup_mace_irq,
+	.shutdown = shutdown_mace_irq,
+	.enable = enable_mace_irq,
+	.disable = disable_mace_irq,
+	.ack = mask_and_ack_mace_irq,
+	.end = end_mace_irq,
 };
 
 static void ip32_unknown_interrupt(struct pt_regs *regs)
Index: linux/arch/mips/sibyte/sb1250/irq.c
===================================================================
--- linux.orig/arch/mips/sibyte/sb1250/irq.c
+++ linux/arch/mips/sibyte/sb1250/irq.c
@@ -71,24 +71,22 @@ extern char sb1250_duart_present[];
 #endif
 
 static struct hw_interrupt_type sb1250_irq_type = {
-	"SB1250-IMR",
-	startup_sb1250_irq,
-	shutdown_sb1250_irq,
-	enable_sb1250_irq,
-	disable_sb1250_irq,
-	ack_sb1250_irq,
-	end_sb1250_irq,
+	.typename = "SB1250-IMR",
+	.startup = startup_sb1250_irq,
+	.shutdown = shutdown_sb1250_irq,
+	.enable = enable_sb1250_irq,
+	.disable = disable_sb1250_irq,
+	.ack = ack_sb1250_irq,
+	.end = end_sb1250_irq,
 #ifdef CONFIG_SMP
-	sb1250_set_affinity
-#else
-	NULL
+	.set_affinity = sb1250_set_affinity
 #endif
 };
 
 /* Store the CPU id (not the logical number) */
 int sb1250_irq_owner[SB1250_NR_IRQS];
 
-DEFINE_SPINLOCK(sb1250_imr_lock);
+DEFINE_RAW_SPINLOCK(sb1250_imr_lock);
 
 void sb1250_mask_irq(int cpu, int irq)
 {
@@ -276,7 +274,7 @@ static irqreturn_t  sb1250_dummy_handler
 
 static struct irqaction sb1250_dummy_action = {
 	.handler = sb1250_dummy_handler,
-	.flags   = 0,
+	.flags   = SA_NODELAY,
 	.mask    = CPU_MASK_NONE,
 	.name    = "sb1250-private",
 	.next    = NULL,
Index: linux/arch/mips/sibyte/sb1250/smp.c
===================================================================
--- linux.orig/arch/mips/sibyte/sb1250/smp.c
+++ linux/arch/mips/sibyte/sb1250/smp.c
@@ -59,7 +59,7 @@ void sb1250_smp_finish(void)
 {
 	extern void sb1250_time_init(void);
 	sb1250_time_init();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
Index: linux/arch/mips/sibyte/sb1250/time.c
===================================================================
--- linux.orig/arch/mips/sibyte/sb1250/time.c
+++ linux/arch/mips/sibyte/sb1250/time.c
@@ -115,10 +115,12 @@ void sb1250_timer_interrupt(struct pt_re
 		ll_timer_interrupt(irq, regs);
 	}
 
-	/*
-	 * every CPU should do profiling and process accouting
-	 */
-	ll_local_timer_interrupt(irq, regs);
+	if (cpu != 0) {
+		/*
+		 * every CPU should do profiling and process accouting
+		 */
+		ll_local_timer_interrupt(irq, regs);
+	}
 }
 
 /*
Index: linux/arch/mips/sni/irq.c
===================================================================
--- linux.orig/arch/mips/sni/irq.c
+++ linux/arch/mips/sni/irq.c
@@ -58,14 +58,13 @@ static void end_pciasic_irq(unsigned int
 }
 
 static struct hw_interrupt_type pciasic_irq_type = {
-	"ASIC-PCI",
-	startup_pciasic_irq,
-	shutdown_pciasic_irq,
-	enable_pciasic_irq,
-	disable_pciasic_irq,
-	mask_and_ack_pciasic_irq,
-	end_pciasic_irq,
-	NULL
+	.typename = "ASIC-PCI",
+	.startup = startup_pciasic_irq,
+	.shutdown = shutdown_pciasic_irq,
+	.enable = enable_pciasic_irq,
+	.disable = disable_pciasic_irq,
+	.ack = mask_and_ack_pciasic_irq,
+	.end = end_pciasic_irq,
 };
 
 /*
Index: linux/arch/mips/sni/reset.c
===================================================================
--- linux.orig/arch/mips/sni/reset.c
+++ linux/arch/mips/sni/reset.c
@@ -30,7 +30,7 @@ void sni_machine_restart(char *command)
 
 	/* This does a normal via the keyboard controller like a PC.
 	   We can do that easier ...  */
-	local_irq_disable();
+	raw_local_irq_disable();
 	for (;;) {
 		for (i=0; i<100; i++) {
 			kb_wait();
Index: linux/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
===================================================================
--- linux.orig/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
+++ linux/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_irq.c
@@ -669,7 +669,7 @@ void __init arch_init_irq(void)
 {
 	extern void tx4927_irq_init(void);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	tx4927_irq_init();
 	toshiba_rbtx4927_irq_ioc_init();
Index: linux/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_setup.c
===================================================================
--- linux.orig/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_setup.c
+++ linux/arch/mips/tx4927/toshiba_rbtx4927/toshiba_rbtx4927_setup.c
@@ -727,7 +727,7 @@ void toshiba_rbtx4927_restart(char *comm
 	reg_wr08(RBTX4927_SW_RESET_DO, RBTX4927_SW_RESET_DO_SET);
 
 	/* do something passive while waiting for reset */
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1)
 		asm_wait();
 
@@ -738,7 +738,7 @@ void toshiba_rbtx4927_restart(char *comm
 void toshiba_rbtx4927_halt(void)
 {
 	printk(KERN_NOTICE "System Halted\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) {
 		asm_wait();
 	}
Index: linux/arch/mips/vr41xx/common/pmu.c
===================================================================
--- linux.orig/arch/mips/vr41xx/common/pmu.c
+++ linux/arch/mips/vr41xx/common/pmu.c
@@ -62,7 +62,7 @@ static inline void software_reset(void)
 
 static void vr41xx_restart(char *command)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	software_reset();
 	printk(KERN_NOTICE "\nYou can reset your system\n");
 	while (1) ;
@@ -70,14 +70,14 @@ static void vr41xx_restart(char *command
 
 static void vr41xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	printk(KERN_NOTICE "\nYou can turn off the power supply\n");
 	while (1) ;
 }
 
 static void vr41xx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	printk(KERN_NOTICE "\nYou can turn off the power supply\n");
 	while (1) ;
 }
Index: linux/arch/parisc/kernel/time.c
===================================================================
--- linux.orig/arch/parisc/kernel/time.c
+++ linux/arch/parisc/kernel/time.c
@@ -33,10 +33,6 @@
 
 #include <linux/timex.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* xtime and wall_jiffies keep wall-clock time */
 extern unsigned long wall_jiffies;
 
Index: linux/arch/ppc/8260_io/enet.c
===================================================================
--- linux.orig/arch/ppc/8260_io/enet.c
+++ linux/arch/ppc/8260_io/enet.c
@@ -116,7 +116,7 @@ struct scc_enet_private {
 	scc_t	*sccp;
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux/arch/ppc/8260_io/fcc_enet.c
===================================================================
--- linux.orig/arch/ppc/8260_io/fcc_enet.c
+++ linux/arch/ppc/8260_io/fcc_enet.c
@@ -377,7 +377,7 @@ struct fcc_enet_private {
 	volatile fcc_enet_t	*ep;
 	struct	net_device_stats stats;
 	uint	tx_free;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux/arch/ppc/8xx_io/commproc.c
===================================================================
--- linux.orig/arch/ppc/8xx_io/commproc.c
+++ linux/arch/ppc/8xx_io/commproc.c
@@ -356,7 +356,7 @@ cpm_setbrg(uint brg, uint rate)
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /*
  * 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up...
Index: linux/arch/ppc/8xx_io/enet.c
===================================================================
--- linux.orig/arch/ppc/8xx_io/enet.c
+++ linux/arch/ppc/8xx_io/enet.c
@@ -144,7 +144,7 @@ struct scc_enet_private {
 	unsigned char *rx_vaddr[RX_RING_SIZE];
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 };
 
 static int scc_enet_open(struct net_device *dev);
Index: linux/arch/ppc/8xx_io/fec.c
===================================================================
--- linux.orig/arch/ppc/8xx_io/fec.c
+++ linux/arch/ppc/8xx_io/fec.c
@@ -165,7 +165,7 @@ struct fec_enet_private {
 
 	struct	net_device_stats stats;
 	uint	tx_full;
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 #ifdef	CONFIG_USE_MDIO
 	uint	phy_id;
Index: linux/arch/ppc/Kconfig
===================================================================
--- linux.orig/arch/ppc/Kconfig
+++ linux/arch/ppc/Kconfig
@@ -15,13 +15,6 @@ config GENERIC_HARDIRQS
 	bool
 	default y
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-	default y
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -922,6 +915,14 @@ config HIGHMEM
 
 source kernel/Kconfig.hz
 source kernel/Kconfig.preempt
+
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 source "mm/Kconfig"
 
 source "fs/Kconfig.binfmt"
Index: linux/arch/ppc/boot/Makefile
===================================================================
--- linux.orig/arch/ppc/boot/Makefile
+++ linux/arch/ppc/boot/Makefile
@@ -11,6 +11,15 @@
 #
 
 CFLAGS	 	+= -fno-builtin -D__BOOTER__ -Iarch/$(ARCH)/boot/include
+
+ifdef CONFIG_MCOUNT
+# do not trace the boot loader
+nullstring :=
+space      := $(nullstring) # end of the line
+pg_flag     = $(nullstring) -pg # end of the line
+CFLAGS     := $(subst ${pg_flag},${space},${CFLAGS})
+endif
+
 HOSTCFLAGS	+= -Iarch/$(ARCH)/boot/include
 
 BOOT_TARGETS	= zImage zImage.initrd znetboot znetboot.initrd
Index: linux/arch/ppc/boot/lib/Makefile
===================================================================
--- linux.orig/arch/ppc/boot/lib/Makefile
+++ linux/arch/ppc/boot/lib/Makefile
@@ -5,19 +5,49 @@
 CFLAGS_kbd.o	:= -Idrivers/char
 CFLAGS_vreset.o := -I$(srctree)/arch/ppc/boot/include
 
-zlib  := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
-	 
-lib-y += $(zlib:.c=.o) div64.o
-lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
-
+zlib       := infblock.c infcodes.c inffast.c inflate.c inftrees.c infutil.c
+zlibheader := infblock.h infcodes.h inffast.h inftrees.h infutil.h
+zliblinuxheader := zlib.h zconf.h zutil.h
+
+$(addprefix $(obj)/,$(zlib)): $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader))
+
+src-boot := div64.S
+src-boot += $(zlib)
+#src-boot := $(addprefix $(obj)/, $(src-boot))
+obj-boot := $(addsuffix .o, $(basename $(src-boot)))
 
-# zlib files needs header from their original place
-EXTRA_CFLAGS += -Ilib/zlib_inflate
+BOOTCFLAGS	+= -I$(obj) -I$(srctree)/$(obj) $(CFLAGS)
 
 quiet_cmd_copy_zlib = COPY    $@
-      cmd_copy_zlib = cat $< > $@
+      cmd_copy_zlib = sed "s@__attribute_used__@@;s@.include.<linux/module.h>@@;s@.include.<linux/spinlock.h>@@;s@.*spin.*lock.*@@;s@.*SPINLOCK.*@@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+
+quiet_cmd_copy_zlibheader = COPY    $@
+      cmd_copy_zlibheader = sed "s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
+# stddef.h for NULL
+quiet_cmd_copy_zliblinuxheader = COPY    $@
+      cmd_copy_zliblinuxheader = sed "s@.include.<linux/string.h>@@;s@.include.<linux/errno.h>@@;s@<linux/kernel.h>@<stddef.h>@;s@<linux/\([^>]\+\).*@\"\1\"@" $< > $@
 
 $(addprefix $(obj)/,$(zlib)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
 	$(call cmd,copy_zlib)
 
-clean-files := $(zlib)
+$(addprefix $(obj)/,$(zlibheader)): $(obj)/%: $(srctree)/lib/zlib_inflate/%
+	$(call cmd,copy_zlibheader)
+
+$(addprefix $(obj)/,$(zliblinuxheader)): $(obj)/%: $(srctree)/include/linux/%
+	$(call cmd,copy_zliblinuxheader)
+
+clean-files := $(zlib) $(zlibheader) $(zliblinuxheader)
+
+quiet_cmd_bootcc = BOOTCC  $@
+      cmd_bootcc = $(CC) -Wp,-MD,$(depfile) $(BOOTCFLAGS) -c -o $@ $<
+
+quiet_cmd_bootas = BOOTAS  $@
+      cmd_bootas = $(CC) -Wp,-MD,$(depfile) $(BOOTAFLAGS) -c -o $@ $<
+
+$(patsubst %.c,%.o, $(filter %.c, $(src-boot))): %.o: %.c
+	$(call if_changed_dep,bootcc)
+$(patsubst %.S,%.o, $(filter %.S, $(src-boot))): %.o: %.S
+	$(call if_changed_dep,bootas)
+
+lib-y += $(obj-boot)
+lib-$(CONFIG_VGA_CONSOLE) += vreset.o kbd.o
Index: linux/arch/ppc/kernel/dma-mapping.c
===================================================================
--- linux.orig/arch/ppc/kernel/dma-mapping.c
+++ linux/arch/ppc/kernel/dma-mapping.c
@@ -71,7 +71,7 @@ int map_page(unsigned long va, phys_addr
  * This is the page table (2MB) covering uncached, DMA consistent allocations
  */
 static pte_t *consistent_pte;
-static DEFINE_SPINLOCK(consistent_lock);
+static DEFINE_RAW_SPINLOCK(consistent_lock);
 
 /*
  * VM region handling support.
@@ -407,7 +407,7 @@ static inline void __dma_sync_page_highm
 	int nr_segs = 1 + ((size - seg_size) + PAGE_SIZE - 1)/PAGE_SIZE;
 	int seg_nr = 0;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	do {
 		start = (unsigned long)kmap_atomic(page + seg_nr,
@@ -426,7 +426,7 @@ static inline void __dma_sync_page_highm
 		seg_offset = 0;
 	} while (seg_nr < nr_segs);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 #endif /* CONFIG_HIGHMEM */
 
Index: linux/arch/ppc/kernel/entry.S
===================================================================
--- linux.orig/arch/ppc/kernel/entry.S
+++ linux/arch/ppc/kernel/entry.S
@@ -240,7 +240,7 @@ ret_from_syscall:
 	SYNC
 	MTMSRD(r10)
 	lwz	r9,TI_FLAGS(r12)
-	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SYSCALL_T_OR_A|_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	syscall_exit_work
 syscall_exit_cont:
 #if defined(CONFIG_4xx) || defined(CONFIG_BOOKE)
@@ -318,7 +318,7 @@ syscall_exit_work:
 	rlwinm	r12,r1,0,0,18	/* current_thread_info() */
 	lwz	r9,TI_FLAGS(r12)
 5:
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	1f
 	lwz	r5,_MSR(r1)
 	andi.	r5,r5,MSR_PR
@@ -658,7 +658,7 @@ user_exc_return:		/* r10 contains MSR_KE
 	/* Check current_thread_info()->flags */
 	rlwinm	r9,r1,0,0,18
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_NEED_RESCHED)
+	andi.	r0,r9,(_TIF_SIGPENDING|_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne	do_work
 
 restore_user:
@@ -876,7 +876,7 @@ load_dbcr0:
 #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */
 
 do_work:			/* r10 contains MSR_KERNEL here */
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	beq	do_user_signal
 
 do_resched:			/* r10 contains MSR_KERNEL here */
@@ -890,7 +890,7 @@ recheck:
 	MTMSRD(r10)		/* disable interrupts */
 	rlwinm	r9,r1,0,0,18
 	lwz	r9,TI_FLAGS(r9)
-	andi.	r0,r9,_TIF_NEED_RESCHED
+	andi.	r0,r9,(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED)
 	bne-	do_resched
 	andi.	r0,r9,_TIF_SIGPENDING
 	beq	restore_user
@@ -1000,3 +1000,85 @@ machine_check_in_rtas:
 	/* XXX load up BATs and panic */
 
 #endif /* CONFIG_PPC_OF */
+
+#ifdef CONFIG_MCOUNT
+
+/*
+ * mcount() is not the same as _mcount().  The callers of mcount() have a
+ * normal context.  The callers of _mcount() do not have a stack frame and
+ * have not saved the "caller saves" registers.
+ */
+_GLOBAL(mcount)
+	stwu	r1,-16(r1)
+	mflr	r3
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	stw	r3,20(r1)
+	cmpwi	r5,0
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,16(r1)
+	lwz	r4,4(r4)
+	bl	__trace
+1:
+	lwz	r0,20(r1)
+	mtlr	r0
+	addi	r1,r1,16
+	blr
+
+/*
+ * The -pg flag, which is specified in the case of CONFIG_MCOUNT, causes the
+ * C compiler to add a call to _mcount() at the start of each function preamble,
+ * before the stack frame is created.  An example of this preamble code is:
+ *
+ * 	mflr    r0
+ * 	lis     r12,-16354
+ * 	stw     r0,4(r1)
+ * 	addi    r0,r12,-19652
+ * 	bl      0xc00034c8 <_mcount>
+ * 	mflr    r0
+ * 	stwu    r1,-16(r1)
+ */
+_GLOBAL(_mcount)
+#define M_STK_SIZE 48
+	/* Would not expect to need to save cr, but glibc version of */
+	/* _mcount() does, so cautiously saving it here too.         */
+	stwu	r1,-M_STK_SIZE(r1)
+	stw	r3, 12(r1)
+	stw	r4, 16(r1)
+	stw	r5, 20(r1)
+	stw	r6, 24(r1)
+	mflr	r3		/* will use as first arg to __trace() */
+	mfcr	r4
+	lis	r5,mcount_enabled@ha
+	lwz	r5,mcount_enabled@l(r5)
+	cmpwi	r5,0
+	stw	r3, 44(r1)	/* lr */
+	stw	r4,  8(r1)	/* cr */
+	stw	r7, 28(r1)
+	stw	r8, 32(r1)
+	stw	r9, 36(r1)
+	stw	r10,40(r1)
+	beq	1f
+	/* r3 contains lr (eip), put parent lr (parent_eip) in r4 */
+	lwz	r4,M_STK_SIZE+4(r1)
+	bl	__trace
+1:
+	lwz	r8,  8(r1)	/* cr */
+	lwz	r9, 44(r1)	/* lr */
+	lwz	r3, 12(r1)
+	lwz	r4, 16(r1)
+	lwz	r5, 20(r1)
+	mtcrf	0xff,r8
+	mtctr	r9
+	lwz	r0, 52(r1)
+	lwz	r6, 24(r1)
+	lwz	r7, 28(r1)
+	lwz	r8, 32(r1)
+	lwz	r9, 36(r1)
+	lwz	r10,40(r1)
+	addi	r1,r1,M_STK_SIZE
+	mtlr	r0
+	bctr
+
+#endif /* CONFIG_MCOUNT */
Index: linux/arch/ppc/kernel/idle.c
===================================================================
--- linux.orig/arch/ppc/kernel/idle.c
+++ linux/arch/ppc/kernel/idle.c
@@ -40,7 +40,7 @@ void default_idle(void)
 
 	powersave = ppc_md.power_save;
 
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		if (powersave != NULL)
 			powersave();
 #ifdef CONFIG_SMP
@@ -52,8 +52,11 @@ void default_idle(void)
 		}
 #endif
 	}
-	if (need_resched())
-		schedule();
+	if (need_resched()) {
+		raw_local_irq_disable();
+		__schedule();
+		raw_local_irq_enable();
+	}
 	if (cpu_is_offline(cpu) && system_state == SYSTEM_RUNNING)
 		cpu_die();
 }
@@ -63,11 +66,15 @@ void default_idle(void)
  */
 void cpu_idle(void)
 {
-	for (;;)
+	for (;;) {
+		BUG_ON(raw_irqs_disabled());
+		stop_critical_timing();
+		propagate_preempt_locks_value();
 		if (ppc_md.idle != NULL)
 			ppc_md.idle();
 		else
 			default_idle();
+	}
 }
 
 #if defined(CONFIG_SYSCTL) && defined(CONFIG_6xx)
Index: linux/arch/ppc/kernel/irq.c
===================================================================
--- linux.orig/arch/ppc/kernel/irq.c
+++ linux/arch/ppc/kernel/irq.c
@@ -138,6 +138,7 @@ skip:
 void do_IRQ(struct pt_regs *regs)
 {
 	int irq, first = 1;
+
         irq_enter();
 
 	/*
@@ -149,6 +150,7 @@ void do_IRQ(struct pt_regs *regs)
 	 * has already been handled. -- Tom
 	 */
 	while ((irq = ppc_md.get_irq(regs)) >= 0) {
+		trace_special(regs->nip, irq, 0);
 		__do_IRQ(irq, regs);
 		first = 0;
 	}
Index: linux/arch/ppc/kernel/misc.S
===================================================================
--- linux.orig/arch/ppc/kernel/misc.S
+++ linux/arch/ppc/kernel/misc.S
@@ -302,8 +302,8 @@ _GLOBAL(local_save_flags_ptr)
 	nop
 _GLOBAL(local_save_flags_ptr_end)
 
-/* void local_irq_restore(unsigned long flags) */
-_GLOBAL(local_irq_restore)
+/* void __raw_local_irq_restore(unsigned long flags) */
+_GLOBAL(__raw_local_irq_restore)
 /*
  * Just set/clear the MSR_EE bit through restore/flags but do not
  * change anything else.  This is needed by the RT system and makes
@@ -341,9 +341,9 @@ _GLOBAL(local_irq_restore)
 	nop
 	nop
 	nop
-_GLOBAL(local_irq_restore_end)
+_GLOBAL(__raw_local_irq_restore_end)
 
-_GLOBAL(local_irq_disable)
+_GLOBAL(__raw_local_irq_disable)
 	mfmsr	r0		/* Get current interrupt state */
 	rlwinm	r3,r0,16+1,32-1,31	/* Extract old value of 'EE' */
 	rlwinm	r0,r0,0,17,15	/* clear MSR_EE in r0 */
@@ -370,9 +370,9 @@ _GLOBAL(local_irq_disable)
 	nop
 	nop
 	nop
-_GLOBAL(local_irq_disable_end)
+_GLOBAL(__raw_local_irq_disable_end)
 
-_GLOBAL(local_irq_enable)
+_GLOBAL(__raw_local_irq_enable)
 	mfmsr	r3		/* Get current state */
 	ori	r3,r3,MSR_EE	/* Turn on 'EE' bit */
 	SYNC			/* Some chip revs have problems here... */
@@ -399,7 +399,7 @@ _GLOBAL(local_irq_enable)
 	nop
 	nop
 	nop
-_GLOBAL(local_irq_enable_end)
+_GLOBAL(__raw_local_irq_enable_end)
 
 /*
  * complement mask on the msr then "or" some values on.
Index: linux/arch/ppc/kernel/ppc_ksyms.c
===================================================================
--- linux.orig/arch/ppc/kernel/ppc_ksyms.c
+++ linux/arch/ppc/kernel/ppc_ksyms.c
@@ -291,9 +291,11 @@ EXPORT_SYMBOL(console_drivers);
 EXPORT_SYMBOL(xmon);
 EXPORT_SYMBOL(xmon_printf);
 #endif
-EXPORT_SYMBOL(__up);
-EXPORT_SYMBOL(__down);
-EXPORT_SYMBOL(__down_interruptible);
+#ifdef CONFIG_ASM_SEMAPHORES
+EXPORT_SYMBOL(__compat_up);
+EXPORT_SYMBOL(__compat_down);
+EXPORT_SYMBOL(__compat_down_interruptible);
+#endif
 
 #if defined(CONFIG_KGDB) || defined(CONFIG_XMON)
 extern void (*debugger)(struct pt_regs *regs);
Index: linux/arch/ppc/kernel/process.c
===================================================================
--- linux.orig/arch/ppc/kernel/process.c
+++ linux/arch/ppc/kernel/process.c
@@ -37,6 +37,8 @@
 #include <linux/kallsyms.h>
 #include <linux/mqueue.h>
 #include <linux/hardirq.h>
+#include <linux/init_task.h>
+#include <linux/fs_struct.h>
 
 #include <asm/pgtable.h>
 #include <asm/uaccess.h>
@@ -52,8 +54,8 @@ struct task_struct *last_task_used_math 
 struct task_struct *last_task_used_altivec = NULL;
 struct task_struct *last_task_used_spe = NULL;
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
@@ -241,7 +243,7 @@ struct task_struct *__switch_to(struct t
 	unsigned long s;
 	struct task_struct *last;
 
-	local_irq_save(s);
+	raw_local_irq_save(s);
 #ifdef CHECK_STACK
 	check_stack(prev);
 	check_stack(new);
@@ -302,7 +304,7 @@ struct task_struct *__switch_to(struct t
 	new_thread = &new->thread;
 	old_thread = &current->thread;
 	last = _switch(old_thread, new_thread);
-	local_irq_restore(s);
+	raw_local_irq_restore(s);
 	return last;
 }
 
Index: linux/arch/ppc/kernel/semaphore.c
===================================================================
--- linux.orig/arch/ppc/kernel/semaphore.c
+++ linux/arch/ppc/kernel/semaphore.c
@@ -29,7 +29,7 @@
  *	sem->count = tmp;
  *	return old_count;
  */
-static inline int __sem_update_count(struct semaphore *sem, int incr)
+static inline int __sem_update_count(struct compat_semaphore *sem, int incr)
 {
 	int old_count, tmp;
 
@@ -48,7 +48,7 @@ static inline int __sem_update_count(str
 	return old_count;
 }
 
-void __up(struct semaphore *sem)
+void __compat_up(struct compat_semaphore *sem)
 {
 	/*
 	 * Note that we incremented count in up() before we came here,
@@ -70,7 +70,7 @@ void __up(struct semaphore *sem)
  * Thus it is only when we decrement count from some value > 0
  * that we have actually got the semaphore.
  */
-void __sched __down(struct semaphore *sem)
+void __sched __compat_down(struct compat_semaphore *sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -100,7 +100,7 @@ void __sched __down(struct semaphore *se
 	wake_up(&sem->wait);
 }
 
-int __sched __down_interruptible(struct semaphore * sem)
+int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -129,3 +129,8 @@ int __sched __down_interruptible(struct 
 	wake_up(&sem->wait);
 	return retval;
 }
+
+int compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
Index: linux/arch/ppc/kernel/signal.c
===================================================================
--- linux.orig/arch/ppc/kernel/signal.c
+++ linux/arch/ppc/kernel/signal.c
@@ -705,6 +705,14 @@ int do_signal(sigset_t *oldset, struct p
 	unsigned long frame, newsp;
 	int signr, ret;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	local_irq_enable();
+	preempt_check_resched();
+#endif
+
 	if (try_to_freeze()) {
 		signr = 0;
 		if (!signal_pending(current))
Index: linux/arch/ppc/kernel/smp-tbsync.c
===================================================================
--- linux.orig/arch/ppc/kernel/smp-tbsync.c
+++ linux/arch/ppc/kernel/smp-tbsync.c
@@ -49,7 +49,7 @@ smp_generic_take_timebase( void )
 {
 	int cmd, tbl, tbu;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	while( !running )
 		;
 	rmb();
@@ -78,7 +78,7 @@ smp_generic_take_timebase( void )
 		}
 		enter_contest( tbsync->mark, -1 );
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 static int __devinit
@@ -88,7 +88,7 @@ start_contest( int cmd, int offset, int 
 
 	tbsync->cmd = cmd;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	for( i=-3; i<num; ) {
 		tbl = get_tbl() + 400;
 		tbsync->tbu = tbu = get_tbu();
@@ -114,7 +114,7 @@ start_contest( int cmd, int offset, int 
 		if( i++ > 0 )
 			score += tbsync->race_result;
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 	return score;
 }
 
Index: linux/arch/ppc/kernel/smp.c
===================================================================
--- linux.orig/arch/ppc/kernel/smp.c
+++ linux/arch/ppc/kernel/smp.c
@@ -138,6 +138,16 @@ void smp_send_reschedule(int cpu)
 	smp_message_pass(cpu, PPC_MSG_RESCHEDULE, 0, 0);
 }
 
+/*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	smp_message_pass(MSG_ALL_BUT_SELF, PPC_MSG_RESCHEDULE, 0, 0);
+}
+
 #ifdef CONFIG_XMON
 void smp_send_xmon_break(int cpu)
 {
@@ -147,7 +157,7 @@ void smp_send_xmon_break(int cpu)
 
 static void stop_this_cpu(void *dummy)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1)
 		;
 }
@@ -162,7 +172,7 @@ void smp_send_stop(void)
  * static memory requirements. It also looks cleaner.
  * Stolen from the i386 version.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 static struct call_data_struct {
 	void (*func) (void *info);
@@ -197,7 +207,7 @@ int smp_call_function(void (*func) (void
 	if (num_online_cpus() <= 1)
 		return 0;
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON(raw_irqs_disabled());
 	return __smp_call_function(func, info, wait, MSG_ALL_BUT_SELF);
 }
 
@@ -357,7 +367,7 @@ int __devinit start_secondary(void *unus
 	cpu_set(cpu, cpu_online_map);
 	spin_unlock(&call_lock);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	cpu_idle();
 	return 0;
Index: linux/arch/ppc/kernel/temp.c
===================================================================
--- linux.orig/arch/ppc/kernel/temp.c
+++ linux/arch/ppc/kernel/temp.c
@@ -142,7 +142,7 @@ static void tau_timeout(void * info)
 	int shrink;
 
 	/* disabling interrupts *should* be okay */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cpu = smp_processor_id();
 
 #ifndef CONFIG_TAU_INT
@@ -185,7 +185,7 @@ static void tau_timeout(void * info)
 	 */
 	mtspr(SPRN_THRM3, THRM3_SITV(500*60) | THRM3_E);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void tau_timeout_smp(unsigned long unused)
Index: linux/arch/ppc/kernel/time.c
===================================================================
--- linux.orig/arch/ppc/kernel/time.c
+++ linux/arch/ppc/kernel/time.c
@@ -66,10 +66,8 @@
 
 #include <asm/time.h>
 
-/* XXX false sharing with below? */
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
+unsigned long cpu_khz;   /* Detected as we calibrate the TSC */
+EXPORT_SYMBOL(cpu_khz);
 
 unsigned long disarm_decr[NR_CPUS];
 
@@ -91,7 +89,7 @@ extern unsigned long wall_jiffies;
 /* used for timezone offset */
 static long timezone_offset;
 
-DEFINE_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
 
 EXPORT_SYMBOL(rtc_lock);
 
@@ -109,7 +107,7 @@ static inline int tb_delta(unsigned *jif
 }
 
 #ifdef CONFIG_SMP
-unsigned long profile_pc(struct pt_regs *regs)
+unsigned long notrace profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
 
Index: linux/arch/ppc/kernel/traps.c
===================================================================
--- linux.orig/arch/ppc/kernel/traps.c
+++ linux/arch/ppc/kernel/traps.c
@@ -72,7 +72,7 @@ void (*debugger_fault_handler)(struct pt
  * Trap & Exception support
  */
 
-DEFINE_SPINLOCK(die_lock);
+DEFINE_RAW_SPINLOCK(die_lock);
 
 void die(const char * str, struct pt_regs * fp, long err)
 {
@@ -113,6 +113,10 @@ void _exception(int signr, struct pt_reg
 		debugger(regs);
 		die("Exception in kernel mode", regs, signr);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	info.si_signo = signr;
 	info.si_errno = 0;
 	info.si_code = code;
Index: linux/arch/ppc/lib/locks.c
===================================================================
--- linux.orig/arch/ppc/lib/locks.c
+++ linux/arch/ppc/lib/locks.c
@@ -43,7 +43,7 @@ static inline unsigned long __spin_trylo
 	return ret;
 }
 
-void _raw_spin_lock(spinlock_t *lock)
+void __raw_spin_lock(raw_spinlock_t *lock)
 {
 	int cpu = smp_processor_id();
 	unsigned int stuck = INIT_STUCK;
@@ -63,9 +63,9 @@ void _raw_spin_lock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	lock->owner_cpu = cpu;
 }
-EXPORT_SYMBOL(_raw_spin_lock);
+EXPORT_SYMBOL(__raw_spin_lock);
 
-int _raw_spin_trylock(spinlock_t *lock)
+int __raw_spin_trylock(raw_spinlock_t *lock)
 {
 	if (__spin_trylock(&lock->lock))
 		return 0;
@@ -73,9 +73,9 @@ int _raw_spin_trylock(spinlock_t *lock)
 	lock->owner_pc = (unsigned long)__builtin_return_address(0);
 	return 1;
 }
-EXPORT_SYMBOL(_raw_spin_trylock);
+EXPORT_SYMBOL(__raw_spin_trylock);
 
-void _raw_spin_unlock(spinlock_t *lp)
+void __raw_spin_unlock(raw_spinlock_t *lp)
 {
   	if ( !lp->lock )
 		printk("_spin_unlock(%p): no lock cpu %d curr PC %p %s/%d\n",
@@ -89,13 +89,13 @@ void _raw_spin_unlock(spinlock_t *lp)
 	wmb();
 	lp->lock = 0;
 }
-EXPORT_SYMBOL(_raw_spin_unlock);
+EXPORT_SYMBOL(__raw_spin_unlock);
 
 /*
  * For rwlocks, zero is unlocked, -1 is write-locked,
  * positive is read-locked.
  */
-static __inline__ int __read_trylock(rwlock_t *rw)
+static __inline__ int __read_trylock(raw_rwlock_t *rw)
 {
 	signed int tmp;
 
@@ -115,13 +115,13 @@ static __inline__ int __read_trylock(rwl
 	return tmp;
 }
 
-int _raw_read_trylock(rwlock_t *rw)
+int __raw_read_trylock(raw_rwlock_t *rw)
 {
 	return __read_trylock(rw) > 0;
 }
-EXPORT_SYMBOL(_raw_read_trylock);
+EXPORT_SYMBOL(__raw_read_trylock);
 
-void _raw_read_lock(rwlock_t *rw)
+void __raw_read_lock(rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -136,9 +136,9 @@ void _raw_read_lock(rwlock_t *rw)
 		}
 	}
 }
-EXPORT_SYMBOL(_raw_read_lock);
+EXPORT_SYMBOL(__raw_read_lock);
 
-void _raw_read_unlock(rwlock_t *rw)
+void __raw_read_unlock(raw_rwlock_t *rw)
 {
 	if ( rw->lock == 0 )
 		printk("_read_unlock(): %s/%d (nip %08lX) lock %d\n",
@@ -147,9 +147,9 @@ void _raw_read_unlock(rwlock_t *rw)
 	wmb();
 	atomic_dec((atomic_t *) &(rw)->lock);
 }
-EXPORT_SYMBOL(_raw_read_unlock);
+EXPORT_SYMBOL(__raw_read_unlock);
 
-void _raw_write_lock(rwlock_t *rw)
+void __raw_write_lock(raw_rwlock_t *rw)
 {
 	unsigned int stuck;
 
@@ -165,18 +165,18 @@ void _raw_write_lock(rwlock_t *rw)
 	}
 	wmb();
 }
-EXPORT_SYMBOL(_raw_write_lock);
+EXPORT_SYMBOL(__raw_write_lock);
 
-int _raw_write_trylock(rwlock_t *rw)
+int __raw_write_trylock(raw_rwlock_t *rw)
 {
 	if (cmpxchg(&rw->lock, 0, -1) != 0)
 		return 0;
 	wmb();
 	return 1;
 }
-EXPORT_SYMBOL(_raw_write_trylock);
+EXPORT_SYMBOL(__raw_write_trylock);
 
-void _raw_write_unlock(rwlock_t *rw)
+void __raw_write_unlock(raw_rwlock_t *rw)
 {
 	if (rw->lock >= 0)
 		printk("_write_lock(): %s/%d (nip %08lX) lock %d\n",
@@ -185,6 +185,6 @@ void _raw_write_unlock(rwlock_t *rw)
 	wmb();
 	rw->lock = 0;
 }
-EXPORT_SYMBOL(_raw_write_unlock);
+EXPORT_SYMBOL(__raw_write_unlock);
 
 #endif
Index: linux/arch/ppc/mm/fault.c
===================================================================
--- linux.orig/arch/ppc/mm/fault.c
+++ linux/arch/ppc/mm/fault.c
@@ -92,7 +92,7 @@ static int store_updates_sp(struct pt_re
  * the error_code parameter is ESR for a data fault, 0 for an instruction
  * fault.
  */
-int do_page_fault(struct pt_regs *regs, unsigned long address,
+int notrace do_page_fault(struct pt_regs *regs, unsigned long address,
 		  unsigned long error_code)
 {
 	struct vm_area_struct * vma;
Index: linux/arch/ppc/mm/init.c
===================================================================
--- linux.orig/arch/ppc/mm/init.c
+++ linux/arch/ppc/mm/init.c
@@ -56,7 +56,7 @@
 #endif
 #define MAX_LOW_MEM	CONFIG_LOWMEM_SIZE
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 unsigned long total_memory;
 unsigned long total_lowmem;
Index: linux/arch/ppc/platforms/4xx/xilinx_ml300.c
===================================================================
--- linux.orig/arch/ppc/platforms/4xx/xilinx_ml300.c
+++ linux/arch/ppc/platforms/4xx/xilinx_ml300.c
@@ -62,7 +62,7 @@ static volatile unsigned *powerdown_base
 static void
 xilinx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	out_be32(powerdown_base, XPAR_POWER_0_POWERDOWN_VALUE);
 	while (1) ;
 }
Index: linux/arch/ppc/platforms/apus_setup.c
===================================================================
--- linux.orig/arch/ppc/platforms/apus_setup.c
+++ linux/arch/ppc/platforms/apus_setup.c
@@ -282,6 +282,7 @@ void apus_calibrate_decr(void)
 	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	__bus_speed = bus_speed;
 	__speed_test_failed = speed_test_failed;
@@ -480,7 +481,7 @@ void cache_clear(__u32 addr, int length)
 void
 apus_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	APUS_WRITE(APUS_REG_LOCK,
 		   REGLOCK_BLACKMAGICK1|REGLOCK_BLACKMAGICK2);
@@ -598,7 +599,7 @@ int __debug_serinit( void )
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* turn off Rx and Tx interrupts */
 	custom.intena = IF_RBF | IF_TBE;
@@ -606,7 +607,7 @@ int __debug_serinit( void )
 	/* clear any pending interrupt */
 	custom.intreq = IF_RBF | IF_TBE;
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	/*
 	 * set the appropriate directions for the modem control flags,
Index: linux/arch/ppc/platforms/chestnut.c
===================================================================
--- linux.orig/arch/ppc/platforms/chestnut.c
+++ linux/arch/ppc/platforms/chestnut.c
@@ -455,7 +455,7 @@ chestnut_restart(char *cmd)
 {
 	volatile ulong i = 10000000;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
         /*
          * Set CPLD Reg 3 bit 0 to 1 to allow MPP signals on reset to work
@@ -474,7 +474,7 @@ chestnut_restart(char *cmd)
 static void
 chestnut_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for (;;);
 	/* NOTREACHED */
 }
Index: linux/arch/ppc/platforms/chrp_smp.c
===================================================================
--- linux.orig/arch/ppc/platforms/chrp_smp.c
+++ linux/arch/ppc/platforms/chrp_smp.c
@@ -57,7 +57,7 @@ smp_chrp_setup_cpu(int cpu_nr)
 		do_openpic_setup_cpu();
 }
 
-static DEFINE_SPINLOCK(timebase_lock);
+static DEFINE_RAW_SPINLOCK(timebase_lock);
 static unsigned int timebase_upper = 0, timebase_lower = 0;
 
 void __devinit
Index: linux/arch/ppc/platforms/chrp_time.c
===================================================================
--- linux.orig/arch/ppc/platforms/chrp_time.c
+++ linux/arch/ppc/platforms/chrp_time.c
@@ -188,4 +188,5 @@ void __init chrp_calibrate_decr(void)
  	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 }
Index: linux/arch/ppc/platforms/cpci690.c
===================================================================
--- linux.orig/arch/ppc/platforms/cpci690.c
+++ linux/arch/ppc/platforms/cpci690.c
@@ -321,7 +321,7 @@ cpci690_reset_board(void)
 {
 	u32	i = 10000;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	out_8((cpci690_br_base + CPCI690_BR_SW_RESET), 0x11);
 
 	while (i != 0) i++;
Index: linux/arch/ppc/platforms/ev64260.c
===================================================================
--- linux.orig/arch/ppc/platforms/ev64260.c
+++ linux/arch/ppc/platforms/ev64260.c
@@ -445,7 +445,7 @@ ev64260_platform_notify(struct device *d
 static void
 ev64260_reset_board(void *addr)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* disable and invalidate the L2 cache */
 	_set_L2CR(0);
@@ -513,7 +513,7 @@ ev64260_restart(char *cmd)
 static void
 ev64260_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 	/* NOTREACHED */
 }
@@ -552,6 +552,7 @@ ev64260_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
Index: linux/arch/ppc/platforms/gemini_setup.c
===================================================================
--- linux.orig/arch/ppc/platforms/gemini_setup.c
+++ linux/arch/ppc/platforms/gemini_setup.c
@@ -302,7 +302,7 @@ void __init gemini_init_l2(void)
 void
 gemini_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* make a clean restart, not via the MPIC */
 	_gemini_reboot();
 	for(;;);
@@ -461,6 +461,7 @@ void __init gemini_calibrate_decr(void)
 	divisor = 4;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 unsigned long __init gemini_find_end_of_memory(void)
Index: linux/arch/ppc/platforms/hdpu.c
===================================================================
--- linux.orig/arch/ppc/platforms/hdpu.c
+++ linux/arch/ppc/platforms/hdpu.c
@@ -473,7 +473,7 @@ static void hdpu_reset_board(void)
 
 	hdpu_cpustate_set(CPUSTATE_KERNEL_MAJOR | CPUSTATE_KERNEL_RESET);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Clear all the LEDs */
 	mv64x60_write(&bh, MV64x60_GPP_VALUE_CLR, ((1 << 4) |
@@ -515,7 +515,7 @@ static void hdpu_restart(char *cmd)
 
 static void hdpu_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	hdpu_cpustate_set(CPUSTATE_KERNEL_MAJOR | CPUSTATE_KERNEL_HALT);
 
Index: linux/arch/ppc/platforms/lopec.c
===================================================================
--- linux.orig/arch/ppc/platforms/lopec.c
+++ linux/arch/ppc/platforms/lopec.c
@@ -162,7 +162,7 @@ lopec_restart(char *cmd)
 	reg |= 0x80;
 	*((unsigned char *) LOPEC_SYSSTAT1) = reg;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	while(1);
 #undef LOPEC_SYSSTAT1
 }
@@ -170,7 +170,7 @@ lopec_restart(char *cmd)
 static void
 lopec_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while(1);
 }
 
Index: linux/arch/ppc/platforms/mvme5100.c
===================================================================
--- linux.orig/arch/ppc/platforms/mvme5100.c
+++ linux/arch/ppc/platforms/mvme5100.c
@@ -266,7 +266,7 @@ mvme5100_map_io(void)
 static void
 mvme5100_reset_board(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Set exception prefix high - to the firmware */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -290,7 +290,7 @@ mvme5100_restart(char *cmd)
 static void
 mvme5100_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 }
 
Index: linux/arch/ppc/platforms/pal4_setup.c
===================================================================
--- linux.orig/arch/ppc/platforms/pal4_setup.c
+++ linux/arch/ppc/platforms/pal4_setup.c
@@ -81,7 +81,7 @@ pal4_show_cpuinfo(struct seq_file *m)
 static void
 pal4_restart(char *cmd)
 {
-        local_irq_disable();
+        raw_local_irq_disable();
         __asm__ __volatile__("lis  3,0xfff0\n \
                               ori  3,3,0x100\n \
                               mtspr 26,3\n \
@@ -95,7 +95,7 @@ pal4_restart(char *cmd)
 static void
 pal4_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux/arch/ppc/platforms/pmac_cpufreq.c
===================================================================
--- linux.orig/arch/ppc/platforms/pmac_cpufreq.c
+++ linux/arch/ppc/platforms/pmac_cpufreq.c
@@ -285,7 +285,7 @@ static int __pmac pmu_set_cpu_speed(int 
 	asm volatile("mtdec %0" : : "r" (0x7fffffff));
 
 	/* We can now disable MSR_EE */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Giveup the FPU & vec */
 	enable_kernel_fp();
@@ -341,7 +341,7 @@ static int __pmac pmu_set_cpu_speed(int 
  	openpic_set_priority(pic_prio);
 
 	/* Let interrupts flow again ... */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 #ifdef DEBUG_FREQ
 	debug_calc_bogomips();
Index: linux/arch/ppc/platforms/pmac_feature.c
===================================================================
--- linux.orig/arch/ppc/platforms/pmac_feature.c
+++ linux/arch/ppc/platforms/pmac_feature.c
@@ -63,7 +63,7 @@ extern struct device_node *k2_skiplist[2
  * We use a single global lock to protect accesses. Each driver has
  * to take care of its own locking
  */
-static DEFINE_SPINLOCK(feature_lock  __pmacdata);
+static DEFINE_RAW_SPINLOCK(feature_lock  __pmacdata);
 
 #define LOCK(flags)	spin_lock_irqsave(&feature_lock, flags);
 #define UNLOCK(flags)	spin_unlock_irqrestore(&feature_lock, flags);
Index: linux/arch/ppc/platforms/pmac_nvram.c
===================================================================
--- linux.orig/arch/ppc/platforms/pmac_nvram.c
+++ linux/arch/ppc/platforms/pmac_nvram.c
@@ -80,7 +80,7 @@ static volatile unsigned char *nvram_dat
 static int nvram_mult, is_core_99;
 static int core99_bank = 0;
 static int nvram_partitions[3];
-static DEFINE_SPINLOCK(nv_lock);
+static DEFINE_RAW_SPINLOCK(nv_lock);
 
 extern int pmac_newworld;
 extern int system_running;
Index: linux/arch/ppc/platforms/pmac_pic.c
===================================================================
--- linux.orig/arch/ppc/platforms/pmac_pic.c
+++ linux/arch/ppc/platforms/pmac_pic.c
@@ -68,7 +68,7 @@ static int max_irqs __pmacdata;
 static int max_real_irqs __pmacdata;
 static u32 level_mask[4] __pmacdata;
 
-static DEFINE_SPINLOCK(pmac_pic_lock __pmacdata);
+static DEFINE_RAW_SPINLOCK(pmac_pic_lock __pmacdata);
 
 
 #define GATWICK_IRQ_POOL_SIZE        10
Index: linux/arch/ppc/platforms/pmac_smp.c
===================================================================
--- linux.orig/arch/ppc/platforms/pmac_smp.c
+++ linux/arch/ppc/platforms/pmac_smp.c
@@ -500,8 +500,8 @@ static void __devinit smp_core99_kick_cp
 		return;
 	if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu", 0x346);
 
-	local_irq_save(flags);
-	local_irq_disable();
+	raw_local_irq_save(flags);
+	raw_local_irq_disable();
 
 	/* Save reset vector */
 	save_vector = *vector;
@@ -529,7 +529,7 @@ static void __devinit smp_core99_kick_cp
 	*vector = save_vector;
 	flush_icache_range((unsigned long) vector, (unsigned long) vector + 4);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	if (ppc_md.progress) ppc_md.progress("smp_core99_kick_cpu done", 0x347);
 }
 
@@ -571,7 +571,7 @@ void smp_core99_take_timebase(void)
 		mb();
 
 	/* set our stuff the same as the primary */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	set_dec(1);
 	set_tb(pri_tb_hi, pri_tb_lo);
 	last_jiffy_stamp(smp_processor_id()) = pri_tb_stamp;
@@ -580,7 +580,7 @@ void smp_core99_take_timebase(void)
 	/* tell the primary we're done */
        	sec_tb_reset = 0;
 	mb();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* not __init, called in sleep/wakeup code */
@@ -600,7 +600,7 @@ void smp_core99_give_timebase(void)
 	/* freeze the timebase and read it */
 	/* disable interrupts so the timebase is disabled for the
 	   shortest possible time */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 4);
 	pmac_call_feature(PMAC_FTR_READ_GPIO, NULL, core99_tb_gpio, 0);
 	mb();
@@ -624,7 +624,7 @@ void smp_core99_give_timebase(void)
 	/* Now, restart the timebase by leaving the GPIO to an open collector */
        	pmac_call_feature(PMAC_FTR_WRITE_GPIO, NULL, core99_tb_gpio, 0);
         pmac_call_feature(PMAC_FTR_READ_GPIO, NULL, core99_tb_gpio, 0);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 
Index: linux/arch/ppc/platforms/pmac_time.c
===================================================================
--- linux.orig/arch/ppc/platforms/pmac_time.c
+++ linux/arch/ppc/platforms/pmac_time.c
@@ -197,6 +197,7 @@ via_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = (dstart - dend) / ((6 * HZ)/100);
 	tb_to_us = mulhwu_scale_factor(dstart - dend, 60000);
+	cpu_khz = (dstart - dend) / 60;
 
 	printk(KERN_INFO "via_calibrate_decr: ticks per jiffy = %u (%u ticks)\n",
 	       tb_ticks_per_jiffy, dstart - dend);
@@ -288,4 +289,5 @@ pmac_calibrate_decr(void)
 	       freq/1000000, freq%1000000);
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 }
Index: linux/arch/ppc/platforms/powerpmc250.c
===================================================================
--- linux.orig/arch/ppc/platforms/powerpmc250.c
+++ linux/arch/ppc/platforms/powerpmc250.c
@@ -166,12 +166,13 @@ powerpmc250_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void
 powerpmc250_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* Hard reset */
 	writeb(0x11, 0xfe000332);
 	while(1);
@@ -180,7 +181,7 @@ powerpmc250_restart(char *cmd)
 static void
 powerpmc250_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 }
 
Index: linux/arch/ppc/platforms/pplus.c
===================================================================
--- linux.orig/arch/ppc/platforms/pplus.c
+++ linux/arch/ppc/platforms/pplus.c
@@ -607,7 +607,7 @@ static void pplus_restart(char *cmd)
 {
 	unsigned long i = 10000;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* set VIA IDE controller into native mode */
 	pplus_set_VIA_IDE_native();
Index: linux/arch/ppc/platforms/prep_setup.c
===================================================================
--- linux.orig/arch/ppc/platforms/prep_setup.c
+++ linux/arch/ppc/platforms/prep_setup.c
@@ -458,7 +458,7 @@ static void __prep
 prep_restart(char *cmd)
 {
 #define PREP_SP92	0x92	/* Special Port 92 */
-	local_irq_disable(); /* no interrupts */
+	raw_local_irq_disable(); /* no interrupts */
 
 	/* set exception prefix high - to the prom */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -476,7 +476,7 @@ prep_restart(char *cmd)
 static void __prep
 prep_halt(void)
 {
-	local_irq_disable(); /* no interrupts */
+	raw_local_irq_disable(); /* no interrupts */
 
 	/* set exception prefix high - to the prom */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -544,7 +544,7 @@ prep_sig750_poweroff(void)
 {
 	/* tweak the power manager found in most IBM PRePs (except Thinkpads) */
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* set exception prefix high - to the prom */
 	_nmask_and_or_msr(0, MSR_IP);
 
@@ -938,6 +938,7 @@ prep_calibrate_decr(void)
 					(freq/divisor)/1000000,
 					(freq/divisor)%1000000);
 			tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+			cpu_khz = (freq / divisor) / 1000;
 			tb_ticks_per_jiffy = freq / HZ / divisor;
 		}
 	}
Index: linux/arch/ppc/platforms/prpmc750.c
===================================================================
--- linux.orig/arch/ppc/platforms/prpmc750.c
+++ linux/arch/ppc/platforms/prpmc750.c
@@ -271,18 +271,19 @@ static void __init prpmc750_calibrate_de
 
 	tb_ticks_per_jiffy = freq / (HZ * divisor);
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static void prpmc750_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	writeb(PRPMC750_MODRST_MASK, PRPMC750_MODRST_REG);
 	while (1) ;
 }
 
 static void prpmc750_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
Index: linux/arch/ppc/platforms/prpmc800.c
===================================================================
--- linux.orig/arch/ppc/platforms/prpmc800.c
+++ linux/arch/ppc/platforms/prpmc800.c
@@ -330,6 +330,7 @@ static void __init prpmc800_calibrate_de
 		tb_ticks_per_second = 100000000 / 4;
 		tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 		tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+		cpu_khz = tb_ticks_per_second / 1000;
 		return;
 	}
 
@@ -370,13 +371,14 @@ static void __init prpmc800_calibrate_de
 	tb_ticks_per_second = (tbl_end - tbl_start) * 2;
 	tb_ticks_per_jiffy = tb_ticks_per_second / HZ;
 	tb_to_us = mulhwu_scale_factor(tb_ticks_per_second, 1000000);
+	cpu_khz = tb_ticks_per_second / 1000;
 }
 
 static void prpmc800_restart(char *cmd)
 {
 	ulong temp;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	temp = in_be32((uint *) HARRIER_MISC_CSR_REG);
 	temp |= HARRIER_RSTOUT;
 	out_be32((uint *) HARRIER_MISC_CSR_REG, temp);
@@ -385,7 +387,7 @@ static void prpmc800_restart(char *cmd)
 
 static void prpmc800_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
Index: linux/arch/ppc/platforms/radstone_ppc7d.c
===================================================================
--- linux.orig/arch/ppc/platforms/radstone_ppc7d.c
+++ linux/arch/ppc/platforms/radstone_ppc7d.c
@@ -175,7 +175,7 @@ static void ppc7d_power_off(void)
 {
 	u32 data;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Ensure that internal MV643XX watchdog is disabled.
 	 * The Disco watchdog uses MPP17 on this hardware.
Index: linux/arch/ppc/platforms/sandpoint.c
===================================================================
--- linux.orig/arch/ppc/platforms/sandpoint.c
+++ linux/arch/ppc/platforms/sandpoint.c
@@ -544,7 +544,7 @@ sandpoint_map_io(void)
 static void
 sandpoint_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Set exception prefix high - to the firmware */
 	_nmask_and_or_msr(0, MSR_IP);
@@ -558,7 +558,7 @@ sandpoint_restart(char *cmd)
 static void
 sandpoint_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);	/* No way to shut power off with software */
 	/* NOTREACHED */
 }
Index: linux/arch/ppc/platforms/sbc82xx.c
===================================================================
--- linux.orig/arch/ppc/platforms/sbc82xx.c
+++ linux/arch/ppc/platforms/sbc82xx.c
@@ -68,7 +68,7 @@ static void sbc82xx_time_init(void)
 
 static volatile char *sbc82xx_i8259_map;
 static char sbc82xx_i8259_mask = 0xff;
-static DEFINE_SPINLOCK(sbc82xx_i8259_lock);
+static DEFINE_RAW_SPINLOCK(sbc82xx_i8259_lock);
 
 static void sbc82xx_i8259_mask_and_ack_irq(unsigned int irq_nr)
 {
Index: linux/arch/ppc/platforms/spruce.c
===================================================================
--- linux.orig/arch/ppc/platforms/spruce.c
+++ linux/arch/ppc/platforms/spruce.c
@@ -150,6 +150,7 @@ spruce_calibrate_decr(void)
 	freq = SPRUCE_BUS_SPEED;
 	tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq/divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 static int
@@ -236,7 +237,7 @@ spruce_setup_arch(void)
 static void
 spruce_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* SRR0 has system reset vector, SRR1 has default MSR value */
 	/* rfi restores MSR from SRR1 and sets the PC to the SRR0 value */
Index: linux/arch/ppc/syslib/cpm2_common.c
===================================================================
--- linux.orig/arch/ppc/syslib/cpm2_common.c
+++ linux/arch/ppc/syslib/cpm2_common.c
@@ -114,7 +114,7 @@ cpm2_fastbrg(uint brg, uint rate, int di
 /*
  * dpalloc / dpfree bits.
  */
-static spinlock_t cpm_dpmem_lock;
+static raw_spinlock_t cpm_dpmem_lock;
 /* 16 blocks should be enough to satisfy all requests
  * until the memory subsystem goes up... */
 static rh_block_t cpm_boot_dpmem_rh_block[16];
Index: linux/arch/ppc/syslib/ibm440gx_common.c
===================================================================
--- linux.orig/arch/ppc/syslib/ibm440gx_common.c
+++ linux/arch/ppc/syslib/ibm440gx_common.c
@@ -157,7 +157,7 @@ void __init ibm440gx_l2c_enable(void){
 		return;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	asm volatile ("sync" ::: "memory");
 
 	/* Disable SRAM */
@@ -201,7 +201,7 @@ void __init ibm440gx_l2c_enable(void){
 	mtdcr(DCRN_L2C0_CFG, r);
 
 	asm volatile ("sync; isync" ::: "memory");
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* Disable L2 cache */
@@ -209,7 +209,7 @@ void __init ibm440gx_l2c_disable(void){
 	u32 r;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	asm volatile ("sync" ::: "memory");
 
 	/* Disable L2C mode */
@@ -228,7 +228,7 @@ void __init ibm440gx_l2c_disable(void){
 	      SRAM_SBCR_BAS3 | SRAM_SBCR_BS_64KB | SRAM_SBCR_BU_RW);
 
 	asm volatile ("sync; isync" ::: "memory");
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void __init ibm440gx_l2c_setup(struct ibm44x_clocks* p)
Index: linux/arch/ppc/syslib/ibm44x_common.c
===================================================================
--- linux.orig/arch/ppc/syslib/ibm44x_common.c
+++ linux/arch/ppc/syslib/ibm44x_common.c
@@ -60,6 +60,7 @@ void __init ibm44x_calibrate_decr(unsign
 {
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
@@ -76,19 +77,19 @@ extern void abort(void);
 
 static void ibm44x_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	abort();
 }
 
 static void ibm44x_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
 static void ibm44x_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux/arch/ppc/syslib/m8260_pci_erratum9.c
===================================================================
--- linux.orig/arch/ppc/syslib/m8260_pci_erratum9.c
+++ linux/arch/ppc/syslib/m8260_pci_erratum9.c
@@ -132,7 +132,7 @@ idma_pci9_read(u8 *dst, u8 *src, int byt
 	volatile idma_bd_t *bd = &idma_dpram->bd;
 	volatile cpm2_map_t *immap = cpm2_immr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* initialize IDMA parameter RAM for this transfer */
 	if (sinc)
@@ -161,7 +161,7 @@ idma_pci9_read(u8 *dst, u8 *src, int byt
 	/* wait for transfer to complete */
 	while(bd->flags & IDMA_BD_V);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return;
 }
@@ -184,7 +184,7 @@ idma_pci9_write(u8 *dst, u8 *src, int by
 	volatile idma_bd_t *bd = &idma_dpram->bd;
 	volatile cpm2_map_t *immap = cpm2_immr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* initialize IDMA parameter RAM for this transfer */
 	if (dinc)
@@ -213,7 +213,7 @@ idma_pci9_write(u8 *dst, u8 *src, int by
 	/* wait for transfer to complete */
 	while(bd->flags & IDMA_BD_V);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	return;
 }
Index: linux/arch/ppc/syslib/m8260_setup.c
===================================================================
--- linux.orig/arch/ppc/syslib/m8260_setup.c
+++ linux/arch/ppc/syslib/m8260_setup.c
@@ -78,6 +78,7 @@ m8260_calibrate_decr(void)
         divisor = 4;
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 }
 
 /* The 8260 has an internal 1-second timer update register that
@@ -128,7 +129,7 @@ m8260_restart(char *cmd)
 static void
 m8260_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1);
 }
 
Index: linux/arch/ppc/syslib/m8xx_setup.c
===================================================================
--- linux.orig/arch/ppc/syslib/m8xx_setup.c
+++ linux/arch/ppc/syslib/m8xx_setup.c
@@ -159,6 +159,7 @@ void __init m8xx_calibrate_decr(void)
         printk("Decrementer Frequency = %d/%d\n", freq, divisor);
         tb_ticks_per_jiffy = freq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+	cpu_khz = (freq / divisor) / 1000;
 
 	/* Perform some more timer/timebase initialization.  This used
 	 * to be done elsewhere, but other changes caused it to get
@@ -234,7 +235,7 @@ m8xx_restart(char *cmd)
 {
 	__volatile__ unsigned char dummy;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	((immap_t *)IMAP_ADDR)->im_clkrst.car_plprcr |= 0x00000080;
 
 	/* Clear the ME bit in MSR to cause checkstop on machine check
Index: linux/arch/ppc/syslib/mpc52xx_setup.c
===================================================================
--- linux.orig/arch/ppc/syslib/mpc52xx_setup.c
+++ linux/arch/ppc/syslib/mpc52xx_setup.c
@@ -40,7 +40,7 @@ mpc52xx_restart(char *cmd)
 {
 	struct mpc52xx_gpt __iomem *gpt0 = MPC52xx_VA(MPC52xx_GPTx_OFFSET(0));
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Turn on the watchdog and wait for it to expire. It effectively
 	  does a reset */
@@ -53,7 +53,7 @@ mpc52xx_restart(char *cmd)
 void
 mpc52xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	while (1);
 }
@@ -214,6 +214,7 @@ mpc52xx_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = xlbfreq / HZ / divisor;
 	tb_to_us = mulhwu_scale_factor(xlbfreq / divisor, 1000000);
+	cpu_khz = (xlbfreq / divisor) / 1000;
 }
 
 int mpc52xx_match_psc_function(int psc_idx, const char *func)
Index: linux/arch/ppc/syslib/ocp.c
===================================================================
--- linux.orig/arch/ppc/syslib/ocp.c
+++ linux/arch/ppc/syslib/ocp.c
@@ -45,11 +45,11 @@
 #include <linux/pm.h>
 #include <linux/bootmem.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/io.h>
 #include <asm/ocp.h>
 #include <asm/errno.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 //#define DBG(x)	printk x
Index: linux/arch/ppc/syslib/open_pic.c
===================================================================
--- linux.orig/arch/ppc/syslib/open_pic.c
+++ linux/arch/ppc/syslib/open_pic.c
@@ -528,7 +528,7 @@ void openpic_reset_processor_phys(u_int 
 }
 
 #if defined(CONFIG_SMP) || defined(CONFIG_PM)
-static DEFINE_SPINLOCK(openpic_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic_setup_lock);
 #endif
 
 #ifdef CONFIG_SMP
Index: linux/arch/ppc/syslib/open_pic2.c
===================================================================
--- linux.orig/arch/ppc/syslib/open_pic2.c
+++ linux/arch/ppc/syslib/open_pic2.c
@@ -382,7 +382,7 @@ static void openpic2_set_spurious(u_int 
 			   vec);
 }
 
-static DEFINE_SPINLOCK(openpic2_setup_lock);
+static DEFINE_RAW_SPINLOCK(openpic2_setup_lock);
 
 /*
  *  Initialize a timer interrupt (and disable it)
Index: linux/arch/ppc/syslib/ppc4xx_setup.c
===================================================================
--- linux.orig/arch/ppc/syslib/ppc4xx_setup.c
+++ linux/arch/ppc/syslib/ppc4xx_setup.c
@@ -142,7 +142,7 @@ static void
 ppc4xx_power_off(void)
 {
 	printk("System Halted\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
@@ -150,7 +150,7 @@ static void
 ppc4xx_halt(void)
 {
 	printk("System Halted\n");
-	local_irq_disable();
+	raw_local_irq_disable();
 	while (1) ;
 }
 
@@ -173,6 +173,7 @@ ppc4xx_calibrate_decr(void)
 	freq = bip->bi_tbfreq;
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	/* Set the time base to zero.
 	   ** At 200 Mhz, time base will rollover in ~2925 years.
Index: linux/arch/ppc/syslib/ppc83xx_setup.c
===================================================================
--- linux.orig/arch/ppc/syslib/ppc83xx_setup.c
+++ linux/arch/ppc/syslib/ppc83xx_setup.c
@@ -137,7 +137,7 @@ mpc83xx_restart(char *cmd)
 
 	reg = ioremap(BCSR_PHYS_ADDR, BCSR_SIZE);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Unlock the BCSR bits so a PRST will update the contents.
@@ -166,14 +166,14 @@ mpc83xx_restart(char *cmd)
 void
 mpc83xx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
 void
 mpc83xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux/arch/ppc/syslib/ppc85xx_setup.c
===================================================================
--- linux.orig/arch/ppc/syslib/ppc85xx_setup.c
+++ linux/arch/ppc/syslib/ppc85xx_setup.c
@@ -59,6 +59,7 @@ mpc85xx_calibrate_decr(void)
         divisor = 8;
         tb_ticks_per_jiffy = freq / divisor / HZ;
         tb_to_us = mulhwu_scale_factor(freq / divisor, 1000000);
+        cpu_khz = (freq / divisor) / 1000;
 
 	/* Set the time base to zero */
 	mtspr(SPRN_TBWL, 0);
@@ -114,21 +115,21 @@ mpc85xx_early_serial_map(void)
 void
 mpc85xx_restart(char *cmd)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	abort();
 }
 
 void
 mpc85xx_power_off(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
 void
 mpc85xx_halt(void)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	for(;;);
 }
 
Index: linux/arch/ppc/syslib/prom.c
===================================================================
--- linux.orig/arch/ppc/syslib/prom.c
+++ linux/arch/ppc/syslib/prom.c
@@ -1397,7 +1397,7 @@ print_properties(struct device_node *np)
 }
 #endif
 
-static DEFINE_SPINLOCK(rtas_lock);
+static DEFINE_RAW_SPINLOCK(rtas_lock);
 
 /* this can be called after setup -- Cort */
 int __openfirmware
Index: linux/arch/ppc/syslib/todc_time.c
===================================================================
--- linux.orig/arch/ppc/syslib/todc_time.c
+++ linux/arch/ppc/syslib/todc_time.c
@@ -508,6 +508,7 @@ todc_calibrate_decr(void)
 
 	tb_ticks_per_jiffy = freq / HZ;
 	tb_to_us = mulhwu_scale_factor(freq, 1000000);
+	cpu_khz = freq / 1000;
 
 	return;
 }
Index: linux/arch/ppc/xmon/xmon.c
===================================================================
--- linux.orig/arch/ppc/xmon/xmon.c
+++ linux/arch/ppc/xmon/xmon.c
@@ -291,10 +291,10 @@ irqreturn_t
 xmon_irq(int irq, void *d, struct pt_regs *regs)
 {
 	unsigned long flags;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	printf("Keyboard interrupt\n");
 	xmon(regs);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return IRQ_HANDLED;
 }
 
Index: linux/arch/ppc64/kernel/time.c
===================================================================
--- linux.orig/arch/ppc64/kernel/time.c
+++ linux/arch/ppc64/kernel/time.c
@@ -68,10 +68,6 @@
 #include <asm/systemcfg.h>
 #include <asm/firmware.h>
 
-u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* keep track of when we need to update the rtc */
 time_t last_rtc_update;
 extern int piranha_simulator;
Index: linux/arch/s390/kernel/time.c
===================================================================
--- linux.orig/arch/s390/kernel/time.c
+++ linux/arch/s390/kernel/time.c
@@ -49,10 +49,6 @@
 
 #define TICK_SIZE tick
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static ext_int_info_t ext_int_info_cc;
 static u64 init_timer_cc;
 static u64 jiffies_timer_cc;
Index: linux/arch/sh/kernel/time.c
===================================================================
--- linux.orig/arch/sh/kernel/time.c
+++ linux/arch/sh/kernel/time.c
@@ -56,10 +56,6 @@ extern unsigned long wall_jiffies;
 #define TICK_SIZE (tick_nsec / 1000)
 DEFINE_SPINLOCK(tmu0_lock);
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 /* XXX: Can we initialize this in a routine somewhere?  Dreamcast doesn't want
  * these routines anywhere... */
 #ifdef CONFIG_SH_RTC
Index: linux/arch/sh64/kernel/time.c
===================================================================
--- linux.orig/arch/sh64/kernel/time.c
+++ linux/arch/sh64/kernel/time.c
@@ -116,8 +116,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
 static unsigned long tmu_base, rtc_base;
 unsigned long cprc_base;
 
Index: linux/arch/sparc/kernel/time.c
===================================================================
--- linux.orig/arch/sparc/kernel/time.c
+++ linux/arch/sparc/kernel/time.c
@@ -45,10 +45,6 @@
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 DEFINE_SPINLOCK(rtc_lock);
 enum sparc_clock_type sp_clock_typ;
 DEFINE_SPINLOCK(mostek_lock);
Index: linux/arch/sparc64/kernel/time.c
===================================================================
--- linux.orig/arch/sparc64/kernel/time.c
+++ linux/arch/sparc64/kernel/time.c
@@ -55,10 +55,6 @@ unsigned long ds1287_regs = 0UL;
 
 extern unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 static void __iomem *mstk48t08_regs;
 static void __iomem *mstk48t59_regs;
 
Index: linux/arch/um/kernel/time.c
===================================================================
--- linux.orig/arch/um/kernel/time.c
+++ linux/arch/um/kernel/time.c
@@ -114,8 +114,8 @@ void time_init(void)
 	wall_to_monotonic.tv_nsec = -now.tv_nsec;
 }
 
-/* Declared in linux/time.h, which can't be included here */
-extern void clock_was_set(void);
+/* Defined in linux/ktimer.h, which can't be included here */
+#define clock_was_set()		do { } while (0)
 
 void do_gettimeofday(struct timeval *tv)
 {
Index: linux/arch/um/kernel/time_kern.c
===================================================================
--- linux.orig/arch/um/kernel/time_kern.c
+++ linux/arch/um/kernel/time_kern.c
@@ -22,10 +22,6 @@
 #include "mode.h"
 #include "os.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 int hz(void)
 {
 	return(HZ);
Index: linux/arch/v850/kernel/time.c
===================================================================
--- linux.orig/arch/v850/kernel/time.c
+++ linux/arch/v850/kernel/time.c
@@ -26,10 +26,6 @@
 
 #include "mach.h"
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
 #define TICK_SIZE	(tick_nsec / 1000)
 
 /*
Index: linux/arch/x86_64/Kconfig
===================================================================
--- linux.orig/arch/x86_64/Kconfig
+++ linux/arch/x86_64/Kconfig
@@ -24,6 +24,18 @@ config X86
 	bool
 	default y
 
+config GENERIC_TIME
+       bool
+       default y
+
+config GENERIC_TIME_VSYSCALL
+       bool
+       default y
+
+config PARANOID_GENERIC_TIME
+	default y
+	bool "Paraniod Timekeeping Checks"
+
 config SEMAPHORE_SLEEPERS
 	bool
 	default y
@@ -38,13 +50,6 @@ config ISA
 config SBUS
 	bool
 
-config RWSEM_GENERIC_SPINLOCK
-	bool
-	default y
-
-config RWSEM_XCHGADD_ALGORITHM
-	bool
-
 config GENERIC_CALIBRATE_DELAY
 	bool
 	default y
@@ -226,6 +231,14 @@ config SCHED_SMT
 
 source "kernel/Kconfig.preempt"
 
+config RWSEM_GENERIC_SPINLOCK
+	bool
+	depends on PREEMPT_RT
+	default y
+
+config RWSEM_XCHGADD_ALGORITHM
+	bool
+
 config K8_NUMA
        bool "K8 NUMA support"
        select NUMA
Index: linux/arch/x86_64/Kconfig.debug
===================================================================
--- linux.orig/arch/x86_64/Kconfig.debug
+++ linux/arch/x86_64/Kconfig.debug
@@ -33,6 +33,14 @@ config IOMMU_DEBUG
 	 options. See Documentation/x86_64/boot-options.txt for more
 	 details.
 
+config DEBUG_STACKOVERFLOW
+        bool "Check for stack overflows"
+        depends on DEBUG_KERNEL
+        default y
+        help
+          This option will cause messages to be printed if free stack space
+          drops below a certain limit.
+
 config KPROBES
 	bool "Kprobes"
 	depends on DEBUG_KERNEL
Index: linux/arch/x86_64/boot/compressed/misc.c
===================================================================
--- linux.orig/arch/x86_64/boot/compressed/misc.c
+++ linux/arch/x86_64/boot/compressed/misc.c
@@ -114,6 +114,7 @@ static char *vidmem = (char *)0xb8000;
 static int vidport;
 static int lines, cols;
 
+#define ZLIB_INFLATE_NO_INFLATE_LOCK
 #include "../../../../lib/inflate.c"
 
 static void *malloc(int size)
Index: linux/arch/x86_64/ia32/sys_ia32.c
===================================================================
--- linux.orig/arch/x86_64/ia32/sys_ia32.c
+++ linux/arch/x86_64/ia32/sys_ia32.c
@@ -456,6 +456,10 @@ sys32_settimeofday(struct compat_timeval
 	struct timespec kts;
 	struct timezone ktz;
 
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
  	if (tv) {
 		if (get_tv32(&ktv, tv))
 			return -EFAULT;
Index: linux/arch/x86_64/kernel/Makefile
===================================================================
--- linux.orig/arch/x86_64/kernel/Makefile
+++ linux/arch/x86_64/kernel/Makefile
@@ -29,7 +29,7 @@ obj-$(CONFIG_GART_IOMMU)	+= pci-gart.o a
 obj-$(CONFIG_DUMMY_IOMMU)	+= pci-nommu.o pci-dma.o
 obj-$(CONFIG_SWIOTLB)		+= swiotlb.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
-obj-$(CONFIG_X86_PM_TIMER)	+= pmtimer.o
+obj-$(CONFIG_SYSFS)		+= switch2poll.o
 
 obj-$(CONFIG_MODULES)		+= module.o
 
Index: linux/arch/x86_64/kernel/apic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/apic.c
+++ linux/arch/x86_64/kernel/apic.c
@@ -485,10 +485,9 @@ static int lapic_suspend(struct sys_devi
 	apic_pm_state.apic_tmict = apic_read(APIC_TMICT);
 	apic_pm_state.apic_tdcr = apic_read(APIC_TDCR);
 	apic_pm_state.apic_thmr = apic_read(APIC_LVTTHMR);
-	local_save_flags(flags);
-	local_irq_disable();
+	raw_local_irq_save(flags);
 	disable_local_APIC();
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -503,7 +502,7 @@ static int lapic_resume(struct sys_devic
 	/* XXX: Pavel needs this for S3 resume, but can't explain why */
 	set_fixmap_nocache(FIX_APIC_BASE, APIC_DEFAULT_PHYS_BASE);
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	rdmsr(MSR_IA32_APICBASE, l, h);
 	l &= ~MSR_IA32_APICBASE_BASE;
 	l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
@@ -526,7 +525,7 @@ static int lapic_resume(struct sys_devic
 	apic_write(APIC_LVTERR, apic_pm_state.apic_lvterr);
 	apic_write(APIC_ESR, 0);
 	apic_read(APIC_ESR);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return 0;
 }
 
@@ -680,7 +679,7 @@ static void setup_APIC_timer(unsigned in
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* For some reasons this doesn't work on Simics, so fake it for now */ 
 	if (!strstr(boot_cpu_data.x86_model_id, "Screwdriver")) { 
@@ -710,7 +709,7 @@ static void setup_APIC_timer(unsigned in
 
 	__setup_APIC_LVTT(clocks);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -767,7 +766,7 @@ void __init setup_boot_APIC_clock (void)
 	printk(KERN_INFO "Using local APIC timer interrupts.\n");
 	using_apic_timer = 1;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	calibration_result = calibrate_APIC_clock();
 	/*
@@ -775,14 +774,14 @@ void __init setup_boot_APIC_clock (void)
 	 */
 	setup_APIC_timer(calibration_result);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 void __cpuinit setup_secondary_APIC_clock(void)
 {
-	local_irq_disable(); /* FIXME: Do we need this? --RR */
+	raw_local_irq_disable(); /* FIXME: Do we need this? --RR */
 	setup_APIC_timer(calibration_result);
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 void __cpuinit disable_APIC_timer(void)
Index: linux/arch/x86_64/kernel/early_printk.c
===================================================================
--- linux.orig/arch/x86_64/kernel/early_printk.c
+++ linux/arch/x86_64/kernel/early_printk.c
@@ -206,7 +206,7 @@ static int early_console_initialized = 0
 
 void early_printk(const char *fmt, ...)
 { 
-	char buf[512]; 
+	static char buf[512];
 	int n; 
 	va_list ap;
 
Index: linux/arch/x86_64/kernel/entry.S
===================================================================
--- linux.orig/arch/x86_64/kernel/entry.S
+++ linux/arch/x86_64/kernel/entry.S
@@ -48,6 +48,15 @@
 #define retint_kernel retint_restore_args
 #endif	
 	
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# define CALL_TRACE_IRQS_ON \
+	push %rbp; \
+	mov %rsp, %rbp; \
+	call trace_irqs_on; \
+	leaveq
+#else
+# define CALL_TRACE_IRQS_ON
+#endif
 /*
  * C code is not supposed to know about undefined top of stack. Every time 
  * a C function with an pt_regs argument is called from the SYSCALL based 
@@ -230,8 +239,8 @@ sysret_check:		
 	/* edx:	work, edi: workmask */	
 sysret_careful:
 	CFI_RESTORE_STATE
-	bt $TIF_NEED_RESCHED,%edx
-	jnc sysret_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz sysret_signal
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -252,7 +261,7 @@ sysret_signal:
 	leaq -ARGOFFSET(%rsp),%rdi # &pt_regs -> arg1
 	xorl %esi,%esi # oldset -> arg2
 	call ptregscall_common
-1:	movl $_TIF_NEED_RESCHED,%edi
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	jmp sysret_check
 	
 badsys:
@@ -319,8 +328,8 @@ int_with_check:
 	/* First do a reschedule test. */
 	/* edx:	work, edi: workmask */
 int_careful:
-	bt $TIF_NEED_RESCHED,%edx
-	jnc  int_very_careful
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz int_very_careful
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET 8
@@ -353,7 +362,7 @@ int_signal:
 	movq %rsp,%rdi		# &ptregs -> arg1
 	xorl %esi,%esi		# oldset -> arg2
 	call do_notify_resume
-1:	movl $_TIF_NEED_RESCHED,%edi	
+1:	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 int_restore_rest:
 	RESTORE_REST
 	cli
@@ -554,8 +563,8 @@ bad_iret:
 	/* edi: workmask, edx: work */
 retint_careful:
 	CFI_RESTORE_STATE
-	bt    $TIF_NEED_RESCHED,%edx
-	jnc   retint_signal
+	testl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edx
+	jz    retint_signal
 	sti
 	pushq %rdi
 	CFI_ADJUST_CFA_OFFSET	8
@@ -577,7 +586,7 @@ retint_signal:
 	call do_notify_resume
 	RESTORE_REST
 	cli
-	movl $_TIF_NEED_RESCHED,%edi
+	movl $(_TIF_NEED_RESCHED|_TIF_NEED_RESCHED_DELAYED),%edi
 	GET_THREAD_INFO(%rcx)
 	jmp retint_check
 
@@ -593,6 +602,7 @@ retint_kernel:	
 	bt   $9,EFLAGS-ARGOFFSET(%rsp)	/* interrupts off? */
 	jnc  retint_restore_args
 	call preempt_schedule_irq
+	CALL_TRACE_IRQS_ON
 	jmp exit_intr
 #endif	
 	CFI_ENDPROC
@@ -1038,3 +1048,41 @@ ENTRY(call_softirq)
 	CFI_ADJUST_CFA_OFFSET -8
 	ret
 	CFI_ENDPROC
+
+#ifdef CONFIG_LATENCY_TRACE
+
+ENTRY(mcount)
+	cmpq $0, trace_enabled
+	jz out
+
+	push %rbp
+	mov %rsp,%rbp
+
+	push %r9
+	push %r8
+	push %rdi
+	push %rsi
+	push %rdx
+	push %rcx
+	push %rax
+
+	mov 0x0(%rbp),%rax
+	mov 0x8(%rbp),%rdi
+	mov 0x8(%rax),%rsi
+
+	call   __trace
+
+	pop %rax
+	pop %rcx
+	pop %rdx
+	pop %rsi
+	pop %rdi
+	pop %r8
+	pop %r9
+
+	leaveq
+out:
+	ret
+
+#endif
+
Index: linux/arch/x86_64/kernel/genapic_flat.c
===================================================================
--- linux.orig/arch/x86_64/kernel/genapic_flat.c
+++ linux/arch/x86_64/kernel/genapic_flat.c
@@ -50,8 +50,8 @@ static void flat_send_IPI_mask(cpumask_t
 	unsigned long cfg;
 	unsigned long flags;
 
-	local_save_flags(flags);
-	local_irq_disable();
+	raw_local_save_flags(flags);
+	raw_local_irq_disable();
 
 	/*
 	 * Wait for idle.
@@ -73,7 +73,7 @@ static void flat_send_IPI_mask(cpumask_t
 	 * Send the IPI. The write to APIC_ICR fires this off.
 	 */
 	apic_write(APIC_ICR, cfg);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static void flat_send_IPI_allbutself(int vector)
Index: linux/arch/x86_64/kernel/i8259.c
===================================================================
--- linux.orig/arch/x86_64/kernel/i8259.c
+++ linux/arch/x86_64/kernel/i8259.c
@@ -127,7 +127,7 @@ void (*interrupt[NR_IRQS])(void) = {
  * moves to arch independent land
  */
 
-DEFINE_SPINLOCK(i8259A_lock);
+DEFINE_RAW_SPINLOCK(i8259A_lock);
 
 static void end_8259A_irq (unsigned int irq)
 {
@@ -448,7 +448,7 @@ device_initcall(i8259A_init_sysfs);
  * IRQ2 is cascade interrupt to second interrupt controller
  */
 
-static struct irqaction irq2 = { no_action, 0, CPU_MASK_NONE, "cascade", NULL, NULL};
+static struct irqaction irq2 = { no_action, SA_NODELAY, CPU_MASK_NONE, "cascade", NULL, NULL};
 
 void __init init_ISA_irqs (void)
 {
Index: linux/arch/x86_64/kernel/init_task.c
===================================================================
--- linux.orig/arch/x86_64/kernel/init_task.c
+++ linux/arch/x86_64/kernel/init_task.c
@@ -10,8 +10,8 @@
 #include <asm/pgtable.h>
 #include <asm/desc.h>
 
-static struct fs_struct init_fs = INIT_FS;
-static struct files_struct init_files = INIT_FILES;
+static struct fs_struct init_fs = INIT_FS(init_fs);
+static struct files_struct init_files = INIT_FILES(init_files);
 static struct signal_struct init_signals = INIT_SIGNALS(init_signals);
 static struct sighand_struct init_sighand = INIT_SIGHAND(init_sighand);
 struct mm_struct init_mm = INIT_MM(init_mm);
Index: linux/arch/x86_64/kernel/io_apic.c
===================================================================
--- linux.orig/arch/x86_64/kernel/io_apic.c
+++ linux/arch/x86_64/kernel/io_apic.c
@@ -46,7 +46,7 @@ static int no_timer_check;
 
 int disable_timer_pin_1 __initdata;
 
-static DEFINE_SPINLOCK(ioapic_lock);
+static DEFINE_RAW_SPINLOCK(ioapic_lock);
 
 /*
  * # of IRQ routing registers
@@ -93,6 +93,9 @@ int vector_irq[NR_VECTORS] __read_mostly
 		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
 		reg ACTION;						\
 		io_apic_modify(entry->apic, reg);			\
+		 /* Force POST flush by reading: */			\
+		reg = io_apic_read(entry->apic, 0x10 + R + pin*2);	\
+									\
 		if (!entry->next)					\
 			break;						\
 		entry = irq_2_pin + entry->next;			\
@@ -156,10 +159,8 @@ static void add_pin_to_irq(unsigned int 
 	static void name##_IO_APIC_irq (unsigned int irq)		\
 	__DO_ACTION(R, ACTION, FINAL)
 
-DO_ACTION( __mask,             0, |= 0x00010000, io_apic_sync(entry->apic) )
-						/* mask = 1 */
-DO_ACTION( __unmask,           0, &= 0xfffeffff, )
-						/* mask = 0 */
+DO_ACTION( __mask,             0, |= 0x00010000, ) /* mask = 1 */
+DO_ACTION( __unmask,           0, &= 0xfffeffff, ) /* mask = 0 */
 
 static void mask_IO_APIC_irq (unsigned int irq)
 {
@@ -1273,7 +1274,7 @@ static int __init timer_irq_works(void)
 {
 	unsigned long t1 = jiffies;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	/* Let ten ticks pass... */
 	mdelay((10 * 1000) / HZ);
 
@@ -1366,12 +1367,50 @@ static unsigned int startup_level_ioapic
 	return 0; /* don't check for pending */
 }
 
+/*
+ * In the preemptible case mask the IRQ first then handle it and ack it.
+ *
+ * (In the non-preemptible case we keep the IRQ unacked in the local APIC
+ * and dont need to do the masking, because the code executes atomically.)
+ */
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+	move_irq(irq);
+	mask_IO_APIC_irq(irq);
+	ack_APIC_irq();
+}
+
+static void end_level_ioapic_irq(unsigned int irq)
+{
+	if (!(irq_desc[irq].status & IRQ_INPROGRESS))
+		unmask_IO_APIC_irq(irq);
+}
+
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+
+#else /* !CONFIG_PREEMPT_HARDIRQS */
+
+static void mask_and_ack_level_ioapic_irq(unsigned int irq)
+{
+}
+
 static void end_level_ioapic_irq (unsigned int irq)
 {
 	move_irq(irq);
 	ack_APIC_irq();
 }
 
+static void enable_level_ioapic_irq(unsigned int irq)
+{
+	unmask_IO_APIC_irq(irq);
+}
+#endif /* !CONFIG_PREEMPT_HARDIRQS */
+
 #ifdef CONFIG_PCI_MSI
 static unsigned int startup_edge_ioapic_vector(unsigned int vector)
 {
@@ -1395,6 +1434,13 @@ static unsigned int startup_level_ioapic
 	return startup_level_ioapic_irq (irq);
 }
 
+static void mask_and_ack_level_ioapic_vector (unsigned int vector)
+{
+	int irq = vector_to_irq(vector);
+
+	mask_and_ack_level_ioapic_irq(irq);
+}
+
 static void end_level_ioapic_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
@@ -1403,6 +1449,11 @@ static void end_level_ioapic_vector (uns
 	end_level_ioapic_irq(irq);
 }
 
+static void enable_level_ioapic_vector(unsigned int vector)
+{
+	enable_level_ioapic_irq(vector_to_irq(vector));
+}
+
 static void mask_IO_APIC_vector (unsigned int vector)
 {
 	int irq = vector_to_irq(vector);
Index: linux/arch/x86_64/kernel/irq.c
===================================================================
--- linux.orig/arch/x86_64/kernel/irq.c
+++ linux/arch/x86_64/kernel/irq.c
@@ -129,9 +129,9 @@ void fixup_irqs(cpumask_t map)
 	}
 
 	/* That doesn't seem sufficient.  Give it 1ms. */
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay(1);
-	local_irq_disable();
+	raw_local_irq_disable();
 }
 #endif
 
@@ -145,11 +145,11 @@ asmlinkage void do_softirq(void)
  	if (in_interrupt())
  		return;
 
- 	local_irq_save(flags);
+ 	raw_local_irq_save(flags);
  	pending = local_softirq_pending();
  	/* Switch to interrupt stack */
  	if (pending)
 		call_softirq();
- 	local_irq_restore(flags);
+ 	raw_local_irq_restore(flags);
 }
 EXPORT_SYMBOL(do_softirq);
Index: linux/arch/x86_64/kernel/machine_kexec.c
===================================================================
--- linux.orig/arch/x86_64/kernel/machine_kexec.c
+++ linux/arch/x86_64/kernel/machine_kexec.c
@@ -190,7 +190,7 @@ NORET_TYPE void machine_kexec(struct kim
 	relocate_new_kernel_t rnk;
 
 	/* Interrupts aren't acceptable while we reboot */
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Calculate the offsets */
 	page_list = image->head;
Index: linux/arch/x86_64/kernel/nmi.c
===================================================================
--- linux.orig/arch/x86_64/kernel/nmi.c
+++ linux/arch/x86_64/kernel/nmi.c
@@ -43,7 +43,7 @@
  * This is maintained separately from nmi_active because the NMI
  * watchdog may also be driven from the I/O APIC timer.
  */
-static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
+static DEFINE_RAW_SPINLOCK(lapic_nmi_owner_lock);
 static unsigned int lapic_nmi_owner;
 #define LAPIC_NMI_WATCHDOG	(1<<0)
 #define LAPIC_NMI_RESERVED	(1<<1)
@@ -127,7 +127,7 @@ void __cpuinit nmi_watchdog_default(void
 static __init void nmi_cpu_busy(void *data)
 {
 	volatile int *endflag = data;
-	local_irq_enable();
+	raw_local_irq_enable();
 	/* Intentionally don't use cpu_relax here. This is
 	   to make sure that the performance counter really ticks,
 	   even if there is a simulator or similar that catches the
@@ -156,7 +156,7 @@ int __init check_nmi_watchdog (void)
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++)
 		counts[cpu] = cpu_pda[cpu].__nmi_count; 
-	local_irq_enable();
+	raw_local_irq_enable();
 	mdelay((10*1000)/nmi_hz); // wait 10 ticks
 
 	for (cpu = 0; cpu < NR_CPUS; cpu++) {
@@ -466,12 +466,42 @@ void touch_nmi_watchdog (void)
  	touch_softlockup_watchdog();
 }
 
+int nmi_show_regs[NR_CPUS];
+
+void nmi_show_all_regs(void)
+{
+	int i;
+
+	if (nmi_watchdog == NMI_NONE)
+		return;
+	if (system_state != SYSTEM_RUNNING) {
+		printk("nmi_show_all_regs(): system state %d, not doing.\n",
+			system_state);
+		return;
+	}
+
+	for_each_online_cpu(i)
+		nmi_show_regs[i] = 1;
+	for_each_online_cpu(i)
+		while (nmi_show_regs[i] == 1)
+			barrier();
+}
+
+static DEFINE_RAW_SPINLOCK(nmi_print_lock);
+
 void nmi_watchdog_tick (struct pt_regs * regs, unsigned reason)
 {
 	int sum;
 	int touched = 0;
+	int cpu = safe_smp_processor_id();
 
 	sum = read_pda(apic_timer_irqs);
+	if (nmi_show_regs[cpu]) {
+		nmi_show_regs[cpu] = 0;
+		spin_lock(&nmi_print_lock);
+		show_regs(regs);
+		spin_unlock(&nmi_print_lock);
+	}
 	if (__get_cpu_var(nmi_touch)) {
 		__get_cpu_var(nmi_touch) = 0;
 		touched = 1;
@@ -483,6 +513,11 @@ void nmi_watchdog_tick (struct pt_regs *
 		 */
 		local_inc(&__get_cpu_var(alert_counter));
 		if (local_read(&__get_cpu_var(alert_counter)) == 5*nmi_hz) {
+			int i;
+
+			for (i = 0; i < NR_CPUS; i++)
+				nmi_show_regs[i] = 1;
+
 			if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT)
 							== NOTIFY_STOP) {
 				local_set(&__get_cpu_var(alert_counter), 0);
Index: linux/arch/x86_64/kernel/pmtimer.c
===================================================================
--- linux.orig/arch/x86_64/kernel/pmtimer.c
+++ /dev/null
@@ -1,101 +0,0 @@
-/* Ported over from i386 by AK, original copyright was:
- *
- * (C) Dominik Brodowski <linux@brodo.de> 2003
- *
- * Driver to use the Power Management Timer (PMTMR) available in some
- * southbridges as primary timing source for the Linux kernel.
- *
- * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
- * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
- *
- * This file is licensed under the GPL v2.
- *
- * Dropped all the hardware bug workarounds for now. Hopefully they
- * are not needed on 64bit chipsets.
- */
-
-#include <linux/jiffies.h>
-#include <linux/kernel.h>
-#include <linux/time.h>
-#include <linux/init.h>
-#include <linux/cpumask.h>
-#include <asm/io.h>
-#include <asm/proto.h>
-#include <asm/msr.h>
-#include <asm/vsyscall.h>
-
-/* The I/O port the PMTMR resides at.
- * The location is detected during setup_arch(),
- * in arch/i386/kernel/acpi/boot.c */
-u32 pmtmr_ioport;
-
-/* value of the Power timer at last timer interrupt */
-static u32 offset_delay;
-static u32 last_pmtmr_tick;
-
-#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
-
-static inline u32 cyc2us(u32 cycles)
-{
-	/* The Power Management Timer ticks at 3.579545 ticks per microsecond.
-	 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
-	 *
-	 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
-	 * easily be multiplied with 286 (=0x11E) without having to fear
-	 * u32 overflows.
-	 */
-	cycles *= 286;
-	return (cycles >> 10);
-}
-
-int pmtimer_mark_offset(void)
-{
-	static int first_run = 1;
-	unsigned long tsc;
-	u32 lost;
-
-	u32 tick = inl(pmtmr_ioport);
-	u32 delta;
-
-	delta = cyc2us((tick - last_pmtmr_tick) & ACPI_PM_MASK);
-
-	last_pmtmr_tick = tick;
-	monotonic_base += delta * NSEC_PER_USEC;
-
-	delta += offset_delay;
-
-	lost = delta / (USEC_PER_SEC / HZ);
-	offset_delay = delta % (USEC_PER_SEC / HZ);
-
-	rdtscll(tsc);
-	vxtime.last_tsc = tsc - offset_delay * cpu_khz;
-
-	/* don't calculate delay for first run,
-	   or if we've got less then a tick */
-	if (first_run || (lost < 1)) {
-		first_run = 0;
-		offset_delay = 0;
-	}
-
-	return lost - 1;
-}
-
-unsigned int do_gettimeoffset_pm(void)
-{
-	u32 now, offset, delta = 0;
-
-	offset = last_pmtmr_tick;
-	now = inl(pmtmr_ioport);
-	delta = (now - offset) & ACPI_PM_MASK;
-
-	return offset_delay + cyc2us(delta);
-}
-
-
-static int __init nopmtimer_setup(char *s)
-{
-	pmtmr_ioport = 0;
-	return 0;
-}
-
-__setup("nopmtimer", nopmtimer_setup);
Index: linux/arch/x86_64/kernel/process.c
===================================================================
--- linux.orig/arch/x86_64/kernel/process.c
+++ linux/arch/x86_64/kernel/process.c
@@ -36,6 +36,7 @@
 #include <linux/utsname.h>
 #include <linux/random.h>
 #include <linux/kprobes.h>
+#include <linux/spinlock.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -60,6 +61,12 @@ static atomic_t hlt_counter = ATOMIC_INI
 unsigned long boot_option_idle_override = 0;
 EXPORT_SYMBOL(boot_option_idle_override);
 
+DEFINE_SPINLOCK(pm_idle_switch_lock);
+EXPORT_SYMBOL_GPL(pm_idle_switch_lock);
+
+int pm_idle_locked = 0;
+EXPORT_SYMBOL_GPL(pm_idle_locked);
+
 /*
  * Powermanagement idle function, if any..
  */
@@ -87,12 +94,13 @@ EXPORT_SYMBOL(enable_hlt);
 void default_idle(void)
 {
 	if (!atomic_read(&hlt_counter)) {
-		local_irq_disable();
-		if (!need_resched())
-			safe_halt();
+		raw_local_irq_disable();
+		if (!need_resched() && !need_resched_delayed())
+			raw_safe_halt();
 		else
-			local_irq_enable();
-	}
+			raw_local_irq_enable();
+	} else
+		raw_local_irq_enable();
 }
 
 /*
@@ -100,11 +108,11 @@ void default_idle(void)
  * to poll the ->need_resched flag instead of waiting for the
  * cross-CPU IPI to arrive. Use this option with caution.
  */
-static void poll_idle (void)
+void poll_idle (void)
 {
 	int oldval;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	/*
 	 * Deal with another CPU just having chosen a thread to
@@ -120,7 +128,7 @@ static void poll_idle (void)
 			"rep; nop;"
 			"je 2b;"
 			: :
-			"i" (_TIF_NEED_RESCHED), 
+			"i" (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_DELAYED),
 			"m" (current_thread_info()->flags));
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 	} else {
@@ -189,7 +197,9 @@ void cpu_idle (void)
 {
 	/* endless idle loop with no priority at all */
 	while (1) {
-		while (!need_resched()) {
+		BUG_ON(raw_irqs_disabled());
+
+		while (!need_resched() && !need_resched_delayed()) {
 			void (*idle)(void);
 
 			if (__get_cpu_var(cpu_idle_state))
@@ -201,10 +211,13 @@ void cpu_idle (void)
 				idle = default_idle;
 			if (cpu_is_offline(smp_processor_id()))
 				play_dead();
+			stop_critical_timing();
+			propagate_preempt_locks_value();
 			idle();
 		}
-
-		schedule();
+		raw_local_irq_disable();
+		__schedule();
+		raw_local_irq_enable();
 	}
 }
 
@@ -217,16 +230,16 @@ void cpu_idle (void)
  */
 static void mwait_idle(void)
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 
-	if (!need_resched()) {
+	if (!need_resched() && !need_resched_delayed()) {
 		set_thread_flag(TIF_POLLING_NRFLAG);
 		do {
 			__monitor((void *)&current_thread_info()->flags, 0, 0);
-			if (need_resched())
+			if (need_resched() || need_resched_delayed())
 				break;
 			__mwait(0, 0);
-		} while (!need_resched());
+		} while (!need_resched() && !need_resched_delayed());
 		clear_thread_flag(TIF_POLLING_NRFLAG);
 	}
 }
@@ -315,7 +328,7 @@ void show_regs(struct pt_regs *regs)
 {
 	printk("CPU %d:", smp_processor_id());
 	__show_regs(regs);
-	show_trace(&regs->rsp);
+	show_trace(current, &regs->rsp);
 }
 
 /*
@@ -334,13 +347,14 @@ void exit_thread(void)
 	kprobe_flush_task(me);
 
 	if (me->thread.io_bitmap_ptr) { 
-		struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+		struct tss_struct *tss;
 
 		kfree(t->io_bitmap_ptr);
 		t->io_bitmap_ptr = NULL;
 		/*
 		 * Careful, clear this in the TSS too:
 		 */
+		tss = &per_cpu(init_tss, get_cpu());
 		memset(tss->io_bitmap, 0xff, t->io_bitmap_max);
 		t->io_bitmap_max = 0;
 		put_cpu();
Index: linux/arch/x86_64/kernel/reboot.c
===================================================================
--- linux.orig/arch/x86_64/kernel/reboot.c
+++ linux/arch/x86_64/kernel/reboot.c
@@ -98,7 +98,7 @@ void machine_shutdown(void)
 	smp_send_stop();
 #endif
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 #ifndef CONFIG_SMP
 	disable_local_APIC();
@@ -106,7 +106,7 @@ void machine_shutdown(void)
 
 	disable_IO_APIC();
 
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 void machine_emergency_restart(void)
Index: linux/arch/x86_64/kernel/signal.c
===================================================================
--- linux.orig/arch/x86_64/kernel/signal.c
+++ linux/arch/x86_64/kernel/signal.c
@@ -417,6 +417,13 @@ int do_signal(struct pt_regs *regs, sigs
 	siginfo_t info;
 	int signr;
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Fully-preemptible kernel does not need interrupts disabled:
+	 */
+	raw_local_irq_enable();
+	preempt_check_resched();
+#endif
 	/*
 	 * We want the common case to go fast, which
 	 * is why we may in certain cases get here from
Index: linux/arch/x86_64/kernel/smp.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smp.c
+++ linux/arch/x86_64/kernel/smp.c
@@ -299,10 +299,20 @@ void smp_send_reschedule(int cpu)
 }
 
 /*
+ * this function sends a 'reschedule' IPI to all other CPUs.
+ * This is used when RT tasks are starving and other CPUs
+ * might be able to run them:
+ */
+void smp_send_reschedule_allbutself(void)
+{
+	send_IPI_allbutself(RESCHEDULE_VECTOR);
+}
+
+/*
  * Structure and data for smp_call_function(). This is designed to minimise
  * static memory requirements. It also looks cleaner.
  */
-static DEFINE_SPINLOCK(call_lock);
+static DEFINE_RAW_SPINLOCK(call_lock);
 
 struct call_data_struct {
 	void (*func) (void *info);
@@ -456,9 +466,9 @@ void smp_stop_cpu(void)
 	 * Remove this CPU:
 	 */
 	cpu_clear(smp_processor_id(), cpu_online_map);
-	local_irq_disable();
+	raw_local_irq_disable();
 	disable_local_APIC();
-	local_irq_enable(); 
+	raw_local_irq_enable();
 }
 
 static void smp_really_stop_cpu(void *dummy)
@@ -482,9 +492,9 @@ void smp_send_stop(void)
 	if (!nolock)
 		spin_unlock(&call_lock);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	disable_local_APIC();
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 /*
Index: linux/arch/x86_64/kernel/smpboot.c
===================================================================
--- linux.orig/arch/x86_64/kernel/smpboot.c
+++ linux/arch/x86_64/kernel/smpboot.c
@@ -198,7 +198,7 @@ static void __cpuinit smp_store_cpu_info
    latency and low latency is the primary objective here. -AK */
 #define no_cpu_relax() barrier()
 
-static __cpuinitdata DEFINE_SPINLOCK(tsc_sync_lock);
+static __cpuinitdata __DEFINE_RAW_SPINLOCK(tsc_sync_lock);
 static volatile __cpuinitdata unsigned long go[SLAVE + 1];
 static int notscsync __cpuinitdata;
 
@@ -214,7 +214,7 @@ static __cpuinit void sync_master(void *
 
 	go[MASTER] = 0;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	{
 		for (i = 0; i < NUM_ROUNDS*NUM_ITERS; ++i) {
 			while (!go[MASTER])
@@ -223,7 +223,7 @@ static __cpuinit void sync_master(void *
 			rdtscll(go[SLAVE]);
 		}
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -1022,7 +1022,7 @@ int __cpuinit __cpu_up(unsigned int cpu)
 	int err;
 	int apicid = cpu_present_to_apicid(cpu);
 
-	WARN_ON(irqs_disabled());
+	WARN_ON(raw_irqs_disabled());
 
 	Dprintk("++++++++++++++++++++=_---CPU UP  %u\n", cpu);
 
Index: linux/arch/x86_64/kernel/switch2poll.c
===================================================================
--- /dev/null
+++ linux/arch/x86_64/kernel/switch2poll.c
@@ -0,0 +1,112 @@
+#include <linux/module.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/spinlock.h>
+#include <linux/pm.h>
+
+extern void poll_idle (void);
+
+#define KERNEL_ATTR_RW(_name) \
+static struct subsys_attribute _name##_attr = \
+	__ATTR(_name, 0644, _name##_show, _name##_store)
+
+static struct idlep_kobject
+{
+	struct kobject kobj;
+	int is_poll;
+	void (*idle)(void);
+} idle_kobj;
+
+static ssize_t idle_poll_show(struct subsystem *subsys, char *page)
+{
+	return sprintf(page, "%s\n", (idle_kobj.is_poll ? "on" : "off"));
+}
+
+static ssize_t idle_poll_store(struct subsystem *subsys,
+			       const char *buf, size_t len)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pm_idle_switch_lock, flags);
+
+	/*
+	 * If power management is handling the idle function,
+	 * then leave it be.
+	 */
+	if (pm_idle_locked) {
+		len = -EBUSY;
+		goto out;
+	}
+
+	if (strncmp(buf,"1",1)==0 ||
+	    (len >=2 && strncmp(buf,"on",2)==0)) {
+		if (idle_kobj.is_poll != 1) {
+			idle_kobj.is_poll = 1;
+			boot_option_idle_override = 1;
+			idle_kobj.idle = pm_idle;
+			pm_idle = poll_idle;
+		}
+	} else if (strncmp(buf,"0",1)==0 ||
+		   (len >= 3 && strncmp(buf,"off",3)==0)) {
+		if (idle_kobj.is_poll != 0) {
+			boot_option_idle_override = 0;
+			idle_kobj.is_poll = 0;
+			pm_idle = idle_kobj.idle;
+		}
+	}
+
+out:
+	spin_unlock_irqrestore(&pm_idle_switch_lock, flags);
+
+	return len;
+}
+
+
+KERNEL_ATTR_RW(idle_poll);
+
+static struct attribute * idle_attrs[] = {
+	&idle_poll_attr.attr,
+	NULL
+};
+
+static struct attribute_group idle_attr_group = {
+	.attrs = idle_attrs,
+};
+
+static int __init idle_poll_set_init(void)
+{
+	int err;
+
+	/*
+	 * If the default is alread poll_idle then
+	 * don't even bother with this.
+	 */
+	if (pm_idle == poll_idle)
+		return 0;
+
+	memset(&idle_kobj, 0, sizeof(idle_kobj));
+
+	idle_kobj.is_poll = 0;
+	idle_kobj.idle = pm_idle;
+
+	err = kobject_set_name(&idle_kobj.kobj, "%s", "idle");
+	if (err)
+		goto out;
+
+	idle_kobj.kobj.parent = &kernel_subsys.kset.kobj;
+	err = kobject_register(&idle_kobj.kobj);
+	if (err)
+		goto out;
+
+	err = sysfs_create_group(&idle_kobj.kobj,
+				 &idle_attr_group);
+	if (err)
+		goto out;
+
+	return 0;
+out:
+	printk(KERN_INFO "Problem setting up sysfs idle_poll\n");
+	return 0;
+}
+
+late_initcall(idle_poll_set_init);
Index: linux/arch/x86_64/kernel/time.c
===================================================================
--- linux.orig/arch/x86_64/kernel/time.c
+++ linux/arch/x86_64/kernel/time.c
@@ -38,29 +38,27 @@
 #include <asm/sections.h>
 #include <linux/cpufreq.h>
 #include <linux/hpet.h>
+#include <linux/timeofday.h>
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/apic.h>
 #endif
+#include <linux/clocksource.h>
+#include <asm/vsyscall.h>
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-
-EXPORT_SYMBOL(jiffies_64);
-
-#ifdef CONFIG_CPU_FREQ
-static void cpufreq_delayed_get(void);
-#endif
 extern void i8254_timer_resume(void);
 extern int using_apic_timer;
 
-DEFINE_SPINLOCK(rtc_lock);
-DEFINE_SPINLOCK(i8253_lock);
+DEFINE_RAW_SPINLOCK(rtc_lock);
+DEFINE_RAW_SPINLOCK(i8253_lock);
 
 static int nohpet __initdata = 0;
 static int notsc __initdata = 0;
 
 #undef HPET_HACK_ENABLE_DANGEROUS
 
-unsigned int cpu_khz;					/* TSC clocks / usec, not used here */
+unsigned int cpu_khz;					/* CPU clocks / usec, not used here */
+unsigned int tsc_khz;					/* TSC clocks / usec, not used here */
+unsigned long hpet_address;
 static unsigned long hpet_period;			/* fsecs / HPET clock */
 unsigned long hpet_tick;				/* HPET clocks / interrupt */
 static int hpet_use_timer;
@@ -83,107 +81,6 @@ static inline void rdtscll_sync(unsigned
 	rdtscll(*tsc);
 }
 
-/*
- * do_gettimeoffset() returns microseconds since last timer interrupt was
- * triggered by hardware. A memory read of HPET is slower than a register read
- * of TSC, but much more reliable. It's also synchronized to the timer
- * interrupt. Note that do_gettimeoffset() may return more than hpet_tick, if a
- * timer interrupt has happened already, but vxtime.trigger wasn't updated yet.
- * This is not a problem, because jiffies hasn't updated either. They are bound
- * together by xtime_lock.
- */
-
-static inline unsigned int do_gettimeoffset_tsc(void)
-{
-	unsigned long t;
-	unsigned long x;
-	rdtscll_sync(&t);
-	if (t < vxtime.last_tsc) t = vxtime.last_tsc; /* hack */
-	x = ((t - vxtime.last_tsc) * vxtime.tsc_quot) >> 32;
-	return x;
-}
-
-static inline unsigned int do_gettimeoffset_hpet(void)
-{
-	/* cap counter read to one tick to avoid inconsistencies */
-	unsigned long counter = hpet_readl(HPET_COUNTER) - vxtime.last;
-	return (min(counter,hpet_tick) * vxtime.quot) >> 32;
-}
-
-unsigned int (*do_gettimeoffset)(void) = do_gettimeoffset_tsc;
-
-/*
- * This version of gettimeofday() has microsecond resolution and better than
- * microsecond precision, as we're using at least a 10 MHz (usually 14.31818
- * MHz) HPET timer.
- */
-
-void do_gettimeofday(struct timeval *tv)
-{
-	unsigned long seq, t;
- 	unsigned int sec, usec;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-
-		sec = xtime.tv_sec;
-		usec = xtime.tv_nsec / 1000;
-
-		/* i386 does some correction here to keep the clock 
-		   monotonous even when ntpd is fixing drift.
-		   But they didn't work for me, there is a non monotonic
-		   clock anyways with ntp.
-		   I dropped all corrections now until a real solution can
-		   be found. Note when you fix it here you need to do the same
-		   in arch/x86_64/kernel/vsyscall.c and export all needed
-		   variables in vmlinux.lds. -AK */ 
-
-		t = (jiffies - wall_jiffies) * (1000000L / HZ) +
-			do_gettimeoffset();
-		usec += t;
-
-	} while (read_seqretry(&xtime_lock, seq));
-
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
-}
-
-EXPORT_SYMBOL(do_gettimeofday);
-
-/*
- * settimeofday() first undoes the correction that gettimeofday would do
- * on the time, and then saves it. This is ugly, but has been like this for
- * ages already.
- */
-
-int do_settimeofday(struct timespec *tv)
-{
-	time_t wtm_sec, sec = tv->tv_sec;
-	long wtm_nsec, nsec = tv->tv_nsec;
-
-	if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC)
-		return -EINVAL;
-
-	write_seqlock_irq(&xtime_lock);
-
-	nsec -= do_gettimeoffset() * 1000 +
-		(jiffies - wall_jiffies) * (NSEC_PER_SEC/HZ);
-
-	wtm_sec  = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec);
-	wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec);
-
-	set_normalized_timespec(&xtime, sec, nsec);
-	set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
-
-	ntp_clear();
-
-	write_sequnlock_irq(&xtime_lock);
-	clock_was_set();
-	return 0;
-}
-
-EXPORT_SYMBOL(do_settimeofday);
-
 unsigned long profile_pc(struct pt_regs *regs)
 {
 	unsigned long pc = instruction_pointer(regs);
@@ -283,90 +180,8 @@ static void set_rtc_mmss(unsigned long n
 	spin_unlock(&rtc_lock);
 }
 
-
-/* monotonic_clock(): returns # of nanoseconds passed since time_init()
- *		Note: This function is required to return accurate
- *		time even in the absence of multiple timer ticks.
- */
-unsigned long long monotonic_clock(void)
-{
-	unsigned long seq;
- 	u32 last_offset, this_offset, offset;
-	unsigned long long base;
-
-	if (vxtime.mode == VXTIME_HPET) {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last;
-			base = monotonic_base;
-			this_offset = hpet_readl(HPET_COUNTER);
-
-		} while (read_seqretry(&xtime_lock, seq));
-		offset = (this_offset - last_offset);
-		offset *=(NSEC_PER_SEC/HZ)/hpet_tick;
-		return base + offset;
-	}else{
-		do {
-			seq = read_seqbegin(&xtime_lock);
-
-			last_offset = vxtime.last_tsc;
-			base = monotonic_base;
-		} while (read_seqretry(&xtime_lock, seq));
-		sync_core();
-		rdtscll(this_offset);
-		offset = (this_offset - last_offset)*1000/cpu_khz; 
-		return base + offset;
-	}
-
-
-}
-EXPORT_SYMBOL(monotonic_clock);
-
-static noinline void handle_lost_ticks(int lost, struct pt_regs *regs)
-{
-    static long lost_count;
-    static int warned;
-
-    if (report_lost_ticks) {
-	    printk(KERN_WARNING "time.c: Lost %d timer "
-		   "tick(s)! ", lost);
-	    print_symbol("rip %s)\n", regs->rip);
-    }
-
-    if (lost_count == 1000 && !warned) {
-	    printk(KERN_WARNING
-		   "warning: many lost ticks.\n"
-		   KERN_WARNING "Your time source seems to be instable or "
-		   		"some driver is hogging interupts\n");
-	    print_symbol("rip %s\n", regs->rip);
-	    if (vxtime.mode == VXTIME_TSC && vxtime.hpet_address) {
-		    printk(KERN_WARNING "Falling back to HPET\n");
-		    vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		    vxtime.mode = VXTIME_HPET;
-		    do_gettimeoffset = do_gettimeoffset_hpet;
-	    }
-	    /* else should fall back to PIT, but code missing. */
-	    warned = 1;
-    } else
-	    lost_count++;
-
-#ifdef CONFIG_CPU_FREQ
-    /* In some cases the CPU can change frequency without us noticing
-       (like going into thermal throttle)
-       Give cpufreq a change to catch up. */
-    if ((lost_count+1) % 25 == 0) {
-	    cpufreq_delayed_get();
-    }
-#endif
-}
-
 static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
-	static unsigned long rtc_update = 0;
-	unsigned long tsc;
-	int delay, offset = 0, lost = 0;
-
 /*
  * Here we are in the timer irq handler. We have irqs locally disabled (so we
  * don't need spin_lock_irqsave()) but we don't know if the timer_bh is running
@@ -376,67 +191,6 @@ static irqreturn_t timer_interrupt(int i
 
 	write_seqlock(&xtime_lock);
 
-	if (vxtime.hpet_address)
-		offset = hpet_readl(HPET_COUNTER);
-
-	if (hpet_use_timer) {
-		/* if we're using the hpet timer functionality,
-		 * we can more accurately know the counter value
-		 * when the timer interrupt occured.
-		 */
-		offset = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		delay = hpet_readl(HPET_COUNTER) - offset;
-	} else {
-		spin_lock(&i8253_lock);
-		outb_p(0x00, 0x43);
-		delay = inb_p(0x40);
-		delay |= inb(0x40) << 8;
-		spin_unlock(&i8253_lock);
-		delay = LATCH - 1 - delay;
-	}
-
-	rdtscll_sync(&tsc);
-
-	if (vxtime.mode == VXTIME_HPET) {
-		if (offset - vxtime.last > hpet_tick) {
-			lost = (offset - vxtime.last) / hpet_tick - 1;
-		}
-
-		monotonic_base += 
-			(offset - vxtime.last)*(NSEC_PER_SEC/HZ) / hpet_tick;
-
-		vxtime.last = offset;
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (vxtime.mode == VXTIME_PMTMR) {
-		lost = pmtimer_mark_offset();
-#endif
-	} else {
-		offset = (((tsc - vxtime.last_tsc) *
-			   vxtime.tsc_quot) >> 32) - (USEC_PER_SEC / HZ);
-
-		if (offset < 0)
-			offset = 0;
-
-		if (offset > (USEC_PER_SEC / HZ)) {
-			lost = offset / (USEC_PER_SEC / HZ);
-			offset %= (USEC_PER_SEC / HZ);
-		}
-
-		monotonic_base += (tsc - vxtime.last_tsc)*1000000/cpu_khz ;
-
-		vxtime.last_tsc = tsc - vxtime.quot * delay / vxtime.tsc_quot;
-
-		if ((((tsc - vxtime.last_tsc) *
-		      vxtime.tsc_quot) >> 32) < offset)
-			vxtime.last_tsc = tsc -
-				(((long) offset << 32) / vxtime.tsc_quot) - 1;
-	}
-
-	if (lost > 0) {
-		handle_lost_ticks(lost, regs);
-		jiffies += lost;
-	}
-
 /*
  * Do the timer stuff.
  */
@@ -459,20 +213,6 @@ static irqreturn_t timer_interrupt(int i
 		smp_local_timer_interrupt(regs);
 #endif
 
-/*
- * If we have an externally synchronized Linux clock, then update CMOS clock
- * accordingly every ~11 minutes. set_rtc_mmss() will be called in the jiffy
- * closest to exactly 500 ms before the next second. If the update fails, we
- * don't care, as it'll be updated on the next turn, and the problem (time way
- * off) isn't likely to go away much sooner anyway.
- */
-
-	if (ntp_synced() && xtime.tv_sec > rtc_update &&
-		abs(xtime.tv_nsec - 500000000) <= tick_nsec / 2) {
-		set_rtc_mmss(xtime.tv_sec);
-		rtc_update = xtime.tv_sec + 660;
-	}
- 
 	write_sequnlock(&xtime_lock);
 
 	return IRQ_HANDLED;
@@ -481,9 +221,9 @@ static irqreturn_t timer_interrupt(int i
 static unsigned int cyc2ns_scale;
 #define CYC2NS_SCALE_FACTOR 10 /* 2^10, carefully chosen */
 
-static inline void set_cyc2ns_scale(unsigned long cpu_mhz)
+static inline void set_cyc2ns_scale(unsigned long cpu_khz)
 {
-	cyc2ns_scale = (1000 << CYC2NS_SCALE_FACTOR)/cpu_mhz;
+	cyc2ns_scale = (1000000 << CYC2NS_SCALE_FACTOR)/cpu_khz;
 }
 
 static inline unsigned long long cycles_2_ns(unsigned long long cyc)
@@ -513,6 +253,32 @@ unsigned long long sched_clock(void)
 	return cycles_2_ns(a);
 }
 
+/* code to compensate for TSC C3 stalls: */
+static u64 tsc_c3_offset;
+static int tsc_unstable;
+
+static inline int check_tsc_unstable(void)
+{
+	return tsc_unstable;
+}
+static inline void mark_tsc_unstable(void)
+{
+	tsc_unstable = 1;;
+}
+
+void tsc_c3_compensate(unsigned long nsecs)
+{
+	u64 cycles = ((u64)nsecs * tsc_khz)/1000000;
+
+	tsc_c3_offset += cycles;
+}
+EXPORT_SYMBOL_GPL(tsc_c3_compensate);
+
+static inline u64 tsc_read_c3_time(void)
+{
+	return tsc_c3_offset;
+}
+
 unsigned long get_cmos_time(void)
 {
 	unsigned int timeout, year, mon, day, hour, min, sec;
@@ -573,6 +339,30 @@ unsigned long get_cmos_time(void)
 	return mktime(year, mon, day, hour, min, sec);
 }
 
+/* arch specific timeofday hooks: */
+u64 read_persistent_clock(void)
+{
+	return (u64)get_cmos_time() * NSEC_PER_SEC;
+}
+
+void sync_persistent_clock(struct timespec ts)
+{
+	static unsigned long rtc_update = 0;
+	/*
+	 * If we have an externally synchronized Linux clock, then update
+	 * CMOS clock accordingly every ~11 minutes. set_rtc_mmss() will
+	 * be called in the jiffy closest to exactly 500 ms before the
+	 * next second. If the update fails, we don't care, as it'll be
+	 * updated on the next turn, and the problem (time way off) isn't
+	 * likely to go away much sooner anyway.
+	 */
+	if (ts.tv_sec > rtc_update &&
+		abs(ts.tv_nsec - 500000000) <= tick_nsec / 2) {
+		set_rtc_mmss(xtime.tv_sec);
+		rtc_update = xtime.tv_sec + 660;
+	}
+}
+
 #ifdef CONFIG_CPU_FREQ
 
 /* Frequency scaling support. Adjust the TSC based timer when the cpu frequency
@@ -600,23 +390,6 @@ static void handle_cpufreq_delayed_get(v
 	cpufreq_delayed_issched = 0;
 }
 
-/* if we notice lost ticks, schedule a call to cpufreq_get() as it tries
- * to verify the CPU frequency the timing core thinks the CPU is running
- * at is still correct.
- */
-static void cpufreq_delayed_get(void)
-{
-	static int warned;
-	if (cpufreq_init && !cpufreq_delayed_issched) {
-		cpufreq_delayed_issched = 1;
-		if (!warned) {
-			warned = 1;
-			printk(KERN_DEBUG "Losing some ticks... checking if CPU frequency changed.\n");
-		}
-		schedule_work(&cpufreq_delayed_get_work);
-	}
-}
-
 static unsigned int  ref_freq = 0;
 static unsigned long loops_per_jiffy_ref = 0;
 
@@ -651,11 +424,14 @@ static int time_cpufreq_notifier(struct 
 		cpufreq_scale(loops_per_jiffy_ref, ref_freq, freq->new);
 
 		cpu_khz = cpufreq_scale(cpu_khz_ref, ref_freq, freq->new);
-		if (!(freq->flags & CPUFREQ_CONST_LOOPS))
+		if (!(freq->flags & CPUFREQ_CONST_LOOPS)) {
 			vxtime.tsc_quot = (1000L << 32) / cpu_khz;
+			tsc_khz = cpu_khz;
+		}
+
 	}
 	
-	set_cyc2ns_scale(cpu_khz_ref / 1000);
+	set_cyc2ns_scale(cpu_khz_ref);
 
 	return 0;
 }
@@ -690,18 +466,18 @@ static unsigned int __init hpet_calibrat
 	int tsc_now, hpet_now;
 	unsigned long flags;
 
-	local_irq_save(flags);
-	local_irq_disable();
+	raw_local_irq_save(flags);
+	raw_local_irq_disable();
 
 	hpet_start = hpet_readl(HPET_COUNTER);
 	rdtscl(tsc_start);
 
 	do {
-		local_irq_disable();
+		raw_local_irq_disable();
 		hpet_now = hpet_readl(HPET_COUNTER);
 		sync_core();
 		rdtscl(tsc_now);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	} while ((tsc_now - tsc_start) < TICK_COUNT &&
 		 (hpet_now - hpet_start) < TICK_COUNT);
 
@@ -879,7 +655,7 @@ int __init time_setup(char *str)
 }
 
 static struct irqaction irq0 = {
-	timer_interrupt, SA_INTERRUPT, CPU_MASK_NONE, "timer", NULL, NULL
+	timer_interrupt, SA_INTERRUPT | SA_NODELAY, CPU_MASK_NONE, "timer", NULL, NULL
 };
 
 extern void __init config_acpi_tables(void);
@@ -916,18 +692,12 @@ void __init time_init(void)
 	if (hpet_use_timer) {
 		cpu_khz = hpet_calibrate_tsc();
 		timename = "HPET";
-#ifdef CONFIG_X86_PM_TIMER
-	} else if (pmtmr_ioport) {
-		vxtime_hz = PM_TIMER_FREQUENCY;
-		timename = "PM";
-		pit_init();
-		cpu_khz = pit_calibrate_tsc();
-#endif
 	} else {
 		pit_init();
 		cpu_khz = pit_calibrate_tsc();
 		timename = "PIT";
 	}
+	tsc_khz = cpu_khz;
 
 	printk(KERN_INFO "time.c: Using %ld.%06ld MHz %s timer.\n",
 	       vxtime_hz / 1000000, vxtime_hz % 1000000, timename);
@@ -939,7 +709,7 @@ void __init time_init(void)
 	rdtscll_sync(&vxtime.last_tsc);
 	setup_irq(0, &irq0);
 
-	set_cyc2ns_scale(cpu_khz / 1000);
+	set_cyc2ns_scale(cpu_khz);
 
 #ifndef CONFIG_SMP
 	time_init_gtod();
@@ -969,31 +739,8 @@ static __init int unsynchronized_tsc(voi
  */
 void __init time_init_gtod(void)
 {
-	char *timetype;
-
 	if (unsynchronized_tsc())
-		notsc = 1;
-	if (vxtime.hpet_address && notsc) {
-		timetype = hpet_use_timer ? "HPET" : "PIT/HPET";
-		vxtime.last = hpet_readl(HPET_T0_CMP) - hpet_tick;
-		vxtime.mode = VXTIME_HPET;
-		do_gettimeoffset = do_gettimeoffset_hpet;
-#ifdef CONFIG_X86_PM_TIMER
-	/* Using PM for gettimeofday is quite slow, but we have no other
-	   choice because the TSC is too unreliable on some systems. */
-	} else if (pmtmr_ioport && !vxtime.hpet_address && notsc) {
-		timetype = "PM";
-		do_gettimeoffset = do_gettimeoffset_pm;
-		vxtime.mode = VXTIME_PMTMR;
-		sysctl_vsyscall = 0;
-		printk(KERN_INFO "Disabling vsyscall due to use of PM timer\n");
-#endif
-	} else {
-		timetype = hpet_use_timer ? "HPET/TSC" : "PIT/TSC";
-		vxtime.mode = VXTIME_TSC;
-	}
-
-	printk(KERN_INFO "time.c: Using %s based timekeeping.\n", timetype);
+		mark_tsc_unstable();
 }
 
 __setup("report_lost_ticks", time_setup);
@@ -1016,7 +763,6 @@ static int timer_suspend(struct sys_devi
 
 static int timer_resume(struct sys_device *dev)
 {
-	unsigned long flags;
 	unsigned long sec;
 	unsigned long ctime = get_cmos_time();
 	unsigned long sleep_length = (ctime - sleep_start) * HZ;
@@ -1027,10 +773,6 @@ static int timer_resume(struct sys_devic
 		i8254_timer_resume();
 
 	sec = ctime + clock_cmos_diff;
-	write_seqlock_irqsave(&xtime_lock,flags);
-	xtime.tv_sec = sec;
-	xtime.tv_nsec = 0;
-	write_sequnlock_irqrestore(&xtime_lock,flags);
 	jiffies += sleep_length;
 	wall_jiffies += sleep_length;
 	touch_softlockup_watchdog();
@@ -1125,11 +867,11 @@ int hpet_rtc_timer_init(void)
 	else
 		hpet_rtc_int_freq = DEFAULT_RTC_INT_FREQ;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	cnt = hpet_readl(HPET_COUNTER);
 	cnt += ((hpet_tick*HZ)/hpet_rtc_int_freq);
 	hpet_writel(cnt, HPET_T1_CMP);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	cfg = hpet_readl(HPET_T1_CFG);
 	cfg |= HPET_TN_ENABLE | HPET_TN_SETVAL | HPET_TN_32BIT;
@@ -1305,3 +1047,214 @@ static int __init notsc_setup(char *s)
 __setup("notsc", notsc_setup);
 
 
+/* clock-source code: */
+
+static unsigned long current_tsc_khz = 0;
+
+static int tsc_update_callback(void);
+
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+/* This will hurt performance! */
+static DEFINE_RAW_SPINLOCK(checktsc_lock);
+static cycle_t last_tsc;
+
+static cycle_t read_tsc(void)
+{
+	unsigned long flags;
+	cycle_t ret;
+
+	spin_lock_irqsave(&checktsc_lock, flags);
+
+	rdtscll(ret);
+
+	if (ret < last_tsc)
+		printk("read_tsc: ACK! TSC went backward! Unsynced TSCs?\n");
+	last_tsc = ret;
+
+	spin_unlock_irqrestore(&checktsc_lock, flags);
+
+	return ret;
+}
+
+static cycle_t __vsyscall_fn vread_tsc(void* unused)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static cycle_t read_tsc_c3(void)
+{
+	unsigned long flags;
+	cycle_t ret;
+
+	spin_lock_irqsave(&checktsc_lock, flags);
+
+	rdtscll(ret);
+	ret += tsc_read_c3_time();
+
+	if (ret < last_tsc)
+		printk("read_tsc_c3: ACK! TSC went backward! Unsynced TSCs?\n");
+	last_tsc = ret;
+
+	spin_unlock_irqrestore(&checktsc_lock, flags);
+
+	return ret;
+}
+
+#else /* CONFIG_PARANOID_GENERIC_TIME */
+
+static cycle_t read_tsc(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static cycle_t __vsyscall_fn vread_tsc(void* unused)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret;
+}
+
+static cycle_t read_tsc_c3(void)
+{
+	cycle_t ret;
+
+	rdtscll(ret);
+
+	return ret + tsc_read_c3_time();
+}
+
+#endif /* CONFIG_PARANOID_GENERIC_TIME */
+
+static struct clocksource clocksource_tsc = {
+	.name			= "tsc",
+	.rating			= 300,
+	.read			= read_tsc,
+	.vread			= vread_tsc,
+	.mask			= (cycle_t)-1,
+	.mult			= 0, /* to be set */
+	.shift			= 22,
+	.update_callback	= tsc_update_callback,
+	.is_continuous		= 1,
+};
+
+static int tsc_update_callback(void)
+{
+	int change = 0;
+
+	/* check to see if we should switch to the safe clocksource: */
+	if (tsc_read_c3_time() &&
+		strncmp(clocksource_tsc.name, "c3tsc", 5)) {
+		printk("Falling back to C3 safe TSC\n");
+		clocksource_tsc.read = read_tsc_c3;
+		clocksource_tsc.vread = 0;
+		clocksource_tsc.name = "c3tsc";
+		change = 1;
+	}
+
+	if (clocksource_tsc.rating != 50 && check_tsc_unstable()) {
+		clocksource_tsc.rating = 50;
+		reselect_clocksource();
+		change = 1;
+	}
+
+	/* only update if tsc_khz has changed: */
+	if (current_tsc_khz != tsc_khz){
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		change = 1;
+	}
+	return change;
+}
+
+static int __init init_tsc_clocksource(void)
+{
+	if (!notsc && tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		clocksource_tsc.mult = clocksource_khz2mult(current_tsc_khz,
+							clocksource_tsc.shift);
+		register_clocksource(&clocksource_tsc);
+	}
+	return 0;
+}
+
+module_init(init_tsc_clocksource);
+
+
+#define HPET_MASK	0xFFFFFFFF
+#define HPET_SHIFT	22
+
+/* FSEC = 10^-15 NSEC = 10^-9 */
+#define FSEC_PER_NSEC	1000000
+
+static void *hpet_ptr;
+
+static cycle_t read_hpet(void)
+{
+	return (cycle_t)readl(hpet_ptr);
+}
+
+static cycle_t __vsyscall_fn vread_hpet(void* ptr)
+{
+	return (cycle_t)readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0);
+}
+
+struct clocksource clocksource_hpet = {
+	.name		= "hpet",
+	.rating		= 250,
+	.read		= read_hpet,
+	.vread		= vread_hpet,
+	.mask		= (cycle_t)HPET_MASK,
+	.mult		= 0, /* set below */
+	.shift		= HPET_SHIFT,
+	.is_continuous	= 1,
+};
+
+static int __init init_hpet_clocksource(void)
+{
+	unsigned long hpet_period;
+	void __iomem *hpet_base;
+	u64 tmp;
+
+	if (!hpet_address)
+		return -ENODEV;
+
+	/* calculate the hpet address: */
+	hpet_base =
+		(void __iomem*)ioremap_nocache(hpet_address, HPET_MMAP_SIZE);
+	hpet_ptr = hpet_base + HPET_COUNTER;
+
+	/* calculate the frequency: */
+	hpet_period = readl(hpet_base + HPET_PERIOD);
+
+	/*
+	 * hpet period is in femto seconds per cycle
+	 * so we need to convert this to ns/cyc units
+	 * aproximated by mult/2^shift
+	 *
+	 *  fsec/cyc * 1nsec/1000000fsec = nsec/cyc = mult/2^shift
+	 *  fsec/cyc * 1ns/1000000fsec * 2^shift = mult
+	 *  fsec/cyc * 2^shift * 1nsec/1000000fsec = mult
+	 *  (fsec/cyc << shift)/1000000 = mult
+	 *  (hpet_period << shift)/FSEC_PER_NSEC = mult
+	 */
+	tmp = (u64)hpet_period << HPET_SHIFT;
+	do_div(tmp, FSEC_PER_NSEC);
+	clocksource_hpet.mult = (u32)tmp;
+
+	register_clocksource(&clocksource_hpet);
+
+	return 0;
+}
+
+module_init(init_hpet_clocksource);
Index: linux/arch/x86_64/kernel/traps.c
===================================================================
--- linux.orig/arch/x86_64/kernel/traps.c
+++ linux/arch/x86_64/kernel/traps.c
@@ -88,7 +88,7 @@ int register_die_notifier(struct notifie
 static inline void conditional_sti(struct pt_regs *regs)
 {
 	if (regs->eflags & X86_EFLAGS_IF)
-		local_irq_enable();
+		raw_local_irq_enable();
 }
 
 static int kstack_depth_to_print = 10;
@@ -154,7 +154,7 @@ static unsigned long *in_exception_stack
  * severe exception (double fault, nmi, stack fault, debug, mce) hardware stack
  */
 
-void show_trace(unsigned long *stack)
+void show_trace(struct task_struct *task, unsigned long *stack)
 {
 	unsigned long addr;
 	const unsigned cpu = safe_smp_processor_id();
@@ -219,6 +219,7 @@ void show_trace(unsigned long *stack)
 	HANDLE_STACK (((long) stack & (THREAD_SIZE-1)) != 0);
 #undef HANDLE_STACK
 	printk("\n");
+	print_traces(task);
 }
 
 void show_stack(struct task_struct *tsk, unsigned long * rsp)
@@ -255,7 +256,7 @@ void show_stack(struct task_struct *tsk,
 		printk("%016lx ", *stack++);
 		touch_nmi_watchdog();
 	}
-	show_trace((unsigned long *)rsp);
+	show_trace(tsk, (unsigned long *)rsp);
 }
 
 /*
@@ -264,7 +265,7 @@ void show_stack(struct task_struct *tsk,
 void dump_stack(void)
 {
 	unsigned long dummy;
-	show_trace(&dummy);
+	show_trace(current, &dummy);
 }
 
 EXPORT_SYMBOL(dump_stack);
@@ -337,7 +338,7 @@ void out_of_line_bug(void)
 } 
 #endif
 
-static DEFINE_SPINLOCK(die_lock);
+static DEFINE_RAW_SPINLOCK(die_lock);
 static int die_owner = -1;
 
 unsigned long oops_begin(void)
@@ -346,7 +347,7 @@ unsigned long oops_begin(void)
 	unsigned long flags;
 
 	/* racy, but better than risking deadlock. */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	if (!spin_trylock(&die_lock)) { 
 		if (cpu == die_owner) 
 			/* nested oops. should stop eventually */;
Index: linux/arch/x86_64/kernel/vmlinux.lds.S
===================================================================
--- linux.orig/arch/x86_64/kernel/vmlinux.lds.S
+++ linux/arch/x86_64/kernel/vmlinux.lds.S
@@ -99,6 +99,18 @@ SECTIONS
   .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
   jiffies = VVIRT(.jiffies);
 
+  .vsyscall_fn :  AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  .vsyscall_data :  AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) }
+
+  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { *(.vsyscall_gtod_data) }
+  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+
+  .vsyscall_gtod_lock : AT(VLOAD(.vsyscall_gtod_lock)) { *(.vsyscall_gtod_lock) }
+  vsyscall_gtod_lock = VVIRT(.vsyscall_gtod_lock);
+
+  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
+  .vsyscall_data : AT(VLOAD(.vsyscall_data)) { *(.vsyscall_data) }
+
   .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { *(.vsyscall_1) }
   .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { *(.vsyscall_2) }
   .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { *(.vsyscall_3) }
Index: linux/arch/x86_64/kernel/vsyscall.c
===================================================================
--- linux.orig/arch/x86_64/kernel/vsyscall.c
+++ linux/arch/x86_64/kernel/vsyscall.c
@@ -19,6 +19,8 @@
  *  want per guest time just set the kernel.vsyscall64 sysctl to 0.
  */
 
+#include <linux/timeofday.h>
+#include <linux/clocksource.h>
 #include <linux/time.h>
 #include <linux/init.h>
 #include <linux/kernel.h>
@@ -27,20 +29,33 @@
 #include <linux/jiffies.h>
 #include <linux/sysctl.h>
 
+
 #include <asm/vsyscall.h>
 #include <asm/pgtable.h>
+#include <asm/unistd.h>
 #include <asm/page.h>
 #include <asm/fixmap.h>
 #include <asm/errno.h>
 #include <asm/io.h>
 
-#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr)))
+#define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) notrace
 #define force_inline __attribute__((always_inline)) inline
 
 int __sysctl_vsyscall __section_sysctl_vsyscall = 1;
-seqlock_t __xtime_lock __section_xtime_lock = SEQLOCK_UNLOCKED;
+raw_seqlock_t __xtime_lock __section_xtime_lock = RAW_SEQLOCK_UNLOCKED;
 
-#include <asm/unistd.h>
+struct vsyscall_gtod_data_t {
+	struct timeval wall_time_tv;
+	struct timezone sys_tz;
+	cycle_t offset_base;
+	struct clocksource clock;
+};
+
+extern struct vsyscall_gtod_data_t vsyscall_gtod_data;
+struct vsyscall_gtod_data_t __vsyscall_gtod_data __section_vsyscall_gtod_data;
+
+extern raw_seqlock_t vsyscall_gtod_lock;
+raw_seqlock_t __vsyscall_gtod_lock __section_vsyscall_gtod_lock = RAW_SEQLOCK_UNLOCKED;
 
 static force_inline void timeval_normalize(struct timeval * tv)
 {
@@ -53,40 +68,66 @@ static force_inline void timeval_normali
 	}
 }
 
-static force_inline void do_vgettimeofday(struct timeval * tv)
+/*
+ * XXX - this is ugly. gettimeofday() has a label in it so we can't
+ *       call it twice.
+ */
+static force_inline int syscall_gtod(struct timeval *tv, struct timezone *tz)
+{
+	int ret;
+
+	asm volatile("syscall"
+		: "=a" (ret)
+		: "0" (__NR_gettimeofday),"D" (tv),"S" (tz)
+		: __syscall_clobber);
+
+	return ret;
+}
+
+static force_inline void do_vgettimeofday(struct timeval *tv)
 {
-	long sequence, t;
-	unsigned long sec, usec;
+	cycle_t now, cycle_delta;
+	nsec_t nsec_delta;
+	unsigned long seq;
 
 	do {
-		sequence = read_seqbegin(&__xtime_lock);
-		
-		sec = __xtime.tv_sec;
-		usec = (__xtime.tv_nsec / 1000) +
-			(__jiffies - __wall_jiffies) * (1000000 / HZ);
-
-		if (__vxtime.mode != VXTIME_HPET) {
-			sync_core();
-			rdtscll(t);
-			if (t < __vxtime.last_tsc)
-				t = __vxtime.last_tsc;
-			usec += ((t - __vxtime.last_tsc) *
-				 __vxtime.tsc_quot) >> 32;
-			/* See comment in x86_64 do_gettimeofday. */
-		} else {
-			usec += ((readl((void *)fix_to_virt(VSYSCALL_HPET) + 0xf0) -
-				  __vxtime.last) * __vxtime.quot) >> 32;
+		seq = read_seqbegin(&__vsyscall_gtod_lock);
+
+		if (!__vsyscall_gtod_data.clock.vread) {
+			syscall_gtod(tv, NULL);
+			return;
 		}
-	} while (read_seqretry(&__xtime_lock, sequence));
 
-	tv->tv_sec = sec + usec / 1000000;
-	tv->tv_usec = usec % 1000000;
+		/* read the timeosurce and calc cycle_delta */
+		now = __vsyscall_gtod_data.clock.vread(
+				__vsyscall_gtod_data.clock.vdata);
+
+		cycle_delta = (now - __vsyscall_gtod_data.offset_base)
+					& __vsyscall_gtod_data.clock.mask;
+
+		/* convert cycles to nsecs */
+		nsec_delta = cycle_delta * __vsyscall_gtod_data.clock.mult;
+		nsec_delta = nsec_delta >> __vsyscall_gtod_data.clock.shift;
+
+		/* add nsec offset to wall_time_tv */
+		*tv = __vsyscall_gtod_data.wall_time_tv;
+		do_div(nsec_delta, NSEC_PER_USEC);
+		tv->tv_usec += (unsigned long) nsec_delta;
+
+		while (tv->tv_usec > USEC_PER_SEC) {
+			tv->tv_sec += 1;
+			tv->tv_usec -= USEC_PER_SEC;
+		}
+	} while (read_seqretry(&__vsyscall_gtod_lock, seq));
 }
 
-/* RED-PEN may want to readd seq locking, but then the variable should be write-once. */
+/*
+ * RED-PEN may want to readd seq locking, but then the variable should be
+ * write-once.
+ */
 static force_inline void do_get_tz(struct timezone * tz)
 {
-	*tz = __sys_tz;
+	*tz = __vsyscall_gtod_data.sys_tz;
 }
 
 static force_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -122,11 +163,16 @@ int __vsyscall(0) vgettimeofday(struct t
  * unlikely */
 time_t __vsyscall(1) vtime(time_t *t)
 {
+	struct timeval tv;
+
 	if (unlikely(!__sysctl_vsyscall))
 		return time_syscall(t);
-	else if (t)
-		*t = __xtime.tv_sec;		
-	return __xtime.tv_sec;
+
+	vgettimeofday(&tv, 0);
+	if (t)
+		*t = tv.tv_sec;
+
+	return tv.tv_sec;
 }
 
 long __vsyscall(2) venosys_0(void)
@@ -139,6 +185,38 @@ long __vsyscall(3) venosys_1(void)
 	return -ENOSYS;
 }
 
+struct clocksource *curr_clock;
+
+void arch_update_vsyscall_gtod(struct timespec wall_time, cycle_t offset_base,
+				struct clocksource *clock, int ntp_adj)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&vsyscall_gtod_lock, flags);
+
+	/* XXX - hackitty hack hack. this is terrible! */
+	if (curr_clock != clock)
+		curr_clock = clock;
+
+	/* save off wall time as timeval: */
+	vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time.tv_sec;
+	vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time.tv_nsec/1000;
+
+	/* save offset_base: */
+	vsyscall_gtod_data.offset_base = offset_base;
+
+	/* copy current clocksource: */
+	vsyscall_gtod_data.clock = *clock;
+
+	/* apply ntp adjustment to clocksource mult: */
+	vsyscall_gtod_data.clock.mult += ntp_adj;
+
+	/* save off current timezone: */
+	vsyscall_gtod_data.sys_tz = sys_tz;
+
+	write_sequnlock_irqrestore(&vsyscall_gtod_lock, flags);
+}
+
 #ifdef CONFIG_SYSCTL
 
 #define SYSCALL 0x050f
@@ -217,6 +295,7 @@ static int __init vsyscall_init(void)
 	BUG_ON((unsigned long) &vtime != VSYSCALL_ADDR(__NR_vtime));
 	BUG_ON((VSYSCALL_ADDR(0) != __fix_to_virt(VSYSCALL_FIRST_PAGE)));
 	map_vsyscall();
+	sysctl_vsyscall = 1;
 #ifdef CONFIG_SYSCTL
 	register_sysctl_table(kernel_root_table2, 0);
 #endif
Index: linux/arch/x86_64/kernel/x8664_ksyms.c
===================================================================
--- linux.orig/arch/x86_64/kernel/x8664_ksyms.c
+++ linux/arch/x86_64/kernel/x8664_ksyms.c
@@ -14,6 +14,7 @@
 #include <linux/syscalls.h>
 #include <linux/tty.h>
 #include <linux/ioctl32.h>
+#include <linux/mc146818rtc.h>
 
 #include <asm/semaphore.h>
 #include <asm/processor.h>
@@ -33,8 +34,6 @@
 #include <asm/tlbflush.h>
 #include <asm/kdebug.h>
 
-extern spinlock_t rtc_lock;
-
 #ifdef CONFIG_SMP
 extern void __write_lock_failed(rwlock_t *rw);
 extern void __read_lock_failed(rwlock_t *rw);
@@ -62,10 +61,12 @@ EXPORT_SYMBOL(pm_idle);
 EXPORT_SYMBOL(pm_power_off);
 EXPORT_SYMBOL(get_cmos_time);
 
-EXPORT_SYMBOL(__down_failed);
-EXPORT_SYMBOL(__down_failed_interruptible);
-EXPORT_SYMBOL(__down_failed_trylock);
-EXPORT_SYMBOL(__up_wakeup);
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+EXPORT_SYMBOL(__compat_down_failed);
+EXPORT_SYMBOL(__compat_down_failed_interruptible);
+EXPORT_SYMBOL(__compat_down_failed_trylock);
+EXPORT_SYMBOL(__compat_up_wakeup);
+#endif
 /* Networking helper routines. */
 EXPORT_SYMBOL(csum_partial_copy_nocheck);
 EXPORT_SYMBOL(ip_compute_csum);
Index: linux/arch/x86_64/lib/thunk.S
===================================================================
--- linux.orig/arch/x86_64/lib/thunk.S
+++ linux/arch/x86_64/lib/thunk.S
@@ -43,11 +43,13 @@
 	thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
 #endif	
 	thunk do_softirq_thunk,do_softirq
-	
-	thunk __down_failed,__down
-	thunk_retrax __down_failed_interruptible,__down_interruptible
-	thunk_retrax __down_failed_trylock,__down_trylock
-	thunk __up_wakeup,__up
+
+#ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
+	thunk __compat_down_failed,__compat_down
+	thunk_retrax __compat_down_failed_interruptible,__compat_down_interruptible
+	thunk_retrax __compat_down_failed_trylock,__compat_down_trylock
+	thunk __compat_up_wakeup,__compat_up
+#endif
 	
 	/* SAVE_ARGS below is used only for the .cfi directives it contains. */
 	CFI_STARTPROC
Index: linux/arch/x86_64/mm/fault.c
===================================================================
--- linux.orig/arch/x86_64/mm/fault.c
+++ linux/arch/x86_64/mm/fault.c
@@ -39,6 +39,7 @@ void bust_spinlocks(int yes)
 {
 	int loglevel_save = console_loglevel;
 	if (yes) {
+		stop_trace();
 		oops_in_progress = 1;
 	} else {
 #ifdef CONFIG_VT
@@ -327,7 +328,7 @@ asmlinkage void __kprobes do_page_fault(
 		return;
 
 	if (likely(regs->eflags & X86_EFLAGS_IF))
-		local_irq_enable();
+		raw_local_irq_enable();
 
 	if (unlikely(page_fault_trace))
 		printk("pagefault rip:%lx rsp:%lx cs:%lu ss:%lu address %lx error %lx\n",
Index: linux/arch/x86_64/mm/init.c
===================================================================
--- linux.orig/arch/x86_64/mm/init.c
+++ linux/arch/x86_64/mm/init.c
@@ -47,7 +47,7 @@ extern int swiotlb;
 
 extern char _stext[];
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /*
  * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
Index: linux/arch/xtensa/kernel/time.c
===================================================================
--- linux.orig/arch/xtensa/kernel/time.c
+++ linux/arch/xtensa/kernel/time.c
@@ -29,9 +29,6 @@
 
 extern volatile unsigned long wall_jiffies;
 
-u64 jiffies_64 = INITIAL_JIFFIES;
-EXPORT_SYMBOL(jiffies_64);
-
 spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED;
 EXPORT_SYMBOL(rtc_lock);
 
Index: linux/drivers/Makefile
===================================================================
--- linux.orig/drivers/Makefile
+++ linux/drivers/Makefile
@@ -67,3 +67,4 @@ obj-$(CONFIG_INFINIBAND)	+= infiniband/
 obj-$(CONFIG_SGI_IOC4)		+= sn/
 obj-y				+= firmware/
 obj-$(CONFIG_CRYPTO)		+= crypto/
+obj-$(CONFIG_GENERIC_TIME)	+= clocksource/
Index: linux/drivers/acpi/events/evgpe.c
===================================================================
--- linux.orig/drivers/acpi/events/evgpe.c
+++ linux/drivers/acpi/events/evgpe.c
@@ -377,7 +377,7 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_x
 	struct acpi_gpe_register_info *gpe_register_info;
 	u32 status_reg;
 	u32 enable_reg;
-	u32 flags;
+	unsigned long flags;
 	acpi_status status;
 	struct acpi_gpe_block_info *gpe_block;
 	acpi_native_uint i;
Index: linux/drivers/acpi/events/evgpeblk.c
===================================================================
--- linux.orig/drivers/acpi/events/evgpeblk.c
+++ linux/drivers/acpi/events/evgpeblk.c
@@ -136,7 +136,7 @@ acpi_status acpi_ev_walk_gpe_list(ACPI_G
 	struct acpi_gpe_block_info *gpe_block;
 	struct acpi_gpe_xrupt_info *gpe_xrupt_info;
 	acpi_status status = AE_OK;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_walk_gpe_list");
 
@@ -479,7 +479,7 @@ static struct acpi_gpe_xrupt_info *acpi_
 	struct acpi_gpe_xrupt_info *next_gpe_xrupt;
 	struct acpi_gpe_xrupt_info *gpe_xrupt;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_get_gpe_xrupt_block");
 
@@ -553,7 +553,7 @@ static acpi_status
 acpi_ev_delete_gpe_xrupt(struct acpi_gpe_xrupt_info *gpe_xrupt)
 {
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_delete_gpe_xrupt");
 
@@ -610,7 +610,7 @@ acpi_ev_install_gpe_block(struct acpi_gp
 	struct acpi_gpe_block_info *next_gpe_block;
 	struct acpi_gpe_xrupt_info *gpe_xrupt_block;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_install_gpe_block");
 
@@ -663,7 +663,7 @@ acpi_ev_install_gpe_block(struct acpi_gp
 acpi_status acpi_ev_delete_gpe_block(struct acpi_gpe_block_info *gpe_block)
 {
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("ev_install_gpe_block");
 
Index: linux/drivers/acpi/events/evxface.c
===================================================================
--- linux.orig/drivers/acpi/events/evxface.c
+++ linux/drivers/acpi/events/evxface.c
@@ -562,7 +562,7 @@ acpi_install_gpe_handler(acpi_handle gpe
 	struct acpi_gpe_event_info *gpe_event_info;
 	struct acpi_handler_info *handler;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("acpi_install_gpe_handler");
 
@@ -653,7 +653,7 @@ acpi_remove_gpe_handler(acpi_handle gpe_
 	struct acpi_gpe_event_info *gpe_event_info;
 	struct acpi_handler_info *handler;
 	acpi_status status;
-	u32 flags;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("acpi_remove_gpe_handler");
 
Index: linux/drivers/acpi/osl.c
===================================================================
--- linux.orig/drivers/acpi/osl.c
+++ linux/drivers/acpi/osl.c
@@ -729,14 +729,14 @@ void acpi_os_delete_lock(acpi_handle han
 acpi_status
 acpi_os_create_semaphore(u32 max_units, u32 initial_units, acpi_handle * handle)
 {
-	struct semaphore *sem = NULL;
+	struct compat_semaphore *sem = NULL;
 
 	ACPI_FUNCTION_TRACE("os_create_semaphore");
 
-	sem = acpi_os_allocate(sizeof(struct semaphore));
+	sem = acpi_os_allocate(sizeof(struct compat_semaphore));
 	if (!sem)
 		return_ACPI_STATUS(AE_NO_MEMORY);
-	memset(sem, 0, sizeof(struct semaphore));
+	memset(sem, 0, sizeof(struct compat_semaphore));
 
 	sema_init(sem, initial_units);
 
@@ -759,7 +759,7 @@ EXPORT_SYMBOL(acpi_os_create_semaphore);
 
 acpi_status acpi_os_delete_semaphore(acpi_handle handle)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 	ACPI_FUNCTION_TRACE("os_delete_semaphore");
 
@@ -788,7 +788,7 @@ EXPORT_SYMBOL(acpi_os_delete_semaphore);
 acpi_status acpi_os_wait_semaphore(acpi_handle handle, u32 units, u16 timeout)
 {
 	acpi_status status = AE_OK;
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 	int ret = 0;
 
 	ACPI_FUNCTION_TRACE("os_wait_semaphore");
@@ -870,7 +870,7 @@ EXPORT_SYMBOL(acpi_os_wait_semaphore);
  */
 acpi_status acpi_os_signal_semaphore(acpi_handle handle, u32 units)
 {
-	struct semaphore *sem = (struct semaphore *)handle;
+	struct compat_semaphore *sem = (struct compat_semaphore *)handle;
 
 	ACPI_FUNCTION_TRACE("os_signal_semaphore");
 
Index: linux/drivers/acpi/processor_idle.c
===================================================================
--- linux.orig/drivers/acpi/processor_idle.c
+++ linux/drivers/acpi/processor_idle.c
@@ -37,6 +37,7 @@
 #include <linux/acpi.h>
 #include <linux/dmi.h>
 #include <linux/moduleparam.h>
+#include <linux/spinlock.h>
 
 #include <asm/io.h>
 #include <asm/uaccess.h>
@@ -167,6 +168,7 @@ acpi_processor_power_activate(struct acp
 }
 
 static atomic_t c3_cpu_count;
+extern void tsc_c3_compensate(unsigned long nsecs);
 
 static void acpi_processor_idle(void)
 {
@@ -184,14 +186,14 @@ static void acpi_processor_idle(void)
 	 * Interrupts must be disabled during bus mastering calculations and
 	 * for C2/C3 transitions.
 	 */
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Check whether we truly need to go idle, or should
 	 * reschedule:
 	 */
 	if (unlikely(need_resched())) {
-		local_irq_enable();
+		raw_local_irq_enable();
 		return;
 	}
 
@@ -253,7 +255,7 @@ static void acpi_processor_idle(void)
 		 *      issues (e.g. floppy DMA transfer overrun/underrun).
 		 */
 		if (pr->power.bm_activity & cx->demotion.threshold.bm) {
-			local_irq_enable();
+			raw_local_irq_enable();
 			next_state = cx->demotion.state;
 			goto end;
 		}
@@ -277,7 +279,7 @@ static void acpi_processor_idle(void)
 		if (pm_idle_save)
 			pm_idle_save();
 		else
-			safe_halt();
+			raw_safe_halt();
 		/*
 		 * TBD: Can't get time duration while in C1, as resumes
 		 *      go to an ISR rather than here.  Need to instrument
@@ -296,7 +298,7 @@ static void acpi_processor_idle(void)
 		/* Get end time (ticks) */
 		t2 = inl(acpi_fadt.xpm_tmr_blk.address);
 		/* Re-enable interrupts */
-		local_irq_enable();
+		raw_local_irq_enable();
 		/* Compute time (ticks) that we were actually asleep */
 		sleep_ticks =
 		    ticks_elapsed(t1, t2) - cx->latency_ticks - C2_OVERHEAD;
@@ -334,15 +336,20 @@ static void acpi_processor_idle(void)
 					  ACPI_MTX_DO_NOT_LOCK);
 		}
 
+#ifdef CONFIG_GENERIC_TIME
+		/* compensate for TSC pause */
+		tsc_c3_compensate((u32)(((u64)((t2-t1)&0xFFFFFF)*286070)>>10));
+#endif
+
 		/* Re-enable interrupts */
-		local_irq_enable();
+		raw_local_irq_enable();
 		/* Compute time (ticks) that we were actually asleep */
 		sleep_ticks =
 		    ticks_elapsed(t1, t2) - cx->latency_ticks - C3_OVERHEAD;
 		break;
 
 	default:
-		local_irq_enable();
+		raw_local_irq_enable();
 		return;
 	}
 
@@ -421,7 +428,7 @@ static void acpi_processor_idle(void)
 	if (pm_idle_save)
 		pm_idle_save();
 	else
-		safe_halt();
+		raw_safe_halt();
 	return;
 }
 
@@ -986,6 +993,7 @@ int acpi_processor_power_init(struct acp
 	static int first_run = 0;
 	struct proc_dir_entry *entry = NULL;
 	unsigned int i;
+	unsigned long flags;
 
 	ACPI_FUNCTION_TRACE("acpi_processor_power_init");
 
@@ -1019,6 +1027,7 @@ int acpi_processor_power_init(struct acp
 	 * Note that we use previously set idle handler will be used on
 	 * platforms that only support C1.
 	 */
+	spin_lock_irqsave(&pm_idle_switch_lock, flags);
 	if ((pr->flags.power) && (!boot_option_idle_override)) {
 		printk(KERN_INFO PREFIX "CPU%d (power states:", pr->id);
 		for (i = 1; i <= pr->power.count; i++)
@@ -1030,8 +1039,13 @@ int acpi_processor_power_init(struct acp
 		if (pr->id == 0) {
 			pm_idle_save = pm_idle;
 			pm_idle = acpi_processor_idle;
+			/*
+			 * Don't allow switching of the pm_idle to poll.
+			 */
+			pm_idle_locked = 1;
 		}
 	}
+	spin_unlock_irqrestore(&pm_idle_switch_lock, flags);
 
 	/* 'power' [R] */
 	entry = create_proc_entry(ACPI_PROCESSOR_FILE_POWER,
@@ -1074,5 +1088,7 @@ int acpi_processor_power_exit(struct acp
 		cpu_idle_wait();
 	}
 
+	pm_idle_locked = 0;
+
 	return_VALUE(0);
 }
Index: linux/drivers/acpi/processor_throttling.c
===================================================================
--- linux.orig/drivers/acpi/processor_throttling.c
+++ linux/drivers/acpi/processor_throttling.c
@@ -69,7 +69,7 @@ static int acpi_processor_get_throttling
 
 	duty_mask <<= pr->throttling.duty_offset;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	value = inl(pr->throttling.address);
 
@@ -87,7 +87,7 @@ static int acpi_processor_get_throttling
 
 	pr->throttling.state = state;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 			  "Throttling state is T%d (%d%% throttling applied)\n",
@@ -131,7 +131,7 @@ int acpi_processor_set_throttling(struct
 		duty_mask = ~duty_mask;
 	}
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Disable throttling by writing a 0 to bit 4.  Note that we must
@@ -158,7 +158,7 @@ int acpi_processor_set_throttling(struct
 
 	pr->throttling.state = state;
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	ACPI_DEBUG_PRINT((ACPI_DB_INFO,
 			  "Throttling state set to T%d (%d%%)\n", state,
Index: linux/drivers/acpi/sleep/main.c
===================================================================
--- linux.orig/drivers/acpi/sleep/main.c
+++ linux/drivers/acpi/sleep/main.c
@@ -82,7 +82,7 @@ static int acpi_pm_enter(suspend_state_t
 			return error;
 	}
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	acpi_enable_wakeup_device(acpi_state);
 	switch (pm_state) {
 	case PM_SUSPEND_STANDBY:
@@ -105,7 +105,7 @@ static int acpi_pm_enter(suspend_state_t
 	default:
 		return -EINVAL;
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	printk(KERN_DEBUG "Back to C!\n");
 
 	/* restore processor state
Index: linux/drivers/acpi/sleep/poweroff.c
===================================================================
--- linux.orig/drivers/acpi/sleep/poweroff.c
+++ linux/drivers/acpi/sleep/poweroff.c
@@ -46,7 +46,7 @@ void acpi_power_off(void)
 {
 	/* acpi_sleep_prepare(ACPI_STATE_S5) should have already been called */
 	printk("%s called\n", __FUNCTION__);
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* Some SMP machines only can poweroff in boot CPU */
 	acpi_enter_sleep_state(ACPI_STATE_S5);
 }
Index: linux/drivers/atm/atmtcp.c
===================================================================
--- linux.orig/drivers/atm/atmtcp.c
+++ linux/drivers/atm/atmtcp.c
@@ -368,7 +368,7 @@ static struct atm_dev atmtcp_control_dev
 	.ops		= &atmtcp_c_dev_ops,
 	.type		= "atmtcp",
 	.number		= 999,
-	.lock		= SPIN_LOCK_UNLOCKED
+	.lock		= SPIN_LOCK_UNLOCKED(atmtcp_control_dev.lock)
 };
 
 
Index: linux/drivers/base/class.c
===================================================================
--- linux.orig/drivers/base/class.c
+++ linux/drivers/base/class.c
@@ -520,8 +520,10 @@ int class_device_add(struct class_device
 		class_name = make_class_name(class_dev);
 		sysfs_create_link(&class_dev->kobj,
 				  &class_dev->dev->kobj, "device");
+		/*
 		sysfs_create_link(&class_dev->dev->kobj, &class_dev->kobj,
 				  class_name);
+		*/
 	}
 
 	/* notify any interfaces this device is now here */
@@ -618,7 +620,9 @@ void class_device_del(struct class_devic
 	if (class_dev->dev) {
 		class_name = make_class_name(class_dev);
 		sysfs_remove_link(&class_dev->kobj, "device");
+		/*
 		sysfs_remove_link(&class_dev->dev->kobj, class_name);
+		*/
 	}
 	if (class_dev->devt_attr)
 		class_device_remove_file(class_dev, class_dev->devt_attr);
Index: linux/drivers/block/cfq-iosched.c
===================================================================
--- linux.orig/drivers/block/cfq-iosched.c
+++ linux/drivers/block/cfq-iosched.c
@@ -1382,10 +1382,9 @@ static void cfq_exit_single_io_context(s
 {
 	struct cfq_data *cfqd = cic->cfqq->cfqd;
 	request_queue_t *q = cfqd->queue;
+	unsigned long flags;
 
-	WARN_ON(!irqs_disabled());
-
-	spin_lock(q->queue_lock);
+	spin_lock_irqsave(q->queue_lock, flags);
 
 	if (unlikely(cic->cfqq == cfqd->active_queue)) {
 		__cfq_slice_expired(cfqd, cic->cfqq, 0);
@@ -1394,7 +1393,7 @@ static void cfq_exit_single_io_context(s
 
 	cfq_put_queue(cic->cfqq);
 	cic->cfqq = NULL;
-	spin_unlock(q->queue_lock);
+	spin_unlock_irqrestore(q->queue_lock, flags);
 }
 
 /*
@@ -1405,9 +1404,6 @@ static void cfq_exit_io_context(struct c
 {
 	struct cfq_io_context *__cic;
 	struct list_head *entry;
-	unsigned long flags;
-
-	local_irq_save(flags);
 
 	/*
 	 * put the reference this task is holding to the various queues
@@ -1418,7 +1414,6 @@ static void cfq_exit_io_context(struct c
 	}
 
 	cfq_exit_single_io_context(cic);
-	local_irq_restore(flags);
 }
 
 static struct cfq_io_context *
Index: linux/drivers/block/ll_rw_blk.c
===================================================================
--- linux.orig/drivers/block/ll_rw_blk.c
+++ linux/drivers/block/ll_rw_blk.c
@@ -29,6 +29,7 @@
 #include <linux/swap.h>
 #include <linux/writeback.h>
 #include <linux/blkdev.h>
+#include <linux/interrupt.h>
 
 /*
  * for max sense size
@@ -1413,7 +1414,7 @@ static int ll_merge_requests_fn(request_
  */
 void blk_plug_device(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	/*
 	 * don't plug a stopped queue, it must be paired with blk_start_queue()
@@ -1434,7 +1435,7 @@ EXPORT_SYMBOL(blk_plug_device);
  */
 int blk_remove_plug(request_queue_t *q)
 {
-	WARN_ON(!irqs_disabled());
+	WARN_ON_NONRT(!irqs_disabled());
 
 	if (!test_and_clear_bit(QUEUE_FLAG_PLUGGED, &q->queue_flags))
 		return 0;
@@ -3369,13 +3370,17 @@ void exit_io_context(void)
 	unsigned long flags;
 	struct io_context *ioc;
 
-	local_irq_save(flags);
 	task_lock(current);
+	/*
+	 * CHECKME: what does this protect against - can interrupt
+	 *          contexts access current->io_context?
+	 */
+	local_irq_save_nort(flags);
 	ioc = current->io_context;
 	current->io_context = NULL;
 	ioc->task = NULL;
+	local_irq_restore_nort(flags);
 	task_unlock(current);
-	local_irq_restore(flags);
 
 	if (ioc->aic && ioc->aic->exit)
 		ioc->aic->exit(ioc->aic);
Index: linux/drivers/block/loop.c
===================================================================
--- linux.orig/drivers/block/loop.c
+++ linux/drivers/block/loop.c
@@ -514,12 +514,12 @@ static int loop_make_request(request_que
 	lo->lo_pending++;
 	loop_add_bio(lo, old_bio);
 	spin_unlock_irq(&lo->lo_lock);
-	up(&lo->lo_bh_mutex);
+	complete(&lo->lo_bh_done);
 	return 0;
 
 out:
 	if (lo->lo_pending == 0)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 	bio_io_error(old_bio, old_bio->bi_size);
 	return 0;
@@ -580,23 +580,20 @@ static int loop_thread(void *data)
 	lo->lo_pending = 1;
 
 	/*
-	 * up sem, we are running
+	 * complete it, we are running
 	 */
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 
 	for (;;) {
 		int pending;
 
-		/*
-		 * interruptible just to not contribute to load avg
-		 */
-		if (down_interruptible(&lo->lo_bh_mutex))
+		if (wait_for_completion_interruptible(&lo->lo_bh_done))
 			continue;
 
 		spin_lock_irq(&lo->lo_lock);
 
 		/*
-		 * could be upped because of tear-down, not pending work
+		 * could be completed because of tear-down, not pending work
 		 */
 		if (unlikely(!lo->lo_pending)) {
 			spin_unlock_irq(&lo->lo_lock);
@@ -619,7 +616,7 @@ static int loop_thread(void *data)
 			break;
 	}
 
-	up(&lo->lo_sem);
+	complete(&lo->lo_done);
 	return 0;
 }
 
@@ -830,7 +827,7 @@ static int loop_set_fd(struct loop_devic
 	set_blocksize(bdev, lo_blocksize);
 
 	kernel_thread(loop_thread, lo, CLONE_KERNEL);
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 	return 0;
 
  out_putf:
@@ -896,10 +893,10 @@ static int loop_clr_fd(struct loop_devic
 	lo->lo_state = Lo_rundown;
 	lo->lo_pending--;
 	if (!lo->lo_pending)
-		up(&lo->lo_bh_mutex);
+		complete(&lo->lo_bh_done);
 	spin_unlock_irq(&lo->lo_lock);
 
-	down(&lo->lo_sem);
+	wait_for_completion(&lo->lo_done);
 
 	lo->lo_backing_file = NULL;
 
@@ -1276,8 +1273,8 @@ static int __init loop_init(void)
 		if (!lo->lo_queue)
 			goto out_mem4;
 		init_MUTEX(&lo->lo_ctl_mutex);
-		init_MUTEX_LOCKED(&lo->lo_sem);
-		init_MUTEX_LOCKED(&lo->lo_bh_mutex);
+		init_completion(&lo->lo_done);
+		init_completion(&lo->lo_bh_done);
 		lo->lo_number = i;
 		spin_lock_init(&lo->lo_lock);
 		disk->major = LOOP_MAJOR;
Index: linux/drivers/block/paride/pseudo.h
===================================================================
--- linux.orig/drivers/block/paride/pseudo.h
+++ linux/drivers/block/paride/pseudo.h
@@ -43,7 +43,7 @@ static unsigned long ps_timeout;
 static int ps_tq_active = 0;
 static int ps_nice = 0;
 
-static DEFINE_SPINLOCK(ps_spinlock __attribute__((unused)));
+static __attribute__((unused)) DEFINE_SPINLOCK(ps_spinlock);
 
 static DECLARE_WORK(ps_tq, ps_tq_int, NULL);
 
Index: linux/drivers/block/sx8.c
===================================================================
--- linux.orig/drivers/block/sx8.c
+++ linux/drivers/block/sx8.c
@@ -27,6 +27,7 @@
 #include <linux/time.h>
 #include <linux/hdreg.h>
 #include <linux/dma-mapping.h>
+#include <linux/completion.h>
 #include <asm/io.h>
 #include <asm/semaphore.h>
 #include <asm/uaccess.h>
@@ -280,7 +281,7 @@ struct carm_host {
 
 	struct work_struct		fsm_task;
 
-	struct semaphore		probe_sem;
+	struct completion		probe_comp;
 };
 
 struct carm_response {
@@ -1342,7 +1343,7 @@ static void carm_fsm_task (void *_data)
 	}
 
 	case HST_PROBE_FINISHED:
-		up(&host->probe_sem);
+		complete(&host->probe_comp);
 		break;
 
 	case HST_ERROR:
@@ -1618,7 +1619,7 @@ static int carm_init_one (struct pci_dev
 	host->flags = pci_dac ? FL_DAC : 0;
 	spin_lock_init(&host->lock);
 	INIT_WORK(&host->fsm_task, carm_fsm_task, host);
-	init_MUTEX_LOCKED(&host->probe_sem);
+	init_completion(&host->probe_comp);
 
 	for (i = 0; i < ARRAY_SIZE(host->req); i++)
 		host->req[i].tag = i;
@@ -1687,8 +1688,8 @@ static int carm_init_one (struct pci_dev
 	if (rc)
 		goto err_out_free_irq;
 
-	DPRINTK("waiting for probe_sem\n");
-	down(&host->probe_sem);
+	DPRINTK("waiting for probe_comp\n");
+	wait_for_completion(&host->probe_comp);
 
 	printk(KERN_INFO "%s: pci %s, ports %d, io %lx, irq %u, major %d\n",
 	       host->name, pci_name(pdev), (int) CARM_MAX_PORTS,
Index: linux/drivers/char/Kconfig
===================================================================
--- linux.orig/drivers/char/Kconfig
+++ linux/drivers/char/Kconfig
@@ -711,6 +711,45 @@ config RTC
 	  To compile this driver as a module, choose M here: the
 	  module will be called rtc.
 
+config RTC_HISTOGRAM
+	tristate "Real Time Clock Histogram Support"
+	default n
+	depends on RTC
+	---help---
+	  If you say Y here then the kernel will track the delivery and
+	  wakeup latency of /dev/rtc using tasks and will report a
+	  histogram to the kernel log when the application closes /dev/rtc.
+
+config BLOCKER
+	tristate "Priority Inheritance Debugging (Blocker) Device Support"
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  pi_test suite uses to test and measure kernel locking primitives.
+
+config LPPTEST
+	tristate "Parallel Port Based Latency Measurement Device"
+	depends on !PARPORT && X86
+	default y
+	---help---
+	  If you say Y here then a device will be created that the userspace
+	  testlpp utility uses to measure IRQ latencies of a target system
+	  from an independent measurement system.
+
+	  NOTE: this code assumes x86 PCs and that the parallel port is
+	  bidirectional and is on IRQ 7.
+
+	  to use the device, both the target and the source system needs to
+	  run a kernel with CONFIG_LPPTEST enabled. To measure latencies,
+	  use the scripts/testlpp utility in your kernel source directory,
+	  and run it (as root) on the source system - it will start printing
+	  out the latencies it took to get a response from the target system:
+
+	    Latency of response: 12.2 usecs (121265 cycles)
+
+	  then generate various workloads on the target system to see how
+	  (worst-case-) latencies are impacted.
+
 config SGI_DS1286
 	tristate "SGI DS1286 RTC support"
 	depends on SGI_IP22
Index: linux/drivers/char/Makefile
===================================================================
--- linux.orig/drivers/char/Makefile
+++ linux/drivers/char/Makefile
@@ -57,6 +57,8 @@ obj-$(CONFIG_R3964) += n_r3964.o
 obj-$(CONFIG_APPLICOM) += applicom.o
 obj-$(CONFIG_SONYPI) += sonypi.o
 obj-$(CONFIG_RTC) += rtc.o
+obj-$(CONFIG_BLOCKER) += blocker.o
+obj-$(CONFIG_LPPTEST) += lpptest.o
 obj-$(CONFIG_HPET) += hpet.o
 obj-$(CONFIG_GEN_RTC) += genrtc.o
 obj-$(CONFIG_EFI_RTC) += efirtc.o
Index: linux/drivers/char/blocker.c
===================================================================
--- /dev/null
+++ linux/drivers/char/blocker.c
@@ -0,0 +1,108 @@
+/*
+ * priority inheritance testing device
+ */
+
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <asm/rtc.h>
+
+#define BLOCKER_MINOR		221
+
+#define BLOCK_IOCTL		4245
+#define BLOCK_SET_DEPTH		4246
+
+#define MAX_LOCK_DEPTH		10
+
+void loop(int loops)
+{
+	int i;
+
+	for (i = 0; i < loops; i++)
+		get_cycles();
+}
+
+static spinlock_t blocker_lock[MAX_LOCK_DEPTH];
+
+static unsigned int lock_depth = 1;
+
+void do_the_lock_and_loop(unsigned int args)
+{
+	int i, max;
+
+	if (rt_task(current))
+		max = lock_depth;
+	else if (lock_depth > 1)
+		max = (current->pid % lock_depth) + 1;
+	else
+		max = 1;
+
+	/* Always lock from the top down */
+	for (i = max-1; i >= 0; i--)
+		 spin_lock(&blocker_lock[i]);
+	loop(args);
+	for (i = 0; i < max; i++)
+		spin_unlock(&blocker_lock[i]);
+}
+
+static int blocker_open(struct inode *in, struct file *file)
+{
+	printk(KERN_INFO "blocker_open called\n");
+
+	return 0;
+}
+
+static long blocker_ioctl(struct file *file,
+			  unsigned int cmd, unsigned long args)
+{
+	switch(cmd) {
+	case BLOCK_IOCTL:
+		do_the_lock_and_loop(args);
+		return 0;
+	case BLOCK_SET_DEPTH:
+		if (args >= MAX_LOCK_DEPTH)
+			return -EINVAL;
+		lock_depth = args;
+		return 0;
+	default:
+		return -EINVAL;
+	}
+}
+
+static struct file_operations blocker_fops = {
+	.owner		= THIS_MODULE,
+	.llseek		= no_llseek,
+	.unlocked_ioctl = blocker_ioctl,
+	.open		= blocker_open,
+};
+
+static struct miscdevice blocker_dev =
+{
+	BLOCKER_MINOR,
+	"blocker",
+	&blocker_fops
+};
+
+static int __init blocker_init(void)
+{
+	int i;
+
+	if (misc_register(&blocker_dev))
+		return -ENODEV;
+
+	for (i = 0; i < MAX_LOCK_DEPTH; i++)
+		spin_lock_init(blocker_lock + i);
+
+	return 0;
+}
+
+void __exit blocker_exit(void)
+{
+	printk(KERN_INFO "blocker device uninstalled\n");
+	misc_deregister(&blocker_dev);
+}
+
+module_init(blocker_init);
+module_exit(blocker_exit);
+
+MODULE_LICENSE("GPL");
+
Index: linux/drivers/char/epca.c
===================================================================
--- linux.orig/drivers/char/epca.c
+++ linux/drivers/char/epca.c
@@ -80,7 +80,7 @@ static int invalid_lilo_config;
 /* The ISA boards do window flipping into the same spaces so its only sane
    with a single lock. It's still pretty efficient */
 
-static spinlock_t epca_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(epca_lock);
 
 /* -----------------------------------------------------------------------
 	MAXBOARDS is typically 12, but ISA and EISA cards are restricted to 
Index: linux/drivers/char/hangcheck-timer.c
===================================================================
--- linux.orig/drivers/char/hangcheck-timer.c
+++ linux/drivers/char/hangcheck-timer.c
@@ -49,6 +49,7 @@
 #include <linux/delay.h>
 #include <asm/uaccess.h>
 #include <linux/sysrq.h>
+#include <linux/timeofday.h>
 
 
 #define VERSION_STR "0.9.0"
@@ -130,8 +131,12 @@ __setup("hcheck_dump_tasks", hangcheck_p
 #endif
 
 #ifdef HAVE_MONOTONIC
+#ifndef CONFIG_GENERIC_TIME
 extern unsigned long long monotonic_clock(void);
 #else
+#define monotonic_clock() ktime_to_ns(get_monotonic_clock())
+#endif
+#else
 static inline unsigned long long monotonic_clock(void)
 {
 # ifdef __s390__
Index: linux/drivers/char/ipmi/ipmi_si_intf.c
===================================================================
--- linux.orig/drivers/char/ipmi/ipmi_si_intf.c
+++ linux/drivers/char/ipmi/ipmi_si_intf.c
@@ -52,7 +52,7 @@
 #include <linux/pci.h>
 #include <linux/ioport.h>
 #include <asm/irq.h>
-#ifdef CONFIG_HIGH_RES_TIMERS
+#ifdef CONFIG_HIGH_RES_TIMERS_OLD
 #include <linux/hrtime.h>
 # if defined(schedule_next_int)
 /* Old high-res timer code, do translations. */
@@ -785,7 +785,7 @@ static int initialized = 0;
 /* Must be called with interrupts off and with the si_lock held. */
 static void si_restart_short_timer(struct smi_info *smi_info)
 {
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 	unsigned long flags;
 	unsigned long jiffies_now;
 	unsigned long seq;
@@ -855,13 +855,13 @@ static void smi_timeout(unsigned long da
 	/* If the state machine asks for a short delay, then shorten
            the timer timeout. */
 	if (smi_result == SI_SM_CALL_WITH_DELAY) {
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		unsigned long seq;
 #endif
 		spin_lock_irqsave(&smi_info->count_lock, flags);
 		smi_info->short_timeouts++;
 		spin_unlock_irqrestore(&smi_info->count_lock, flags);
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		do {
 			seq = read_seqbegin_irqsave(&xtime_lock, flags);
 			smi_info->si_timer.expires = jiffies;
@@ -877,7 +877,7 @@ static void smi_timeout(unsigned long da
 		smi_info->long_timeouts++;
 		spin_unlock_irqrestore(&smi_info->count_lock, flags);
 		smi_info->si_timer.expires = jiffies + SI_TIMEOUT_JIFFIES;
-#if defined(CONFIG_HIGH_RES_TIMERS)
+#if defined(CONFIG_HIGH_RES_TIMERS_OLD)
 		smi_info->si_timer.arch_cycle_expires = 0;
 #endif
 	}
Index: linux/drivers/char/ipmi/ipmi_watchdog.c
===================================================================
--- linux.orig/drivers/char/ipmi/ipmi_watchdog.c
+++ linux/drivers/char/ipmi/ipmi_watchdog.c
@@ -366,7 +366,8 @@ static void panic_halt_ipmi_set_timeout(
    when both messages are free. */
 static atomic_t heartbeat_tofree = ATOMIC_INIT(0);
 static DECLARE_MUTEX(heartbeat_lock);
-static DECLARE_MUTEX_LOCKED(heartbeat_wait_lock);
+/* PREEMPT_RT: should be a completion instead */
+static COMPAT_DECLARE_MUTEX_LOCKED(heartbeat_wait_lock);
 static void heartbeat_free_smi(struct ipmi_smi_msg *msg)
 {
     if (atomic_dec_and_test(&heartbeat_tofree))
Index: linux/drivers/char/lpptest.c
===================================================================
--- /dev/null
+++ linux/drivers/char/lpptest.c
@@ -0,0 +1,163 @@
+/*
+ * /dev/lpptest device: test IRQ handling latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
+ *
+ * licensed under the GPL
+ *
+ * You need to have CONFIG_PARPORT disabled for this device, it is a
+ * completely self-contained device that assumes sole ownership of the
+ * parallel port.
+ */
+#include <linux/config.h>
+#include <linux/sched.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/fs.h>
+#include <linux/delay.h>
+#include <asm/uaccess.h>
+#include <asm/io.h>
+#include <asm/rtc.h>
+
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_IRQ 7
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+static char dev_id[] = "lpptest";
+
+#define INIT_PORT()	outb(0x04, 0x37a)
+#define ENABLE_IRQ()	outb(0x10, 0x37a)
+#define DISABLE_IRQ()	outb(0, 0x37a)
+
+static unsigned char out = 0x5a;
+
+/**
+ * Interrupt handler. Flip a bit in the reply.
+ */
+static int lpptest_irq (int irq, void *dev_id, struct pt_regs *regs)
+{
+	out ^= 0xff;
+	outb(out, 0x378);
+
+	return IRQ_HANDLED;
+}
+
+static cycles_t test_response(void)
+{
+	cycles_t now, end;
+	unsigned char in;
+	int timeout = 0;
+
+	raw_local_irq_disable();
+	in = inb(0x379);
+	inb(0x378);
+	outb(0x08, 0x378);
+	now = get_cycles();
+	while(1) {
+    		if (inb(0x379) != in)
+			break;
+		if (timeout++ > 1000000) {
+			outb(0x00, 0x378);
+			raw_local_irq_enable();
+
+			return 0;
+		}
+	}
+	end = get_cycles();
+	outb(0x00, 0x378);
+	raw_local_irq_enable();
+
+	return end - now;
+}
+
+static int lpptest_open(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+static int lpptest_close(struct inode *inode, struct file *file)
+{
+	return 0;
+}
+
+int lpptest_ioctl(struct inode *inode, struct file *file, unsigned int ioctl_num, unsigned long ioctl_param)
+{
+	int retval = 0;
+
+	switch (ioctl_num) {
+
+	case LPPTEST_DISABLE:
+		DISABLE_IRQ();
+		break;
+
+	case LPPTEST_ENABLE:
+		ENABLE_IRQ();
+		break;
+
+	case LPPTEST_TEST: {
+
+		cycles_t diff = test_response();
+		if (copy_to_user((void *)ioctl_param, (void*) &diff, sizeof(diff)))
+			goto errcpy;
+		break;
+	}
+	default: retval = -EINVAL;
+	}
+
+	return retval;
+
+ errcpy:
+	return -EFAULT;
+}
+
+static struct file_operations lpptest_dev_fops = {
+	.ioctl = lpptest_ioctl,
+	.open = lpptest_open,
+	.release = lpptest_close,
+};
+
+static int __init lpptest_init (void)
+{
+	if (register_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME, &lpptest_dev_fops))
+	{
+		printk(KERN_NOTICE "Can't allocate major number %d for lpptest.\n",
+		       LPPTEST_CHAR_MAJOR);
+		return -EAGAIN;
+	}
+
+	if (request_irq (LPPTEST_IRQ, lpptest_irq, 0, "lpptest", dev_id)) {
+		printk (KERN_WARNING "lpptest: irq %d in use. Unload parport module!\n", LPPTEST_IRQ);
+		unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+		return -EAGAIN;
+	}
+	irq_desc[LPPTEST_IRQ].status |= IRQ_NODELAY;
+	irq_desc[LPPTEST_IRQ].action->flags |= SA_NODELAY | SA_INTERRUPT;
+
+	INIT_PORT();
+	ENABLE_IRQ();
+
+	return 0;
+}
+module_init (lpptest_init);
+
+static void __exit lpptest_exit (void)
+{
+	DISABLE_IRQ();
+
+	free_irq(LPPTEST_IRQ, dev_id);
+	unregister_chrdev(LPPTEST_CHAR_MAJOR, LPPTEST_DEVICE_NAME);
+}
+module_exit (lpptest_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_DESCRIPTION("lpp test module");
+
Index: linux/drivers/char/random.c
===================================================================
--- linux.orig/drivers/char/random.c
+++ linux/drivers/char/random.c
@@ -417,7 +417,7 @@ static struct entropy_store input_pool =
 	.poolinfo = &poolinfo_table[0],
 	.name = "input",
 	.limit = 1,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(input_pool.lock),
 	.pool = input_pool_data
 };
 
@@ -426,7 +426,7 @@ static struct entropy_store blocking_poo
 	.name = "blocking",
 	.limit = 1,
 	.pull = &input_pool,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(blocking_pool.lock),
 	.pool = blocking_pool_data
 };
 
@@ -434,7 +434,7 @@ static struct entropy_store nonblocking_
 	.poolinfo = &poolinfo_table[1],
 	.name = "nonblocking",
 	.pull = &input_pool,
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(nonblocking_pool.lock),
 	.pool = nonblocking_pool_data
 };
 
@@ -581,8 +581,11 @@ static void add_timer_randomness(struct 
 	preempt_disable();
 	/* if over the trickle threshold, use only 1 in 4096 samples */
 	if (input_pool.entropy_count > trickle_thresh &&
-	    (__get_cpu_var(trickle_count)++ & 0xfff))
-		goto out;
+	    (__get_cpu_var(trickle_count)++ & 0xfff)) {
+		preempt_enable();
+		return;
+	}
+	preempt_enable();
 
 	sample.jiffies = jiffies;
 	sample.cycles = get_cycles();
@@ -627,9 +630,6 @@ static void add_timer_randomness(struct 
 
 	if(input_pool.entropy_count >= random_read_wakeup_thresh)
 		wake_up_interruptible(&random_read_wait);
-
-out:
-	preempt_enable();
 }
 
 extern void add_input_randomness(unsigned int type, unsigned int code,
Index: linux/drivers/char/rtc.c
===================================================================
--- linux.orig/drivers/char/rtc.c
+++ linux/drivers/char/rtc.c
@@ -84,10 +84,36 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 
+#ifdef CONFIG_MIPS
+# include <asm/time.h>
+#endif
+
 #if defined(__i386__)
 #include <asm/hpet.h>
 #endif
 
+#ifdef CONFIG_RTC_HISTOGRAM
+
+static cycles_t last_interrupt_time;
+
+#include <asm/timex.h>
+
+#define CPU_MHZ		(cpu_khz / 1000)
+
+#define HISTSIZE	10000
+static int histogram[HISTSIZE];
+
+static int rtc_state;
+
+enum rtc_states {
+	S_STARTUP,		/* First round - let the application start */
+	S_IDLE,			/* Waiting for an interrupt */
+	S_WAITING_FOR_READ,	/* Signal delivered. waiting for rtc_read() */
+	S_READ_MISSED,		/* Signal delivered, read() deadline missed */
+};
+
+#endif
+
 #ifdef __sparc__
 #include <linux/pci.h>
 #include <asm/ebus.h>
@@ -205,7 +231,146 @@ static inline unsigned char rtc_is_updat
 	return uip;
 }
 
+#ifndef RTC_IRQ
+# undef CONFIG_RTC_HISTOGRAM
+#endif
+
+static inline void rtc_open_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i;
+
+	last_interrupt_time = 0;
+	rtc_state = S_STARTUP;
+	rtc_irq_data = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		histogram[i] = 0;
+#endif
+}
+
+static inline void rtc_wake_event(void)
+{
+#ifndef CONFIG_RTC_HISTOGRAM
+	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+#else
+	if (!(rtc_status & RTC_IS_OPEN))
+		return;
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		break;
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+		last_interrupt_time = get_cycles();
+		rtc_state = S_WAITING_FOR_READ;
+		break;
+
+	/* Signal has been delivered. waiting for rtc_read() */
+	case S_WAITING_FOR_READ:
+		/*
+		 * Well foo.  The usermode application didn't
+		 * schedule and read in time.
+		 */
+		last_interrupt_time = get_cycles();
+		rtc_state = S_READ_MISSED;
+		printk("Read missed before next interrupt\n");
+		break;
+	/* Signal has been delivered, read() deadline was missed */
+	case S_READ_MISSED:
+		/*
+		 * Not much we can do here.  We're waiting for the usermode
+		 * application to read the rtc
+		 */
+		last_interrupt_time = get_cycles();
+		break;
+	}
+#endif
+}
+
+static inline void rtc_read_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	cycles_t now = get_cycles();
+
+	switch (rtc_state) {
+	/* Startup */
+	case S_STARTUP:
+		rtc_state = S_IDLE;
+		break;
+
+	/* Waiting for an interrupt */
+	case S_IDLE:
+		printk("bug in rtc_read(): called in state S_IDLE!\n");
+		break;
+	case S_WAITING_FOR_READ:	/*
+					 * Signal has been delivered.
+					 * waiting for rtc_read()
+					 */
+		/*
+		 * Well done
+		 */
+	case S_READ_MISSED:		/*
+					 * Signal has been delivered, read()
+					 * deadline was missed
+					 */
+		/*
+		 * So, you finally got here.
+		 */
+		if (!last_interrupt_time)
+			printk("bug in rtc_read(): last_interrupt_time = 0\n");
+		rtc_state = S_IDLE;
+		{
+			cycles_t latency = now - last_interrupt_time;
+			unsigned long delta;	/* Microseconds */
+
+			delta = latency;
+			delta /= CPU_MHZ;
+
+			if (delta > 1000 * 1000) {
+				printk("rtc: eek\n");
+			} else {
+				unsigned long slot = delta;
+				if (slot >= HISTSIZE)
+					slot = HISTSIZE - 1;
+				histogram[slot]++;
+				if (delta > 2000)
+					printk("wow!  That was a "
+							"%ld millisec bump\n",
+						delta / 1000);
+			}
+		}
+		rtc_state = S_IDLE;
+		break;
+	}
+#endif
+}
+
+static inline void rtc_close_event(void)
+{
+#ifdef CONFIG_RTC_HISTOGRAM
+	int i = 0;
+	unsigned long total = 0;
+
+	for (i = 0; i < HISTSIZE; i++)
+		total += histogram[i];
+	if (!total)
+		return;
+
+	printk("\nrtc latency histogram of {%s/%d, %lu samples}:\n",
+		current->comm, current->pid, total);
+	for (i = 0; i < HISTSIZE; i++) {
+		if (histogram[i])
+			printk("%d %d\n", i, histogram[i]);
+	}
+#endif
+}
+
 #ifdef RTC_IRQ
+
 /*
  *	A very tiny interrupt handler. It runs with SA_INTERRUPT set,
  *	but there is possibility of conflicting with the set_rtc_mmss()
@@ -218,6 +383,8 @@ static inline unsigned char rtc_is_updat
 
 irqreturn_t rtc_interrupt(int irq, void *dev_id, struct pt_regs *regs)
 {
+	int mod;
+
 	/*
 	 *	Can be an alarm interrupt, update complete interrupt,
 	 *	or a periodic interrupt. We store the status in the
@@ -239,19 +406,22 @@ irqreturn_t rtc_interrupt(int irq, void 
 		rtc_irq_data |= (CMOS_READ(RTC_INTR_FLAGS) & 0xF0);
 	}
 
+	mod = 0;
 	if (rtc_status & RTC_TIMER_ON)
-		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+		mod = 1;
 
 	spin_unlock (&rtc_lock);
+	if (mod)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
 
 	/* Now do the rest of the actions */
 	spin_lock(&rtc_task_lock);
 	if (rtc_callback)
 		rtc_callback->func(rtc_callback->private_data);
 	spin_unlock(&rtc_task_lock);
-	wake_up_interruptible(&rtc_wait);	
 
-	kill_fasync (&rtc_async_queue, SIGIO, POLL_IN);
+	rtc_wake_event();
+	wake_up_interruptible(&rtc_wait);
 
 	return IRQ_HANDLED;
 }
@@ -355,6 +525,8 @@ static ssize_t rtc_read(struct file *fil
 		schedule();
 	} while (1);
 
+	rtc_read_event();
+
 	if (count < sizeof(unsigned long))
 		retval = put_user(data, (unsigned int __user *)buf) ?: sizeof(int); 
 	else
@@ -405,8 +577,8 @@ static int rtc_do_ioctl(unsigned int cmd
 		if (rtc_status & RTC_TIMER_ON) {
 			spin_lock_irq (&rtc_lock);
 			rtc_status &= ~RTC_TIMER_ON;
-			del_timer(&rtc_irq_timer);
 			spin_unlock_irq (&rtc_lock);
+			del_timer(&rtc_irq_timer);
 		}
 		return 0;
 	}
@@ -424,9 +596,9 @@ static int rtc_do_ioctl(unsigned int cmd
 		if (!(rtc_status & RTC_TIMER_ON)) {
 			spin_lock_irq (&rtc_lock);
 			rtc_irq_timer.expires = jiffies + HZ/rtc_freq + 2*HZ/100;
-			add_timer(&rtc_irq_timer);
 			rtc_status |= RTC_TIMER_ON;
 			spin_unlock_irq (&rtc_lock);
+			add_timer(&rtc_irq_timer);
 		}
 		set_rtc_irq_bit(RTC_PIE);
 		return 0;
@@ -584,6 +756,11 @@ static int rtc_do_ioctl(unsigned int cmd
 		save_freq_select = CMOS_READ(RTC_FREQ_SELECT);
 		CMOS_WRITE((save_freq_select|RTC_DIV_RESET2), RTC_FREQ_SELECT);
 
+		/*
+		 * Make CMOS date writes nonpreemptible even on PREEMPT_RT.
+		 * There's a limit to everything! =B-)
+		 */
+		preempt_disable();
 #ifdef CONFIG_MACH_DECSTATION
 		CMOS_WRITE(real_yrs, RTC_DEC_YEAR);
 #endif
@@ -593,6 +770,7 @@ static int rtc_do_ioctl(unsigned int cmd
 		CMOS_WRITE(hrs, RTC_HOURS);
 		CMOS_WRITE(min, RTC_MINUTES);
 		CMOS_WRITE(sec, RTC_SECONDS);
+		preempt_enable();
 
 		CMOS_WRITE(save_control, RTC_CONTROL);
 		CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
@@ -690,6 +868,7 @@ static int rtc_open(struct inode *inode,
 	if(rtc_status & RTC_IS_OPEN)
 		goto out_busy;
 
+	rtc_open_event();
 	rtc_status |= RTC_IS_OPEN;
 
 	rtc_irq_data = 0;
@@ -711,6 +890,7 @@ static int rtc_release(struct inode *ino
 {
 #ifdef RTC_IRQ
 	unsigned char tmp;
+	int del;
 
 	if (rtc_has_irq == 0)
 		goto no_irq;
@@ -729,11 +909,14 @@ static int rtc_release(struct inode *ino
 		CMOS_WRITE(tmp, RTC_CONTROL);
 		CMOS_READ(RTC_INTR_FLAGS);
 	}
+	del = 0;
 	if (rtc_status & RTC_TIMER_ON) {
 		rtc_status &= ~RTC_TIMER_ON;
-		del_timer(&rtc_irq_timer);
+		del = 1;
 	}
 	spin_unlock_irq(&rtc_lock);
+	if (del)
+		del_timer(&rtc_irq_timer);
 
 	if (file->f_flags & FASYNC) {
 		rtc_fasync (-1, file, 0);
@@ -745,6 +928,7 @@ no_irq:
 	rtc_irq_data = 0;
 	rtc_status &= ~RTC_IS_OPEN;
 	spin_unlock_irq (&rtc_lock);
+	rtc_close_event();
 	return 0;
 }
 
@@ -809,6 +993,7 @@ int rtc_unregister(rtc_task_t *task)
 	return -EIO;
 #else
 	unsigned char tmp;
+	int del;
 
 	spin_lock_irq(&rtc_lock);
 	spin_lock(&rtc_task_lock);
@@ -828,12 +1013,15 @@ int rtc_unregister(rtc_task_t *task)
 		CMOS_WRITE(tmp, RTC_CONTROL);
 		CMOS_READ(RTC_INTR_FLAGS);
 	}
+	del = 0;
 	if (rtc_status & RTC_TIMER_ON) {
 		rtc_status &= ~RTC_TIMER_ON;
-		del_timer(&rtc_irq_timer);
+		del = 1;
 	}
 	rtc_status &= ~RTC_IS_OPEN;
 	spin_unlock(&rtc_task_lock);
+	if (del)
+		del_timer(&rtc_irq_timer);
 	spin_unlock_irq(&rtc_lock);
 	return 0;
 #endif
@@ -1092,6 +1280,7 @@ module_exit(rtc_exit);
 static void rtc_dropped_irq(unsigned long data)
 {
 	unsigned long freq;
+	int mod;
 
 	spin_lock_irq (&rtc_lock);
 
@@ -1101,8 +1290,9 @@ static void rtc_dropped_irq(unsigned lon
 	}
 
 	/* Just in case someone disabled the timer from behind our back... */
+	mod = 0;
 	if (rtc_status & RTC_TIMER_ON)
-		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
+		mod = 1;
 
 	rtc_irq_data += ((rtc_freq/HZ)<<8);
 	rtc_irq_data &= ~0xff;
@@ -1111,6 +1301,8 @@ static void rtc_dropped_irq(unsigned lon
 	freq = rtc_freq;
 
 	spin_unlock_irq(&rtc_lock);
+	if (mod)
+		mod_timer(&rtc_irq_timer, jiffies + HZ/rtc_freq + 2*HZ/100);
 
 	printk(KERN_WARNING "rtc: lost some interrupts at %ldHz.\n", freq);
 
Index: linux/drivers/char/s3c2410-rtc.c
===================================================================
--- linux.orig/drivers/char/s3c2410-rtc.c
+++ linux/drivers/char/s3c2410-rtc.c
@@ -22,6 +22,7 @@
 #include <linux/init.h>
 #include <linux/device.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/rtc.h>
 #include <linux/bcd.h>
 
Index: linux/drivers/char/specialix.c
===================================================================
--- linux.orig/drivers/char/specialix.c
+++ linux/drivers/char/specialix.c
@@ -2491,7 +2491,7 @@ static int __init specialix_init(void)
 #endif
 	
 	for (i = 0; i < SX_NBOARD; i++)
-		sx_board[i].lock = SPIN_LOCK_UNLOCKED;
+		spin_lock_init(&sx_board[i].lock);
 
 	if (sx_init_drivers()) {
 		func_exit();
Index: linux/drivers/char/sx.c
===================================================================
--- linux.orig/drivers/char/sx.c
+++ linux/drivers/char/sx.c
@@ -2321,7 +2321,7 @@ static int sx_init_portstructs (int nboa
 #ifdef NEW_WRITE_LOCKING
 			port->gs.port_write_sem = MUTEX;
 #endif
-			port->gs.driver_lock = SPIN_LOCK_UNLOCKED;
+			spin_lock_init(&port->gs.driver_lock);
 			/*
 			 * Initializing wait queue
 			 */
Index: linux/drivers/char/sysrq.c
===================================================================
--- linux.orig/drivers/char/sysrq.c
+++ linux/drivers/char/sysrq.c
@@ -114,7 +114,7 @@ static struct sysrq_key_op sysrq_crashdu
 static void sysrq_handle_reboot(int key, struct pt_regs *pt_regs,
 				struct tty_struct *tty) 
 {
-	local_irq_enable();
+	raw_local_irq_enable();
 	emergency_restart();
 }
 
@@ -169,6 +169,38 @@ static struct sysrq_key_op sysrq_showreg
 	.enable_mask	= SYSRQ_ENABLE_DUMP,
 };
 
+#ifdef CONFIG_DEBUG_DEADLOCKS
+
+static void sysrq_handle_showlocks(int key, struct pt_regs *pt_regs,
+				   struct tty_struct *tty)
+{
+	show_all_locks();
+}
+
+static struct sysrq_key_op sysrq_showlocks_op = {
+	.handler	= sysrq_handle_showlocks,
+	.help_msg	= "show-all-locks(D)",
+	.action_msg	= "Show Locks Held",
+};
+
+#endif
+
+#if defined(__i386__)
+
+static void sysrq_handle_showallregs(int key, struct pt_regs *pt_regs,
+				     struct tty_struct *tty)
+{
+	nmi_show_all_regs();
+}
+
+static struct sysrq_key_op sysrq_showallregs_op = {
+	.handler	= sysrq_handle_showallregs,
+	.help_msg	= "showalLcpupc",
+	.action_msg	= "Show Regs On All CPUs",
+};
+
+#endif
+
 
 static void sysrq_handle_showstate(int key, struct pt_regs *pt_regs,
 				   struct tty_struct *tty) 
@@ -294,7 +326,11 @@ static struct sysrq_key_op *sysrq_key_ta
 #else
 /* c */	NULL,
 #endif
+#ifdef CONFIG_DEBUG_DEADLOCKS
+/* d */ &sysrq_showlocks_op,
+#else
 /* d */ NULL,
+#endif
 /* e */	&sysrq_term_op,
 /* f */	&sysrq_moom_op,
 /* g */	NULL,
@@ -306,7 +342,11 @@ static struct sysrq_key_op *sysrq_key_ta
 #else
 /* k */	NULL,
 #endif
+#if defined(__i386__)
+/* l */	&sysrq_showallregs_op,
+#else
 /* l */	NULL,
+#endif
 /* m */	&sysrq_showmem_op,
 /* n */	&sysrq_unrt_op,
 /* o */	NULL, /* This will often be registered
Index: linux/drivers/char/tty_io.c
===================================================================
--- linux.orig/drivers/char/tty_io.c
+++ linux/drivers/char/tty_io.c
@@ -224,6 +224,7 @@ static int check_tty_count(struct tty_st
 		printk(KERN_WARNING "Warning: dev (%s) tty->count(%d) "
 				    "!= #fd's(%d) in %s\n",
 		       tty->name, tty->count, count, routine);
+		dump_stack();
 		return count;
        }	
 #endif
@@ -867,8 +868,8 @@ static void do_tty_hangup(void *data)
 				p->signal->tty = NULL;
 			if (!p->signal->leader)
 				continue;
-			send_group_sig_info(SIGHUP, SEND_SIG_PRIV, p);
-			send_group_sig_info(SIGCONT, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGHUP, SEND_SIG_PRIV, p);
+			group_send_sig_info(SIGCONT, SEND_SIG_PRIV, p);
 			if (tty->pgrp > 0)
 				p->signal->tty_old_pgrp = tty->pgrp;
 		} while_each_task_pid(tty->session, PIDTYPE_SID, p);
Index: linux/drivers/char/watchdog/cpu5wdt.c
===================================================================
--- linux.orig/drivers/char/watchdog/cpu5wdt.c
+++ linux/drivers/char/watchdog/cpu5wdt.c
@@ -28,6 +28,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 #include <linux/timer.h>
+#include <linux/completion.h>
 #include <asm/io.h>
 #include <asm/uaccess.h>
 
@@ -56,7 +57,7 @@ static int ticks = 10000;
 /* some device data */
 
 static struct {
-	struct semaphore stop;
+	struct completion stop;
 	volatile int running;
 	struct timer_list timer;
 	volatile int queue;
@@ -84,7 +85,7 @@ static void cpu5wdt_trigger(unsigned lon
 	}
 	else {
 		/* ticks doesn't matter anyway */
-		up(&cpu5wdt_device.stop);
+		complete(&cpu5wdt_device.stop);
 	}
 
 }
@@ -238,7 +239,7 @@ static int __devinit cpu5wdt_init(void)
 	if ( !val )
 		printk(KERN_INFO PFX "sorry, was my fault\n");
 
-	init_MUTEX_LOCKED(&cpu5wdt_device.stop);
+	init_completion(&cpu5wdt_device.stop);
 	cpu5wdt_device.queue = 0;
 
 	clear_bit(0, &cpu5wdt_device.inuse);
@@ -268,7 +269,7 @@ static void __devexit cpu5wdt_exit(void)
 {
 	if ( cpu5wdt_device.queue ) {
 		cpu5wdt_device.queue = 0;
-		down(&cpu5wdt_device.stop);
+		wait_for_completion(&cpu5wdt_device.stop);
 	}
 
 	misc_deregister(&cpu5wdt_misc);
Index: linux/drivers/clocksource/Makefile
===================================================================
--- /dev/null
+++ linux/drivers/clocksource/Makefile
@@ -0,0 +1,3 @@
+#XXX doesn't boot! obj-$(CONFIG_X86) += tsc-interp.o
+obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o
+obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o
Index: linux/drivers/clocksource/acpi_pm.c
===================================================================
--- /dev/null
+++ linux/drivers/clocksource/acpi_pm.c
@@ -0,0 +1,154 @@
+/*
+ * linux/drivers/clocksource/acpi_pm.c
+ *
+ * This file contains the ACPI PM based clocksource.
+ *
+ * This code was largely moved from the i386 timer_pm.c file
+ * which was (C) Dominik Brodowski <linux@brodo.de> 2003
+ * and contained the following comments:
+ *
+ * Driver to use the Power Management Timer (PMTMR) available in some
+ * southbridges as primary timing source for the Linux kernel.
+ *
+ * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
+ * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
+ *
+ * This file is licensed under the GPL v2.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/errno.h>
+#include <linux/init.h>
+#include <asm/io.h>
+
+/* Number of PMTMR ticks expected during calibration run */
+#define PMTMR_TICKS_PER_SEC 3579545
+
+#if (defined(CONFIG_X86) && (!defined(CONFIG_X86_64)))
+# include "mach_timer.h"
+# define PMTMR_EXPECTED_RATE ((PMTMR_TICKS_PER_SEC*CALIBRATE_TIME_MSEC)/1000)
+#endif
+
+/*
+ * The I/O port the PMTMR resides at.
+ * The location is detected during setup_arch(),
+ * in arch/i386/acpi/boot.c
+ */
+extern u32 acpi_pmtmr_ioport;
+extern int acpi_pmtmr_buggy;
+
+#define ACPI_PM_MASK 0xFFFFFF /* limit it to 24 bits */
+
+static inline u32 read_pmtmr(void)
+{
+	/* mask the output to 24 bits */
+	return inl(acpi_pmtmr_ioport) & ACPI_PM_MASK;
+}
+
+static cycle_t acpi_pm_read_verified(void)
+{
+	u32 v1 = 0, v2 = 0, v3 = 0;
+
+	/*
+	 * It has been reported that because of various broken
+	 * chipsets (ICH4, PIIX4 and PIIX4E) where the ACPI PM clock
+	 * source is not latched, so you must read it multiple
+	 * times to ensure a safe value is read:
+	 */
+	do {
+		v1 = read_pmtmr();
+		v2 = read_pmtmr();
+		v3 = read_pmtmr();
+	} while ((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1)
+			|| (v3 > v1 && v3 < v2));
+
+	return (cycle_t)v2;
+}
+
+static cycle_t acpi_pm_read(void)
+{
+	return (cycle_t)read_pmtmr();
+}
+
+struct clocksource clocksource_acpi_pm = {
+	.name		= "acpi_pm",
+	.rating		= 200,
+	.read		= acpi_pm_read,
+	.mask		= (cycle_t)ACPI_PM_MASK,
+	.mult		= 0, /*to be caluclated*/
+	.shift		= 22,
+	.is_continuous	= 1,
+};
+
+#if defined(CONFIG_X86) && !defined(CONFIG_X86_64)
+/*
+ * Some boards have the PMTMR running way too fast. We check
+ * the PMTMR rate against PIT channel 2 to catch these cases.
+ */
+static int __init verify_pmtmr_rate(void)
+{
+	unsigned long count, delta;
+	u32 value1, value2;
+
+	mach_prepare_counter();
+	value1 = read_pmtmr();
+	mach_countup(&count);
+	value2 = read_pmtmr();
+	delta = (value2 - value1) & ACPI_PM_MASK;
+
+	/* check that the PMTMR delta is within 5% of what we expect: */
+	if (delta < (PMTMR_EXPECTED_RATE * 19) / 20 ||
+	    delta > (PMTMR_EXPECTED_RATE * 21) / 20) {
+		printk(KERN_INFO "PM-Timer running at invalid rate: %lu%% of normal - aborting.\n", 100UL * delta / PMTMR_EXPECTED_RATE);
+		return -1;
+	}
+
+	return 0;
+}
+#else
+# define verify_pmtmr_rate() (0)
+#endif
+
+static int __init init_acpi_pm_clocksource(void)
+{
+	u32 value1, value2;
+	unsigned int i;
+
+	if (!acpi_pmtmr_ioport)
+		return -ENODEV;
+
+	clocksource_acpi_pm.mult = clocksource_hz2mult(PMTMR_TICKS_PER_SEC,
+						clocksource_acpi_pm.shift);
+
+	/* "verify" this timing source: */
+	value1 = read_pmtmr();
+	for (i = 0; i < 10000; i++) {
+		value2 = read_pmtmr();
+		if (value2 == value1)
+			continue;
+		if (value2 > value1)
+			goto pm_good;
+		if ((value2 < value1) && ((value2) < 0xFFF))
+			goto pm_good;
+		printk(KERN_INFO "PM-Timer had inconsistent results: 0x%#x, 0x%#x - aborting.\n", value1, value2);
+		return -EINVAL;
+	}
+	printk(KERN_INFO "PM-Timer had no reasonable result: 0x%#x - aborting.\n", value1);
+	return -ENODEV;
+
+pm_good:
+	if (verify_pmtmr_rate() != 0)
+		return -ENODEV;
+
+	/* check to see if pmtmr is known buggy: */
+	if (acpi_pmtmr_buggy) {
+		clocksource_acpi_pm.read = acpi_pm_read_verified;
+		clocksource_acpi_pm.rating = 110;
+	}
+
+	register_clocksource(&clocksource_acpi_pm);
+
+	return 0;
+}
+
+module_init(init_acpi_pm_clocksource);
Index: linux/drivers/clocksource/cyclone.c
===================================================================
--- /dev/null
+++ linux/drivers/clocksource/cyclone.c
@@ -0,0 +1,121 @@
+#include <linux/clocksource.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+
+#include <asm/pgtable.h>
+#include <asm/io.h>
+
+#include "mach_timer.h"
+
+#define CYCLONE_CBAR_ADDR	0xFEB00CD0	/* base address ptr */
+#define CYCLONE_PMCC_OFFSET	0x51A0		/* offset to control register */
+#define CYCLONE_MPCS_OFFSET	0x51A8		/* offset to select register */
+#define CYCLONE_MPMC_OFFSET	0x51D0		/* offset to count register */
+#define CYCLONE_TIMER_FREQ	99780000	/* 100Mhz, but not really */
+#define CYCLONE_TIMER_MASK	0xFFFFFFFF	/* 32 bit mask */
+
+int use_cyclone = 0;
+static void __iomem *cyclone_ptr;
+
+static cycle_t read_cyclone(void)
+{
+	return (cycle_t)readl(cyclone_ptr);
+}
+
+struct clocksource clocksource_cyclone = {
+	.name		= "cyclone",
+	.rating		= 250,
+	.read		= read_cyclone,
+	.mask		= (cycle_t)CYCLONE_TIMER_MASK,
+	.mult		= 10,
+	.shift		= 0,
+	.is_continuous	= 1,
+};
+
+static int __init init_cyclone_clocksource(void)
+{
+	unsigned long base;	/* saved value from CBAR */
+	unsigned long offset;
+	u32 __iomem* volatile cyclone_timer;	/* Cyclone MPMC0 register */
+	u32 __iomem* reg;
+	int i;
+
+	/* make sure we're on a summit box: */
+	if (!use_cyclone)
+		return -ENODEV;
+
+	printk(KERN_INFO "Summit chipset: Starting Cyclone Counter.\n");
+
+	/* find base address: */
+	offset = CYCLONE_CBAR_ADDR;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR register.\n");
+		return -ENODEV;
+	}
+	/* even on 64bit systems, this is only 32bits: */
+	base = readl(reg);
+	if (!base) {
+		printk(KERN_ERR "Summit chipset: Could not find valid CBAR value.\n");
+		return -ENODEV;
+	}
+	iounmap(reg);
+
+	/* setup PMCC: */
+	offset = base + CYCLONE_PMCC_OFFSET;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid PMCC register.\n");
+		return -ENODEV;
+	}
+	writel(0x00000001,reg);
+	iounmap(reg);
+
+	/* setup MPCS: */
+	offset = base + CYCLONE_MPCS_OFFSET;
+	reg = ioremap_nocache(offset, sizeof(reg));
+	if (!reg) {
+		printk(KERN_ERR "Summit chipset: Could not find valid MPCS register.\n");
+		return -ENODEV;
+	}
+	writel(0x00000001,reg);
+	iounmap(reg);
+
+	/* map in cyclone_timer: */
+	offset = base + CYCLONE_MPMC_OFFSET;
+	cyclone_timer = ioremap_nocache(offset, sizeof(u64));
+	if (!cyclone_timer) {
+		printk(KERN_ERR "Summit chipset: Could not find valid MPMC register.\n");
+		return -ENODEV;
+	}
+
+	/* quick test to make sure its ticking: */
+	for (i = 0; i < 3; i++){
+		u32 old = readl(cyclone_timer);
+		int stall = 100;
+
+		while (stall--)
+			barrier();
+
+		if (readl(cyclone_timer) == old) {
+			printk(KERN_ERR "Summit chipset: Counter not counting! DISABLED\n");
+			iounmap(cyclone_timer);
+			cyclone_timer = NULL;
+			return -ENODEV;
+		}
+	}
+	cyclone_ptr = cyclone_timer;
+
+	/* sort out mult/shift values: */
+	clocksource_cyclone.shift = 22;
+	clocksource_cyclone.mult = clocksource_hz2mult(CYCLONE_TIMER_FREQ,
+						clocksource_cyclone.shift);
+
+	register_clocksource(&clocksource_cyclone);
+
+	return 0;
+}
+
+module_init(init_cyclone_clocksource);
Index: linux/drivers/clocksource/tsc-interp.c
===================================================================
--- /dev/null
+++ linux/drivers/clocksource/tsc-interp.c
@@ -0,0 +1,111 @@
+/*
+ * TSC-Jiffies Interpolation clocksource
+ *	Example interpolation clocksource.
+ * TODO:
+ *	o per-cpu TSC offsets
+ */
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/threads.h>
+#include <linux/timer.h>
+#include <linux/timex.h>
+#include <linux/init.h>
+#include <linux/smp.h>
+
+static unsigned long current_tsc_khz = 0;
+
+static DECLARE_RAW_SEQLOCK(tsc_interp_lock);
+static unsigned long tsc_then;
+static unsigned long jiffies_then;
+struct timer_list tsc_interp_timer;
+
+static unsigned long mult, shift;
+
+#define NSEC_PER_JIFFY	((((unsigned long long)NSEC_PER_SEC)<<8)/ACTHZ)
+#define SHIFT_VAL	22
+
+static cycle_t read_tsc_interp(void);
+static void tsc_interp_update_callback(void);
+
+static struct clocksource clocksource_tsc_interp = {
+	.name			= "tsc-interp",
+	.rating			= 150,
+	.type			= CLOCKSOURCE_FUNCTION,
+	.read_fnct		= read_tsc_interp,
+	.mask			= (cycle_t)((1ULL<<32)-1),
+	.mult			= 1<<SHIFT_VAL,
+	.shift			= SHIFT_VAL,
+	.update_callback	= tsc_interp_update_callback,
+};
+
+static void tsc_interp_sync(unsigned long unused)
+{
+	unsigned long jiffies_now;
+	cycle_t tsc_now;
+
+	do {
+		jiffies_now = jiffies;
+		rdtscll(tsc_now);
+	} while (jiffies_now != jiffies);
+
+	write_seqlock(&tsc_interp_lock);
+	jiffies_then = jiffies_now;
+	tsc_then = tsc_now;
+	write_sequnlock(&tsc_interp_lock);
+
+	mod_timer(&tsc_interp_timer, jiffies+1);
+}
+
+static cycle_t read_tsc_interp(void)
+{
+	unsigned long seq, jiffs_now, jiffs_then;
+	cycle_t ret, now, then;
+
+	do {
+		seq = read_seqbegin(&tsc_interp_lock);
+
+		jiffs_now = jiffies;
+		jiffs_then = jiffies_then;
+		then = tsc_then;
+
+	} while (read_seqretry(&tsc_interp_lock, seq));
+
+	rdtscll(now);
+	ret = (cycle_t)jiffs_then * NSEC_PER_JIFFY;
+	if (jiffs_then == jiffs_now)
+		ret += min((cycle_t)NSEC_PER_JIFFY,
+				(cycle_t)((now - then)*mult) >> shift);
+	else
+		ret += (cycle_t)(jiffs_now - jiffs_then)*NSEC_PER_JIFFY;
+
+	return ret;
+}
+
+static void tsc_interp_update_callback(void)
+{
+	/* only update if tsc_khz has changed: */
+	if (current_tsc_khz != tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		mult = clocksource_khz2mult(current_tsc_khz, shift);
+	}
+}
+
+static int __init init_tsc_interp_clocksource(void)
+{
+	/* TSC initialization is done in arch/i386/kernel/tsc.c */
+	if (cpu_has_tsc && tsc_khz) {
+		current_tsc_khz = tsc_khz;
+		shift = SHIFT_VAL;
+		mult = clocksource_khz2mult(current_tsc_khz, shift);
+		/* setup periodic soft-timer: */
+		init_timer(&tsc_interp_timer);
+		tsc_interp_timer.function = tsc_interp_sync;
+		tsc_interp_timer.expires = jiffies;
+		add_timer(&tsc_interp_timer);
+
+		register_clocksource(&clocksource_tsc_interp);
+	}
+	return 0;
+}
+
+module_init(init_tsc_interp_clocksource);
Index: linux/drivers/cpufreq/cpufreq.c
===================================================================
--- linux.orig/drivers/cpufreq/cpufreq.c
+++ linux/drivers/cpufreq/cpufreq.c
@@ -605,7 +605,8 @@ static int cpufreq_add_dev (struct sys_d
 	policy->cpu = cpu;
 	policy->cpus = cpumask_of_cpu(cpu);
 
-	init_MUTEX_LOCKED(&policy->lock);
+	init_MUTEX(&policy->lock);
+	down(&policy->lock);
 	init_completion(&policy->kobj_unregister);
 	INIT_WORK(&policy->update, handle_update, (void *)(long)cpu);
 
@@ -614,6 +615,7 @@ static int cpufreq_add_dev (struct sys_d
 	 */
 	ret = cpufreq_driver->init(policy);
 	if (ret) {
+		up(&policy->lock);
 		dprintk("initialization failed\n");
 		goto err_out;
 	}
@@ -626,8 +628,10 @@ static int cpufreq_add_dev (struct sys_d
 	strlcpy(policy->kobj.name, "cpufreq", KOBJ_NAME_LEN);
 
 	ret = kobject_register(&policy->kobj);
-	if (ret)
+	if (ret) {
+		up(&policy->lock);
 		goto err_out_driver_exit;
+	}
 
 	/* set up files for this cpu device */
 	drv_attr = cpufreq_driver->attr;
Index: linux/drivers/i2c/busses/i2c-pxa.c
===================================================================
--- linux.orig/drivers/i2c/busses/i2c-pxa.c
+++ linux/drivers/i2c/busses/i2c-pxa.c
@@ -925,7 +925,7 @@ static struct i2c_algorithm i2c_pxa_algo
 };
 
 static struct pxa_i2c i2c_pxa = {
-	.lock	= SPIN_LOCK_UNLOCKED,
+	.lock	= SPIN_LOCK_UNLOCKED(i2c_pxa.lock),
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER(i2c_pxa.wait),
 	.adap	= {
 		.owner		= THIS_MODULE,
Index: linux/drivers/i2c/busses/i2c-s3c2410.c
===================================================================
--- linux.orig/drivers/i2c/busses/i2c-s3c2410.c
+++ linux/drivers/i2c/busses/i2c-s3c2410.c
@@ -573,7 +573,7 @@ static struct i2c_algorithm s3c24xx_i2c_
 };
 
 static struct s3c24xx_i2c s3c24xx_i2c = {
-	.lock	= SPIN_LOCK_UNLOCKED,
+	.lock	= SPIN_LOCK_UNLOCKED(s3c24xx_i2c.lock),
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER(s3c24xx_i2c.wait),
 	.adap	= {
 		.name			= "s3c2410-i2c",
Index: linux/drivers/i2c/chips/tps65010.c
===================================================================
--- linux.orig/drivers/i2c/chips/tps65010.c
+++ linux/drivers/i2c/chips/tps65010.c
@@ -25,6 +25,7 @@
 #include <linux/init.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/device.h>
 #include <linux/i2c.h>
 #include <linux/delay.h>
@@ -33,7 +34,6 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
-#include <asm/irq.h>
 #include <asm/mach-types.h>
 
 #include <asm/arch/gpio.h>
Index: linux/drivers/ide/ide-floppy.c
===================================================================
--- linux.orig/drivers/ide/ide-floppy.c
+++ linux/drivers/ide/ide-floppy.c
@@ -838,7 +838,7 @@ static ide_startstop_t idefloppy_pc_intr
 			"transferred\n", pc->actually_transferred);
 		clear_bit(PC_DMA_IN_PROGRESS, &pc->flags);
 
-		local_irq_enable();
+		local_irq_enable_nort();
 
 		if (status.b.check || test_bit(PC_DMA_ERROR, &pc->flags)) {
 			/* Error detected */
@@ -1670,9 +1670,9 @@ static int idefloppy_get_format_progress
 		atapi_status_t status;
 		unsigned long flags;
 
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		status.all = HWIF(drive)->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 
 		progress_indication = !status.b.dsc ? 0 : 0x10000;
 	}
Index: linux/drivers/ide/ide-io.c
===================================================================
--- linux.orig/drivers/ide/ide-io.c
+++ linux/drivers/ide/ide-io.c
@@ -636,7 +636,7 @@ static ide_startstop_t drive_cmd_intr (i
 	u8 stat = hwif->INB(IDE_STATUS_REG);
 	int retries = 10;
 
-	local_irq_enable();
+	local_irq_enable_nort();
 	if ((stat & DRQ_STAT) && args && args[3]) {
 		u8 io_32bit = drive->io_32bit;
 		drive->io_32bit = 0;
@@ -1107,7 +1107,7 @@ static void ide_do_request (ide_hwgroup_
 	ide_get_lock(ide_intr, hwgroup);
 
 	/* caller must own ide_lock */
-	BUG_ON(!irqs_disabled());
+	BUG_ON_NONRT(!irqs_disabled());
 
 	while (!hwgroup->busy) {
 		hwgroup->busy = 1;
@@ -1219,8 +1219,7 @@ static void ide_do_request (ide_hwgroup_
 		 */
 		if (masked_irq != IDE_NO_IRQ && hwif->irq != masked_irq)
 			disable_irq_nosync(hwif->irq);
-		spin_unlock(&ide_lock);
-		local_irq_enable();
+		spin_unlock_irq(&ide_lock);
 			/* allow other IRQs while we start this request */
 		startstop = start_request(drive, rq);
 		spin_lock_irq(&ide_lock);
@@ -1368,7 +1367,7 @@ void ide_timer_expiry (unsigned long dat
 #endif /* DISABLE_IRQ_NOSYNC */
 			/* local CPU only,
 			 * as if we were handling an interrupt */
-			local_irq_disable();
+			local_irq_disable_nort();
 			if (hwgroup->polling) {
 				startstop = handler(drive);
 			} else if (drive_is_ready(drive)) {
@@ -1565,7 +1564,7 @@ irqreturn_t ide_intr (int irq, void *dev
 	spin_unlock(&ide_lock);
 
 	if (drive->unmask)
-		local_irq_enable();
+		local_irq_enable_nort();
 	/* service this interrupt, may set handler for next interrupt */
 	startstop = handler(drive);
 	spin_lock_irq(&ide_lock);
Index: linux/drivers/ide/ide-iops.c
===================================================================
--- linux.orig/drivers/ide/ide-iops.c
+++ linux/drivers/ide/ide-iops.c
@@ -246,10 +246,10 @@ static void ata_input_data(ide_drive_t *
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->INSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -268,10 +268,10 @@ static void ata_output_data(ide_drive_t 
 	if (io_32bit) {
 		if (io_32bit & 2) {
 			unsigned long flags;
-			local_irq_save(flags);
+			local_irq_save_nort(flags);
 			ata_vlb_sync(drive, IDE_NSECTOR_REG);
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
-			local_irq_restore(flags);
+			local_irq_restore_nort(flags);
 		} else
 			hwif->OUTSL(IDE_DATA_REG, buffer, wcount);
 	} else {
@@ -570,12 +570,12 @@ int ide_wait_stat (ide_startstop_t *star
 				if (!(stat & BUSY_STAT))
 					break;
 
-				local_irq_restore(flags);
+				local_irq_restore_nort(flags);
 				*startstop = ide_error(drive, "status timeout", stat);
 				return 1;
 			}
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 	/*
 	 * Allow status to settle, then read it again.
@@ -733,17 +733,15 @@ int ide_driveid_update (ide_drive_t *dri
 		printk("%s: CHECK for good STATUS\n", drive->name);
 		return 0;
 	}
-	local_irq_save(flags);
-	SELECT_MASK(drive, 0);
 	id = kmalloc(SECTOR_WORDS*4, GFP_ATOMIC);
-	if (!id) {
-		local_irq_restore(flags);
+	if (!id)
 		return 0;
-	}
+	local_irq_save_nort(flags);
+	SELECT_MASK(drive, 0);
 	ata_input_data(drive, id, SECTOR_WORDS);
 	(void) hwif->INB(IDE_STATUS_REG);	/* clear drive IRQ */
-	local_irq_enable();
-	local_irq_restore(flags);
+	local_irq_enable_nort();
+	local_irq_restore_nort(flags);
 	ide_fix_driveid(id);
 	if (id) {
 		drive->id->dma_ultra = id->dma_ultra;
@@ -823,7 +821,7 @@ int ide_config_drive_speed (ide_drive_t 
 			if (time_after(jiffies, timeout))
 				break;
 		}
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	}
 
 	/*
@@ -1249,6 +1247,7 @@ int ide_wait_not_busy(ide_hwif_t *hwif, 
 		 */
 		if (stat == 0xff)
 			return -ENODEV;
+		touch_softlockup_watchdog();
 	}
 	return -EBUSY;
 }
Index: linux/drivers/ide/ide-lib.c
===================================================================
--- linux.orig/drivers/ide/ide-lib.c
+++ linux/drivers/ide/ide-lib.c
@@ -447,15 +447,16 @@ EXPORT_SYMBOL_GPL(ide_set_xfer_rate);
 
 static void ide_dump_opcode(ide_drive_t *drive)
 {
+	unsigned long flags;
 	struct request *rq;
 	u8 opcode = 0;
 	int found = 0;
 
-	spin_lock(&ide_lock);
+	spin_lock_irqsave(&ide_lock, flags);
 	rq = NULL;
 	if (HWGROUP(drive))
 		rq = HWGROUP(drive)->rq;
-	spin_unlock(&ide_lock);
+	spin_unlock_irqrestore(&ide_lock, flags);
 	if (!rq)
 		return;
 	if (rq->flags & (REQ_DRIVE_CMD | REQ_DRIVE_TASK)) {
@@ -483,10 +484,8 @@ static void ide_dump_opcode(ide_drive_t 
 static u8 ide_dump_ata_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
 	ide_hwif_t *hwif = HWIF(drive);
-	unsigned long flags;
 	u8 err = 0;
 
-	local_irq_set(flags);
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (stat & BUSY_STAT)
 		printk("Busy ");
@@ -546,7 +545,7 @@ static u8 ide_dump_ata_status(ide_drive_
 		printk("\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return err;
 }
 
@@ -561,14 +560,12 @@ static u8 ide_dump_ata_status(ide_drive_
 
 static u8 ide_dump_atapi_status(ide_drive_t *drive, const char *msg, u8 stat)
 {
-	unsigned long flags;
-
 	atapi_status_t status;
 	atapi_error_t error;
 
 	status.all = stat;
 	error.all = 0;
-	local_irq_set(flags);
+
 	printk("%s: %s: status=0x%02x { ", drive->name, msg, stat);
 	if (status.b.bsy)
 		printk("Busy ");
@@ -594,7 +591,7 @@ static u8 ide_dump_atapi_status(ide_driv
 		printk("}\n");
 	}
 	ide_dump_opcode(drive);
-	local_irq_restore(flags);
+
 	return error.all;
 }
 
Index: linux/drivers/ide/ide-probe.c
===================================================================
--- linux.orig/drivers/ide/ide-probe.c
+++ linux/drivers/ide/ide-probe.c
@@ -184,7 +184,7 @@ static inline void do_identify (ide_driv
 	hwif->ata_input_data(drive, id, SECTOR_WORDS);
 
 	drive->id_read = 1;
-	local_irq_enable();
+	local_irq_enable_nort();
 	ide_fix_driveid(id);
 
 #if defined (CONFIG_SCSI_EATA_DMA) || defined (CONFIG_SCSI_EATA_PIO) || defined (CONFIG_SCSI_EATA)
@@ -362,14 +362,14 @@ static int actual_try_to_identify (ide_d
 		unsigned long flags;
 
 		/* local CPU only; some systems need this */
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		/* drive returned ID */
 		do_identify(drive, cmd);
 		/* drive responded with ID */
 		rc = 0;
 		/* clear drive IRQ */
 		(void) hwif->INB(IDE_STATUS_REG);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 	} else {
 		/* drive refused ID */
 		rc = 2;
@@ -656,7 +656,7 @@ static void hwif_release_dev (struct dev
 {
 	ide_hwif_t *hwif = container_of(dev, ide_hwif_t, gendev);
 
-	up(&hwif->gendev_rel_sem);
+	complete(&hwif->gendev_rel_comp);
 }
 
 static void hwif_register (ide_hwif_t *hwif)
@@ -842,7 +842,7 @@ static void probe_hwif(ide_hwif_t *hwif)
 		} while ((stat & BUSY_STAT) && time_after(timeout, jiffies));
 
 	}
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 	/*
 	 * Use cached IRQ number. It might be (and is...) changed by probe
 	 * code above
@@ -1328,7 +1328,7 @@ static void drive_release_dev (struct de
 	drive->queue = NULL;
 	spin_unlock_irq(&ide_lock);
 
-	up(&drive->gendev_rel_sem);
+	complete(&drive->gendev_rel_comp);
 }
 
 /*
Index: linux/drivers/ide/ide-taskfile.c
===================================================================
--- linux.orig/drivers/ide/ide-taskfile.c
+++ linux/drivers/ide/ide-taskfile.c
@@ -227,7 +227,7 @@ ide_startstop_t task_no_data_intr (ide_d
 	ide_hwif_t *hwif	= HWIF(drive);
 	u8 stat;
 
-	local_irq_enable();
+	local_irq_enable_nort();
 	if (!OK_STAT(stat = hwif->INB(IDE_STATUS_REG),READY_STAT,BAD_STAT)) {
 		return ide_error(drive, "task_no_data_intr", stat);
 		/* calls ide_end_drive_cmd */
@@ -279,7 +279,7 @@ static void ide_pio_sector(ide_drive_t *
 	offset %= PAGE_SIZE;
 
 #ifdef CONFIG_HIGHMEM
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 #endif
 	buf = kmap_atomic(page, KM_BIO_SRC_IRQ) + offset;
 
@@ -299,7 +299,7 @@ static void ide_pio_sector(ide_drive_t *
 
 	kunmap_atomic(buf, KM_BIO_SRC_IRQ);
 #ifdef CONFIG_HIGHMEM
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 #endif
 }
 
@@ -457,7 +457,7 @@ ide_startstop_t pre_task_out_intr (ide_d
 	}
 
 	if (!drive->unmask)
-		local_irq_disable();
+		local_irq_disable_nort();
 
 	ide_set_handler(drive, &task_out_intr, WAIT_WORSTCASE, NULL);
 	ide_pio_datablock(drive, rq, 1);
Index: linux/drivers/ide/ide.c
===================================================================
--- linux.orig/drivers/ide/ide.c
+++ linux/drivers/ide/ide.c
@@ -222,7 +222,7 @@ static void init_hwif_data(ide_hwif_t *h
 	hwif->mwdma_mask = 0x80;	/* disable all mwdma */
 	hwif->swdma_mask = 0x80;	/* disable all swdma */
 
-	sema_init(&hwif->gendev_rel_sem, 0);
+	init_completion(&hwif->gendev_rel_comp);
 
 	default_hwif_iops(hwif);
 	default_hwif_transport(hwif);
@@ -245,7 +245,7 @@ static void init_hwif_data(ide_hwif_t *h
 		drive->is_flash			= 0;
 		drive->vdma			= 0;
 		INIT_LIST_HEAD(&drive->list);
-		sema_init(&drive->gendev_rel_sem, 0);
+		init_completion(&drive->gendev_rel_comp);
 	}
 }
 
@@ -602,7 +602,7 @@ void ide_unregister(unsigned int index)
 		}
 		spin_unlock_irq(&ide_lock);
 		device_unregister(&drive->gendev);
-		down(&drive->gendev_rel_sem);
+		wait_for_completion(&drive->gendev_rel_comp);
 		spin_lock_irq(&ide_lock);
 	}
 	hwif->present = 0;
@@ -662,7 +662,7 @@ void ide_unregister(unsigned int index)
 	/* More messed up locking ... */
 	spin_unlock_irq(&ide_lock);
 	device_unregister(&hwif->gendev);
-	down(&hwif->gendev_rel_sem);
+	wait_for_completion(&hwif->gendev_rel_comp);
 
 	/*
 	 * Remove us from the kernel's knowledge
@@ -1049,15 +1049,13 @@ int ide_spin_wait_hwgroup (ide_drive_t *
 	spin_lock_irq(&ide_lock);
 
 	while (hwgroup->busy) {
-		unsigned long lflags;
 		spin_unlock_irq(&ide_lock);
-		local_irq_set(lflags);
+
 		if (time_after(jiffies, timeout)) {
-			local_irq_restore(lflags);
 			printk(KERN_ERR "%s: channel busy\n", drive->name);
 			return -EBUSY;
 		}
-		local_irq_restore(lflags);
+
 		spin_lock_irq(&ide_lock);
 	}
 	return 0;
Index: linux/drivers/ide/pci/alim15x3.c
===================================================================
--- linux.orig/drivers/ide/pci/alim15x3.c
+++ linux/drivers/ide/pci/alim15x3.c
@@ -296,7 +296,6 @@ static void ali15x3_tune_drive (ide_driv
 	struct pci_dev *dev = hwif->pci_dev;
 	int s_time, a_time, c_time;
 	u8 s_clc, a_clc, r_clc;
-	unsigned long flags;
 	int bus_speed = system_bus_clock();
 	int port = hwif->channel ? 0x5c : 0x58;
 	int portFIFO = hwif->channel ? 0x55 : 0x54;
@@ -323,7 +322,6 @@ static void ali15x3_tune_drive (ide_driv
 		if (r_clc >= 16)
 			r_clc = 0;
 	}
-	local_irq_save(flags);
 	
 	/* 
 	 * PIO mode => ATA FIFO on, ATAPI FIFO off
@@ -345,7 +343,6 @@ static void ali15x3_tune_drive (ide_driv
 	
 	pci_write_config_byte(dev, port, s_clc);
 	pci_write_config_byte(dev, port+drive->select.b.unit+2, (a_clc << 4) | r_clc);
-	local_irq_restore(flags);
 
 	/*
 	 * setup   active  rec
@@ -585,7 +582,6 @@ static int ali15x3_dma_setup(ide_drive_t
   
 static unsigned int __devinit init_chipset_ali15x3 (struct pci_dev *dev, const char *name)
 {
-	unsigned long flags;
 	u8 tmpbyte;
 	struct pci_dev *north = pci_find_slot(0, PCI_DEVFN(0,0));
 
@@ -601,7 +597,6 @@ static unsigned int __devinit init_chips
 	}
 #endif  /* defined(DISPLAY_ALI_TIMINGS) && defined(CONFIG_PROC_FS) */
 
-	local_irq_save(flags);
 
 	if (m5229_revision < 0xC2) {
 		/*
@@ -614,7 +609,6 @@ static unsigned int __devinit init_chips
 		 * clear bit 7
 		 */
 		pci_write_config_byte(dev, 0x4b, tmpbyte & 0x7F);
-		local_irq_restore(flags);
 		return 0;
 	}
 
@@ -639,7 +633,6 @@ static unsigned int __devinit init_chips
 	 * 0:0.0 so if we didn't find one we know what is cooking.
 	 */
 	if (north && north->vendor != PCI_VENDOR_ID_AL) {
-		local_irq_restore(flags);
 	        return 0;
 	}
 
@@ -662,7 +655,6 @@ static unsigned int __devinit init_chips
 			pci_write_config_byte(isa_dev, 0x79, tmpbyte | 0x02);
 		}
 	}
-	local_irq_restore(flags);
 	return 0;
 }
 
@@ -683,10 +675,8 @@ static unsigned int __devinit ata66_ali1
 	unsigned int ata66	= 0;
 	u8 cable_80_pin[2]	= { 0, 0 };
 
-	unsigned long flags;
 	u8 tmpbyte;
 
-	local_irq_save(flags);
 
 	if (m5229_revision >= 0xC2) {
 		/*
@@ -736,7 +726,6 @@ static unsigned int __devinit ata66_ali1
 
 	pci_write_config_byte(dev, 0x53, tmpbyte);
 
-	local_irq_restore(flags);
 
 	return(ata66);
 }
Index: linux/drivers/ide/pci/hpt366.c
===================================================================
--- linux.orig/drivers/ide/pci/hpt366.c
+++ linux/drivers/ide/pci/hpt366.c
@@ -1481,7 +1481,6 @@ static void __devinit init_dma_hpt366(id
 	u8 dma_new	= 0, dma_old = 0;
 	u8 primary	= hwif->channel ? 0x4b : 0x43;
 	u8 secondary	= hwif->channel ? 0x4f : 0x47;
-	unsigned long flags;
 
 	if (!dmabase)
 		return;
@@ -1493,8 +1492,6 @@ static void __devinit init_dma_hpt366(id
 
 	dma_old = hwif->INB(dmabase+2);
 
-	local_irq_save(flags);
-
 	dma_new = dma_old;
 	pci_read_config_byte(hwif->pci_dev, primary, &masterdma);
 	pci_read_config_byte(hwif->pci_dev, secondary, &slavedma);
@@ -1504,8 +1501,6 @@ static void __devinit init_dma_hpt366(id
 	if (dma_new != dma_old)
 		hwif->OUTB(dma_new, dmabase+2);
 
-	local_irq_restore(flags);
-
 	ide_setup_dma(hwif, dmabase, 8);
 }
 
Index: linux/drivers/ide/setup-pci.c
===================================================================
--- linux.orig/drivers/ide/setup-pci.c
+++ linux/drivers/ide/setup-pci.c
@@ -665,8 +665,11 @@ static int do_ide_setup_pci_device(struc
 {
 	static ata_index_t ata_index = { .b = { .low = 0xff, .high = 0xff } };
 	int tried_config = 0;
+	unsigned long flags;
 	int pciirq, ret;
 
+	spin_lock_irqsave(&ide_lock, flags);
+
 	ret = ide_setup_pci_controller(dev, d, noisy, &tried_config);
 	if (ret < 0)
 		goto out;
@@ -721,6 +724,8 @@ static int do_ide_setup_pci_device(struc
 	*index = ata_index;
 	ide_pci_setup_ports(dev, d, pciirq, index);
 out:
+	spin_unlock_irqrestore(&ide_lock, flags);
+
 	return ret;
 }
 
Index: linux/drivers/ieee1394/ieee1394_types.h
===================================================================
--- linux.orig/drivers/ieee1394/ieee1394_types.h
+++ linux/drivers/ieee1394/ieee1394_types.h
@@ -19,7 +19,7 @@ struct hpsb_tlabel_pool {
 	spinlock_t lock;
 	u8 next;
 	u32 allocations;
-	struct semaphore count;
+	struct compat_semaphore count;
 };
 
 #define HPSB_TPOOL_INIT(_tp)			\
Index: linux/drivers/ieee1394/nodemgr.c
===================================================================
--- linux.orig/drivers/ieee1394/nodemgr.c
+++ linux/drivers/ieee1394/nodemgr.c
@@ -114,7 +114,7 @@ struct host_info {
 	struct hpsb_host *host;
 	struct list_head list;
 	struct completion exited;
-	struct semaphore reset_sem;
+	struct compat_semaphore reset_sem;
 	int pid;
 	char daemon_name[15];
 	int kill_me;
Index: linux/drivers/ieee1394/raw1394-private.h
===================================================================
--- linux.orig/drivers/ieee1394/raw1394-private.h
+++ linux/drivers/ieee1394/raw1394-private.h
@@ -29,7 +29,7 @@ struct file_info {
 
         struct list_head req_pending;
         struct list_head req_complete;
-        struct semaphore complete_sem;
+        struct compat_semaphore complete_sem;
         spinlock_t reqlists_lock;
         wait_queue_head_t poll_wait_complete;
 
Index: linux/drivers/input/gameport/gameport.c
===================================================================
--- linux.orig/drivers/input/gameport/gameport.c
+++ linux/drivers/input/gameport/gameport.c
@@ -21,6 +21,7 @@
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/kthread.h>
+#include <linux/interrupt.h>
 
 /*#include <asm/io.h>*/
 
@@ -100,12 +101,12 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		GET_TIME(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		GET_TIME(t2);
 		GET_TIME(t3);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if ((t = DELTA(t2,t1) - DELTA(t3,t2)) < tx) tx = t;
 	}
@@ -124,11 +125,11 @@ static int gameport_measure_speed(struct
 	tx = 1 << 30;
 
 	for(i = 0; i < 50; i++) {
-		local_irq_save(flags);
+		local_irq_save_nort(flags);
 		rdtscl(t1);
 		for (t = 0; t < 50; t++) gameport_read(gameport);
 		rdtscl(t2);
-		local_irq_restore(flags);
+		local_irq_restore_nort(flags);
 		udelay(i * 10);
 		if (t2 - t1 < tx) tx = t2 - t1;
 	}
Index: linux/drivers/input/serio/sa1111ps2.c
===================================================================
--- linux.orig/drivers/input/serio/sa1111ps2.c
+++ linux/drivers/input/serio/sa1111ps2.c
@@ -13,6 +13,7 @@
 #include <linux/serio.h>
 #include <linux/errno.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/ioport.h>
 #include <linux/delay.h>
 #include <linux/device.h>
Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.c
===================================================================
--- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.c
+++ linux/drivers/media/dvb/dvb-core/dvb_frontend.c
@@ -97,7 +97,7 @@ struct dvb_frontend_private {
 	struct dvb_device *dvbdev;
 	struct dvb_frontend_parameters parameters;
 	struct dvb_fe_events events;
-	struct semaphore sem;
+	struct compat_semaphore sem;
 	struct list_head list_head;
 	wait_queue_head_t wait_queue;
 	pid_t thread_pid;
Index: linux/drivers/media/dvb/dvb-core/dvb_frontend.h
===================================================================
--- linux.orig/drivers/media/dvb/dvb-core/dvb_frontend.h
+++ linux/drivers/media/dvb/dvb-core/dvb_frontend.h
@@ -86,7 +86,7 @@ struct dvb_fe_events {
 	int			  eventr;
 	int			  overflow;
 	wait_queue_head_t	  wait_queue;
-	struct semaphore	  sem;
+	struct compat_semaphore	  sem;
 };
 
 struct dvb_frontend {
Index: linux/drivers/media/video/zr36120_i2c.c
===================================================================
--- linux.orig/drivers/media/video/zr36120_i2c.c
+++ linux/drivers/media/video/zr36120_i2c.c
@@ -120,7 +120,7 @@ struct i2c_bus zoran_i2c_bus_template =
 	I2C_BUSID_ZORAN,
 	NULL,
 
-	SPIN_LOCK_UNLOCKED,
+	SPIN_LOCK_UNLOCKED(zoran_i2c_bus_template.lock),
 
 	attach_inform,
 	detach_inform,
Index: linux/drivers/message/i2o/exec-osm.c
===================================================================
--- linux.orig/drivers/message/i2o/exec-osm.c
+++ linux/drivers/message/i2o/exec-osm.c
@@ -204,7 +204,7 @@ static int i2o_msg_post_wait_complete(st
 {
 	struct i2o_exec_wait *wait, *tmp;
 	unsigned long flags;
-	static spinlock_t lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(lock);
 	int rc = 1;
 
 	/*
Index: linux/drivers/misc/ibmasm/module.c
===================================================================
--- linux.orig/drivers/misc/ibmasm/module.c
+++ linux/drivers/misc/ibmasm/module.c
@@ -85,7 +85,7 @@ static int __devinit ibmasm_init_one(str
 	}
 	memset(sp, 0, sizeof(struct service_processor));
 
-	sp->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&sp->lock);
 	INIT_LIST_HEAD(&sp->command_queue);
 
 	pci_set_drvdata(pdev, (void *)sp);
Index: linux/drivers/net/3c527.c
===================================================================
--- linux.orig/drivers/net/3c527.c
+++ linux/drivers/net/3c527.c
@@ -182,7 +182,7 @@ struct mc32_local 
 
 	u16 rx_ring_tail;       /* index to rx de-queue end */ 
 
-	struct semaphore cmd_mutex;    /* Serialises issuing of execute commands */
+	struct compat_semaphore cmd_mutex;    /* Serialises issuing of execute commands */
         struct completion execution_cmd; /* Card has completed an execute command */
 	struct completion xceiver_cmd;   /* Card has completed a tx or rx command */
 };
Index: linux/drivers/net/3c59x.c
===================================================================
--- linux.orig/drivers/net/3c59x.c
+++ linux/drivers/net/3c59x.c
@@ -956,9 +956,9 @@ static void poll_vortex(struct net_devic
 	struct vortex_private *vp = netdev_priv(dev);
 	unsigned long flags;
 	local_save_flags(flags);
-	local_irq_disable();
+	local_irq_disable_nort();
 	(vp->full_bus_master_rx ? boomerang_interrupt:vortex_interrupt)(dev->irq,dev,NULL);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 } 
 #endif
 
@@ -2004,13 +2004,17 @@ static void vortex_tx_timeout(struct net
 			/*
 			 * Block interrupts because vortex_interrupt does a bare spin_lock()
 			 */
+#ifndef CONFIG_PREEMPT_RT
 			unsigned long flags;
 			local_irq_save(flags);
+#endif
 			if (vp->full_bus_master_tx)
 				boomerang_interrupt(dev->irq, dev, NULL);
 			else
 				vortex_interrupt(dev->irq, dev, NULL);
+#ifndef CONFIG_PREEMPT_RT
 			local_irq_restore(flags);
+#endif
 		}
 	}
 
Index: linux/drivers/net/8139too.c
===================================================================
--- linux.orig/drivers/net/8139too.c
+++ linux/drivers/net/8139too.c
@@ -2128,10 +2128,10 @@ static int rtl8139_poll(struct net_devic
 		 * Order is important since data can get interrupted
 		 * again when we think we are done.
 		 */
-		local_irq_disable();
+		raw_local_irq_disable();
 		RTL_W16_F(IntrMask, rtl8139_intr_mask);
 		__netif_rx_complete(dev);
-		local_irq_enable();
+		raw_local_irq_enable();
 	}
 	spin_unlock(&tp->rx_lock);
 
Index: linux/drivers/net/e1000/e1000_main.c
===================================================================
--- linux.orig/drivers/net/e1000/e1000_main.c
+++ linux/drivers/net/e1000/e1000_main.c
@@ -2262,10 +2262,10 @@ e1000_xmit_frame(struct sk_buff *skb, st
 	if(adapter->pcix_82544)
 		count += nr_frags;
 
- 	local_irq_save(flags); 
+ 	local_irq_save_nort(flags);
  	if (!spin_trylock(&adapter->tx_lock)) { 
  		/* Collision - tell upper layer to requeue */ 
- 		local_irq_restore(flags); 
+ 		local_irq_restore_nort(flags);
  		return NETDEV_TX_LOCKED; 
  	} 
 	if(adapter->hw.tx_pkt_filtering && (adapter->hw.mac_type == e1000_82573) )
Index: linux/drivers/net/hamradio/6pack.c
===================================================================
--- linux.orig/drivers/net/hamradio/6pack.c
+++ linux/drivers/net/hamradio/6pack.c
@@ -124,7 +124,7 @@ struct sixpack {
 	struct timer_list	tx_t;
 	struct timer_list	resync_t;
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 	spinlock_t		lock;
 };
 
Index: linux/drivers/net/hamradio/mkiss.c
===================================================================
--- linux.orig/drivers/net/hamradio/mkiss.c
+++ linux/drivers/net/hamradio/mkiss.c
@@ -85,7 +85,7 @@ struct mkiss {
 #define CRC_MODE_SMACK  2
 
 	atomic_t		refcnt;
-	struct semaphore	dead_sem;
+	struct compat_semaphore	dead_sem;
 };
 
 /*---------------------------------------------------------------------------*/
@@ -622,7 +622,7 @@ static void ax_setup(struct net_device *
  * best way to fix this is to use a rwlock in the tty struct, but for now we
  * use a single global rwlock for all ttys in ppp line discipline.
  */
-static rwlock_t disc_data_lock = RW_LOCK_UNLOCKED;
+static DEFINE_RWLOCK(disc_data_lock);
 
 static struct mkiss *mkiss_get(struct tty_struct *tty)
 {
Index: linux/drivers/net/netconsole.c
===================================================================
--- linux.orig/drivers/net/netconsole.c
+++ linux/drivers/net/netconsole.c
@@ -75,10 +75,19 @@ static void write_msg(struct console *co
 		return;
 
 	local_irq_save(flags);
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * A bit hairy. Netconsole uses mutexes (indirectly) and
+	 * thus must have interrupts enabled:
+	 */
+	local_irq_enable();
+#endif
 
 	for(left = len; left; ) {
 		frag = min(left, MAX_PRINT_CHUNK);
+		WARN_ON_RT(irqs_disabled());
 		netpoll_send_udp(&np, msg, frag);
+		WARN_ON_RT(irqs_disabled());
 		msg += frag;
 		left -= frag;
 	}
Index: linux/drivers/net/ns83820.c
===================================================================
--- linux.orig/drivers/net/ns83820.c
+++ linux/drivers/net/ns83820.c
@@ -1014,8 +1014,6 @@ static void do_tx_done(struct net_device
 	struct ns83820 *dev = PRIV(ndev);
 	u32 cmdsts, tx_done_idx, *desc;
 
-	spin_lock_irq(&dev->tx_lock);
-
 	dprintk("do_tx_done(%p)\n", ndev);
 	tx_done_idx = dev->tx_done_idx;
 	desc = dev->tx_descs + (tx_done_idx * DESC_SIZE);
@@ -1071,7 +1069,6 @@ static void do_tx_done(struct net_device
 		netif_start_queue(ndev);
 		netif_wake_queue(ndev);
 	}
-	spin_unlock_irq(&dev->tx_lock);
 }
 
 static void ns83820_cleanup_tx(struct ns83820 *dev)
@@ -1372,7 +1369,9 @@ static void ns83820_do_isr(struct net_de
 	 * work has accumulated
 	 */
 	if ((ISR_TXDESC | ISR_TXIDLE | ISR_TXOK | ISR_TXERR) & isr) {
+		spin_lock_irq(&dev->tx_lock);
 		do_tx_done(ndev);
+		spin_unlock_irq(&dev->tx_lock);
 
 		/* Disable TxOk if there are no outstanding tx packets.
 		 */
@@ -1457,7 +1456,7 @@ static void ns83820_tx_timeout(struct ne
         u32 tx_done_idx, *desc;
 	unsigned long flags;
 
-	local_irq_save(flags);
+	spin_lock_irqsave(&dev->tx_lock, flags);
 
 	tx_done_idx = dev->tx_done_idx;
 	desc = dev->tx_descs + (tx_done_idx * DESC_SIZE);
@@ -1484,7 +1483,7 @@ static void ns83820_tx_timeout(struct ne
 		ndev->name,
 		tx_done_idx, dev->tx_free_idx, le32_to_cpu(desc[DESC_CMDSTS]));
 
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&dev->tx_lock, flags);
 }
 
 static void ns83820_tx_watch(unsigned long data)
Index: linux/drivers/net/plip.c
===================================================================
--- linux.orig/drivers/net/plip.c
+++ linux/drivers/net/plip.c
@@ -229,7 +229,10 @@ struct net_local {
 	                              struct hh_cache *hh);
 	spinlock_t lock;
 	atomic_t kill_timer;
-	struct semaphore killed_timer_sem;
+	/*
+	 * PREEMPT_RT: this isnt a mutex, it should be struct completion.
+	 */
+	struct compat_semaphore killed_timer_sem;
 };
 
 static inline void enable_parport_interrupts (struct net_device *dev)
Index: linux/drivers/net/ppp_async.c
===================================================================
--- linux.orig/drivers/net/ppp_async.c
+++ linux/drivers/net/ppp_async.c
@@ -65,7 +65,7 @@ struct asyncppp {
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 	unsigned char	obuf[OBUFSIZE];
 };
Index: linux/drivers/net/ppp_synctty.c
===================================================================
--- linux.orig/drivers/net/ppp_synctty.c
+++ linux/drivers/net/ppp_synctty.c
@@ -70,7 +70,7 @@ struct syncppp {
 	struct tasklet_struct tsk;
 
 	atomic_t	refcnt;
-	struct semaphore dead_sem;
+	struct compat_semaphore dead_sem;
 	struct ppp_channel chan;	/* interface to generic ppp layer */
 };
 
Index: linux/drivers/net/smc91x.c
===================================================================
--- linux.orig/drivers/net/smc91x.c
+++ linux/drivers/net/smc91x.c
@@ -74,6 +74,7 @@ static const char version[] =
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
 #include <linux/crc32.h>
@@ -1998,7 +1999,7 @@ static int __init smc_probe(struct net_d
       	if (retval)
       		goto err_out;
 
-	set_irq_type(dev->irq, SMC_IRQ_TRIGGER_TYPE);
+	SMC_SET_IRQ_TYPE(dev->irq, SMC_IRQ_TRIGGER_TYPE);
 
 #ifdef SMC_USE_PXA_DMA
 	{
Index: linux/drivers/net/smc91x.h
===================================================================
--- linux.orig/drivers/net/smc91x.h
+++ linux/drivers/net/smc91x.h
@@ -90,7 +90,7 @@
 			__l--;						\
 		}							\
 	} while (0)
-#define set_irq_type(irq, type)
+#define SMC_SET_IRQ_TYPE(irq, type)
 
 #elif defined(CONFIG_SA1100_PLEB)
 /* We can only do 16-bit reads and writes in the static memory space. */
@@ -109,7 +109,7 @@
 #define SMC_outw(v, a, r)	outw(v, (a) + (r))
 #define SMC_outsw(a, r, p, l)	outsw((a) + (r), p, l)
 
-#define set_irq_type(irq, type) do {} while (0)
+#define SMC_SET_IRQ_TYPE(irq, type) do {} while (0)
 
 #elif defined(CONFIG_SA1100_ASSABET)
 
@@ -209,7 +209,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, 
 #define SMC_insw(a, r, p, l)	insw((a) + (r) - 0xa0000000, p, l)
 #define SMC_outsw(a, r, p, l)	outsw((a) + (r) - 0xa0000000, p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_SET_IRQ_TYPE(irq, type)	do {} while(0)
 
 #elif	defined(CONFIG_ISA)
 
@@ -237,7 +237,7 @@ SMC_outw(u16 val, void __iomem *ioaddr, 
 #define SMC_insw(a, r, p, l)	insw((a) + (r) - 0xa0000000, p, l)
 #define SMC_outsw(a, r, p, l)	outsw((a) + (r) - 0xa0000000, p, l)
 
-#define set_irq_type(irq, type)	do {} while(0)
+#define SMC_SET_IRQ_TYPE(irq, type)	do {} while(0)
 
 #define RPC_LSA_DEFAULT		RPC_LED_TX_RX
 #define RPC_LSB_DEFAULT		RPC_LED_100_10
@@ -310,6 +310,10 @@ static inline void SMC_outsw (unsigned l
 
 #endif
 
+#ifndef SMC_SET_IRQ_TYPE
+#define SMC_SET_IRQ_TYPE set_irq_type
+#endif
+
 #ifndef	SMC_IRQ_TRIGGER_TYPE
 #define	SMC_IRQ_TRIGGER_TYPE	IRQT_RISING
 #endif
Index: linux/drivers/net/tulip/tulip_core.c
===================================================================
--- linux.orig/drivers/net/tulip/tulip_core.c
+++ linux/drivers/net/tulip/tulip_core.c
@@ -1811,6 +1811,7 @@ static void __devexit tulip_remove_one (
 	pci_iounmap(pdev, tp->base_addr);
 	free_netdev (dev);
 	pci_release_regions (pdev);
+	pci_disable_device (pdev);
 	pci_set_drvdata (pdev, NULL);
 
 	/* pci_power_off (pdev, -1); */
Index: linux/drivers/oprofile/buffer_sync.c
===================================================================
--- linux.orig/drivers/oprofile/buffer_sync.c
+++ linux/drivers/oprofile/buffer_sync.c
@@ -43,13 +43,16 @@ static void process_task_mortuary(void);
  * list for processing. Only after two full buffer syncs
  * does the task eventually get freed, because by then
  * we are sure we will not reference it again.
+ * Can be invoked from softirq via RCU callback due to
+ * call_rcu() of the task struct, hence the _irqsave.
  */
 static int task_free_notify(struct notifier_block * self, unsigned long val, void * data)
 {
+	unsigned long flags;
 	struct task_struct * task = data;
-	spin_lock(&task_mortuary);
+	spin_lock_irqsave(&task_mortuary, flags);
 	list_add(&task->tasks, &dying_tasks);
-	spin_unlock(&task_mortuary);
+	spin_unlock_irqrestore(&task_mortuary, flags);
 	return NOTIFY_OK;
 }
 
@@ -431,25 +434,22 @@ static void increment_tail(struct oprofi
  */
 static void process_task_mortuary(void)
 {
-	struct list_head * pos;
-	struct list_head * pos2;
+	unsigned long flags;
+	LIST_HEAD(local_dead_tasks);
 	struct task_struct * task;
+	struct task_struct * ttask;
 
-	spin_lock(&task_mortuary);
+	spin_lock_irqsave(&task_mortuary, flags);
 
-	list_for_each_safe(pos, pos2, &dead_tasks) {
-		task = list_entry(pos, struct task_struct, tasks);
-		list_del(&task->tasks);
-		free_task(task);
-	}
+	list_splice_init(&dead_tasks, &local_dead_tasks);
+	list_splice_init(&dying_tasks, &dead_tasks);
 
-	list_for_each_safe(pos, pos2, &dying_tasks) {
-		task = list_entry(pos, struct task_struct, tasks);
+	spin_unlock_irqrestore(&task_mortuary, flags);
+
+	list_for_each_entry_safe(task, ttask, &local_dead_tasks, tasks) {
 		list_del(&task->tasks);
-		list_add_tail(&task->tasks, &dead_tasks);
+		free_task(task);
 	}
-
-	spin_unlock(&task_mortuary);
 }
 
 
Index: linux/drivers/oprofile/oprofilefs.c
===================================================================
--- linux.orig/drivers/oprofile/oprofilefs.c
+++ linux/drivers/oprofile/oprofilefs.c
@@ -21,7 +21,7 @@
 
 #define OPROFILEFS_MAGIC 0x6f70726f
 
-DEFINE_SPINLOCK(oprofilefs_lock);
+DEFINE_RAW_SPINLOCK(oprofilefs_lock);
 
 static struct inode * oprofilefs_get_inode(struct super_block * sb, int mode)
 {
Index: linux/drivers/pci/hotplug/cpci_hotplug_core.c
===================================================================
--- linux.orig/drivers/pci/hotplug/cpci_hotplug_core.c
+++ linux/drivers/pci/hotplug/cpci_hotplug_core.c
@@ -60,8 +60,8 @@ static int slots;
 static atomic_t extracting;
 int cpci_debug;
 static struct cpci_hp_controller *controller;
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore thread_exit;		/* guard ensure thread has exited before calling it quits */
 static int thread_finished = 1;
 
 static int enable_slot(struct hotplug_slot *slot);
Index: linux/drivers/pci/hotplug/cpqphp_ctrl.c
===================================================================
--- linux.orig/drivers/pci/hotplug/cpqphp_ctrl.c
+++ linux/drivers/pci/hotplug/cpqphp_ctrl.c
@@ -45,8 +45,8 @@ static int configure_new_function(struct
 			u8 behind_bridge, struct resource_lists *resources);
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 
Index: linux/drivers/pci/hotplug/ibmphp_hpc.c
===================================================================
--- linux.orig/drivers/pci/hotplug/ibmphp_hpc.c
+++ linux/drivers/pci/hotplug/ibmphp_hpc.c
@@ -104,7 +104,7 @@ static int tid_poll;
 static struct semaphore sem_hpcaccess;	// lock access to HPC
 static struct semaphore semOperations;	// lock all operations and
 					// access to data structures
-static struct semaphore sem_exit;	// make sure polling thread goes away
+static struct compat_semaphore sem_exit;	// make sure polling thread goes away
 //----------------------------------------------------------------------------
 // local function prototypes
 //----------------------------------------------------------------------------
Index: linux/drivers/pci/hotplug/pciehp_ctrl.c
===================================================================
--- linux.orig/drivers/pci/hotplug/pciehp_ctrl.c
+++ linux/drivers/pci/hotplug/pciehp_ctrl.c
@@ -48,8 +48,8 @@ static int configure_new_function( struc
 	u8 behind_bridge, struct resource_lists *resources, u8 bridge_bus, u8 bridge_dev);
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 static unsigned long surprise_rm_pending;	/* = 0 */
Index: linux/drivers/pci/hotplug/shpchp_ctrl.c
===================================================================
--- linux.orig/drivers/pci/hotplug/shpchp_ctrl.c
+++ linux/drivers/pci/hotplug/shpchp_ctrl.c
@@ -47,8 +47,8 @@ static int configure_new_function( struc
 	u8 behind_bridge, struct resource_lists *resources, u8 bridge_bus, u8 bridge_dev);
 static void interrupt_event_handler(struct controller *ctrl);
 
-static struct semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
-static struct semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
+static struct compat_semaphore event_semaphore;	/* mutex for process loop (up if something to process) */
+static struct compat_semaphore event_exit;		/* guard ensure thread has exited before calling it quits */
 static int event_finished;
 static unsigned long pushbutton_pending;	/* = 0 */
 
Index: linux/drivers/pcmcia/soc_common.c
===================================================================
--- linux.orig/drivers/pcmcia/soc_common.c
+++ linux/drivers/pcmcia/soc_common.c
@@ -39,6 +39,7 @@
 #include <linux/timer.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <linux/spinlock.h>
 #include <linux/cpufreq.h>
 
Index: linux/drivers/s390/char/vmlogrdr.c
===================================================================
--- linux.orig/drivers/s390/char/vmlogrdr.c
+++ linux/drivers/s390/char/vmlogrdr.c
@@ -145,7 +145,7 @@ static struct vmlogrdr_priv_t sys_ser[] 
 	  .recording_name = "EREP",
 	  .minor_num      = 0,
 	  .buffer_free    = 1,
-	  .priv_lock      = SPIN_LOCK_UNLOCKED,
+	  .priv_lock      = SPIN_LOCK_UNLOCKED(sys_ser[0].priv_lock),
 	  .autorecording  = 1,
 	  .autopurge      = 1,
 	},
@@ -154,7 +154,7 @@ static struct vmlogrdr_priv_t sys_ser[] 
 	  .recording_name = "ACCOUNT",
 	  .minor_num      = 1,
 	  .buffer_free    = 1,
-	  .priv_lock      = SPIN_LOCK_UNLOCKED,
+	  .priv_lock      = SPIN_LOCK_UNLOCKED(sys_ser[1].priv_lock),
 	  .autorecording  = 1,
 	  .autopurge      = 1,
 	},
@@ -163,7 +163,7 @@ static struct vmlogrdr_priv_t sys_ser[] 
 	  .recording_name = "SYMPTOM",
 	  .minor_num      = 2,
 	  .buffer_free    = 1,
-	  .priv_lock      = SPIN_LOCK_UNLOCKED,
+	  .priv_lock      = SPIN_LOCK_UNLOCKED(sys_ser[2].priv_lock),
 	  .autorecording  = 1,
 	  .autopurge      = 1,
 	}
Index: linux/drivers/s390/cio/cmf.c
===================================================================
--- linux.orig/drivers/s390/cio/cmf.c
+++ linux/drivers/s390/cio/cmf.c
@@ -297,7 +297,7 @@ struct cmb_area {
 };
 
 static struct cmb_area cmb_area = {
-	.lock = SPIN_LOCK_UNLOCKED,
+	.lock = SPIN_LOCK_UNLOCKED(cmb_area.lock),
 	.list = LIST_HEAD_INIT(cmb_area.list),
 	.num_channels  = 1024,
 };
Index: linux/drivers/sbus/char/cpwatchdog.c
===================================================================
--- linux.orig/drivers/sbus/char/cpwatchdog.c
+++ linux/drivers/sbus/char/cpwatchdog.c
@@ -155,7 +155,7 @@ struct wd_device {
 };
 
 static struct wd_device wd_dev = { 
-		0, SPIN_LOCK_UNLOCKED, 0, 0, 0, 0,
+		0, SPIN_LOCK_UNLOCKED(wd_dev.lock), 0, 0, 0, 0,
 };
 
 static struct timer_list wd_timer;
Index: linux/drivers/scsi/aacraid/aacraid.h
===================================================================
--- linux.orig/drivers/scsi/aacraid/aacraid.h
+++ linux/drivers/scsi/aacraid/aacraid.h
@@ -731,7 +731,7 @@ struct aac_fib_context {
 	u32			unique;		// unique value representing this context
 	ulong			jiffies;	// used for cleanup - dmb changed to ulong
 	struct list_head	next;		// used to link context's into a linked list
-	struct semaphore 	wait_sem;	// this is used to wait for the next fib to arrive.
+	struct compat_semaphore	wait_sem;	// this is used to wait for the next fib to arrive.
 	int			wait;		// Set to true when thread is in WaitForSingleObject
 	unsigned long		count;		// total number of FIBs on FibList
 	struct list_head	fib_list;	// this holds fibs and their attachd hw_fibs
@@ -800,7 +800,7 @@ struct fib {
 	 *	This is the event the sendfib routine will wait on if the
 	 *	caller did not pass one and this is synch io.
 	 */
-	struct semaphore 	event_wait;
+	struct compat_semaphore	event_wait;
 	spinlock_t		event_lock;
 
 	u32			done;	/* gets set to 1 when fib is complete */
Index: linux/drivers/scsi/aic7xxx/aic79xx_osm.h
===================================================================
--- linux.orig/drivers/scsi/aic7xxx/aic79xx_osm.h
+++ linux/drivers/scsi/aic7xxx/aic79xx_osm.h
@@ -391,7 +391,7 @@ struct ahd_platform_data {
 	spinlock_t		 spin_lock;
 	u_int			 qfrozen;
 	struct timer_list	 reset_timer;
-	struct semaphore	 eh_sem;
+	struct compat_semaphore	 eh_sem;
 	struct Scsi_Host        *host;		/* pointer to scsi host */
 #define AHD_LINUX_NOIRQ	((uint32_t)~0)
 	uint32_t		 irq;		/* IRQ for this adapter */
Index: linux/drivers/scsi/aic7xxx/aic7xxx_osm.h
===================================================================
--- linux.orig/drivers/scsi/aic7xxx/aic7xxx_osm.h
+++ linux/drivers/scsi/aic7xxx/aic7xxx_osm.h
@@ -395,7 +395,7 @@ struct ahc_platform_data {
 	spinlock_t		 spin_lock;
 	u_int			 qfrozen;
 	struct timer_list	 reset_timer;
-	struct semaphore	 eh_sem;
+	struct compat_semaphore	 eh_sem;
 	struct Scsi_Host        *host;		/* pointer to scsi host */
 #define AHC_LINUX_NOIRQ	((uint32_t)~0)
 	uint32_t		 irq;		/* IRQ for this adapter */
Index: linux/drivers/scsi/ncr53c8xx.c
===================================================================
--- linux.orig/drivers/scsi/ncr53c8xx.c
+++ linux/drivers/scsi/ncr53c8xx.c
@@ -3481,8 +3481,8 @@ static int ncr_queue_command (struct ncb
 	**----------------------------------------------------
 	*/
 	if (np->settle_time && cmd->timeout_per_command >= HZ) {
-		u_long tlimit = ktime_get(cmd->timeout_per_command - HZ);
-		if (ktime_dif(np->settle_time, tlimit) > 0)
+		u_long tlimit = jiffies + cmd->timeout_per_command - HZ;
+		if (time_after(np->settle_time, tlimit))
 			np->settle_time = tlimit;
 	}
 
@@ -3516,7 +3516,7 @@ static int ncr_queue_command (struct ncb
 		**	Force ordered tag if necessary to avoid timeouts 
 		**	and to preserve interactivity.
 		*/
-		if (lp && ktime_exp(lp->tags_stime)) {
+		if (lp && time_after(jiffies, lp->tags_stime)) {
 			if (lp->tags_smap) {
 				order = M_ORDERED_TAG;
 				if ((DEBUG_FLAGS & DEBUG_TAGS)||bootverbose>2){ 
@@ -3524,7 +3524,7 @@ static int ncr_queue_command (struct ncb
 						"ordered tag forced.\n");
 				}
 			}
-			lp->tags_stime = ktime_get(3*HZ);
+			lp->tags_stime = jiffies + 3*HZ;
 			lp->tags_smap = lp->tags_umap;
 		}
 
@@ -3669,7 +3669,7 @@ static int ncr_queue_command (struct ncb
 	/*
 	**	select
 	*/
-	cp->phys.select.sel_id		= sdev->id;
+	cp->phys.select.sel_id		= sdev_id(sdev);
 	cp->phys.select.sel_scntl3	= tp->wval;
 	cp->phys.select.sel_sxfer	= tp->sval;
 	/*
@@ -3792,7 +3792,7 @@ static int ncr_reset_scsi_bus(struct ncb
 	u32 term;
 	int retv = 0;
 
-	np->settle_time	= ktime_get(settle_delay * HZ);
+	np->settle_time	= jiffies + settle_delay * HZ;
 
 	if (bootverbose > 1)
 		printk("%s: resetting, "
@@ -4820,7 +4820,7 @@ static void ncr_set_sync_wide_status (st
 	*/
 	for (cp = np->ccb; cp; cp = cp->link_ccb) {
 		if (!cp->cmd) continue;
-		if (cp->cmd->device->id != target) continue;
+		if (scmd_id(cp->cmd) != target) continue;
 #if 0
 		cp->sync_status = tp->sval;
 		cp->wide_status = tp->wval;
@@ -4844,7 +4844,7 @@ static void ncr_setsync (struct ncb *np,
 	u_char target = INB (nc_sdid) & 0x0f;
 	u_char idiv;
 
-	BUG_ON(target != (cmd->device->id & 0xf));
+	BUG_ON(target != (scmd_id(cmd) & 0xf));
 
 	tp = &np->target[target];
 
@@ -4902,7 +4902,7 @@ static void ncr_setwide (struct ncb *np,
 	u_char	scntl3;
 	u_char	sxfer;
 
-	BUG_ON(target != (cmd->device->id & 0xf));
+	BUG_ON(target != (scmd_id(cmd) & 0xf));
 
 	tp = &np->target[target];
 	tp->widedone  =  wide+1;
@@ -5044,7 +5044,7 @@ static void ncr_setup_tags (struct ncb *
 
 static void ncr_timeout (struct ncb *np)
 {
-	u_long	thistime = ktime_get(0);
+	u_long	thistime = jiffies;
 
 	/*
 	**	If release process in progress, let's go
@@ -5057,7 +5057,7 @@ static void ncr_timeout (struct ncb *np)
 		return;
 	}
 
-	np->timer.expires = ktime_get(SCSI_NCR_TIMER_INTERVAL);
+	np->timer.expires = jiffies + SCSI_NCR_TIMER_INTERVAL;
 	add_timer(&np->timer);
 
 	/*
@@ -5336,8 +5336,8 @@ void ncr_exception (struct ncb *np)
 	**=========================================================
 	*/
 
-	if (ktime_exp(np->regtime)) {
-		np->regtime = ktime_get(10*HZ);
+	if (time_after(jiffies, np->regtime)) {
+		np->regtime = jiffies + 10*HZ;
 		for (i = 0; i<sizeof(np->regdump); i++)
 			((char*)&np->regdump)[i] = INB_OFF(i);
 		np->regdump.nc_dstat = dstat;
@@ -5453,7 +5453,7 @@ static int ncr_int_sbmc (struct ncb *np)
 		**	Suspend command processing for 1 second and 
 		**	reinitialize all except the chip.
 		*/
-		np->settle_time	= ktime_get(1*HZ);
+		np->settle_time	= jiffies + HZ;
 		ncr_init (np, 0, bootverbose ? "scsi mode change" : NULL, HS_RESET);
 		return 1;
 	}
@@ -6923,7 +6923,7 @@ static struct lcb *ncr_setup_lcb (struct
 		for (i = 0 ; i < MAX_TAGS ; i++)
 			lp->cb_tags[i] = i;
 		lp->maxnxs = MAX_TAGS;
-		lp->tags_stime = ktime_get(3*HZ);
+		lp->tags_stime = jiffies + 3*HZ;
 		ncr_setup_tags (np, sdev);
 	}
 
Index: linux/drivers/scsi/qla2xxx/qla_def.h
===================================================================
--- linux.orig/drivers/scsi/qla2xxx/qla_def.h
+++ linux/drivers/scsi/qla2xxx/qla_def.h
@@ -2416,7 +2416,7 @@ typedef struct scsi_qla_host {
 	spinlock_t	mbx_reg_lock;   /* Mbx Cmd Register Lock */
 
 	struct semaphore mbx_cmd_sem;	/* Serialialize mbx access */
-	struct semaphore mbx_intr_sem;  /* Used for completion notification */
+	struct compat_semaphore mbx_intr_sem;  /* Used for completion notification */
 
 	uint32_t	mbx_flags;
 #define  MBX_IN_PROGRESS	BIT_0
Index: linux/drivers/scsi/qla2xxx/qla_os.c
===================================================================
--- linux.orig/drivers/scsi/qla2xxx/qla_os.c
+++ linux/drivers/scsi/qla2xxx/qla_os.c
@@ -2123,12 +2123,13 @@ qla2x00_free_sp_pool( scsi_qla_host_t *h
 static int
 qla2x00_do_dpc(void *data)
 {
-	DECLARE_MUTEX_LOCKED(sem);
+	DECLARE_MUTEX(sem);
 	scsi_qla_host_t *ha;
 	fc_port_t	*fcport;
 	uint8_t		status;
 	uint16_t	next_loopid;
 
+	down(&sem);
 	ha = (scsi_qla_host_t *)data;
 
 	lock_kernel();
Index: linux/drivers/scsi/scsi.c
===================================================================
--- linux.orig/drivers/scsi/scsi.c
+++ linux/drivers/scsi/scsi.c
@@ -772,10 +772,10 @@ void __scsi_done(struct scsi_cmnd *cmd)
 	 * It is a per-CPU queue, so we just disable local interrupts
 	 * and need no spinlock.
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	list_add_tail(&cmd->eh_entry, &__get_cpu_var(scsi_done_q));
 	raise_softirq_irqoff(SCSI_SOFTIRQ);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /**
@@ -792,9 +792,9 @@ static void scsi_softirq(struct softirq_
 	int disposition;
 	LIST_HEAD(local_q);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	list_splice_init(&__get_cpu_var(scsi_done_q), &local_q);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	while (!list_empty(&local_q)) {
 		struct scsi_cmnd *cmd = list_entry(local_q.next,
@@ -1286,11 +1286,11 @@ static int scsi_cpu_notify(struct notifi
 	switch(action) {
 	case CPU_DEAD:
 		/* Drain scsi_done_q. */
-		local_irq_disable();
+		raw_local_irq_disable();
 		list_splice_init(&per_cpu(scsi_done_q, cpu),
 				 &__get_cpu_var(scsi_done_q));
 		raise_softirq_irqoff(SCSI_SOFTIRQ);
-		local_irq_enable();
+		raw_local_irq_enable();
 		break;
 	default:
 		break;
Index: linux/drivers/scsi/scsi_error.c
===================================================================
--- linux.orig/drivers/scsi/scsi_error.c
+++ linux/drivers/scsi/scsi_error.c
@@ -1647,6 +1647,12 @@ int scsi_error_handler(void *data)
 
 	__set_current_state(TASK_RUNNING);
 
+	/*
+	 * There's a good chance that the loop will exit in the
+	 * TASK_INTERRUPTIBLE state.
+	 */
+	__set_current_state(TASK_RUNNING);
+
 	SCSI_LOG_ERROR_RECOVERY(1, printk("Error handler scsi_eh_%d"
 					  " exiting\n",shost->host_no));
 
Index: linux/drivers/scsi/sym53c8xx_defs.h
===================================================================
--- linux.orig/drivers/scsi/sym53c8xx_defs.h
+++ linux/drivers/scsi/sym53c8xx_defs.h
@@ -281,19 +281,6 @@
 #endif
 
 /*
-**	These simple macros limit expression involving 
-**	kernel time values (jiffies) to some that have 
-**	chance not to be too much incorrect. :-)
-*/
-#define ktime_get(o)		(jiffies + (u_long) o)
-#define ktime_exp(b)		((long)(jiffies) - (long)(b) >= 0)
-#define ktime_dif(a, b)		((long)(a) - (long)(b))
-/* These ones are not used in this driver */
-#define ktime_add(a, o)		((a) + (u_long)(o))
-#define ktime_sub(a, o)		((a) - (u_long)(o))
-
-
-/*
  *  IO functions definition for big/little endian CPU support.
  *  For now, the NCR is only supported in little endian addressing mode, 
  */
Index: linux/drivers/serial/cpm_uart/cpm_uart_core.c
===================================================================
--- linux.orig/drivers/serial/cpm_uart/cpm_uart_core.c
+++ linux/drivers/serial/cpm_uart/cpm_uart_core.c
@@ -909,7 +909,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SMC1_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SMC1].port.lock),
 		},
 		.flags = FLAG_SMC,
 		.tx_nrfifos = TX_NUM_FIFO,
@@ -923,7 +923,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SMC2_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SMC2].port.lock),
 		},
 		.flags = FLAG_SMC,
 		.tx_nrfifos = TX_NUM_FIFO,
@@ -940,7 +940,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC1_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC1].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
@@ -954,7 +954,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC2_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC2].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
@@ -968,7 +968,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC3_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC3].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
@@ -982,7 +982,7 @@ struct uart_cpm_port cpm_uart_ports[UART
 			.irq		= SCC4_IRQ,
 			.ops		= &cpm_uart_pops,
 			.iotype		= SERIAL_IO_MEM,
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(cpm_uart_ports[UART_SCC4].port.lock),
 		},
 		.tx_nrfifos = TX_NUM_FIFO,
 		.tx_fifosize = TX_BUF_SIZE,
Index: linux/drivers/serial/s3c2410.c
===================================================================
--- linux.orig/drivers/serial/s3c2410.c
+++ linux/drivers/serial/s3c2410.c
@@ -966,7 +966,7 @@ static struct uart_driver s3c24xx_uart_d
 static struct s3c24xx_uart_port s3c24xx_serial_ports[NR_PORTS] = {
 	[0] = {
 		.port = {
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(s3c24xx_serial_ports[0].port.lock),
 			.iotype		= UPIO_MEM,
 			.irq		= IRQ_S3CUART_RX0,
 			.uartclk	= 0,
@@ -978,7 +978,7 @@ static struct s3c24xx_uart_port s3c24xx_
 	},
 	[1] = {
 		.port = {
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(s3c24xx_serial_ports[1].port.lock),
 			.iotype		= UPIO_MEM,
 			.irq		= IRQ_S3CUART_RX1,
 			.uartclk	= 0,
@@ -992,7 +992,7 @@ static struct s3c24xx_uart_port s3c24xx_
 
 	[2] = {
 		.port = {
-			.lock		= SPIN_LOCK_UNLOCKED,
+			.lock		= SPIN_LOCK_UNLOCKED(s3c24xx_serial_ports[2].port.lock),
 			.iotype		= UPIO_MEM,
 			.irq		= IRQ_S3CUART_RX2,
 			.uartclk	= 0,
Index: linux/drivers/usb/core/devio.c
===================================================================
--- linux.orig/drivers/usb/core/devio.c
+++ linux/drivers/usb/core/devio.c
@@ -284,10 +284,11 @@ static void async_completed(struct urb *
         struct async *as = (struct async *)urb->context;
         struct dev_state *ps = as->ps;
 	struct siginfo sinfo;
+	unsigned long flags;
 
-        spin_lock(&ps->lock);
-        list_move_tail(&as->asynclist, &ps->async_completed);
-        spin_unlock(&ps->lock);
+	spin_lock_irqsave(&ps->lock, flags);
+	list_move_tail(&as->asynclist, &ps->async_completed);
+	spin_unlock_irqrestore(&ps->lock, flags);
 	if (as->signr) {
 		sinfo.si_signo = as->signr;
 		sinfo.si_errno = as->urb->status;
Index: linux/drivers/usb/core/hcd.c
===================================================================
--- linux.orig/drivers/usb/core/hcd.c
+++ linux/drivers/usb/core/hcd.c
@@ -506,13 +506,11 @@ error:
 	}
 
 	/* any errors get returned through the urb completion */
-	local_irq_save (flags);
-	spin_lock (&urb->lock);
+	spin_lock_irqsave(&urb->lock, flags);
 	if (urb->status == -EINPROGRESS)
 		urb->status = status;
-	spin_unlock (&urb->lock);
+	spin_unlock_irqrestore(&urb->lock, flags);
 	usb_hcd_giveback_urb (hcd, urb, NULL);
-	local_irq_restore (flags);
 	return 0;
 }
 
@@ -540,8 +538,7 @@ void usb_hcd_poll_rh_status(struct usb_h
 	if (length > 0) {
 
 		/* try to complete the status urb */
-		local_irq_save (flags);
-		spin_lock(&hcd_root_hub_lock);
+		spin_lock_irqsave(&hcd_root_hub_lock, flags);
 		urb = hcd->status_urb;
 		if (urb) {
 			spin_lock(&urb->lock);
@@ -557,14 +554,13 @@ void usb_hcd_poll_rh_status(struct usb_h
 			spin_unlock(&urb->lock);
 		} else
 			length = 0;
-		spin_unlock(&hcd_root_hub_lock);
+		spin_unlock_irqrestore(&hcd_root_hub_lock, flags);
 
 		/* local irqs are always blocked in completions */
 		if (length > 0)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
 		else
 			hcd->poll_pending = 1;
-		local_irq_restore (flags);
 	}
 
 	/* The USB 2.0 spec says 256 ms.  This is close enough and won't
@@ -647,17 +643,15 @@ static int usb_rh_urb_dequeue (struct us
 	} else {				/* Status URB */
 		if (!hcd->uses_new_polling)
 			del_timer_sync (&hcd->rh_timer);
-		local_irq_disable ();
-		spin_lock (&hcd_root_hub_lock);
+		spin_lock_irq(&hcd_root_hub_lock);
 		if (urb == hcd->status_urb) {
 			hcd->status_urb = NULL;
 			urb->hcpriv = NULL;
 		} else
 			urb = NULL;		/* wasn't fully queued */
-		spin_unlock (&hcd_root_hub_lock);
+		spin_unlock_irq(&hcd_root_hub_lock);
 		if (urb)
 			usb_hcd_giveback_urb (hcd, urb, NULL);
-		local_irq_enable ();
 	}
 
 	return 0;
@@ -1367,15 +1361,13 @@ hcd_endpoint_disable (struct usb_device 
 	WARN_ON (!HC_IS_RUNNING (hcd->state) && hcd->state != HC_STATE_HALT &&
 			udev->state != USB_STATE_NOTATTACHED);
 
-	local_irq_disable ();
-
 	/* FIXME move most of this into message.c as part of its
 	 * endpoint disable logic
 	 */
 
 	/* ep is already gone from udev->ep_{in,out}[]; no more submits */
 rescan:
-	spin_lock (&hcd_data_lock);
+	spin_lock_irq(&hcd_data_lock);
 	list_for_each_entry (urb, &ep->urb_list, urb_list) {
 		int	tmp;
 
@@ -1388,13 +1380,13 @@ rescan:
 		if (urb->status != -EINPROGRESS)
 			continue;
 		usb_get_urb (urb);
-		spin_unlock (&hcd_data_lock);
+		spin_unlock_irq(&hcd_data_lock);
 
-		spin_lock (&urb->lock);
+		spin_lock_irq(&urb->lock);
 		tmp = urb->status;
 		if (tmp == -EINPROGRESS)
 			urb->status = -ESHUTDOWN;
-		spin_unlock (&urb->lock);
+		spin_unlock_irq(&urb->lock);
 
 		/* kick hcd unless it's already returning this */
 		if (tmp == -EINPROGRESS) {
@@ -1417,8 +1409,7 @@ rescan:
 		/* list contents may have changed */
 		goto rescan;
 	}
-	spin_unlock (&hcd_data_lock);
-	local_irq_enable ();
+	spin_unlock_irq(&hcd_data_lock);
 
 	/* synchronize with the hardware, so old configuration state
 	 * clears out immediately (and will be freed).
Index: linux/drivers/usb/core/message.c
===================================================================
--- linux.orig/drivers/usb/core/message.c
+++ linux/drivers/usb/core/message.c
@@ -224,8 +224,9 @@ static void sg_clean (struct usb_sg_requ
 static void sg_complete (struct urb *urb, struct pt_regs *regs)
 {
 	struct usb_sg_request	*io = (struct usb_sg_request *) urb->context;
+	unsigned long flags;
 
-	spin_lock (&io->lock);
+	spin_lock_irqsave (&io->lock, flags);
 
 	/* In 2.5 we require hcds' endpoint queues not to progress after fault
 	 * reports, until the completion callback (this!) returns.  That lets
@@ -259,7 +260,7 @@ static void sg_complete (struct urb *urb
 		 * unlink pending urbs so they won't rx/tx bad data.
 		 * careful: unlink can sometimes be synchronous...
 		 */
-		spin_unlock (&io->lock);
+		spin_unlock_irqrestore (&io->lock, flags);
 		for (i = 0, found = 0; i < io->entries; i++) {
 			if (!io->urbs [i] || !io->urbs [i]->dev)
 				continue;
@@ -274,7 +275,7 @@ static void sg_complete (struct urb *urb
 			} else if (urb == io->urbs [i])
 				found = 1;
 		}
-		spin_lock (&io->lock);
+		spin_lock_irqsave (&io->lock, flags);
 	}
 	urb->dev = NULL;
 
@@ -284,7 +285,7 @@ static void sg_complete (struct urb *urb
 	if (!io->count)
 		complete (&io->complete);
 
-	spin_unlock (&io->lock);
+	spin_unlock_irqrestore (&io->lock, flags);
 }
 
 
Index: linux/drivers/usb/net/usbnet.c
===================================================================
--- linux.orig/drivers/usb/net/usbnet.c
+++ linux/drivers/usb/net/usbnet.c
@@ -822,6 +822,8 @@ static void tx_complete (struct urb *urb
 
 	urb->dev = NULL;
 	entry->state = tx_done;
+	spin_lock_rt(&dev->txq.lock);
+	spin_unlock_rt(&dev->txq.lock);
 	defer_bh(dev, skb, &dev->txq);
 }
 
Index: linux/drivers/usb/storage/usb.c
===================================================================
--- linux.orig/drivers/usb/storage/usb.c
+++ linux/drivers/usb/storage/usb.c
@@ -319,6 +319,7 @@ static int usb_stor_control_thread(void 
 		if (test_bit(US_FLIDX_DISCONNECTING, &us->flags)) {
 			US_DEBUGP("-- exiting\n");
 			up(&(us->dev_semaphore));
+			up(&us->sema);
 			break;
 		}
 
Index: linux/drivers/usb/storage/usb.h
===================================================================
--- linux.orig/drivers/usb/storage/usb.h
+++ linux/drivers/usb/storage/usb.h
@@ -172,7 +172,7 @@ struct us_data {
 	dma_addr_t		iobuf_dma;
 
 	/* mutual exclusion and synchronization structures */
-	struct semaphore	sema;		 /* to sleep thread on	    */
+	struct compat_semaphore	sema;		 /* to sleep thread on	    */
 	struct completion	notify;		 /* thread begin/end	    */
 	wait_queue_head_t	delay_wait;	 /* wait during scan, reset */
 
Index: linux/drivers/video/backlight/corgi_bl.c
===================================================================
--- linux.orig/drivers/video/backlight/corgi_bl.c
+++ linux/drivers/video/backlight/corgi_bl.c
@@ -28,7 +28,7 @@ static int corgibl_powermode = FB_BLANK_
 static int current_intensity = 0;
 static int corgibl_limit = 0;
 static void (*corgibl_mach_set_intensity)(int intensity);
-static spinlock_t bl_lock = SPIN_LOCK_UNLOCKED;
+static DEFINE_SPINLOCK(bl_lock);
 static struct backlight_properties corgibl_data;
 
 static void corgibl_send_intensity(int intensity)
Index: linux/drivers/video/console/fbcon.c
===================================================================
--- linux.orig/drivers/video/console/fbcon.c
+++ linux/drivers/video/console/fbcon.c
@@ -1067,7 +1067,6 @@ static void fbcon_clear(struct vc_data *
 {
 	struct fb_info *info = registered_fb[con2fb_map[vc->vc_num]];
 	struct fbcon_ops *ops = info->fbcon_par;
-
 	struct display *p = &fb_display[vc->vc_num];
 	u_int y_break;
 
@@ -1096,10 +1095,11 @@ static void fbcon_putcs(struct vc_data *
 	struct display *p = &fb_display[vc->vc_num];
 	struct fbcon_ops *ops = info->fbcon_par;
 
-	if (!fbcon_is_inactive(vc, info))
+	if (!fbcon_is_inactive(vc, info)) {
 		ops->putcs(vc, info, s, count, real_y(p, ypos), xpos,
 			   get_color(vc, info, scr_readw(s), 1),
 			   get_color(vc, info, scr_readw(s), 0));
+	}
 }
 
 static void fbcon_putc(struct vc_data *vc, int c, int ypos, int xpos)
@@ -2846,6 +2846,7 @@ static const struct consw fb_con = {
 	.con_screen_pos 	= fbcon_screen_pos,
 	.con_getxy 		= fbcon_getxy,
 	.con_resize             = fbcon_resize,
+	.con_preemptible 	= 1,
 };
 
 static struct notifier_block fbcon_event_notifier = {
Index: linux/drivers/video/console/vgacon.c
===================================================================
--- linux.orig/drivers/video/console/vgacon.c
+++ linux/drivers/video/console/vgacon.c
@@ -53,7 +53,7 @@
 #include <video/vga.h>
 #include <asm/io.h>
 
-static DEFINE_SPINLOCK(vga_lock);
+static DEFINE_RAW_SPINLOCK(vga_lock);
 static int cursor_size_lastfrom;
 static int cursor_size_lastto;
 static struct vgastate state;
Index: linux/drivers/video/fbmon.c
===================================================================
--- linux.orig/drivers/video/fbmon.c
+++ linux/drivers/video/fbmon.c
@@ -317,8 +317,12 @@ static int edid_is_monitor_block(unsigne
 static void calc_mode_timings(int xres, int yres, int refresh,
 			      struct fb_videomode *mode)
 {
-	struct fb_var_screeninfo var;
-	struct fb_info info;
+	// FIXME: ugly hack to reduce stack footprint
+	static struct fb_var_screeninfo var;
+	static struct fb_info info;
+	static DECLARE_MUTEX(fb_lock);
+
+	down(&fb_lock);
 	
 	memset(&var, 0, sizeof(struct fb_var_screeninfo));
 	var.xres = xres;
@@ -337,6 +341,7 @@ static void calc_mode_timings(int xres, 
 	mode->vsync_len = var.vsync_len;
 	mode->vmode = 0;
 	mode->sync = 0;
+	up(&fb_lock);
 }
 
 static int get_est_timing(unsigned char *block, struct fb_videomode *mode)
Index: linux/fs/aio.c
===================================================================
--- linux.orig/fs/aio.c
+++ linux/fs/aio.c
@@ -566,13 +566,15 @@ static void use_mm(struct mm_struct *mm)
 	tsk->flags |= PF_BORROWED_MM;
 	active_mm = tsk->active_mm;
 	atomic_inc(&mm->mm_count);
-	tsk->mm = mm;
-	tsk->active_mm = mm;
+	local_irq_disable(); // FIXME
 	/*
 	 * Note that on UML this *requires* PF_BORROWED_MM to be set, otherwise
 	 * it won't work. Update it accordingly if you change it here
 	 */
 	activate_mm(active_mm, mm);
+	tsk->mm = mm;
+	tsk->active_mm = mm;
+	local_irq_enable();
 	task_unlock(tsk);
 
 	mmdrop(active_mm);
Index: linux/fs/block_dev.c
===================================================================
--- linux.orig/fs/block_dev.c
+++ linux/fs/block_dev.c
@@ -667,14 +667,32 @@ int blkdev_get(struct block_device *bdev
 	 * For now, block device ->open() routine must _not_
 	 * examine anything in 'inode' argument except ->i_rdev.
 	 */
-	struct file fake_file = {};
-	struct dentry fake_dentry = {};
-	fake_file.f_mode = mode;
-	fake_file.f_flags = flags;
-	fake_file.f_dentry = &fake_dentry;
-	fake_dentry.d_inode = bdev->bd_inode;
-
-	return do_open(bdev, &fake_file);
+	struct file *fake_file;
+	struct dentry *fake_dentry;
+	int err = -ENOMEM;
+
+	fake_file = kmalloc(sizeof(*fake_file), GFP_KERNEL);
+	if (!fake_file)
+		goto out;
+	memset(fake_file, 0, sizeof(*fake_file));
+
+	fake_dentry = kmalloc(sizeof(*fake_dentry), GFP_KERNEL);
+	if (!fake_dentry)
+		goto out_free_file;
+	memset(fake_dentry, 0, sizeof(*fake_dentry));
+
+	fake_file->f_mode = mode;
+	fake_file->f_flags = flags;
+	fake_file->f_dentry = fake_dentry;
+	fake_dentry->d_inode = bdev->bd_inode;
+
+	err = do_open(bdev, fake_file);
+
+	kfree(fake_dentry);
+out_free_file:
+	kfree(fake_file);
+out:
+	return err;
 }
 
 EXPORT_SYMBOL(blkdev_get);
Index: linux/fs/buffer.c
===================================================================
--- linux.orig/fs/buffer.c
+++ linux/fs/buffer.c
@@ -538,8 +538,7 @@ static void end_buffer_async_read(struct
 	 * decide that the page is now completely done.
 	 */
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -552,8 +551,7 @@ static void end_buffer_async_read(struct
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 
 	/*
 	 * If none of the buffers had errors and they are all
@@ -565,8 +563,7 @@ static void end_buffer_async_read(struct
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -600,8 +597,7 @@ void end_buffer_async_write(struct buffe
 	}
 
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 
 	clear_buffer_async_write(bh);
 	unlock_buffer(bh);
@@ -613,14 +609,12 @@ void end_buffer_async_write(struct buffe
 		}
 		tmp = tmp->b_this_page;
 	}
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	end_page_writeback(page);
 	return;
 
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
@@ -1336,9 +1330,9 @@ struct bh_lru {
 
 static DEFINE_PER_CPU(struct bh_lru, bh_lrus) = {{ NULL }};
 
-#ifdef CONFIG_SMP
-#define bh_lru_lock()	local_irq_disable()
-#define bh_lru_unlock()	local_irq_enable()
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)
+#define bh_lru_lock()	raw_local_irq_disable()
+#define bh_lru_unlock()	raw_local_irq_enable()
 #else
 #define bh_lru_lock()	preempt_disable()
 #define bh_lru_unlock()	preempt_enable()
@@ -3060,6 +3054,8 @@ EXPORT_SYMBOL(alloc_buffer_head);
 void free_buffer_head(struct buffer_head *bh)
 {
 	BUG_ON(!list_empty(&bh->b_assoc_buffers));
+	BUG_ON(spin_is_locked(&bh->b_uptodate_lock));
+	BUG_ON(spin_is_locked(&bh->b_state_lock));
 	kmem_cache_free(bh_cachep, bh);
 	get_cpu_var(bh_accounting).nr--;
 	recalc_bh_state();
@@ -3076,6 +3072,8 @@ init_buffer_head(void *data, kmem_cache_
 
 		memset(bh, 0, sizeof(*bh));
 		INIT_LIST_HEAD(&bh->b_assoc_buffers);
+		spin_lock_init(&bh->b_uptodate_lock);
+		spin_lock_init(&bh->b_state_lock);
 	}
 }
 
Index: linux/fs/compat.c
===================================================================
--- linux.orig/fs/compat.c
+++ linux/fs/compat.c
@@ -268,7 +268,7 @@ out:
 
 #define IOCTL_HASHSIZE 256
 static struct ioctl_trans *ioctl32_hash_table[IOCTL_HASHSIZE];
-static DECLARE_RWSEM(ioctl32_sem);
+static COMPAT_DECLARE_RWSEM(ioctl32_sem);
 
 extern struct ioctl_trans ioctl_start[];
 extern int ioctl_table_size;
Index: linux/fs/dcache.c
===================================================================
--- linux.orig/fs/dcache.c
+++ linux/fs/dcache.c
@@ -33,6 +33,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/futex.h>
 
 /* #define DCACHE_DEBUG 1 */
 
@@ -40,7 +41,7 @@ int sysctl_vfs_cache_pressure = 100;
 EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure);
 
  __cacheline_aligned_in_smp DEFINE_SPINLOCK(dcache_lock);
-static seqlock_t rename_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+static DECLARE_SEQLOCK(rename_lock);
 
 EXPORT_SYMBOL(dcache_lock);
 
@@ -161,6 +162,8 @@ repeat:
 		return;
 	}
 
+	futex_free_robust_list(dentry->d_inode);
+
 	/*
 	 * AV: ->d_delete() is _NOT_ allowed to block now.
 	 */
Index: linux/fs/devfs/base.c
===================================================================
--- linux.orig/fs/devfs/base.c
+++ linux/fs/devfs/base.c
@@ -826,7 +826,7 @@ struct fs_info {		/*  This structure is 
 	wait_queue_head_t revalidate_wait_queue;	/*  Wake when devfsd sleeps    */
 };
 
-static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED };
+static struct fs_info fs_info = {.devfsd_buffer_lock = SPIN_LOCK_UNLOCKED(fs_info.devfsd_buffer_lock) };
 static kmem_cache_t *devfsd_buf_cache;
 #ifdef CONFIG_DEVFS_DEBUG
 static unsigned int devfs_debug_init __initdata = DEBUG_NONE;
Index: linux/fs/dnotify.c
===================================================================
--- linux.orig/fs/dnotify.c
+++ linux/fs/dnotify.c
@@ -162,7 +162,7 @@ void dnotify_parent(struct dentry *dentr
 
 	spin_lock(&dentry->d_lock);
 	parent = dentry->d_parent;
-	if (parent->d_inode->i_dnotify_mask & event) {
+	if (unlikely(parent->d_inode->i_dnotify_mask & event)) {
 		dget(parent);
 		spin_unlock(&dentry->d_lock);
 		__inode_dir_notify(parent->d_inode, event);
Index: linux/fs/exec.c
===================================================================
--- linux.orig/fs/exec.c
+++ linux/fs/exec.c
@@ -48,6 +48,7 @@
 #include <linux/syscalls.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/delay.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -566,11 +567,16 @@ static int exec_mmap(struct mm_struct *m
 		}
 	}
 	task_lock(tsk);
+
+	raw_local_irq_disable();
 	active_mm = tsk->active_mm;
+	activate_mm(active_mm, mm);
 	tsk->mm = mm;
 	tsk->active_mm = mm;
-	activate_mm(active_mm, mm);
+	raw_local_irq_enable();
+
 	task_unlock(tsk);
+
 	arch_pick_mmap_layout(mm);
 	if (old_mm) {
 		up_read(&old_mm->mmap_sem);
@@ -593,6 +599,7 @@ static inline int de_thread(struct task_
 	struct signal_struct *sig = tsk->signal;
 	struct sighand_struct *newsighand, *oldsighand = tsk->sighand;
 	spinlock_t *lock = &oldsighand->siglock;
+	struct task_struct *leader = NULL;
 	int count;
 
 	/*
@@ -645,9 +652,12 @@ static inline int de_thread(struct task_
 		 * synchronize with any firing (by calling del_timer_sync)
 		 * before we can safely let the old group leader die.
 		 */
-		sig->real_timer.data = (unsigned long)current;
-		if (del_timer_sync(&sig->real_timer))
-			add_timer(&sig->real_timer);
+		sig->real_timer.data = current;
+		spin_unlock_irq(lock);
+		if (ktimer_cancel(&sig->real_timer))
+			ktimer_start(&sig->real_timer, NULL,
+				     KTIMER_RESTART|KTIMER_NOCHECK);
+		spin_lock_irq(lock);
 	}
 	while (atomic_read(&sig->count) > count) {
 		sig->group_exit_task = current;
@@ -659,7 +669,6 @@ static inline int de_thread(struct task_
 	}
 	sig->group_exit_task = NULL;
 	sig->notify_count = 0;
-	sig->real_timer.data = (unsigned long)current;
 	spin_unlock_irq(lock);
 
 	/*
@@ -668,17 +677,18 @@ static inline int de_thread(struct task_
 	 * and to assume its PID:
 	 */
 	if (!thread_group_leader(current)) {
-		struct task_struct *leader = current->group_leader, *parent;
+		struct task_struct *parent;
 		struct dentry *proc_dentry1, *proc_dentry2;
-		unsigned long exit_state, ptrace;
+		unsigned long ptrace;
 
 		/*
 		 * Wait for the thread group leader to be a zombie.
 		 * It should already be zombie at this point, most
 		 * of the time.
 		 */
+		leader = current->group_leader;
 		while (leader->exit_state != EXIT_ZOMBIE)
-			yield();
+			msleep(1);
 
 		spin_lock(&leader->proc_lock);
 		spin_lock(&current->proc_lock);
@@ -727,16 +737,15 @@ static inline int de_thread(struct task_
 		list_del(&current->tasks);
 		list_add_tail(&current->tasks, &init_task.tasks);
 		current->exit_signal = SIGCHLD;
-		exit_state = leader->exit_state;
+
+		BUG_ON(leader->exit_state != EXIT_ZOMBIE);
+		leader->exit_state = EXIT_DEAD;
 
 		write_unlock_irq(&tasklist_lock);
 		spin_unlock(&leader->proc_lock);
 		spin_unlock(&current->proc_lock);
 		proc_pid_flush(proc_dentry1);
 		proc_pid_flush(proc_dentry2);
-
-		BUG_ON(exit_state != EXIT_ZOMBIE);
-		release_task(leader);
         }
 
 	/*
@@ -746,8 +755,11 @@ static inline int de_thread(struct task_
 	sig->flags = 0;
 
 no_thread_group:
-	BUG_ON(atomic_read(&sig->count) != 1);
 	exit_itimers(sig);
+	if (leader)
+		release_task(leader);
+
+	BUG_ON(atomic_read(&sig->count) != 1);
 
 	if (atomic_read(&oldsighand->count) == 1) {
 		/*
@@ -777,7 +789,7 @@ no_thread_group:
 		write_unlock_irq(&tasklist_lock);
 
 		if (atomic_dec_and_test(&oldsighand->count))
-			kmem_cache_free(sighand_cachep, oldsighand);
+			sighand_free(oldsighand);
 	}
 
 	BUG_ON(!thread_group_leader(current));
@@ -1426,9 +1438,6 @@ static void coredump_wait(struct mm_stru
 	mm->core_waiters++; /* let other threads block */
 	mm->core_startup_done = &startup_done;
 
-	/* give other threads a chance to run: */
-	yield();
-
 	zap_threads(mm);
 	if (--mm->core_waiters) {
 		up_write(&mm->mmap_sem);
Index: linux/fs/fcntl.c
===================================================================
--- linux.orig/fs/fcntl.c
+++ linux/fs/fcntl.c
@@ -461,7 +461,8 @@ static void send_sigio_to_task(struct ta
 				break;
 		/* fall-through: fall back on the old plain SIGIO signal */
 		case 0:
-			send_group_sig_info(SIGIO, SEND_SIG_PRIV, p);
+			// we hold the tasklist lock already:
+			group_send_sig_info(SIGIO, SEND_SIG_PRIV, p);
 	}
 }
 
@@ -495,7 +496,7 @@ static void send_sigurg_to_task(struct t
                                 struct fown_struct *fown)
 {
 	if (sigio_perm(p, fown, SIGURG))
-		send_group_sig_info(SIGURG, SEND_SIG_PRIV, p);
+		group_send_sig_info(SIGURG, SEND_SIG_PRIV, p);
 }
 
 int send_sigurg(struct fown_struct *fown)
Index: linux/fs/inode.c
===================================================================
--- linux.orig/fs/inode.c
+++ linux/fs/inode.c
@@ -22,6 +22,7 @@
 #include <linux/cdev.h>
 #include <linux/bootmem.h>
 #include <linux/inotify.h>
+#include <linux/futex.h>
 
 /*
  * This is needed for the following functions:
@@ -207,6 +208,7 @@ void inode_init_once(struct inode *inode
 	INIT_LIST_HEAD(&inode->inotify_watches);
 	sema_init(&inode->inotify_sem, 1);
 #endif
+	futex_init_inode(inode);
 }
 
 EXPORT_SYMBOL(inode_init_once);
Index: linux/fs/jbd/transaction.c
===================================================================
--- linux.orig/fs/jbd/transaction.c
+++ linux/fs/jbd/transaction.c
@@ -1483,7 +1483,7 @@ void __journal_temp_unlink_buffer(struct
 	transaction_t *transaction;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	transaction = jh->b_transaction;
 	if (transaction)
 		assert_spin_locked(&transaction->t_journal->j_list_lock);
@@ -1928,7 +1928,7 @@ void __journal_file_buffer(struct journa
 	int was_dirty = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	assert_spin_locked(&transaction->t_journal->j_list_lock);
 
 	J_ASSERT_JH(jh, jh->b_jlist < BJ_Types);
@@ -2017,7 +2017,7 @@ void __journal_refile_buffer(struct jour
 	int was_dirty;
 	struct buffer_head *bh = jh2bh(jh);
 
-	J_ASSERT_JH(jh, jbd_is_locked_bh_state(bh));
+	J_ASSERT_JH_SMP(jh, jbd_is_locked_bh_state(bh));
 	if (jh->b_transaction)
 		assert_spin_locked(&jh->b_transaction->t_journal->j_list_lock);
 
Index: linux/fs/lockd/svc.c
===================================================================
--- linux.orig/fs/lockd/svc.c
+++ linux/fs/lockd/svc.c
@@ -49,7 +49,7 @@ static pid_t			nlmsvc_pid;
 int				nlmsvc_grace_period;
 unsigned long			nlmsvc_timeout;
 
-static DECLARE_MUTEX_LOCKED(lockd_start);
+static DECLARE_WAIT_QUEUE_HEAD(lockd_start);
 static DECLARE_WAIT_QUEUE_HEAD(lockd_exit);
 
 /*
@@ -112,7 +112,7 @@ lockd(struct svc_rqst *rqstp)
 	 * Let our maker know we're running.
 	 */
 	nlmsvc_pid = current->pid;
-	up(&lockd_start);
+	wake_up(&lockd_start);
 
 	daemonize("lockd");
 
@@ -263,8 +263,15 @@ lockd_up(void)
 			"lockd_up: create thread failed, error=%d\n", error);
 		goto destroy_and_out;
 	}
-	down(&lockd_start);
-
+	/*
+	 * Wait for the lockd process to start, but since we're holding
+	 * the lockd semaphore, we can't wait around forever ...
+	 */
+	if (wait_event_interruptible_timeout(lockd_start,
+					     nlmsvc_pid != 0, HZ) <= 0) {
+		printk(KERN_WARNING
+			"lockd_down: lockd failed to start\n");
+	}
 	/*
 	 * Note: svc_serv structures have an initial use count of 1,
 	 * so we exit through here on both success and failure.
@@ -304,16 +311,12 @@ lockd_down(void)
 	 * Wait for the lockd process to exit, but since we're holding
 	 * the lockd semaphore, we can't wait around forever ...
 	 */
-	clear_thread_flag(TIF_SIGPENDING);
-	interruptible_sleep_on_timeout(&lockd_exit, HZ);
-	if (nlmsvc_pid) {
+	if (wait_event_interruptible_timeout(lockd_exit,
+					     nlmsvc_pid == 0, HZ) <= 0) {
 		printk(KERN_WARNING 
 			"lockd_down: lockd failed to exit, clearing pid\n");
 		nlmsvc_pid = 0;
 	}
-	spin_lock_irq(&current->sighand->siglock);
-	recalc_sigpending();
-	spin_unlock_irq(&current->sighand->siglock);
 out:
 	up(&nlmsvc_sema);
 }
Index: linux/fs/nfsd/nfs4state.c
===================================================================
--- linux.orig/fs/nfsd/nfs4state.c
+++ linux/fs/nfsd/nfs4state.c
@@ -44,6 +44,7 @@
 #include <linux/mount.h>
 #include <linux/workqueue.h>
 #include <linux/smp_lock.h>
+#include <linux/spinlock.h>
 #include <linux/kthread.h>
 #include <linux/nfs4.h>
 #include <linux/nfsd/state.h>
@@ -122,7 +123,7 @@ static void release_stateid(struct nfs4_
  */
 
 /* recall_lock protects the del_recall_lru */
-static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED;
+static spinlock_t recall_lock = SPIN_LOCK_UNLOCKED(recall_lock);
 static struct list_head del_recall_lru;
 
 static void
Index: linux/fs/ntfs/aops.c
===================================================================
--- linux.orig/fs/ntfs/aops.c
+++ linux/fs/ntfs/aops.c
@@ -104,8 +104,7 @@ static void ntfs_end_buffer_async_read(s
 				"0x%llx.", (unsigned long long)bh->b_blocknr);
 	}
 	first = page_buffers(page);
-	local_irq_save(flags);
-	bit_spin_lock(BH_Uptodate_Lock, &first->b_state);
+	spin_lock_irqsave(&first->b_uptodate_lock, flags);
 	clear_buffer_async_read(bh);
 	unlock_buffer(bh);
 	tmp = bh;
@@ -120,8 +119,7 @@ static void ntfs_end_buffer_async_read(s
 		}
 		tmp = tmp->b_this_page;
 	} while (tmp != bh);
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	/*
 	 * If none of the buffers had errors then we can set the page uptodate,
 	 * but we first have to perform the post read mst fixups, if the
@@ -154,8 +152,7 @@ static void ntfs_end_buffer_async_read(s
 	unlock_page(page);
 	return;
 still_busy:
-	bit_spin_unlock(BH_Uptodate_Lock, &first->b_state);
-	local_irq_restore(flags);
+	spin_unlock_irqrestore(&first->b_uptodate_lock, flags);
 	return;
 }
 
Index: linux/fs/pipe.c
===================================================================
--- linux.orig/fs/pipe.c
+++ linux/fs/pipe.c
@@ -206,8 +206,14 @@ pipe_readv(struct file *filp, const stru
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		file_accessed(filp);
+#endif
 	return ret;
 }
 
@@ -346,8 +352,14 @@ out:
 		wake_up_interruptible(PIPE_WAIT(*inode));
 		kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN);
 	}
+	/*
+	 * Hack: we turn off atime updates for -RT kernels.
+	 * Who uses them on pipes anyway?
+	 */
+#ifndef CONFIG_PREEMPT_RT
 	if (ret > 0)
 		inode_update_time(inode, 1);	/* mtime and ctime */
+#endif
 	return ret;
 }
 
Index: linux/fs/proc/array.c
===================================================================
--- linux.orig/fs/proc/array.c
+++ linux/fs/proc/array.c
@@ -130,17 +130,19 @@ static inline char * task_name(struct ta
  */
 static const char *task_state_array[] = {
 	"R (running)",		/*  0 */
-	"S (sleeping)",		/*  1 */
-	"D (disk sleep)",	/*  2 */
-	"T (stopped)",		/*  4 */
-	"T (tracing stop)",	/*  8 */
-	"Z (zombie)",		/* 16 */
-	"X (dead)"		/* 32 */
+	"M (running-mutex)",	/*  1 */
+	"S (sleeping)",		/*  2 */
+	"D (disk sleep)",	/*  4 */
+	"T (stopped)",		/*  8 */
+	"T (tracing stop)",	/* 16 */
+	"Z (zombie)",		/* 32 */
+	"X (dead)"		/* 64 */
 };
 
 static inline const char * get_task_state(struct task_struct *tsk)
 {
 	unsigned int state = (tsk->state & (TASK_RUNNING |
+					    TASK_RUNNING_MUTEX |
 					    TASK_INTERRUPTIBLE |
 					    TASK_UNINTERRUPTIBLE |
 					    TASK_STOPPED |
@@ -330,7 +332,7 @@ static int do_task_stat(struct task_stru
 	unsigned long  min_flt = 0,  maj_flt = 0;
 	cputime_t cutime, cstime, utime, stime;
 	unsigned long rsslim = 0;
-	unsigned long it_real_value = 0;
+	DEFINE_KTIME(it_real_value);
 	struct task_struct *t;
 	char tcomm[sizeof(task->comm)];
 
@@ -386,7 +388,7 @@ static int do_task_stat(struct task_stru
 			utime = cputime_add(utime, task->signal->utime);
 			stime = cputime_add(stime, task->signal->stime);
 		}
-		it_real_value = task->signal->it_real_value;
+		it_real_value = task->signal->real_timer.expires;
 	}
 	ppid = pid_alive(task) ? task->group_leader->real_parent->tgid : 0;
 	read_unlock(&tasklist_lock);
@@ -435,7 +437,7 @@ static int do_task_stat(struct task_stru
 		priority,
 		nice,
 		num_threads,
-		jiffies_to_clock_t(it_real_value),
+		(long) ktime_to_clock_t(it_real_value),
 		start_time,
 		vsize,
 		mm ? get_mm_counter(mm, rss) : 0, /* you might want to shift this left 3 */
Index: linux/fs/proc/proc_misc.c
===================================================================
--- linux.orig/fs/proc/proc_misc.c
+++ linux/fs/proc/proc_misc.c
@@ -415,6 +415,42 @@ static int show_stat(struct seq_file *p,
 		nr_running(),
 		nr_iowait());
 
+#ifdef CONFIG_PREEMPT_RT
+	{
+		unsigned long nr_uninterruptible_cpu(int cpu);
+		extern int pi_walk, pi_null, pi_prio, pi_initialized;
+		extern int rt_overload_schedule,
+			   rt_overload_wakeup, rt_overload_pulled;
+		unsigned long rt_nr_running_cpu(int cpu);
+		extern atomic_t rt_overload;
+
+		int i;
+
+		seq_printf(p, "rt_overload_schedule: %d\n",
+					rt_overload_schedule);
+		seq_printf(p, "rt_overload_wakeup:   %d\n",
+					rt_overload_wakeup);
+		seq_printf(p, "rt_overload_pulled:   %d\n",
+					rt_overload_pulled);
+		seq_printf(p, "pi_null: %d\n", pi_null);
+		seq_printf(p, "pi_prio: %d\n", pi_prio);
+		seq_printf(p, "pi_walk: %d\n", pi_walk);
+		seq_printf(p, "pi_init: %d\n", pi_initialized);
+		seq_printf(p, "nr_running(): %ld\n",
+			nr_running());
+		seq_printf(p, "nr_uninterruptible(): %ld\n",
+			nr_uninterruptible());
+		for_each_cpu(i)
+			seq_printf(p, "nr_uninterruptible(%d): %ld\n",
+				i, nr_uninterruptible_cpu(i));
+		for_each_cpu(i)
+			seq_printf(p, "rt_nr_running(%d): %ld\n",
+				i, rt_nr_running_cpu(i));
+		seq_printf(p, "rt_overload: %d\n", atomic_read(&rt_overload));
+
+	}
+#endif
+
 	return 0;
 }
 
@@ -531,6 +567,20 @@ static int execdomains_read_proc(char *p
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
+#ifdef CONFIG_LATENCY_TRACE
+extern struct seq_operations latency_trace_op;
+static int latency_trace_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &latency_trace_op);
+}
+static struct file_operations proc_latency_trace_operations = {
+	.open		= latency_trace_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+#endif
+
 #ifdef CONFIG_MAGIC_SYSRQ
 /*
  * writing 'C' to /proc/sysrq-trigger is like sysrq-C
@@ -563,6 +613,48 @@ void create_seq_entry(char *name, mode_t
 		entry->proc_fops = f;
 }
 
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc(char *page, char **start, off_t off,
+		  int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_data(char *page);
+
+	len = rcu_read_proc_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_gp(char *page, char **start, off_t off,
+		     int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_gp_data(char *page);
+
+	len = rcu_read_proc_gp_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ptrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ptrs_data(char *page);
+
+	len = rcu_read_proc_ptrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+
+int rcu_read_proc_ctrs(char *page, char **start, off_t off,
+		       int count, int *eof, void *data)
+{
+	int len;
+	extern int rcu_read_proc_ctrs_data(char *page);
+
+	len = rcu_read_proc_ctrs_data(page);
+	return proc_calc_metrics(page, start, off, count, eof, len);
+}
+#endif /* #ifdef CONFIG_RCU_STATS */
+
 void __init proc_misc_init(void)
 {
 	struct proc_dir_entry *entry;
@@ -585,6 +677,12 @@ void __init proc_misc_init(void)
 		{"cmdline",	cmdline_read_proc},
 		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
+#ifdef CONFIG_RCU_STATS
+		{"rcustats",	rcu_read_proc},
+		{"rcugp",	rcu_read_proc_gp},
+		{"rcuptrs",	rcu_read_proc_ptrs},
+		{"rcuctrs",	rcu_read_proc_ctrs},
+#endif /* #ifdef CONFIG_RCU_STATS */
 		{NULL,}
 	};
 	for (p = simple_ones; p->name; p++)
@@ -611,6 +709,9 @@ void __init proc_misc_init(void)
 #ifdef CONFIG_SCHEDSTATS
 	create_seq_entry("schedstat", 0, &proc_schedstat_operations);
 #endif
+#ifdef CONFIG_LATENCY_TRACE
+	create_seq_entry("latency_trace", 0, &proc_latency_trace_operations);
+#endif
 #ifdef CONFIG_PROC_KCORE
 	proc_root_kcore = create_proc_entry("kcore", S_IRUSR, NULL);
 	if (proc_root_kcore) {
Index: linux/fs/proc/task_mmu.c
===================================================================
--- linux.orig/fs/proc/task_mmu.c
+++ linux/fs/proc/task_mmu.c
@@ -320,8 +320,10 @@ static void *m_start(struct seq_file *m,
 	vma = NULL;
 	if ((unsigned long)l < mm->map_count) {
 		vma = mm->mmap;
-		while (l-- && vma)
+		while (l-- && vma) {
 			vma = vma->vm_next;
+			cond_resched();
+		}
 		goto out;
 	}
 
Index: linux/fs/sysfs/dir.c
===================================================================
--- linux.orig/fs/sysfs/dir.c
+++ linux/fs/sysfs/dir.c
@@ -112,7 +112,11 @@ static int create_dir(struct kobject * k
 			}
 		}
 		if (error && (error != -EEXIST)) {
-			sysfs_put((*d)->d_fsdata);
+			struct sysfs_dirent *sd = (*d)->d_fsdata;
+			if (sd) {
+ 				list_del_init(&sd->s_sibling);
+				sysfs_put(sd);
+			}
 			d_drop(*d);
 		}
 		dput(*d);
Index: linux/fs/xfs/linux-2.6/mrlock.h
===================================================================
--- linux.orig/fs/xfs/linux-2.6/mrlock.h
+++ linux/fs/xfs/linux-2.6/mrlock.h
@@ -37,12 +37,12 @@
 enum { MR_NONE, MR_ACCESS, MR_UPDATE };
 
 typedef struct {
-	struct rw_semaphore	mr_lock;
-	int			mr_writer;
+	struct compat_rw_semaphore	mr_lock;
+	int				mr_writer;
 } mrlock_t;
 
 #define mrinit(mrp, name)	\
-	( (mrp)->mr_writer = 0, init_rwsem(&(mrp)->mr_lock) )
+	do { (mrp)->mr_writer = 0; init_rwsem(&(mrp)->mr_lock); } while (0)
 #define mrlock_init(mrp, t,n,s)	mrinit(mrp, n)
 #define mrfree(mrp)		do { } while (0)
 #define mraccess(mrp)		mraccessf(mrp, 0)
Index: linux/fs/xfs/linux-2.6/mutex.h
===================================================================
--- linux.orig/fs/xfs/linux-2.6/mutex.h
+++ linux/fs/xfs/linux-2.6/mutex.h
@@ -42,7 +42,7 @@
  * callers.
  */
 #define MUTEX_DEFAULT		0x0
-typedef struct semaphore	mutex_t;
+typedef struct compat_semaphore	mutex_t;
 
 #define mutex_init(lock, type, name)		sema_init(lock, 1)
 #define mutex_destroy(lock)			sema_init(lock, -99)
Index: linux/fs/xfs/linux-2.6/sema.h
===================================================================
--- linux.orig/fs/xfs/linux-2.6/sema.h
+++ linux/fs/xfs/linux-2.6/sema.h
@@ -41,7 +41,7 @@
  * sema_t structure just maps to struct semaphore in Linux kernel.
  */
 
-typedef struct semaphore sema_t;
+typedef struct compat_semaphore sema_t;
 
 #define init_sema(sp, val, c, d)	sema_init(sp, val)
 #define initsema(sp, val)		sema_init(sp, val)
Index: linux/fs/xfs/linux-2.6/xfs_aops.c
===================================================================
--- linux.orig/fs/xfs/linux-2.6/xfs_aops.c
+++ linux/fs/xfs/linux-2.6/xfs_aops.c
@@ -192,7 +192,7 @@ linvfs_unwritten_done(
 	int			uptodate)
 {
 	xfs_ioend_t		*ioend = bh->b_private;
-	static spinlock_t	unwritten_done_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(unwritten_done_lock);
 	unsigned long		flags;
 
 	ASSERT(buffer_unwritten(bh));
Index: linux/fs/xfs/linux-2.6/xfs_buf.c
===================================================================
--- linux.orig/fs/xfs/linux-2.6/xfs_buf.c
+++ linux/fs/xfs/linux-2.6/xfs_buf.c
@@ -962,7 +962,7 @@ int
 pagebuf_lock_value(
 	xfs_buf_t		*pb)
 {
-	return(atomic_read(&pb->pb_sema.count));
+	return !sem_is_locked(&pb->pb_sema);
 }
 #endif
 
Index: linux/fs/xfs/linux-2.6/xfs_buf.h
===================================================================
--- linux.orig/fs/xfs/linux-2.6/xfs_buf.h
+++ linux/fs/xfs/linux-2.6/xfs_buf.h
@@ -139,7 +139,7 @@ typedef int (*page_buf_bdstrat_t)(struct
 #define PB_PAGES	2
 
 typedef struct xfs_buf {
-	struct semaphore	pb_sema;	/* semaphore for lockables  */
+	struct compat_semaphore	pb_sema;	/* semaphore for lockables  */
 	unsigned long		pb_queuetime;	/* time buffer was queued   */
 	atomic_t		pb_pin_count;	/* pin count		    */
 	wait_queue_head_t	pb_waiters;	/* unpin waiters	    */
@@ -159,7 +159,7 @@ typedef struct xfs_buf {
 	page_buf_iodone_t	pb_iodone;	/* I/O completion function */
 	page_buf_relse_t	pb_relse;	/* releasing function */
 	page_buf_bdstrat_t	pb_strat;	/* pre-write function */
-	struct semaphore	pb_iodonesema;	/* Semaphore for I/O waiters */
+	struct compat_semaphore	pb_iodonesema;	/* Semaphore for I/O waiters */
 	void			*pb_fspriv;
 	void			*pb_fspriv2;
 	void			*pb_fspriv3;
Index: linux/fs/xfs/quota/xfs_qm.h
===================================================================
--- linux.orig/fs/xfs/quota/xfs_qm.h
+++ linux/fs/xfs/quota/xfs_qm.h
@@ -179,8 +179,8 @@ typedef struct xfs_dquot_acct {
 #define XFS_QM_IWARNLIMIT	5
 #define XFS_QM_RTBWARNLIMIT	5
 
-#define XFS_QM_LOCK(xqm)	(mutex_lock(&xqm##_lock, PINOD))
-#define XFS_QM_UNLOCK(xqm)	(mutex_unlock(&xqm##_lock))
+#define XFS_QM_LOCK(xqm)	mutex_lock(&xqm##_lock, PINOD)
+#define XFS_QM_UNLOCK(xqm)	mutex_unlock(&xqm##_lock)
 #define XFS_QM_HOLD(xqm)	((xqm)->qm_nrefs++)
 #define XFS_QM_RELE(xqm)	((xqm)->qm_nrefs--)
 
Index: linux/fs/xfs/quota/xfs_quota_priv.h
===================================================================
--- linux.orig/fs/xfs/quota/xfs_quota_priv.h
+++ linux/fs/xfs/quota/xfs_quota_priv.h
@@ -65,8 +65,8 @@
 #define XFS_QI_MPLNEXT(mp)	((mp)->m_quotainfo->qi_dqlist.qh_next)
 #define XFS_QI_MPLNDQUOTS(mp)	((mp)->m_quotainfo->qi_dqlist.qh_nelems)
 
-#define XQMLCK(h)			(mutex_lock(&((h)->qh_lock), PINOD))
-#define XQMUNLCK(h)			(mutex_unlock(&((h)->qh_lock)))
+#define XQMLCK(h)			mutex_lock(&((h)->qh_lock), PINOD)
+#define XQMUNLCK(h)			mutex_unlock(&((h)->qh_lock))
 #ifdef DEBUG
 struct xfs_dqhash;
 static inline int XQMISLCKD(struct xfs_dqhash *h)
Index: linux/fs/xfs/xfs_mount.h
===================================================================
--- linux.orig/fs/xfs/xfs_mount.h
+++ linux/fs/xfs/xfs_mount.h
@@ -340,7 +340,7 @@ typedef struct xfs_mount {
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* XFS_IN_MAXLEVELS */
 	struct xfs_perag	*m_perag;	/* per-ag accounting info */
-	struct rw_semaphore	m_peraglock;	/* lock for m_perag (pointer) */
+	struct compat_rw_semaphore m_peraglock;	/* lock for m_perag (pointer) */
 	sema_t			m_growlock;	/* growfs mutex */
 	int			m_fixedfsid[2];	/* unchanged for life of FS */
 	uint			m_dmevmask;	/* DMI events for this FS */
Index: linux/include/acpi/acpiosxf.h
===================================================================
--- linux.orig/include/acpi/acpiosxf.h
+++ linux/include/acpi/acpiosxf.h
@@ -57,7 +57,7 @@
 #define OSD_PRIORITY_MED            3
 #define OSD_PRIORITY_LO             4
 
-#define ACPI_NO_UNIT_LIMIT          ((u32) -1)
+#define ACPI_NO_UNIT_LIMIT          (INT_MAX/2)
 #define ACPI_MUTEX_SEM              1
 
 /* Functions for acpi_os_signal */
Index: linux/include/asm-arm/arch-ixp2000/system.h
===================================================================
--- linux.orig/include/asm-arm/arch-ixp2000/system.h
+++ linux/include/asm-arm/arch-ixp2000/system.h
@@ -19,7 +19,7 @@ static inline void arch_idle(void)
 
 static inline void arch_reset(char mode)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/*
 	 * Reset flash banking register so that we are pointing at
Index: linux/include/asm-arm/arch-shark/system.h
===================================================================
--- linux.orig/include/asm-arm/arch-shark/system.h
+++ linux/include/asm-arm/arch-shark/system.h
@@ -11,7 +11,7 @@
 static void arch_reset(char mode)
 {
 	short temp;
-	local_irq_disable();
+	raw_local_irq_disable();
 	/* Reset the Machine via pc[3] of the sequoia chipset */
 	outw(0x09,0x24);
 	temp=inw(0x26);
Index: linux/include/asm-arm/atomic.h
===================================================================
--- linux.orig/include/asm-arm/atomic.h
+++ linux/include/asm-arm/atomic.h
@@ -110,10 +110,10 @@ static inline int atomic_add_return(int 
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val += i;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -123,10 +123,10 @@ static inline int atomic_sub_return(int 
 	unsigned long flags;
 	int val;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	val = v->counter;
 	v->counter = val -= i;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return val;
 }
@@ -135,11 +135,46 @@ static inline void atomic_clear_mask(uns
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*addr &= ~mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
+#ifndef CONFIG_SMP
+/*
+ * Atomic compare and exchange.
+ */
+#define __HAVE_ARCH_CMPXCHG	1
+
+extern unsigned long wrong_size_cmpxchg(volatile void *ptr);
+
+static inline unsigned long __cmpxchg(volatile void *ptr,
+				    unsigned long old,
+				    unsigned long new, int size)
+{
+	unsigned long flags, prev;
+	volatile unsigned long *p = ptr;
+
+	if (size == 4) {
+		__raw_local_irq_save(flags);
+		if ((prev = *p) == old)
+			*p = new;
+		__raw_local_irq_restore(flags);
+		return(prev);
+	} else
+		return wrong_size_cmpxchg(ptr);
+}
+
+#define cmpxchg(ptr,o,n)					  	\
+({									\
+     __typeof__(*(ptr)) _o_ = (o);					\
+     __typeof__(*(ptr)) _n_ = (n);					\
+     (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_,		\
+			   	 (unsigned long)_n_, sizeof(*(ptr)));	\
+})
+
+#endif
+
 #endif /* __LINUX_ARM_ARCH__ */
 
 #define atomic_add(i, v)	(void) atomic_add_return(i, v)
Index: linux/include/asm-arm/bitops.h
===================================================================
--- linux.orig/include/asm-arm/bitops.h
+++ linux/include/asm-arm/bitops.h
@@ -36,9 +36,9 @@ static inline void ____atomic_set_bit(un
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*p |= mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_clear_bit(unsigned int bit, volatile unsigned long *p)
@@ -48,9 +48,9 @@ static inline void ____atomic_clear_bit(
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*p &= ~mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
 static inline void ____atomic_change_bit(unsigned int bit, volatile unsigned long *p)
@@ -60,9 +60,9 @@ static inline void ____atomic_change_bit
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	*p ^= mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 }
 
 static inline int
@@ -74,10 +74,10 @@ ____atomic_test_and_set_bit(unsigned int
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	res = *p;
 	*p = res | mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -91,10 +91,10 @@ ____atomic_test_and_clear_bit(unsigned i
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	res = *p;
 	*p = res & ~mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return res & mask;
 }
@@ -108,10 +108,10 @@ ____atomic_test_and_change_bit(unsigned 
 
 	p += bit >> 5;
 
-	local_irq_save(flags);
+	__raw_local_irq_save(flags);
 	res = *p;
 	*p = res ^ mask;
-	local_irq_restore(flags);
+	__raw_local_irq_restore(flags);
 
 	return res & mask;
 }
Index: linux/include/asm-arm/dma.h
===================================================================
--- linux.orig/include/asm-arm/dma.h
+++ linux/include/asm-arm/dma.h
@@ -21,7 +21,7 @@ typedef unsigned int dmamode_t;
 #define DMA_MODE_CASCADE 2
 #define DMA_AUTOINIT	 4
 
-extern spinlock_t  dma_spin_lock;
+extern raw_spinlock_t  dma_spin_lock;
 
 static inline unsigned long claim_dma_lock(void)
 {
Index: linux/include/asm-arm/dyntick.h
===================================================================
--- /dev/null
+++ linux/include/asm-arm/dyntick.h
@@ -0,0 +1,6 @@
+#ifndef _ASMARM_DYNTICK_H
+#define _ASMARM_DYNTICK_H
+
+#include <asm/mach/time.h>
+
+#endif /* _ASMARM_DYNTICK_H */
Index: linux/include/asm-arm/hw_irq.h
===================================================================
--- /dev/null
+++ linux/include/asm-arm/hw_irq.h
@@ -0,0 +1,9 @@
+/*
+ * Nothing to see here yet
+ */
+#ifndef _ARCH_ARM_HW_IRQ_H
+#define _ARCH_ARM_HW_IRQ_H
+
+#include <asm/mach/irq.h>
+
+#endif
Index: linux/include/asm-arm/irq.h
===================================================================
--- linux.orig/include/asm-arm/irq.h
+++ linux/include/asm-arm/irq.h
@@ -19,16 +19,10 @@
 #define NO_IRQ	((unsigned int)(-1))
 #endif
 
-struct irqaction;
-
-extern void disable_irq_nosync(unsigned int);
-extern void disable_irq(unsigned int);
-extern void enable_irq(unsigned int);
-
-#define __IRQT_FALEDGE	(1 << 0)
-#define __IRQT_RISEDGE	(1 << 1)
-#define __IRQT_LOWLVL	(1 << 2)
-#define __IRQT_HIGHLVL	(1 << 3)
+#define __IRQT_FALEDGE	IRQ_TYPE_EDGEL
+#define __IRQT_RISEDGE	IRQ_TYPE_EDGEH
+#define __IRQT_LOWLVL	IRQ_TYPE_LEVELL
+#define __IRQT_HIGHLVL	IRQ_TYPE_LEVELH
 
 #define IRQT_NOEDGE	(0)
 #define IRQT_RISING	(__IRQT_RISEDGE)
@@ -36,16 +30,9 @@ extern void enable_irq(unsigned int);
 #define IRQT_BOTHEDGE	(__IRQT_RISEDGE|__IRQT_FALEDGE)
 #define IRQT_LOW	(__IRQT_LOWLVL)
 #define IRQT_HIGH	(__IRQT_HIGHLVL)
-#define IRQT_PROBE	(1 << 4)
-
-int set_irq_type(unsigned int irq, unsigned int type);
-void disable_irq_wake(unsigned int irq);
-void enable_irq_wake(unsigned int irq);
-int setup_irq(unsigned int, struct irqaction *);
 
-struct irqaction;
-struct pt_regs;
-int handle_IRQ_event(unsigned int, struct pt_regs *, struct irqaction *);
+/* FIXME_TGLX */
+#define IRQT_PROBE	(1 << 7)
 
 #endif
 
Index: linux/include/asm-arm/mach/irq.h
===================================================================
--- linux.orig/include/asm-arm/mach/irq.h
+++ linux/include/asm-arm/mach/irq.h
@@ -10,94 +10,9 @@
 #ifndef __ASM_ARM_MACH_IRQ_H
 #define __ASM_ARM_MACH_IRQ_H
 
-struct irqdesc;
-struct pt_regs;
-struct seq_file;
-
-typedef void (*irq_handler_t)(unsigned int, struct irqdesc *, struct pt_regs *);
-typedef void (*irq_control_t)(unsigned int);
+#include <linux/irq.h>
 
-struct irqchip {
-	/*
-	 * Acknowledge the IRQ.
-	 * If this is a level-based IRQ, then it is expected to mask the IRQ
-	 * as well.
-	 */
-	void (*ack)(unsigned int);
-	/*
-	 * Mask the IRQ in hardware.
-	 */
-	void (*mask)(unsigned int);
-	/*
-	 * Unmask the IRQ in hardware.
-	 */
-	void (*unmask)(unsigned int);
-	/*
-	 * Ask the hardware to re-trigger the IRQ.
-	 * Note: This method _must_ _not_ call the interrupt handler.
-	 * If you are unable to retrigger the interrupt, do not
-	 * provide a function, or if you do, return non-zero.
-	 */
-	int (*retrigger)(unsigned int);
-	/*
-	 * Set the type of the IRQ.
-	 */
-	int (*set_type)(unsigned int, unsigned int);
-	/*
-	 * Set wakeup-enable on the selected IRQ
-	 */
-	int (*set_wake)(unsigned int, unsigned int);
-
-#ifdef CONFIG_SMP
-	/*
-	 * Route an interrupt to a CPU
-	 */
-	void (*set_cpu)(struct irqdesc *desc, unsigned int irq, unsigned int cpu);
-#endif
-};
-
-struct irqdesc {
-	irq_handler_t	handle;
-	struct irqchip	*chip;
-	struct irqaction *action;
-	struct list_head pend;
-	void		*chipdata;
-	void		*data;
-	unsigned int	disable_depth;
-
-	unsigned int	triggered: 1;		/* IRQ has occurred	      */
-	unsigned int	running  : 1;		/* IRQ is running             */
-	unsigned int	pending  : 1;		/* IRQ is pending	      */
-	unsigned int	probing  : 1;		/* IRQ in use for a probe     */
-	unsigned int	probe_ok : 1;		/* IRQ can be used for probe  */
-	unsigned int	valid    : 1;		/* IRQ claimable	      */
-	unsigned int	noautoenable : 1;	/* don't automatically enable IRQ */
-	unsigned int	unused   :25;
-
-	struct proc_dir_entry *procdir;
-
-#ifdef CONFIG_SMP
-	cpumask_t	affinity;
-	unsigned int	cpu;
-#endif
-
-	/*
-	 * IRQ lock detection
-	 */
-	unsigned int	lck_cnt;
-	unsigned int	lck_pc;
-	unsigned int	lck_jif;
-};
-
-extern struct irqdesc irq_desc[];
-
-/*
- * Helpful inline function for calling irq descriptor handlers.
- */
-static inline void desc_handle_irq(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs)
-{
-	desc->handle(irq, desc, regs);
-}
+struct seq_file;
 
 /*
  * This is internal.  Do not use it.
@@ -105,31 +20,52 @@ static inline void desc_handle_irq(unsig
 extern void (*init_arch_irq)(void);
 extern void init_FIQ(void);
 extern int show_fiq_list(struct seq_file *, void *);
-void __set_irq_handler(unsigned int irq, irq_handler_t, int);
+void __set_irq_handler(unsigned int irq, struct irq_type *, int);
 
 /*
  * External stuff.
  */
 #define set_irq_handler(irq,handler)		__set_irq_handler(irq,handler,0)
-#define set_irq_chained_handler(irq,handler)	__set_irq_handler(irq,handler,1)
-#define set_irq_data(irq,d)			do { irq_desc[irq].data = d; } while (0)
-#define set_irq_chipdata(irq,d)			do { irq_desc[irq].chipdata = d; } while (0)
-#define get_irq_chipdata(irq)			(irq_desc[irq].chipdata)
 
-void set_irq_chip(unsigned int irq, struct irqchip *);
+
+#define set_irq_chipdata(irq,d)			set_irq_chip_data(irq, d)
+#define get_irq_chipdata(irq)			get_irq_chip_data(irq)
+
 void set_irq_flags(unsigned int irq, unsigned int flags);
 
 #define IRQF_VALID	(1 << 0)
 #define IRQF_PROBE	(1 << 1)
 #define IRQF_NOAUTOEN	(1 << 2)
 
+/* ARM uses the retrigger functions in desc->chip or software retrigger */
+static inline void hw_resend_irq(struct irq_type *t, unsigned int i) {}
+
 /*
- * Built-in IRQ handlers.
+ * Hack alert. This is for easy migration, but should be changed in the source
  */
-void do_level_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_edge_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_simple_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void do_bad_IRQ(unsigned int irq, struct irqdesc *desc, struct pt_regs *regs);
-void dummy_mask_unmask_irq(unsigned int irq);
+#define do_level_IRQ	(&default_level_type)
+#define do_edge_IRQ	(&default_edge_type)
+#define do_simple_IRQ	(&default_simple_type)
+
+/* Hack to get around set_irq_chained_handler(nr,NULL) problem */
+#define irq_NULL_type no_irq_type
+#define set_irq_chained_handler(irq,handler) \
+	__set_irq_handler(irq,&irq_##handler##_type,1)
+
+#define DEFINE_IRQ_CHAINED_TYPE(function)		\
+struct irq_type irq_##function##_type = {		\
+	.typename = #function"-chained_type",		\
+	.handle_irq = function,				\
+}
+
+#define do_bad_IRQ(irq,desc,regs)			\
+do {							\
+	spin_lock(&desc->lock);				\
+	handle_bad_irq(irq, desc, regs);		\
+	spin_unlock(&desc->lock);			\
+} while(0)
+
+/* FIXME */
+#define ack_bad_irq(irq) do {} while (0)
 
 #endif
Index: linux/include/asm-arm/pgalloc.h
===================================================================
--- linux.orig/include/asm-arm/pgalloc.h
+++ linux/include/asm-arm/pgalloc.h
@@ -102,7 +102,7 @@ static inline void __pmd_populate(pmd_t 
  *
  * Ensure that we always set both PMD entries.
  */
-static inline void
+static inline void notrace
 pmd_populate_kernel(struct mm_struct *mm, pmd_t *pmdp, pte_t *ptep)
 {
 	unsigned long pte_ptr = (unsigned long)ptep;
@@ -115,7 +115,7 @@ pmd_populate_kernel(struct mm_struct *mm
 	__pmd_populate(pmdp, __pa(pte_ptr) | _PAGE_KERNEL_TABLE);
 }
 
-static inline void
+static inline void notrace
 pmd_populate(struct mm_struct *mm, pmd_t *pmdp, struct page *ptep)
 {
 	__pmd_populate(pmdp, page_to_pfn(ptep) << PAGE_SHIFT | _PAGE_USER_TABLE);
Index: linux/include/asm-arm/semaphore.h
===================================================================
--- linux.orig/include/asm-arm/semaphore.h
+++ linux/include/asm-arm/semaphore.h
@@ -5,51 +5,65 @@
 #define __ASM_ARM_SEMAPHORE_H
 
 #include <linux/linkage.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #include <linux/spinlock.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define semaphore compat_semaphore
+#define __MUTEX_INITIALIZER(name) __COMPAT_MUTEX_INITIALIZER(name)
+#endif
+
 #include <asm/atomic.h>
 #include <asm/locks.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INIT(name, cnt)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, cnt)				\
 {								\
 	.count	= ATOMIC_INIT(cnt),				\
 	.wait	= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait),	\
 }
 
-#define __MUTEX_INITIALIZER(name) __SEMAPHORE_INIT(name,1)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count)	\
-	struct semaphore name = __SEMAPHORE_INIT(name,count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init(struct semaphore *sem, int val)
+static inline void compat_sema_init(struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	sem->sleepers = 0;
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX(struct semaphore *sem)
+static inline void compat_init_MUTEX(struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED(struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED(struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-static inline int sema_count(struct semaphore *sem)
+static inline int compat_sema_count(struct compat_semaphore *sem)
 {
 	return atomic_read(&sem->count);
 }
@@ -62,16 +76,18 @@ asmlinkage int  __down_interruptible_fai
 asmlinkage int  __down_trylock_failed(void);
 asmlinkage void __up_wakeup(void);
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern int  __down_trylock(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_up(struct compat_semaphore *sem);
+extern int __compat_down_interruptible(struct compat_semaphore * sem);
+extern int __compat_down_trylock(struct compat_semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down" is the actual routine that waits...
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__down_op(sem, __down_failed);
@@ -81,13 +97,13 @@ static inline void down(struct semaphore
  * This is ugly, but we want the default case to fall through.
  * "__down_interruptible" is the actual routine that waits...
  */
-static inline int down_interruptible (struct semaphore * sem)
+static inline int compat_down_interruptible (struct compat_semaphore * sem)
 {
 	might_sleep();
 	return __down_op_ret(sem, __down_interruptible_failed);
 }
 
-static inline int down_trylock(struct semaphore *sem)
+static inline int compat_down_trylock(struct compat_semaphore *sem)
 {
 	return __down_op_ret(sem, __down_trylock_failed);
 }
@@ -98,9 +114,10 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__up_op(sem, __up_wakeup);
 }
 
+#include <linux/semaphore.h>
 #endif
Index: linux/include/asm-arm/signal.h
===================================================================
--- linux.orig/include/asm-arm/signal.h
+++ linux/include/asm-arm/signal.h
@@ -114,7 +114,7 @@ typedef unsigned long sigset_t;
 #define SIGSTKSZ	8192
 
 #ifdef __KERNEL__
-#define SA_TIMER		0x40000000
+#define SA_TIMER		(0x40000000 | SA_NODELAY)
 #endif
 
 #include <asm-generic/signal.h>
Index: linux/include/asm-arm/system.h
===================================================================
--- linux.orig/include/asm-arm/system.h
+++ linux/include/asm-arm/system.h
@@ -4,6 +4,7 @@
 #ifdef __KERNEL__
 
 #include <linux/config.h>
+#include <asm/ptrace.h>
 
 #define CPU_ARCH_UNKNOWN	0
 #define CPU_ARCH_ARMv3		1
@@ -176,7 +177,7 @@ do {									\
  */
 #if __LINUX_ARM_ARCH__ >= 6
 
-#define local_irq_save(x)					\
+#define __raw_local_irq_save(x)					\
 	({							\
 	__asm__ __volatile__(					\
 	"mrs	%0, cpsr		@ local_irq_save\n"	\
@@ -184,17 +185,17 @@ do {									\
 	: "=r" (x) : : "memory", "cc");				\
 	})
 
-#define local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
-#define local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
-#define local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
-#define local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
+#define __raw_local_irq_enable()  __asm__("cpsie i	@ __sti" : : : "memory", "cc")
+#define __raw_local_irq_disable() __asm__("cpsid i	@ __cli" : : : "memory", "cc")
+#define __raw_local_fiq_enable()  __asm__("cpsie f	@ __stf" : : : "memory", "cc")
+#define __raw_local_fiq_disable() __asm__("cpsid f	@ __clf" : : : "memory", "cc")
 
 #else
 
 /*
  * Save the current interrupt enable state & disable IRQs
  */
-#define local_irq_save(x)					\
+#define __raw_local_irq_save(x)					\
 	({							\
 		unsigned long temp;				\
 		(void) (&temp == &x);				\
@@ -206,11 +207,11 @@ do {									\
 	:							\
 	: "memory", "cc");					\
 	})
-	
+
 /*
  * Enable IRQs
  */
-#define local_irq_enable()					\
+#define __raw_local_irq_enable()				\
 	({							\
 		unsigned long temp;				\
 	__asm__ __volatile__(					\
@@ -225,7 +226,7 @@ do {									\
 /*
  * Disable IRQs
  */
-#define local_irq_disable()					\
+#define __raw_local_irq_disable()				\
 	({							\
 		unsigned long temp;				\
 	__asm__ __volatile__(					\
@@ -272,7 +273,7 @@ do {									\
 /*
  * Save the current interrupt enable state.
  */
-#define local_save_flags(x)					\
+#define __raw_local_save_flags(x)				\
 	({							\
 	__asm__ __volatile__(					\
 	"mrs	%0, cpsr		@ local_save_flags"	\
@@ -282,20 +283,27 @@ do {									\
 /*
  * restore saved IRQ & FIQ state
  */
-#define local_irq_restore(x)					\
+#define __raw_local_irq_restore(x)				\
 	__asm__ __volatile__(					\
 	"msr	cpsr_c, %0		@ local_irq_restore\n"	\
 	:							\
 	: "r" (x)						\
 	: "memory", "cc")
 
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	(int)(flags & PSR_I_BIT);	\
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	(int)(flags & PSR_I_BIT);		\
+})
+
+#define __raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	__raw_local_save_flags(flags);		\
+	__raw_irqs_disabled_flags(flags);	\
 })
 
+#include <linux/rt_irq.h>
+
 #ifdef CONFIG_SMP
 
 #define smp_mb()		mb()
Index: linux/include/asm-arm/thread_info.h
===================================================================
--- linux.orig/include/asm-arm/thread_info.h
+++ linux/include/asm-arm/thread_info.h
@@ -128,6 +128,7 @@ extern void iwmmxt_task_release(struct t
 #define TIF_NOTIFY_RESUME	0
 #define TIF_SIGPENDING		1
 #define TIF_NEED_RESCHED	2
+#define TIF_NEED_RESCHED_DELAYED 3
 #define TIF_SYSCALL_TRACE	8
 #define TIF_POLLING_NRFLAG	16
 #define TIF_USING_IWMMXT	17
@@ -136,6 +137,7 @@ extern void iwmmxt_task_release(struct t
 #define _TIF_NOTIFY_RESUME	(1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1 << TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SYSCALL_TRACE	(1 << TIF_SYSCALL_TRACE)
 #define _TIF_POLLING_NRFLAG	(1 << TIF_POLLING_NRFLAG)
 #define _TIF_USING_IWMMXT	(1 << TIF_USING_IWMMXT)
Index: linux/include/asm-arm/timex.h
===================================================================
--- linux.orig/include/asm-arm/timex.h
+++ linux/include/asm-arm/timex.h
@@ -16,9 +16,17 @@
 
 typedef unsigned long cycles_t;
 
+#ifndef mach_read_cycles
+ #define mach_read_cycles() (0)
+#ifdef CONFIG_LATENCY_TIMING
+ #define mach_cycles_to_usecs(d) (d)
+ #define mach_usecs_to_cycles(d) (d)
+#endif
+#endif
+
 static inline cycles_t get_cycles (void)
 {
-	return 0;
+	return mach_read_cycles();
 }
 
 #endif
Index: linux/include/asm-arm/tlb.h
===================================================================
--- linux.orig/include/asm-arm/tlb.h
+++ linux/include/asm-arm/tlb.h
@@ -32,15 +32,17 @@ struct mmu_gather {
 
 	unsigned int		flushes;
 	unsigned int		avoided_flushes;
+	int			cpu;
 };
 
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	int cpu = smp_processor_id();
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, cpu);
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
+	tlb->cpu = cpu;
 
 	tlb->mm = mm;
 	tlb->freed = 0;
@@ -62,6 +64,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 
 	if (tlb->fullmm)
 		flush_tlb_mm(mm);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
@@ -99,4 +102,6 @@ tlb_end_vma(struct mmu_gather *tlb, stru
 
 #define tlb_migrate_finish(mm)		do { } while (0)
 
+#define tlb_free(tlb) 			do {tlb->freed++;} while (0)
+
 #endif
Index: linux/include/asm-arm/tlbflush.h
===================================================================
--- linux.orig/include/asm-arm/tlbflush.h
+++ linux/include/asm-arm/tlbflush.h
@@ -240,6 +240,7 @@ static inline void local_flush_tlb_all(v
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -251,6 +252,7 @@ static inline void local_flush_tlb_all(v
 		asm("mcr%? p15, 0, %0, c8, c6, 0" : : "r" (zero));
 	if (tlb_flag(TLB_V4_I_FULL | TLB_V6_I_FULL))
 		asm("mcr%? p15, 0, %0, c8, c5, 0" : : "r" (zero));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_mm(struct mm_struct *mm)
@@ -259,6 +261,7 @@ static inline void local_flush_tlb_mm(st
 	const int asid = ASID(mm);
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_WB))
 		asm("mcr%? p15, 0, %0, c7, c10, 4" : : "r" (zero));
 
@@ -279,6 +282,7 @@ static inline void local_flush_tlb_mm(st
 		asm("mcr%? p15, 0, %0, c8, c6, 2" : : "r" (asid));
 	if (tlb_flag(TLB_V6_I_ASID))
 		asm("mcr%? p15, 0, %0, c8, c5, 2" : : "r" (asid));
+	preempt_enable();
 }
 
 static inline void
@@ -287,6 +291,7 @@ local_flush_tlb_page(struct vm_area_stru
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	uaddr = (uaddr & PAGE_MASK) | ASID(vma->vm_mm);
 
 	if (tlb_flag(TLB_WB))
@@ -311,6 +316,7 @@ local_flush_tlb_page(struct vm_area_stru
 		asm("mcr%? p15, 0, %0, c8, c6, 1" : : "r" (uaddr));
 	if (tlb_flag(TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c8, c5, 1" : : "r" (uaddr));
+	preempt_enable();
 }
 
 static inline void local_flush_tlb_kernel_page(unsigned long kaddr)
@@ -318,6 +324,7 @@ static inline void local_flush_tlb_kerne
 	const int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	kaddr &= PAGE_MASK;
 
 	if (tlb_flag(TLB_WB))
@@ -340,6 +347,7 @@ static inline void local_flush_tlb_kerne
 		asm("mcr%? p15, 0, %0, c8, c6, 1" : : "r" (kaddr));
 	if (tlb_flag(TLB_V6_I_PAGE))
 		asm("mcr%? p15, 0, %0, c8, c5, 1" : : "r" (kaddr));
+	preempt_enable();
 }
 
 /*
@@ -360,21 +368,25 @@ static inline void flush_pmd_entry(pmd_t
 	const unsigned int zero = 0;
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
 	if (tlb_flag(TLB_WB))
 		asm("mcr%?	p15, 0, %0, c7, c10, 4	@ flush_pmd"
 			: : "r" (zero));
+	preempt_enable();
 }
 
 static inline void clean_pmd_entry(pmd_t *pmd)
 {
 	const unsigned int __tlb_flag = __cpu_tlb_flags;
 
+	preempt_disable();
 	if (tlb_flag(TLB_DCLEAN))
 		asm("mcr%?	p15, 0, %0, c7, c10, 1	@ flush_pmd"
 			: : "r" (pmd));
+	preempt_enable();
 }
 
 #undef tlb_flag
Index: linux/include/asm-arm/unistd.h
===================================================================
--- linux.orig/include/asm-arm/unistd.h
+++ linux/include/asm-arm/unistd.h
@@ -526,6 +526,9 @@ type name(type1 arg1, type2 arg2, type3 
 #define __ARCH_WANT_SYS_SIGPENDING
 #define __ARCH_WANT_SYS_SIGPROCMASK
 #define __ARCH_WANT_SYS_RT_SIGACTION
+
+#define NR_syscalls	328
+
 #endif
 
 #ifdef __KERNEL_SYSCALLS__
Index: linux/include/asm-generic/bug.h
===================================================================
--- linux.orig/include/asm-generic/bug.h
+++ linux/include/asm-generic/bug.h
@@ -16,12 +16,12 @@
 #define BUG_ON(condition) do { if (unlikely((condition)!=0)) BUG(); } while(0)
 #endif
 
+extern void __WARN_ON(const char *func, const char *file, const int line);
+
 #ifndef HAVE_ARCH_WARN_ON
 #define WARN_ON(condition) do { \
-	if (unlikely((condition)!=0)) { \
-		printk("Badness in %s at %s:%d\n", __FUNCTION__, __FILE__, __LINE__); \
-		dump_stack(); \
-	} \
+	if (unlikely((condition)!=0)) \
+		__WARN_ON(__FUNCTION__, __FILE__, __LINE__); \
 } while (0)
 #endif
 
@@ -39,4 +39,26 @@
 #endif
 #endif
 
+#define WARN_ON_ONCE(condition)		\
+do {					\
+	static int warn_once = 1;	\
+					\
+	if (warn_once && (condition)) {	\
+		warn_once = 0;		\
+		WARN_ON(1);		\
+	}				\
+} while (0)
+
+#ifdef CONFIG_PREEMPT_RT
+# define BUG_ON_RT(c)			BUG_ON(c)
+# define BUG_ON_NONRT(c)		do { } while (0)
+# define WARN_ON_RT(condition)		WARN_ON(condition)
+# define WARN_ON_NONRT(condition)	do { } while (0)
+#else
+# define BUG_ON_RT(c)			do { } while (0)
+# define BUG_ON_NONRT(c)		BUG_ON(c)
+# define WARN_ON_RT(condition)		do { } while (0)
+# define WARN_ON_NONRT(condition)	WARN_ON(condition)
+#endif
+
 #endif
Index: linux/include/asm-generic/div64.h
===================================================================
--- linux.orig/include/asm-generic/div64.h
+++ linux/include/asm-generic/div64.h
@@ -55,4 +55,13 @@ extern uint32_t __div64_32(uint64_t *div
 
 #endif /* BITS_PER_LONG */
 
+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend,divisor,remainder) \
+({							\
+	u64 result = dividend;				\
+	*remainder = do_div(result,divisor);		\
+	result;						\
+})
+#endif
+
 #endif /* _ASM_GENERIC_DIV64_H */
Index: linux/include/asm-generic/percpu.h
===================================================================
--- linux.orig/include/asm-generic/percpu.h
+++ linux/include/asm-generic/percpu.h
@@ -10,11 +10,23 @@ extern unsigned long __per_cpu_offset[NR
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset[cpu]))
 #define __get_cpu_var(var) per_cpu(var, smp_processor_id())
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset[cpu]))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset[cpu]))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -28,15 +40,25 @@ do {								\
 
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+    __typeof__(type) per_cpu__##name##_locked
 
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+    extern spinlock_t per_cpu_lock__##name##_locked; \
+    extern __typeof__(type) per_cpu__##name##_locked
 
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_GENERIC_PERCPU_H_ */
Index: linux/include/asm-generic/timeofday.h
===================================================================
--- /dev/null
+++ linux/include/asm-generic/timeofday.h
@@ -0,0 +1,30 @@
+/*  linux/include/asm-generic/timeofday.h
+ *
+ *  This file contains the asm-generic interface
+ *  to the arch specific calls used by the time of day subsystem
+ */
+#ifndef _ASM_GENERIC_TIMEOFDAY_H
+#define _ASM_GENERIC_TIMEOFDAY_H
+#include <linux/types.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+#include <linux/timeofday.h>
+#include <linux/clocksource.h>
+
+#include <asm/div64.h>
+
+#ifdef CONFIG_GENERIC_TIME
+/* Required externs */
+extern nsec_t read_persistent_clock(void);
+extern void sync_persistent_clock(struct timespec ts);
+
+#ifdef CONFIG_GENERIC_TIME_VSYSCALL
+extern void arch_update_vsyscall_gtod(struct timespec wall_time,
+				cycle_t offset_base, struct clocksource* clock,
+				int ntp_adj);
+#else
+# define arch_update_vsyscall_gtod(x,y,z,w) do { } while(0)
+#endif /* CONFIG_GENERIC_TIME_VSYSCALL */
+
+#endif /* CONFIG_GENERIC_TIME */
+#endif
Index: linux/include/asm-generic/tlb.h
===================================================================
--- linux.orig/include/asm-generic/tlb.h
+++ linux/include/asm-generic/tlb.h
@@ -45,11 +45,12 @@ struct mmu_gather {
 	unsigned int		need_flush;/* Really unmapped some ptes? */
 	unsigned int		fullmm; /* non-zero means full mm flush */
 	unsigned long		freed;
+	int			cpu;
 	struct page *		pages[FREE_PTE_NR];
 };
 
 /* Users of the generic TLB shootdown code must declare this storage space. */
-DECLARE_PER_CPU(struct mmu_gather, mmu_gathers);
+DECLARE_PER_CPU_LOCKED(struct mmu_gather, mmu_gathers);
 
 /* tlb_gather_mmu
  *	Return a pointer to an initialized struct mmu_gather.
@@ -57,7 +58,9 @@ DECLARE_PER_CPU(struct mmu_gather, mmu_g
 static inline struct mmu_gather *
 tlb_gather_mmu(struct mm_struct *mm, unsigned int full_mm_flush)
 {
-	struct mmu_gather *tlb = &per_cpu(mmu_gathers, smp_processor_id());
+	int cpu;
+	struct mmu_gather *tlb = &get_cpu_var_locked(mmu_gathers, &cpu);
+	tlb->cpu = cpu;
 
 	tlb->mm = mm;
 
@@ -98,6 +101,7 @@ tlb_finish_mmu(struct mmu_gather *tlb, u
 		freed = rss;
 	add_mm_counter(mm, rss, -freed);
 	tlb_flush_mmu(tlb, start, end);
+	put_cpu_var_locked(mmu_gathers, tlb->cpu);
 
 	/* keep the page table cache within bounds */
 	check_pgt_cache();
@@ -109,6 +113,15 @@ tlb_is_full_mm(struct mmu_gather *tlb)
 	return tlb->fullmm;
 }
 
+/* tlb_free
+ *	this counts the number of pages we have to take off the RSS
+ *	at flush time.
+ */
+static inline void tlb_free(struct mmu_gather *tlb)
+{
+	tlb->freed++;
+}
+
 /* tlb_remove_page
  *	Must perform the equivalent to __free_pte(pte_get_and_clear(ptep)), while
  *	handling the additional races in SMP caused by other CPUs caching valid
Index: linux/include/asm-i386/acpi.h
===================================================================
--- linux.orig/include/asm-i386/acpi.h
+++ linux/include/asm-i386/acpi.h
@@ -52,8 +52,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
Index: linux/include/asm-i386/arch_hooks.h
===================================================================
--- linux.orig/include/asm-i386/arch_hooks.h
+++ linux/include/asm-i386/arch_hooks.h
@@ -14,7 +14,6 @@
 extern void init_ISA_irqs(void);
 extern void apic_intr_init(void);
 extern void smp_intr_init(void);
-extern irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs);
 
 /* these are the defined hooks */
 extern void intr_init_hook(void);
Index: linux/include/asm-i386/atomic.h
===================================================================
--- linux.orig/include/asm-i386/atomic.h
+++ linux/include/asm-i386/atomic.h
@@ -202,10 +202,10 @@ static __inline__ int atomic_add_return(
 
 #ifdef CONFIG_M386
 no_xadd: /* Legacy 386 processor */
-	local_irq_disable();
+	raw_local_irq_disable();
 	__i = atomic_read(v);
 	atomic_set(v, i + __i);
-	local_irq_enable();
+	raw_local_irq_enable();
 	return i + __i;
 #endif
 }
Index: linux/include/asm-i386/bitops.h
===================================================================
--- linux.orig/include/asm-i386/bitops.h
+++ linux/include/asm-i386/bitops.h
@@ -389,7 +389,7 @@ static inline int sched_find_first_bit(c
 		return __ffs(b[1]) + 32;
 	if (unlikely(b[2]))
 		return __ffs(b[2]) + 64;
-	if (b[3])
+	if (unlikely(b[3]))
 		return __ffs(b[3]) + 96;
 	return __ffs(b[4]) + 128;
 }
Index: linux/include/asm-i386/bug.h
===================================================================
--- linux.orig/include/asm-i386/bug.h
+++ linux/include/asm-i386/bug.h
@@ -13,10 +13,13 @@
 #define HAVE_ARCH_BUG
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 #define BUG()				\
+do {					\
+printk("BUG at %s:%d!\n", __FILE__, __LINE__); \
  __asm__ __volatile__(	"ud2\n"		\
 			"\t.word %c0\n"	\
 			"\t.long %c1\n"	\
-			 : : "i" (__LINE__), "i" (__FILE__))
+			 : : "i" (__LINE__), "i" (__FILE__)); \
+} while (0)
 #else
 #define BUG() __asm__ __volatile__("ud2\n")
 #endif
Index: linux/include/asm-i386/current.h
===================================================================
--- linux.orig/include/asm-i386/current.h
+++ linux/include/asm-i386/current.h
@@ -5,11 +5,16 @@
 
 struct task_struct;
 
-static inline struct task_struct * get_current(void)
+static inline struct task_struct * __current(void)
 {
-	return current_thread_info()->task;
+	return __current_thread_info()->task;
 }
- 
-#define current get_current()
+
+#ifndef CURRENT_PTR
+# define current __current()
+#else
+  extern struct task_struct * const ___current;
+# define current ___current
+#endif
 
 #endif /* !(_I386_CURRENT_H) */
Index: linux/include/asm-i386/delay.h
===================================================================
--- linux.orig/include/asm-i386/delay.h
+++ linux/include/asm-i386/delay.h
@@ -23,4 +23,6 @@ extern void __delay(unsigned long loops)
 	((n) > 20000 ? __bad_ndelay() : __const_udelay((n) * 5ul)) : \
 	__ndelay(n))
 
+void use_tsc_delay(void);
+
 #endif /* defined(_I386_DELAY_H) */
Index: linux/include/asm-i386/dma.h
===================================================================
--- linux.orig/include/asm-i386/dma.h
+++ linux/include/asm-i386/dma.h
@@ -135,7 +135,7 @@
 #define DMA_AUTOINIT	0x10
 
 
-extern spinlock_t  dma_spin_lock;
+extern spinlock_t dma_spin_lock;
 
 static __inline__ unsigned long claim_dma_lock(void)
 {
Index: linux/include/asm-i386/highmem.h
===================================================================
--- linux.orig/include/asm-i386/highmem.h
+++ linux/include/asm-i386/highmem.h
@@ -67,14 +67,40 @@ extern void * FASTCALL(kmap_high(struct 
 extern void FASTCALL(kunmap_high(struct page *page));
 
 void *kmap(struct page *page);
+extern void kunmap_virt(void *ptr);
+extern struct page *kmap_to_page(void *ptr);
 void kunmap(struct page *page);
-void *kmap_atomic(struct page *page, enum km_type type);
-void kunmap_atomic(void *kvaddr, enum km_type type);
-void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
-struct page *kmap_atomic_to_page(void *ptr);
+
+void *__kmap_atomic(struct page *page, enum km_type type);
+void __kunmap_atomic(void *kvaddr, enum km_type type);
+void *__kmap_atomic_pfn(unsigned long pfn, enum km_type type);
+struct page *__kmap_atomic_to_page(void *ptr);
 
 #define flush_cache_kmaps()	do { } while (0)
 
+/*
+ * on PREEMPT_RT kmap_atomic() is a wrapper that uses kmap():
+ */
+#ifdef CONFIG_PREEMPT_RT
+# ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+extern int preempt_locks;
+#  define kmap_atomic(page, type)	({ void *__page; if (preempt_locks) __page = kmap(page); else __page = __kmap_atomic(page, type); __page; })
+#  define kmap_atomic_pfn(pfn, type)	({ void *__page; if (preempt_locks) __page = kmap(pfn_to_page(pfn)); else __page = __kmap_atomic_pfn(pfn, type); __page; })
+#  define kunmap_atomic(kvaddr, type)	do { if (preempt_locks) kunmap_virt(kvaddr); else __kunmap_atomic(kvaddr, type); } while (0)
+#  define kmap_atomic_to_page(kvaddr)	({ struct page *__page; if (preempt_locks) __page = kmap_to_page(kvaddr); else __page = __kmap_atomic_to_page(kvaddr); __page; })
+# else
+#  define kmap_atomic(page, type)	kmap(page)
+#  define kmap_atomic_pfn(pfn, type)	kmap(pfn_to_page(pfn))
+#  define kunmap_atomic(kvaddr, type)	kunmap_virt(kvaddr)
+#  define kmap_atomic_to_page(kvaddr)	kmap_to_page(kvaddr)
+# endif
+#else
+# define kmap_atomic(page, type)	__kmap_atomic(page, type)
+# define kmap_atomic_pfn(pfn, type)	__kmap_atomic_pfn(pfn, type)
+# define kunmap_atomic(kvaddr, type)	__kunmap_atomic(kvaddr, type)
+# define kmap_atomic_to_page(kvaddr)	__kmap_atomic_to_page(kvaddr)
+#endif
+
 #endif /* __KERNEL__ */
 
 #endif /* _ASM_HIGHMEM_H */
Index: linux/include/asm-i386/i387.h
===================================================================
--- linux.orig/include/asm-i386/i387.h
+++ linux/include/asm-i386/i387.h
@@ -53,7 +53,7 @@ static inline void __save_init_fpu( stru
 }
 
 #define __unlazy_fpu( tsk ) do { \
-	if ((tsk)->thread_info->status & TS_USEDFPU) \
+	if (unlikely((tsk)->thread_info->status & TS_USEDFPU)) \
 		save_init_fpu( tsk ); \
 } while (0)
 
Index: linux/include/asm-i386/i8253.h
===================================================================
--- linux.orig/include/asm-i386/i8253.h
+++ linux/include/asm-i386/i8253.h
@@ -1,6 +1,6 @@
 #ifndef __ASM_I8253_H__
 #define __ASM_I8253_H__
 
-extern spinlock_t i8253_lock;
+extern raw_spinlock_t i8253_lock;
 
 #endif	/* __ASM_I8253_H__ */
Index: linux/include/asm-i386/i8259.h
===================================================================
--- linux.orig/include/asm-i386/i8259.h
+++ linux/include/asm-i386/i8259.h
@@ -7,7 +7,7 @@ extern unsigned int cached_irq_mask;
 #define cached_master_mask	(__byte(0, cached_irq_mask))
 #define cached_slave_mask	(__byte(1, cached_irq_mask))
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_8259A(int auto_eoi);
 extern void enable_8259A_irq(unsigned int irq);
Index: linux/include/asm-i386/io_apic.h
===================================================================
--- linux.orig/include/asm-i386/io_apic.h
+++ linux/include/asm-i386/io_apic.h
@@ -16,7 +16,6 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
@@ -35,7 +34,6 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
@@ -103,7 +101,7 @@ union IO_APIC_reg_03 {
  * # of IO-APICs and # of IRQ routing registers
  */
 extern int nr_ioapics;
-extern int nr_ioapic_registers[MAX_IO_APICS];
+extern int nr_ioapic_registers(int apic);
 
 enum ioapic_irq_destination_types {
 	dest_Fixed = 0,
@@ -160,31 +158,8 @@ extern struct mpc_config_intsrc mp_irqs[
 /* non-0 if default (table-less) MP configuration */
 extern int mpc_default_type;
 
-static inline unsigned int io_apic_read(unsigned int apic, unsigned int reg)
-{
-	*IO_APIC_BASE(apic) = reg;
-	return *(IO_APIC_BASE(apic)+4);
-}
-
-static inline void io_apic_write(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
-
-/*
- * Re-write a value: to be used for read-modify-write
- * cycles where the read already set up the index register.
- *
- * Older SiS APIC requires we rewrite the index regiser
- */
+extern void setup_IO_APIC_early(int ioapic);
 extern int sis_apic_bug;
-static inline void io_apic_modify(unsigned int apic, unsigned int reg, unsigned int value)
-{
-	if (sis_apic_bug)
-		*IO_APIC_BASE(apic) = reg;
-	*(IO_APIC_BASE(apic)+4) = value;
-}
 
 /* 1 if "noapic" boot option passed */
 extern int skip_ioapic_setup;
@@ -204,8 +179,11 @@ extern int io_apic_set_pci_routing (int 
 
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 
+extern void io_apic_timer_ack(void *);
+
 #else  /* !CONFIG_X86_IO_APIC */
 #define io_apic_assign_pci_irqs 0
+#define io_apic_timer_ack NULL
 #endif
 
 extern int assign_irq_vector(int irq);
Index: linux/include/asm-i386/mach-default/do_timer.h
===================================================================
--- linux.orig/include/asm-i386/mach-default/do_timer.h
+++ linux/include/asm-i386/mach-default/do_timer.h
@@ -1,86 +1,2 @@
 /* defines for inline arch setup functions */
 
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/**
- * do_timer_interrupt_hook - hook into timer tick
- * @regs:	standard registers from interrupt
- *
- * Description:
- *	This hook is called immediately after the timer interrupt is ack'd.
- *	It's primary purpose is to allow architectures that don't possess
- *	individual per CPU clocks (like the CPU APICs supply) to broadcast the
- *	timer interrupt as a means of triggering reschedules etc.
- **/
-
-static inline void do_timer_interrupt_hook(struct pt_regs *regs)
-{
-	do_timer(regs);
-#ifndef CONFIG_SMP
-	update_process_times(user_mode(regs));
-#endif
-/*
- * In the SMP case we use the local APIC timer interrupt to do the
- * profiling, except when we simulate SMP mode on a uniprocessor
- * system, in that case we have to call the local interrupt handler.
- */
-#ifndef CONFIG_X86_LOCAL_APIC
-	profile_tick(CPU_PROFILING, regs);
-#else
-	if (!using_apic_timer)
-		smp_local_timer_interrupt(regs);
-#endif
-}
-
-
-/* you can safely undefine this if you don't have the Neptune chipset */
-
-#define BUGGY_NEPTUN_TIMER
-
-/**
- * do_timer_overflow - process a detected timer overflow condition
- * @count:	hardware timer interrupt count on overflow
- *
- * Description:
- *	This call is invoked when the jiffies count has not incremented but
- *	the hardware timer interrupt has.  It means that a timer tick interrupt
- *	came along while the previous one was pending, thus a tick was missed
- **/
-static inline int do_timer_overflow(int count)
-{
-	int i;
-
-	spin_lock(&i8259A_lock);
-	/*
-	 * This is tricky when I/O APICs are used;
-	 * see do_timer_interrupt().
-	 */
-	i = inb(0x20);
-	spin_unlock(&i8259A_lock);
-	
-	/* assumption about timer being IRQ0 */
-	if (i & 0x01) {
-		/*
-		 * We cannot detect lost timer interrupts ... 
-		 * well, that's why we call them lost, don't we? :)
-		 * [hmm, on the Pentium and Alpha we can ... sort of]
-		 */
-		count -= LATCH;
-	} else {
-#ifdef BUGGY_NEPTUN_TIMER
-		/*
-		 * for the Neptun bug we know that the 'latch'
-		 * command doesn't latch the high and low value
-		 * of the counter atomically. Thus we have to 
-		 * substract 256 from the counter 
-		 * ... funny, isnt it? :)
-		 */
-		
-		count -= 256;
-#else
-		printk("do_slow_gettimeoffset(): hardware timer problem?\n");
-#endif
-	}
-	return count;
-}
Index: linux/include/asm-i386/mach-default/irq_vectors.h
===================================================================
--- linux.orig/include/asm-i386/mach-default/irq_vectors.h
+++ linux/include/asm-i386/mach-default/irq_vectors.h
@@ -63,7 +63,7 @@
  * levels. (0x80 is the syscall vector)
  */
 #define FIRST_DEVICE_VECTOR	0x31
-#define FIRST_SYSTEM_VECTOR	0xef
+#define FIRST_SYSTEM_VECTOR	0xee
 
 #define TIMER_IRQ 0
 
Index: linux/include/asm-i386/mach-default/mach_timer.h
===================================================================
--- linux.orig/include/asm-i386/mach-default/mach_timer.h
+++ linux/include/asm-i386/mach-default/mach_timer.h
@@ -15,7 +15,9 @@
 #ifndef _MACH_TIMER_H
 #define _MACH_TIMER_H
 
-#define CALIBRATE_LATCH	(5 * LATCH)
+#define CALIBRATE_TIME_MSEC 30 /* 30 msecs */
+#define CALIBRATE_LATCH	\
+	((CLOCK_TICK_RATE * CALIBRATE_TIME_MSEC + 1000/2)/1000)
 
 static inline void mach_prepare_counter(void)
 {
Index: linux/include/asm-i386/mach-summit/mach_mpparse.h
===================================================================
--- linux.orig/include/asm-i386/mach-summit/mach_mpparse.h
+++ linux/include/asm-i386/mach-summit/mach_mpparse.h
@@ -2,6 +2,7 @@
 #define __ASM_MACH_MPPARSE_H
 
 #include <mach_apic.h>
+#include <asm/tsc.h>
 
 extern int use_cyclone;
 
@@ -30,6 +31,7 @@ static inline int mps_oem_check(struct m
 			(!strncmp(productid, "VIGIL SMP", 9) 
 			 || !strncmp(productid, "EXA", 3)
 			 || !strncmp(productid, "RUTHLESS SMP", 12))){
+		mark_tsc_unstable();
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		usb_early_handoff = 1;
@@ -44,6 +46,7 @@ static inline int acpi_madt_oem_check(ch
 	if (!strncmp(oem_id, "IBM", 3) &&
 	    (!strncmp(oem_table_id, "SERVIGIL", 8)
 	     || !strncmp(oem_table_id, "EXA", 3))){
+		mark_tsc_unstable();
 		use_cyclone = 1; /*enable cyclone-timer*/
 		setup_summit();
 		usb_early_handoff = 1;
Index: linux/include/asm-i386/mc146818rtc.h
===================================================================
--- linux.orig/include/asm-i386/mc146818rtc.h
+++ linux/include/asm-i386/mc146818rtc.h
@@ -65,11 +65,11 @@ static inline unsigned char current_lock
 #define lock_cmos_prefix(reg) \
 	do {					\
 		unsigned long cmos_flags;	\
-		local_irq_save(cmos_flags);	\
+		raw_local_irq_save(cmos_flags);	\
 		lock_cmos(reg)
 #define lock_cmos_suffix(reg) \
 		unlock_cmos();			\
-		local_irq_restore(cmos_flags);	\
+		raw_local_irq_restore(cmos_flags); \
 	} while (0)
 #else
 #define lock_cmos_prefix(reg) do {} while (0)
Index: linux/include/asm-i386/pgtable.h
===================================================================
--- linux.orig/include/asm-i386/pgtable.h
+++ linux/include/asm-i386/pgtable.h
@@ -34,7 +34,7 @@ extern unsigned long empty_zero_page[102
 extern pgd_t swapper_pg_dir[1024];
 extern kmem_cache_t *pgd_cache;
 extern kmem_cache_t *pmd_cache;
-extern spinlock_t pgd_lock;
+extern raw_spinlock_t pgd_lock;
 extern struct page *pgd_list;
 
 void pmd_ctor(void *, kmem_cache_t *, unsigned long);
Index: linux/include/asm-i386/rwsem.h
===================================================================
--- linux.orig/include/asm-i386/rwsem.h
+++ linux/include/asm-i386/rwsem.h
@@ -43,15 +43,15 @@
 
 struct rwsem_waiter;
 
-extern struct rw_semaphore *FASTCALL(rwsem_down_read_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_down_write_failed(struct rw_semaphore *sem));
-extern struct rw_semaphore *FASTCALL(rwsem_wake(struct rw_semaphore *));
-extern struct rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_read_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_down_write_failed(struct compat_rw_semaphore *sem));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_wake(struct compat_rw_semaphore *));
+extern struct compat_rw_semaphore *FASTCALL(rwsem_downgrade_wake(struct compat_rw_semaphore *sem));
 
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed long		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
@@ -76,13 +76,13 @@ struct rw_semaphore {
 #endif
 
 #define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
+{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED((name).wait_lock), LIST_HEAD_INIT((name).wait_list) \
 	__RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -95,7 +95,7 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning down_read\n\t"
@@ -120,7 +120,7 @@ LOCK_PREFIX	"  incl      (%%eax)\n\t" /*
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	__s32 result, tmp;
 	__asm__ __volatile__(
@@ -143,7 +143,7 @@ LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -170,7 +170,7 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	signed long ret = cmpxchg(&sem->count,
 				  RWSEM_UNLOCKED_VALUE, 
@@ -183,7 +183,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
 	__asm__ __volatile__(
@@ -209,7 +209,7 @@ LOCK_PREFIX	"  xadd      %%edx,(%%eax)\n
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __up_write\n\t"
@@ -235,7 +235,7 @@ LOCK_PREFIX	"  xaddl     %%edx,(%%eax)\n
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __downgrade_write\n\t"
@@ -260,7 +260,7 @@ LOCK_PREFIX	"  addl      %2,(%%eax)\n\t"
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 LOCK_PREFIX	"addl %1,%0"
@@ -271,7 +271,7 @@ LOCK_PREFIX	"addl %1,%0"
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	int tmp = delta;
 
Index: linux/include/asm-i386/semaphore.h
===================================================================
--- linux.orig/include/asm-i386/semaphore.h
+++ linux/include/asm-i386/semaphore.h
@@ -1,10 +1,9 @@
 #ifndef _I386_SEMAPHORE_H
 #define _I386_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
-#ifdef __KERNEL__
-
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -41,33 +40,40 @@
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __MUTEX_INITIALIZER(name) \
-	__SEMAPHORE_INITIALIZER(name,1)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -77,27 +83,27 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-fastcall void __down_failed(void /* special register calling convention */);
-fastcall int  __down_failed_interruptible(void  /* params in registers */);
-fastcall int  __down_failed_trylock(void  /* params in registers */);
-fastcall void __up_wakeup(void /* special register calling convention */);
+fastcall void __compat_down_failed(void /* special register calling convention */);
+fastcall int  __compat_down_failed_interruptible(void  /* params in registers */);
+fastcall int  __compat_down_failed_trylock(void  /* params in registers */);
+fastcall void __compat_up_wakeup(void /* special register calling convention */);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/i386/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 	__asm__ __volatile__(
@@ -107,7 +113,7 @@ static inline void down(struct semaphore
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __down_failed\n\t"
+		"call __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
@@ -119,7 +125,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -132,7 +138,7 @@ static inline int down_interruptible(str
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_interruptible\n\t"
+		"call __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -145,7 +151,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -157,7 +163,7 @@ static inline int down_trylock(struct se
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %1,%%eax\n\t"
-		"call __down_failed_trylock\n\t"
+		"call __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -172,7 +178,7 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -181,7 +187,7 @@ static inline void up(struct semaphore *
 		"1:\n"
 		LOCK_SECTION_START("")
 		"2:\tlea %0,%%eax\n\t"
-		"call __up_wakeup\n\t"
+		"call __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		".subsection 0\n"
@@ -190,5 +196,10 @@ static inline void up(struct semaphore *
 		:"memory","ax");
 }
 
-#endif
+extern int FASTCALL(compat_sem_is_locked(struct compat_semaphore *sem));
+
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif
Index: linux/include/asm-i386/spinlock.h
===================================================================
--- linux.orig/include/asm-i386/spinlock.h
+++ linux/include/asm-i386/spinlock.h
@@ -34,7 +34,7 @@
 
 #define __raw_spin_lock_string_flags \
 	"\n1:\t" \
-	"lock ; decb %0\n\t" \
+	LOCK_PREFIX "decb %0\n\t" \
 	"jns 4f\n\t" \
 	"2:\t" \
 	"testl $0x200, %1\n\t" \
@@ -48,21 +48,21 @@
 	"jmp 1b\n" \
 	"4:\n\t"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string
 		:"=m" (lock->slock) : : "memory");
 }
 
-static inline void __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
+static inline void __raw_spin_lock_flags(__raw_spinlock_t *lock, unsigned long flags)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string_flags
 		:"=m" (lock->slock) : "r" (flags) : "memory");
 }
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	char oldval;
 	__asm__ __volatile__(
@@ -86,7 +86,7 @@ static inline int __raw_spin_trylock(raw
 		:"=m" (lock->slock) : : "memory"
 
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -100,7 +100,7 @@ static inline void __raw_spin_unlock(raw
 		:"=q" (oldval), "=m" (lock->slock) \
 		:"0" (oldval) : "memory"
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	char oldval = 1;
 
@@ -147,17 +147,17 @@ static inline void __raw_spin_unlock(raw
  */
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -167,7 +167,7 @@ static inline int __raw_read_trylock(raw
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -176,12 +176,12 @@ static inline int __raw_write_trylock(ra
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ", %0"
 				 : "=m" (rw->lock) : : "memory");
Index: linux/include/asm-i386/spinlock_types.h
===================================================================
--- linux.orig/include/asm-i386/spinlock_types.h
+++ linux/include/asm-i386/spinlock_types.h
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
Index: linux/include/asm-i386/system.h
===================================================================
--- linux.orig/include/asm-i386/system.h
+++ linux/include/asm-i386/system.h
@@ -459,24 +459,32 @@ struct alt_instr { 
 #define set_wmb(var, value) do { var = value; wmb(); } while (0)
 
 /* interrupt control.. */
-#define local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
-#define local_irq_restore(x) 	do { typecheck(unsigned long,x); __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+
+#define __raw_local_save_flags(x)	do { typecheck(unsigned long,x); __asm__ __volatile__("pushfl ; popl %0":"=g" (x): /* no input */); } while (0)
+#define __raw_local_irq_restore(x) 	do { __asm__ __volatile__("pushl %0 ; popfl": /* no output */ :"g" (x):"memory", "cc"); } while (0)
+#define __raw_local_irq_disable() 	do { __asm__ __volatile__("cli": : :"memory"); } while (0)
+#define __raw_local_irq_enable()	do { __asm__ __volatile__("sti": : :"memory"); } while (0)
 /* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
-/* used when interrupts are already enabled or to shutdown the processor */
-#define halt()			__asm__ __volatile__("hlt": : :"memory")
-
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
+#define __raw_safe_halt()		do { trace_irqs_on(); __asm__ __volatile__("sti; hlt": : :"memory"); } while (0)
+#define halt()				__asm__ __volatile__("hlt": : :"memory")
+
+
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & (1<<9));			\
+})
+
+#define __raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	__raw_local_save_flags(flags);		\
+	__raw_irqs_disabled_flags(flags);	\
 })
 
 /* For spinlocks etc */
-#define local_irq_save(x)	__asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory")
+#define __raw_local_irq_save(x)	do { __asm__ __volatile__("pushfl ; popl %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+
+#include <linux/rt_irq.h>
 
 /*
  * disable hlt during certain critical i/o operations
Index: linux/include/asm-i386/thread_info.h
===================================================================
--- linux.orig/include/asm-i386/thread_info.h
+++ linux/include/asm-i386/thread_info.h
@@ -83,15 +83,28 @@ struct thread_info {
 #define init_thread_info	(init_thread_union.thread_info)
 #define init_stack		(init_thread_union.stack)
 
+#ifndef CONFIG_SMP
+// # define CURRENT_PTR
+#endif
 
 /* how to get the thread information struct from C */
-static inline struct thread_info *current_thread_info(void)
+static inline struct thread_info *__current_thread_info(void)
 {
 	struct thread_info *ti;
 	__asm__("andl %%esp,%0; ":"=r" (ti) : "0" (~(THREAD_SIZE - 1)));
 	return ti;
 }
 
+#ifndef CURRENT_PTR
+static inline struct thread_info *current_thread_info(void)
+{
+	return __current_thread_info();
+}
+#else
+extern struct thread_info * const current_ti;
+# define current_thread_info() current_ti
+#endif
+
 /* how to get the current stack pointer from C */
 register unsigned long current_stack_pointer asm("esp") __attribute_used__;
 
@@ -144,11 +157,14 @@ register unsigned long current_stack_poi
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		17
+#define TIF_NEED_RESCHED_DELAYED 18	/* reschedule on return to userspace */
+
 
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_SINGLESTEP		(1<<TIF_SINGLESTEP)
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_EMU	(1<<TIF_SYSCALL_EMU)
Index: linux/include/asm-i386/timeofday.h
===================================================================
--- /dev/null
+++ linux/include/asm-i386/timeofday.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_I386_TIMEOFDAY_H
+#define _ASM_I386_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
Index: linux/include/asm-i386/timer.h
===================================================================
--- linux.orig/include/asm-i386/timer.h
+++ linux/include/asm-i386/timer.h
@@ -3,68 +3,10 @@
 #include <linux/init.h>
 #include <linux/pm.h>
 
-/**
- * struct timer_ops - used to define a timer source
- *
- * @name: name of the timer.
- * @init: Probes and initializes the timer. Takes clock= override 
- *        string as an argument. Returns 0 on success, anything else
- *        on failure.
- * @mark_offset: called by the timer interrupt.
- * @get_offset:  called by gettimeofday(). Returns the number of microseconds
- *               since the last timer interupt.
- * @monotonic_clock: returns the number of nanoseconds since the init of the
- *                   timer.
- * @delay: delays this many clock cycles.
- */
-struct timer_opts {
-	char* name;
-	void (*mark_offset)(void);
-	unsigned long (*get_offset)(void);
-	unsigned long long (*monotonic_clock)(void);
-	void (*delay)(unsigned long);
-	unsigned long (*read_timer)(void);
-	int (*suspend)(pm_message_t state);
-	int (*resume)(void);
-};
-
-struct init_timer_opts {
-	int (*init)(char *override);
-	struct timer_opts *opts;
-};
-
 #define TICK_SIZE (tick_nsec / 1000)
-
-extern struct timer_opts* __init select_timer(void);
-extern void clock_fallback(void);
 void setup_pit_timer(void);
-
 /* Modifiers for buggy PIT handling */
-
 extern int pit_latch_buggy;
-
-extern struct timer_opts *cur_timer;
-extern int timer_ack;
-
-/* list of externed timers */
-extern struct timer_opts timer_none;
-extern struct timer_opts timer_pit;
-extern struct init_timer_opts timer_pit_init;
-extern struct init_timer_opts timer_tsc_init;
-#ifdef CONFIG_X86_CYCLONE_TIMER
-extern struct init_timer_opts timer_cyclone_init;
-#endif
-
-extern unsigned long calibrate_tsc(void);
-extern unsigned long read_timer_tsc(void);
-extern void init_cpu_khz(void);
 extern int recalibrate_cpu_khz(void);
-#ifdef CONFIG_HPET_TIMER
-extern struct init_timer_opts timer_hpet_init;
-extern unsigned long calibrate_tsc_hpet(unsigned long *tsc_hpet_quotient_ptr);
-#endif
 
-#ifdef CONFIG_X86_PM_TIMER
-extern struct init_timer_opts timer_pmtmr_init;
-#endif
 #endif
Index: linux/include/asm-i386/timex.h
===================================================================
--- linux.orig/include/asm-i386/timex.h
+++ linux/include/asm-i386/timex.h
@@ -8,6 +8,7 @@
 
 #include <linux/config.h>
 #include <asm/processor.h>
+#include <asm/tsc.h>
 
 #ifdef CONFIG_X86_ELAN
 #  define CLOCK_TICK_RATE 1189200 /* AMD Elan has different frequency! */
@@ -16,40 +17,12 @@
 #endif
 
 
+extern int read_current_timer(unsigned long *timer_value);
 /*
- * Standard way to access the cycle counter on i586+ CPUs.
- * Currently only used on SMP.
- *
- * If you really have a SMP machine with i486 chips or older,
- * compile for that, and this will just always return zero.
- * That's ok, it just means that the nicer scheduling heuristics
- * won't work for you.
- *
- * We only use the low 32 bits, and we'd simply better make sure
- * that we reschedule before that wraps. Scheduling at least every
- * four billion cycles just basically sounds like a good idea,
- * regardless of how fast the machine is. 
+ * On an Athlon64 the cycles-based estimator is off by a
+ * factor of 2: udelay(100) takes 200 usecs. With the non-TSC
+ * based estimator the timings are precise. So turn it off.
  */
-typedef unsigned long long cycles_t;
-
-static inline cycles_t get_cycles (void)
-{
-	unsigned long long ret=0;
-
-#ifndef CONFIG_X86_TSC
-	if (!cpu_has_tsc)
-		return 0;
-#endif
-
-#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
-	rdtscll(ret);
-#endif
-	return ret;
-}
-
-extern unsigned int cpu_khz;
-
-extern int read_current_timer(unsigned long *timer_value);
 #define ARCH_HAS_READ_CURRENT_TIMER	1
 
 #endif
Index: linux/include/asm-i386/tlbflush.h
===================================================================
--- linux.orig/include/asm-i386/tlbflush.h
+++ linux/include/asm-i386/tlbflush.h
@@ -5,15 +5,32 @@
 #include <linux/mm.h>
 #include <asm/processor.h>
 
+/*
+ * TLB-flush needs to be nonpreemptible on PREEMPT_RT due to the
+ * following complex race scenario:
+ *
+ * if the current task is lazy-TLB and does a TLB flush and
+ * gets preempted after the movl %%r3, %0 but before the
+ * movl %0, %%cr3 then its ->active_mm might change and it will
+ * install the wrong cr3 when it switches back. This is not a
+ * problem for the lazy-TLB task itself, but if the next task it
+ * switches to has an ->mm that is also the lazy-TLB task's
+ * new ->active_mm, then the scheduler will assume that cr3 is
+ * the new one, while we overwrote it with the old one. The result
+ * is the wrong cr3 in the new (non-lazy-TLB) task, which typically
+ * causes an infinite pagefault upon the next userspace access.
+ */
 #define __flush_tlb()							\
 	do {								\
 		unsigned int tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr3, %0;              \n"		\
 			"movl %0, %%cr3;  # flush TLB \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +41,7 @@
 	do {								\
 		unsigned int tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movl %%cr4, %2;  # turn off PGE     \n"	\
 			"movl %2, %1;                        \n"	\
@@ -35,6 +53,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
@@ -87,6 +106,13 @@ extern unsigned long pgkern_mask;
 
 static inline void flush_tlb_mm(struct mm_struct *mm)
 {
+	/*
+	 * This is safe on PREEMPT_RT because if we preempt
+	 * right after the check but before the __flush_tlb(),
+	 * and if ->active_mm changes, then we might miss a
+	 * TLB flush, but that TLB flush happened already when
+	 * ->active_mm was changed:
+	 */
 	if (mm == current->active_mm)
 		__flush_tlb();
 }
Index: linux/include/asm-i386/tsc.h
===================================================================
--- /dev/null
+++ linux/include/asm-i386/tsc.h
@@ -0,0 +1,50 @@
+/*
+ * linux/include/asm-i386/tsc.h
+ *
+ * i386 TSC related functions
+ */
+#ifndef _ASM_i386_TSC_H
+#define _ASM_i386_TSC_H
+
+#include <linux/config.h>
+#include <asm/processor.h>
+
+/*
+ * Standard way to access the cycle counter on i586+ CPUs.
+ * Currently only used on SMP.
+ *
+ * If you really have a SMP machine with i486 chips or older,
+ * compile for that, and this will just always return zero.
+ * That's ok, it just means that the nicer scheduling heuristics
+ * won't work for you.
+ *
+ * We only use the low 32 bits, and we'd simply better make sure
+ * that we reschedule before that wraps. Scheduling at least every
+ * four billion cycles just basically sounds like a good idea,
+ * regardless of how fast the machine is.
+ */
+typedef unsigned long long cycles_t;
+
+extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+
+static inline cycles_t get_cycles(void)
+{
+	unsigned long long ret = 0;
+
+#ifndef CONFIG_X86_TSC
+	if (!cpu_has_tsc)
+		return 0;
+#endif
+
+#if defined(CONFIG_X86_GENERIC) || defined(CONFIG_X86_TSC)
+	rdtscll(ret);
+#endif
+	return ret;
+}
+
+extern void tsc_init(void);
+extern void tsc_c3_compensate(unsigned long usecs);
+extern void mark_tsc_unstable(void);
+
+#endif
Index: linux/include/asm-i386/xor.h
===================================================================
--- linux.orig/include/asm-i386/xor.h
+++ linux/include/asm-i386/xor.h
@@ -862,7 +862,21 @@ static struct xor_block_template xor_blo
 #include <asm-generic/xor.h>
 
 #undef XOR_TRY_TEMPLATES
-#define XOR_TRY_TEMPLATES				\
+/*
+ * MMX/SSE ops disable preemption for long periods of time,
+ * so on PREEMPT_RT use the register-based ops only:
+ */
+#ifdef CONFIG_PREEMPT_RT
+# define XOR_TRY_TEMPLATES				\
+	do {						\
+		xor_speed(&xor_block_8regs);		\
+		xor_speed(&xor_block_8regs_p);		\
+		xor_speed(&xor_block_32regs);		\
+		xor_speed(&xor_block_32regs_p);		\
+	} while (0)
+# define XOR_SELECT_TEMPLATE(FASTEST) (FASTEST)
+#else
+# define XOR_TRY_TEMPLATES				\
 	do {						\
 		xor_speed(&xor_block_8regs);		\
 		xor_speed(&xor_block_8regs_p);		\
@@ -875,9 +889,10 @@ static struct xor_block_template xor_blo
 	                xor_speed(&xor_block_p5_mmx);	\
 	        }					\
 	} while (0)
-
 /* We force the use of the SSE xor block because it can write around L2.
    We may also be able to load into the L1 only depending on how the cpu
    deals with a load to a line that is being prefetched.  */
-#define XOR_SELECT_TEMPLATE(FASTEST) \
+# define XOR_SELECT_TEMPLATE(FASTEST) \
 	(cpu_has_xmm ? &xor_block_pIII_sse : FASTEST)
+#endif
+
Index: linux/include/asm-mips/asmmacro.h
===================================================================
--- linux.orig/include/asm-mips/asmmacro.h
+++ linux/include/asm-mips/asmmacro.h
@@ -18,14 +18,14 @@
 #include <asm/asmmacro-64.h>
 #endif
 
-	.macro	local_irq_enable reg=t0
+	.macro	mips_raw_local_irq_enable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	mtc0	\reg, CP0_STATUS
 	irq_enable_hazard
 	.endm
 
-	.macro	local_irq_disable reg=t0
+	.macro	mips_raw_local_irq_disable reg=t0
 	mfc0	\reg, CP0_STATUS
 	ori	\reg, \reg, 1
 	xori	\reg, \reg, 1
Index: linux/include/asm-mips/atomic.h
===================================================================
--- linux.orig/include/asm-mips/atomic.h
+++ linux/include/asm-mips/atomic.h
@@ -18,15 +18,20 @@
  * main big wrapper ...
  */
 #include <linux/config.h>
-#include <linux/spinlock.h>
 
 #ifndef _ASM_ATOMIC_H
 #define _ASM_ATOMIC_H
 
 #include <asm/cpu-features.h>
 #include <asm/war.h>
+#include <asm/types.h>
+
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+
+#include <linux/spinlock.h>
+extern raw_spinlock_t atomic_lock;
 
-extern spinlock_t atomic_lock;
+#endif
 
 typedef struct { volatile int counter; } atomic_t;
 
@@ -78,13 +83,16 @@ static __inline__ void atomic_add(int i,
 		"	beqz	%0, 1b					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter += i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -116,13 +124,16 @@ static __inline__ void atomic_sub(int i,
 		"	beqz	%0, 1b					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter -= i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -158,7 +169,9 @@ static __inline__ int atomic_add_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -167,6 +180,7 @@ static __inline__ int atomic_add_return(
 		v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -201,7 +215,9 @@ static __inline__ int atomic_sub_return(
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -210,6 +226,7 @@ static __inline__ int atomic_sub_return(
 		v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -253,7 +270,9 @@ static __inline__ int atomic_sub_if_posi
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -263,6 +282,7 @@ static __inline__ int atomic_sub_if_posi
 			v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -383,13 +403,16 @@ static __inline__ void atomic64_add(long
 		"	beqz	%0, 1b					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter += i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -421,13 +444,16 @@ static __inline__ void atomic64_sub(long
 		"	beqz	%0, 1b					\n"
 		: "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter));
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
 		v->counter -= i;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 }
 
 /*
@@ -463,7 +489,9 @@ static __inline__ long atomic64_add_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -472,6 +500,7 @@ static __inline__ long atomic64_add_retu
 		v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -506,7 +535,9 @@ static __inline__ long atomic64_sub_retu
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -515,6 +546,7 @@ static __inline__ long atomic64_sub_retu
 		v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
@@ -558,7 +590,9 @@ static __inline__ long atomic64_sub_if_p
 		: "=&r" (result), "=&r" (temp), "=m" (v->counter)
 		: "Ir" (i), "m" (v->counter)
 		: "memory");
-	} else {
+	}
+#if !defined(CONFIG_NO_SPINLOCK) && !defined(CONFIG_PREEMPT_RT)
+	else {
 		unsigned long flags;
 
 		spin_lock_irqsave(&atomic_lock, flags);
@@ -568,6 +602,7 @@ static __inline__ long atomic64_sub_if_p
 			v->counter = result;
 		spin_unlock_irqrestore(&atomic_lock, flags);
 	}
+#endif
 
 	return result;
 }
Index: linux/include/asm-mips/bitops.h
===================================================================
--- linux.orig/include/asm-mips/bitops.h
+++ linux/include/asm-mips/bitops.h
@@ -86,6 +86,7 @@ static inline void set_bit(unsigned long
 		"	beqz	%0, 1b					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -96,6 +97,7 @@ static inline void set_bit(unsigned long
 		__bi_local_irq_save(flags);
 		*a |= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -146,6 +148,7 @@ static inline void clear_bit(unsigned lo
 		"	beqz	%0, 1b					\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (~(1UL << (nr & SZLONG_MASK))), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -156,6 +159,7 @@ static inline void clear_bit(unsigned lo
 		__bi_local_irq_save(flags);
 		*a &= ~mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -208,6 +212,7 @@ static inline void change_bit(unsigned l
 		"	beqz	%0, 1b				\n"
 		: "=&r" (temp), "=m" (*m)
 		: "ir" (1UL << (nr & SZLONG_MASK)), "m" (*m));
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -218,6 +223,7 @@ static inline void change_bit(unsigned l
 		__bi_local_irq_save(flags);
 		*a ^= mask;
 		__bi_local_irq_restore(flags);
+#endif
 	}
 }
 
@@ -286,6 +292,7 @@ static inline int test_and_set_bit(unsig
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -300,6 +307,7 @@ static inline int test_and_set_bit(unsig
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -378,6 +386,7 @@ static inline int test_and_clear_bit(uns
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask;
@@ -392,6 +401,7 @@ static inline int test_and_clear_bit(uns
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
@@ -468,6 +478,7 @@ static inline int test_and_change_bit(un
 		: "memory");
 
 		return res != 0;
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		volatile unsigned long *a = addr;
 		unsigned long mask, retval;
@@ -481,6 +492,7 @@ static inline int test_and_change_bit(un
 		__bi_local_irq_restore(flags);
 
 		return retval;
+#endif
 	}
 }
 
Index: linux/include/asm-mips/hw_irq.h
===================================================================
--- linux.orig/include/asm-mips/hw_irq.h
+++ linux/include/asm-mips/hw_irq.h
@@ -10,6 +10,7 @@
 
 #include <linux/profile.h>
 #include <asm/atomic.h>
+#include <linux/rt_irq.h>
 
 extern void disable_8259A_irq(unsigned int irq);
 extern void enable_8259A_irq(unsigned int irq);
Index: linux/include/asm-mips/i8259.h
===================================================================
--- linux.orig/include/asm-mips/i8259.h
+++ linux/include/asm-mips/i8259.h
@@ -19,7 +19,7 @@
 
 #include <asm/io.h>
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 extern void init_i8259_irqs(void);
 
Index: linux/include/asm-mips/interrupt.h
===================================================================
--- linux.orig/include/asm-mips/interrupt.h
+++ linux/include/asm-mips/interrupt.h
@@ -14,7 +14,7 @@
 #include <asm/hazards.h>
 
 __asm__ (
-	".macro\tlocal_irq_enable\n\t"
+	".macro\tmips_raw_local_irq_enable\n\t"
 	".set\tpush\n\t"
 	".set\treorder\n\t"
 	".set\tnoat\n\t"
@@ -26,10 +26,10 @@ __asm__ (
 	".set\tpop\n\t"
 	".endm");
 
-static inline void local_irq_enable(void)
+static inline void __raw_local_irq_enable(void)
 {
 	__asm__ __volatile__(
-		"local_irq_enable"
+		"mips_raw_local_irq_enable"
 		: /* no outputs */
 		: /* no inputs */
 		: "memory");
@@ -43,7 +43,7 @@ static inline void local_irq_enable(void
  * no nops at all.
  */
 __asm__ (
-	".macro\tlocal_irq_disable\n\t"
+	".macro\tmips_raw_local_irq_disable\n\t"
 	".set\tpush\n\t"
 	".set\tnoat\n\t"
 	"mfc0\t$1,$12\n\t"
@@ -55,30 +55,30 @@ __asm__ (
 	".set\tpop\n\t"
 	".endm");
 
-static inline void local_irq_disable(void)
+static inline void __raw_local_irq_disable(void)
 {
 	__asm__ __volatile__(
-		"local_irq_disable"
+		"mips_raw_local_irq_disable"
 		: /* no outputs */
 		: /* no inputs */
 		: "memory");
 }
 
 __asm__ (
-	".macro\tlocal_save_flags flags\n\t"
+	".macro\tmips_raw_local_save_flags flags\n\t"
 	".set\tpush\n\t"
 	".set\treorder\n\t"
 	"mfc0\t\\flags, $12\n\t"
 	".set\tpop\n\t"
 	".endm");
 
-#define local_save_flags(x)						\
+#define __raw_local_save_flags(x)					\
 __asm__ __volatile__(							\
-	"local_save_flags %0"						\
+	"mips_raw_local_save_flags %0"					\
 	: "=r" (x))
 
 __asm__ (
-	".macro\tlocal_irq_save result\n\t"
+	".macro\tmips_raw_local_irq_save result\n\t"
 	".set\tpush\n\t"
 	".set\treorder\n\t"
 	".set\tnoat\n\t"
@@ -91,15 +91,15 @@ __asm__ (
 	".set\tpop\n\t"
 	".endm");
 
-#define local_irq_save(x)						\
+#define __raw_local_irq_save(x)						\
 __asm__ __volatile__(							\
-	"local_irq_save\t%0"						\
+	"mips_raw_local_irq_save\t%0"					\
 	: "=r" (x)							\
 	: /* no inputs */						\
 	: "memory")
 
 __asm__ (
-	".macro\tlocal_irq_restore flags\n\t"
+	".macro\tmips_raw_local_irq_restore flags\n\t"
 	".set\tnoreorder\n\t"
 	".set\tnoat\n\t"
 	"mfc0\t$1, $12\n\t"
@@ -113,22 +113,28 @@ __asm__ (
 	".set\treorder\n\t"
 	".endm");
 
-#define local_irq_restore(flags)					\
+#define __raw_local_irq_restore(flags)					\
 do {									\
 	unsigned long __tmp1;						\
 									\
 	__asm__ __volatile__(						\
-		"local_irq_restore\t%0"					\
+		"mips_raw_local_irq_restore\t%0"				\
 		: "=r" (__tmp1)						\
 		: "0" (flags)						\
 		: "memory");						\
 } while(0)
 
-#define irqs_disabled()							\
+#define __raw_irqs_disabled()							\
 ({									\
 	unsigned long flags;						\
-	local_save_flags(flags);					\
+	__raw_local_save_flags(flags);					\
 	!(flags & 1);							\
 })
 
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & 1);				\
+})
+
 #endif /* _ASM_INTERRUPT_H */
+
Index: linux/include/asm-mips/io.h
===================================================================
--- linux.orig/include/asm-mips/io.h
+++ linux/include/asm-mips/io.h
@@ -16,6 +16,7 @@
 #include <linux/compiler.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/rt_irq.h>
 
 #include <asm/addrspace.h>
 #include <asm/bug.h>
@@ -349,11 +350,11 @@ static inline void pfx##out##bwlq##p(typ
 									\
 	__val = pfx##ioswab##bwlq(val);					\
 									\
-	if (sizeof(type) != sizeof(u64)) {				\
-		*__addr = __val;					\
-		slow;							\
-	} else								\
-		BUILD_BUG();						\
+	/* Really, we want this to be atomic */				\
+	BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));		\
+									\
+	*__addr = __val;						\
+	slow;								\
 }									\
 									\
 static inline type pfx##in##bwlq##p(unsigned long port)			\
@@ -364,13 +365,10 @@ static inline type pfx##in##bwlq##p(unsi
 	port = __swizzle_addr_##bwlq(port);				\
 	__addr = (void *)(mips_io_port_base + port);			\
 									\
-	if (sizeof(type) != sizeof(u64)) {				\
-		__val = *__addr;					\
-		slow;							\
-	} else {							\
-		__val = 0;						\
-		BUILD_BUG();						\
-	}								\
+	BUILD_BUG_ON(sizeof(type) > sizeof(unsigned long));		\
+									\
+	__val = *__addr;						\
+	slow;								\
 									\
 	return pfx##ioswab##bwlq(__val);				\
 }
Index: linux/include/asm-mips/linkage.h
===================================================================
--- linux.orig/include/asm-mips/linkage.h
+++ linux/include/asm-mips/linkage.h
@@ -1,6 +1,8 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
-/* Nothing to see here... */
+/* FASTCALL stuff */
+#define FASTCALL(x)	x
+#define fastcall
 
 #endif
Index: linux/include/asm-mips/m48t35.h
===================================================================
--- linux.orig/include/asm-mips/m48t35.h
+++ linux/include/asm-mips/m48t35.h
@@ -6,7 +6,7 @@
 
 #include <linux/spinlock.h>
 
-extern spinlock_t rtc_lock;
+extern raw_spinlock_t rtc_lock;
 
 struct m48t35_rtc {
 	volatile u8	pad[0x7ff8];    /* starts at 0x7ff8 */
Index: linux/include/asm-mips/mipsregs.h
===================================================================
--- linux.orig/include/asm-mips/mipsregs.h
+++ linux/include/asm-mips/mipsregs.h
@@ -688,7 +688,7 @@ do {									\
 	unsigned long long val;						\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
@@ -707,7 +707,7 @@ do {									\
 			"dsrl\t%L0, %L0, 32\n\t"			\
 			".set\tmips0"					\
 			: "=r" (val));					\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 									\
 	val;								\
 })
@@ -716,7 +716,7 @@ do {									\
 do {									\
 	unsigned long flags;						\
 									\
-	local_irq_save(flags);						\
+	raw_local_irq_save(flags);					\
 	if (sel == 0)							\
 		__asm__ __volatile__(					\
 			".set\tmips64\n\t"				\
@@ -737,7 +737,7 @@ do {									\
 			"dmtc0\t%L0, " #source ", " #sel "\n\t"		\
 			".set\tmips0"					\
 			: : "r" (val));					\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 } while (0)
 
 #define read_c0_index()		__read_32bit_c0_register($0, 0)
Index: linux/include/asm-mips/mmu_context.h
===================================================================
--- linux.orig/include/asm-mips/mmu_context.h
+++ linux/include/asm-mips/mmu_context.h
@@ -117,7 +117,7 @@ static inline void switch_mm(struct mm_s
 	unsigned int cpu = smp_processor_id();
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Check if our ASID is of an older version and thus invalid */
 	if ((cpu_context(cpu, next) ^ asid_cache(cpu)) & ASID_VERSION_MASK)
@@ -133,7 +133,7 @@ static inline void switch_mm(struct mm_s
 	cpu_clear(cpu, prev->cpu_vm_mask);
 	cpu_set(cpu, next->cpu_vm_mask);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -156,7 +156,7 @@ activate_mm(struct mm_struct *prev, stru
 	unsigned long flags;
 	unsigned int cpu = smp_processor_id();
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	/* Unconditionally get a new ASID.  */
 	get_new_mmu_context(next, cpu);
@@ -168,7 +168,7 @@ activate_mm(struct mm_struct *prev, stru
 	cpu_clear(cpu, prev->cpu_vm_mask);
 	cpu_set(cpu, next->cpu_vm_mask);
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /*
@@ -180,7 +180,7 @@ drop_mmu_context(struct mm_struct *mm, u
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	if (cpu_isset(cpu, mm->cpu_vm_mask))  {
 		get_new_mmu_context(mm, cpu);
@@ -190,7 +190,7 @@ drop_mmu_context(struct mm_struct *mm, u
 		cpu_context(cpu, mm) = 0;
 	}
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #endif /* _ASM_MMU_CONTEXT_H */
Index: linux/include/asm-mips/rwsem.h
===================================================================
--- /dev/null
+++ linux/include/asm-mips/rwsem.h
@@ -0,0 +1,176 @@
+/*
+ * include/asm-mips/rwsem.h: R/W semaphores for MIPS using the stuff
+ * in lib/rwsem.c.  Adapted largely from include/asm-ppc/rwsem.h
+ * by john.cooper@timesys.com
+ */
+
+#ifndef _MIPS_RWSEM_H
+#define _MIPS_RWSEM_H
+
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
+#ifdef __KERNEL__
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+/*
+ * the semaphore definition
+ */
+struct compat_rw_semaphore {
+	/* XXX this should be able to be an atomic_t  -- paulus */
+	signed long		count;
+#define RWSEM_UNLOCKED_VALUE		0x00000000
+#define RWSEM_ACTIVE_BIAS		0x00000001
+#define RWSEM_ACTIVE_MASK		0x0000ffff
+#define RWSEM_WAITING_BIAS		(-0x00010000)
+#define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
+#define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
+	raw_spinlock_t		wait_lock;
+	struct list_head	wait_list;
+#if RWSEM_DEBUG
+	int			debug;
+#endif
+};
+
+/*
+ * initialisation
+ */
+#if RWSEM_DEBUG
+#define __RWSEM_DEBUG_INIT      , 0
+#else
+#define __RWSEM_DEBUG_INIT	/* */
+#endif
+
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
+	  LIST_HEAD_INIT((name).wait_list) \
+	  __RWSEM_DEBUG_INIT }
+
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
+
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
+
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
+{
+	sem->count = RWSEM_UNLOCKED_VALUE;
+	spin_lock_init(&sem->wait_lock);
+	INIT_LIST_HEAD(&sem->wait_list);
+#if RWSEM_DEBUG
+	sem->debug = 0;
+#endif
+}
+
+/*
+ * lock for reading
+ */
+static inline void __down_read(struct compat_rw_semaphore *sem)
+{
+	if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
+		smp_wmb();
+	else
+		rwsem_down_read_failed(sem);
+}
+
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	while ((tmp = sem->count) >= 0) {
+		if (tmp == cmpxchg(&sem->count, tmp,
+				   tmp + RWSEM_ACTIVE_READ_BIAS)) {
+			smp_wmb();
+			return 1;
+		}
+	}
+	return 0;
+}
+
+/*
+ * lock for writing
+ */
+static inline void __down_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = atomic_add_return(RWSEM_ACTIVE_WRITE_BIAS,
+				(atomic_t *)(&sem->count));
+	if (tmp == RWSEM_ACTIVE_WRITE_BIAS)
+		smp_wmb();
+	else
+		rwsem_down_write_failed(sem);
+}
+
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	tmp = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
+		      RWSEM_ACTIVE_WRITE_BIAS);
+	smp_wmb();
+	return tmp == RWSEM_UNLOCKED_VALUE;
+}
+
+/*
+ * unlock after reading
+ */
+static inline void __up_read(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_dec_return((atomic_t *)(&sem->count));
+	if (tmp < -1 && (tmp & RWSEM_ACTIVE_MASK) == 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * unlock after writing
+ */
+static inline void __up_write(struct compat_rw_semaphore *sem)
+{
+	smp_wmb();
+	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
+			      (atomic_t *)(&sem->count)) < 0)
+		rwsem_wake(sem);
+}
+
+/*
+ * implement atomic add functionality
+ */
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
+{
+	atomic_add(delta, (atomic_t *)(&sem->count));
+}
+
+/*
+ * downgrade write lock to read lock
+ */
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
+{
+	int tmp;
+
+	smp_wmb();
+	tmp = atomic_add_return(-RWSEM_WAITING_BIAS, (atomic_t *)(&sem->count));
+	if (tmp < 0)
+		rwsem_downgrade_wake(sem);
+}
+
+/*
+ * implement exchange and add functionality
+ */
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
+{
+	smp_mb();
+	return atomic_add_return(delta, (atomic_t *)(&sem->count));
+}
+
+#endif /* __KERNEL__ */
+#endif /* _MIPS_RWSEM_H */
Index: linux/include/asm-mips/semaphore.h
===================================================================
--- linux.orig/include/asm-mips/semaphore.h
+++ linux/include/asm-mips/semaphore.h
@@ -24,12 +24,20 @@
 
 #ifdef __KERNEL__
 
-#include <asm/atomic.h>
-#include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
@@ -39,42 +47,42 @@ struct semaphore {
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __MUTEX_INITIALIZER(name) \
-	__SEMAPHORE_INITIALIZER(name, 1)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name, 1)
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
 	sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
 	sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
 
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -82,31 +90,35 @@ static inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		__down(sem);
+		__compat_down(sem);
 }
 
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (unlikely(atomic_dec_return(&sem->count) < 0))
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	return ret;
 }
 
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	return atomic_dec_if_positive(&sem->count) < 0;
 }
 
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	if (unlikely(atomic_inc_return(&sem->count) <= 0))
-		__up(sem);
+		__compat_up(sem);
 }
 
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
+
 #endif /* __KERNEL__ */
 
 #endif /* __ASM_SEMAPHORE_H */
Index: linux/include/asm-mips/system.h
===================================================================
--- linux.orig/include/asm-mips/system.h
+++ linux/include/asm-mips/system.h
@@ -196,6 +196,7 @@ static inline unsigned long __xchg_u32(v
 		: "=&r" (retval), "=m" (*m), "=&r" (dummy)
 		: "R" (*m), "Jr" (val)
 		: "memory");
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		unsigned long flags;
 
@@ -203,6 +204,7 @@ static inline unsigned long __xchg_u32(v
 		retval = *m;
 		*m = val;
 		local_irq_restore(flags);	/* implies memory barrier  */
+#endif
 	}
 
 	return retval;
@@ -317,6 +319,7 @@ static inline unsigned long __cmpxchg_u3
 		: "=&r" (retval), "=m" (*m)
 		: "R" (*m), "Jr" (old), "Jr" (new)
 		: "memory");
+#ifndef CONFIG_PREEMPT_RT
 	} else {
 		unsigned long flags;
 
@@ -325,6 +328,7 @@ static inline unsigned long __cmpxchg_u3
 		if (retval == old)
 			*m = new;
 		local_irq_restore(flags);	/* implies memory barrier  */
+#endif
 	}
 
 	return retval;
Index: linux/include/asm-mips/thread_info.h
===================================================================
--- linux.orig/include/asm-mips/thread_info.h
+++ linux/include/asm-mips/thread_info.h
@@ -114,6 +114,7 @@ register struct thread_info *__current_t
 #define TIF_SIGPENDING		2	/* signal pending */
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SYSCALL_AUDIT	4	/* syscall auditing active */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedule on return to userspace */
 #define TIF_USEDFPU		16	/* FPU was used by this task this quantum (SMP) */
 #define TIF_POLLING_NRFLAG	17	/* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE		18
@@ -124,6 +125,7 @@ register struct thread_info *__current_t
 #define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_USEDFPU		(1<<TIF_USEDFPU)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 
Index: linux/include/asm-mips/time.h
===================================================================
--- linux.orig/include/asm-mips/time.h
+++ linux/include/asm-mips/time.h
@@ -31,6 +31,7 @@
 extern unsigned long (*rtc_get_time)(void);
 extern int (*rtc_set_time)(unsigned long);
 extern int (*rtc_set_mmss)(unsigned long);
+extern unsigned long cpu_khz;
 
 /*
  * Timer interrupt functions.
Index: linux/include/asm-ppc/hw_irq.h
===================================================================
--- linux.orig/include/asm-ppc/hw_irq.h
+++ linux/include/asm-ppc/hw_irq.h
@@ -7,16 +7,18 @@
 
 #include <asm/ptrace.h>
 #include <asm/reg.h>
+#include <linux/rt_irq.h>
 
 extern void timer_interrupt(struct pt_regs *);
 
 #define INLINE_IRQS
 
-#define irqs_disabled()	((mfmsr() & MSR_EE) == 0)
+#define __raw_irqs_disabled()	((mfmsr() & MSR_EE) == 0)
+#define __raw_irqs_disabled_flags(flags)	((flags & MSR_EE) == 0)
 
-#ifdef INLINE_IRQS
+#if defined(INLINE_IRQS) || defined(CONFIG_PREEMPT_RT)
 
-static inline void local_irq_disable(void)
+static inline void __raw_local_irq_disable(void)
 {
 	unsigned long msr;
 	msr = mfmsr();
@@ -24,7 +26,7 @@ static inline void local_irq_disable(voi
 	__asm__ __volatile__("": : :"memory");
 }
 
-static inline void local_irq_enable(void)
+static inline void __raw_local_irq_enable(void)
 {
 	unsigned long msr;
 	__asm__ __volatile__("": : :"memory");
@@ -32,7 +34,7 @@ static inline void local_irq_enable(void
 	mtmsr(msr | MSR_EE);
 }
 
-static inline void local_irq_save_ptr(unsigned long *flags)
+static inline void __raw_local_irq_save_ptr(unsigned long *flags)
 {
 	unsigned long msr;
 	msr = mfmsr();
@@ -41,9 +43,9 @@ static inline void local_irq_save_ptr(un
 	__asm__ __volatile__("": : :"memory");
 }
 
-#define local_save_flags(flags)		((flags) = mfmsr())
-#define local_irq_save(flags)		local_irq_save_ptr(&flags)
-#define local_irq_restore(flags)	mtmsr(flags)
+#define __raw_local_save_flags(flags)	((flags) = mfmsr())
+#define __raw_local_irq_save(flags)	__raw_local_irq_save_ptr(&flags)
+#define __raw_local_irq_restore(flags)	mtmsr(flags)
 
 #else
 
Index: linux/include/asm-ppc/ocp.h
===================================================================
--- linux.orig/include/asm-ppc/ocp.h
+++ linux/include/asm-ppc/ocp.h
@@ -29,10 +29,10 @@
 #include <linux/config.h>
 #include <linux/devfs_fs_kernel.h>
 #include <linux/device.h>
+#include <linux/rwsem.h>
 
 #include <asm/mmu.h>
 #include <asm/ocp_ids.h>
-#include <asm/rwsem.h>
 #include <asm/semaphore.h>
 
 #ifdef CONFIG_PPC_OCP
Index: linux/include/asm-ppc/rwsem.h
===================================================================
--- linux.orig/include/asm-ppc/rwsem.h
+++ linux/include/asm-ppc/rwsem.h
@@ -7,6 +7,10 @@
 #ifndef _PPC_RWSEM_H
 #define _PPC_RWSEM_H
 
+#ifndef _LINUX_RWSEM_H
+#error "please don't include asm/rwsem.h directly, use linux/rwsem.h instead"
+#endif
+
 #ifdef __KERNEL__
 #include <linux/list.h>
 #include <linux/spinlock.h>
@@ -16,7 +20,7 @@
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	/* XXX this should be able to be an atomic_t  -- paulus */
 	signed long		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
@@ -25,7 +29,7 @@ struct rw_semaphore {
 #define RWSEM_WAITING_BIAS		(-0x00010000)
 #define RWSEM_ACTIVE_READ_BIAS		RWSEM_ACTIVE_BIAS
 #define RWSEM_ACTIVE_WRITE_BIAS		(RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
-	spinlock_t		wait_lock;
+	raw_spinlock_t		wait_lock;
 	struct list_head	wait_list;
 #if RWSEM_DEBUG
 	int			debug;
@@ -41,20 +45,20 @@ struct rw_semaphore {
 #define __RWSEM_DEBUG_INIT	/* */
 #endif
 
-#define __RWSEM_INITIALIZER(name) \
+#define __COMPAT_RWSEM_INITIALIZER(name) \
 	{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, \
 	  LIST_HEAD_INIT((name).wait_list) \
 	  __RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name)		\
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name)		\
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -67,7 +71,7 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	if (atomic_inc_return((atomic_t *)(&sem->count)) > 0)
 		smp_wmb();
@@ -75,7 +79,7 @@ static inline void __down_read(struct rw
 		rwsem_down_read_failed(sem);
 }
 
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -92,7 +96,7 @@ static inline int __down_read_trylock(st
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -104,7 +108,7 @@ static inline void __down_write(struct r
 		rwsem_down_write_failed(sem);
 }
 
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -117,7 +121,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -130,7 +134,7 @@ static inline void __up_read(struct rw_s
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	smp_wmb();
 	if (atomic_sub_return(RWSEM_ACTIVE_WRITE_BIAS,
@@ -141,7 +145,7 @@ static inline void __up_write(struct rw_
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	atomic_add(delta, (atomic_t *)(&sem->count));
 }
@@ -149,7 +153,7 @@ static inline void rwsem_atomic_add(int 
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -162,7 +166,7 @@ static inline void __downgrade_write(str
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	smp_mb();
 	return atomic_add_return(delta, (atomic_t *)(&sem->count));
Index: linux/include/asm-ppc/semaphore.h
===================================================================
--- linux.orig/include/asm-ppc/semaphore.h
+++ linux/include/asm-ppc/semaphore.h
@@ -1,6 +1,9 @@
 #ifndef _PPC_SEMAPHORE_H
 #define _PPC_SEMAPHORE_H
 
+#include <linux/config.h>
+#include <linux/linkage.h>
+
 /*
  * Swiped from asm-sparc/semaphore.h and modified
  * -- Cort (cort@cs.nmt.edu)
@@ -14,59 +17,68 @@
  * -- Paul Mackerras (paulus@samba.org)
  */
 
-#ifdef __KERNEL__
-
 #include <asm/atomic.h>
 #include <asm/system.h>
 #include <linux/wait.h>
 #include <linux/rwsem.h>
 
-struct semaphore {
+/*
+ * On !PREEMPT_RT all semaphores are compat:
+ */
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
+struct compat_semaphore {
 	/*
 	 * Note that any negative value of count is equivalent to 0,
 	 * but additionally indicates that some process(es) might be
 	 * sleeping on `wait'.
 	 */
 	atomic_t count;
+	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __MUTEX_INITIALIZER(name) \
-	__SEMAPHORE_INITIALIZER(name, 1)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name, 1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name, count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define __DECLARE_SEMAPHORE_GENERIC(name, count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define COMPAT_DECLARE_MUTEX(name)		__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name)	__COMPAT_DECLARE_SEMAPHORE_GENERIC(name, 0)
 
-#define DECLARE_MUTEX(name)		__DECLARE_SEMAPHORE_GENERIC(name, 1)
-#define DECLARE_MUTEX_LOCKED(name)	__DECLARE_SEMAPHORE_GENERIC(name, 0)
+extern int compat_sem_is_locked(struct compat_semaphore *sem);
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 	atomic_set(&sem->count, val);
+	sem->sleepers = 0;
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-extern void __down(struct semaphore * sem);
-extern int  __down_interruptible(struct semaphore * sem);
-extern void __up(struct semaphore * sem);
+extern void __compat_down(struct compat_semaphore * sem);
+extern int  __compat_down_interruptible(struct compat_semaphore * sem);
+extern void __compat_up(struct compat_semaphore * sem);
 
-extern inline void down(struct semaphore * sem)
+extern inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -74,23 +86,23 @@ extern inline void down(struct semaphore
 	 * Try to get the semaphore, take the slow path if we fail.
 	 */
 	if (atomic_dec_return(&sem->count) < 0)
-		__down(sem);
+		__compat_down(sem);
 	smp_wmb();
 }
 
-extern inline int down_interruptible(struct semaphore * sem)
+extern inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int ret = 0;
 
 	might_sleep();
 
 	if (atomic_dec_return(&sem->count) < 0)
-		ret = __down_interruptible(sem);
+		ret = __compat_down_interruptible(sem);
 	smp_wmb();
 	return ret;
 }
 
-extern inline int down_trylock(struct semaphore * sem)
+extern inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int ret;
 
@@ -99,13 +111,15 @@ extern inline int down_trylock(struct se
 	return ret;
 }
 
-extern inline void up(struct semaphore * sem)
+extern inline void compat_up(struct compat_semaphore * sem)
 {
 	smp_wmb();
 	if (atomic_inc_return(&sem->count) <= 0)
-		__up(sem);
+		__compat_up(sem);
 }
 
-#endif /* __KERNEL__ */
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
+
+#include <linux/semaphore.h>
 
 #endif /* !(_PPC_SEMAPHORE_H) */
Index: linux/include/asm-ppc/thread_info.h
===================================================================
--- linux.orig/include/asm-ppc/thread_info.h
+++ linux/include/asm-ppc/thread_info.h
@@ -80,6 +80,7 @@ static inline struct thread_info *curren
 #define TIF_MEMDIE		5
 #define TIF_SYSCALL_AUDIT       6       /* syscall auditing active */
 #define TIF_SECCOMP             7      /* secure computing */
+#define TIF_NEED_RESCHED_DELAYED 8	/* reschedule on return to userspace */
 
 /* as above, but as bit values */
 #define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
@@ -89,6 +90,7 @@ static inline struct thread_info *curren
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_SYSCALL_AUDIT      (1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP            (1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED (1<<TIF_NEED_RESCHED_DELAYED)
 
 #define _TIF_SYSCALL_T_OR_A     (_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|_TIF_SECCOMP)
 
Index: linux/include/asm-ppc/time.h
===================================================================
--- linux.orig/include/asm-ppc/time.h
+++ linux/include/asm-ppc/time.h
@@ -20,6 +20,7 @@
 extern unsigned tb_ticks_per_jiffy;
 extern unsigned tb_to_us;
 extern unsigned tb_last_stamp;
+extern unsigned long cpu_khz;
 extern unsigned long disarm_decr[NR_CPUS];
 
 extern void to_tm(int tim, struct rtc_time * tm);
Index: linux/include/asm-x86_64/acpi.h
===================================================================
--- linux.orig/include/asm-x86_64/acpi.h
+++ linux/include/asm-x86_64/acpi.h
@@ -50,8 +50,8 @@
 
 #define ACPI_ASM_MACROS
 #define BREAKPOINT3
-#define ACPI_DISABLE_IRQS() local_irq_disable()
-#define ACPI_ENABLE_IRQS()  local_irq_enable()
+#define ACPI_DISABLE_IRQS() local_irq_disable_nort()
+#define ACPI_ENABLE_IRQS()  local_irq_enable_nort()
 #define ACPI_FLUSH_CPU_CACHE()	wbinvd()
 
 
Index: linux/include/asm-x86_64/hpet.h
===================================================================
--- linux.orig/include/asm-x86_64/hpet.h
+++ linux/include/asm-x86_64/hpet.h
@@ -1,6 +1,6 @@
 #ifndef _ASM_X8664_HPET_H
 #define _ASM_X8664_HPET_H 1
-
+#include <asm/fixmap.h>
 /*
  * Documentation on HPET can be found at:
  *      http://www.intel.com/ial/home/sp/pcmmspec.htm
@@ -44,6 +44,7 @@
 #define HPET_TN_SETVAL		0x040
 #define HPET_TN_32BIT		0x100
 
+extern unsigned long hpet_address;	/* hpet memory map physical address */
 extern int is_hpet_enabled(void);
 extern int hpet_rtc_timer_init(void);
 extern int oem_force_hpet_timer(void);
Index: linux/include/asm-x86_64/io_apic.h
===================================================================
--- linux.orig/include/asm-x86_64/io_apic.h
+++ linux/include/asm-x86_64/io_apic.h
@@ -16,11 +16,10 @@
 #ifdef CONFIG_PCI_MSI
 static inline int use_pci_vector(void)	{return 1;}
 static inline void disable_edge_ioapic_vector(unsigned int vector) { }
-static inline void mask_and_ack_level_ioapic_vector(unsigned int vector) { }
 static inline void end_edge_ioapic_vector (unsigned int vector) { }
 #define startup_level_ioapic	startup_level_ioapic_vector
 #define shutdown_level_ioapic	mask_IO_APIC_vector
-#define enable_level_ioapic	unmask_IO_APIC_vector
+#define enable_level_ioapic	enable_level_ioapic_vector
 #define disable_level_ioapic	mask_IO_APIC_vector
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_vector
 #define end_level_ioapic	end_level_ioapic_vector
@@ -35,11 +34,10 @@ static inline void end_edge_ioapic_vecto
 #else
 static inline int use_pci_vector(void)	{return 0;}
 static inline void disable_edge_ioapic_irq(unsigned int irq) { }
-static inline void mask_and_ack_level_ioapic_irq(unsigned int irq) { }
 static inline void end_edge_ioapic_irq (unsigned int irq) { }
 #define startup_level_ioapic	startup_level_ioapic_irq
 #define shutdown_level_ioapic	mask_IO_APIC_irq
-#define enable_level_ioapic	unmask_IO_APIC_irq
+#define enable_level_ioapic	enable_level_ioapic_irq
 #define disable_level_ioapic	mask_IO_APIC_irq
 #define mask_and_ack_level_ioapic mask_and_ack_level_ioapic_irq
 #define end_level_ioapic	end_level_ioapic_irq
@@ -217,6 +215,6 @@ extern int assign_irq_vector(int irq);
 
 void enable_NMI_through_LVT0 (void * dummy);
 
-extern spinlock_t i8259A_lock;
+extern raw_spinlock_t i8259A_lock;
 
 #endif
Index: linux/include/asm-x86_64/ipi.h
===================================================================
--- linux.orig/include/asm-x86_64/ipi.h
+++ linux/include/asm-x86_64/ipi.h
@@ -91,7 +91,7 @@ static inline void send_IPI_mask_sequenc
 	 * to an arbitrary mask, so I do a unicast to each CPU instead.
 	 * - mbligh
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	for_each_cpu_mask(query_cpu, mask) {
 		/*
@@ -115,7 +115,7 @@ static inline void send_IPI_mask_sequenc
 		 */
 		apic_write(APIC_ICR, cfg);
 	}
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 #endif /* __ASM_IPI_H */
Index: linux/include/asm-x86_64/kprobes.h
===================================================================
--- linux.orig/include/asm-x86_64/kprobes.h
+++ linux/include/asm-x86_64/kprobes.h
@@ -54,7 +54,7 @@ struct arch_specific_insn {
 static inline void restore_interrupts(struct pt_regs *regs)
 {
 	if (regs->eflags & IF_MASK)
-		local_irq_enable();
+		raw_local_irq_enable();
 }
 
 extern int post_kprobe_handler(struct pt_regs *regs);
Index: linux/include/asm-x86_64/page.h
===================================================================
--- linux.orig/include/asm-x86_64/page.h
+++ linux/include/asm-x86_64/page.h
@@ -21,6 +21,8 @@
 #endif
 #define CURRENT_MASK (~(THREAD_SIZE-1))
 
+#define STACK_WARN             (THREAD_SIZE/8)
+
 #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1))
 #define LARGE_PAGE_SIZE (1UL << PMD_SHIFT)
 
Index: linux/include/asm-x86_64/percpu.h
===================================================================
--- linux.orig/include/asm-x86_64/percpu.h
+++ linux/include/asm-x86_64/percpu.h
@@ -17,11 +17,23 @@
 /* Separate out the type, so (int[3], foo) works. */
 #define DEFINE_PER_CPU(type, name) \
     __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+    __attribute__((__section__(".data.percpu"))) spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+    __attribute__((__section__(".data.percpu"))) __typeof__(type) per_cpu__##name##_locked
 
 /* var is in discarded region: offset to particular copy we want */
 #define per_cpu(var, cpu) (*RELOC_HIDE(&per_cpu__##var, __per_cpu_offset(cpu)))
 #define __get_cpu_var(var) (*RELOC_HIDE(&per_cpu__##var, __my_cpu_offset()))
 
+#define per_cpu_lock(var, cpu) \
+	(*RELOC_HIDE(&per_cpu_lock__##var##_locked, __per_cpu_offset(cpu)))
+#define per_cpu_var_locked(var, cpu) \
+		(*RELOC_HIDE(&per_cpu__##var##_locked, __per_cpu_offset(cpu)))
+#define __get_cpu_lock(var, cpu) \
+		per_cpu_lock(var, cpu)
+#define __get_cpu_var_locked(var, cpu) \
+		per_cpu_var_locked(var, cpu)
+
 /* A macro to avoid #include hell... */
 #define percpu_modcopy(pcpudst, src, size)			\
 do {								\
@@ -39,14 +51,26 @@ extern void setup_per_cpu_areas(void);
 #define DEFINE_PER_CPU(type, name) \
     __typeof__(type) per_cpu__##name
 
+#define DEFINE_PER_CPU_LOCKED(type, name) \
+	spinlock_t per_cpu_lock__##name##_locked = SPIN_LOCK_UNLOCKED(per_cpu_lock__##name##_locked); \
+	__typeof__(type) per_cpu__##name##_locked
+
 #define per_cpu(var, cpu)			(*((void)(cpu), &per_cpu__##var))
 #define __get_cpu_var(var)			per_cpu__##var
+#define __get_cpu_lock(var, cpu)		per_cpu_lock__##var##_locked
+#define __get_cpu_var_locked(var, cpu)		per_cpu__##var##_locked
 
 #endif	/* SMP */
 
 #define DECLARE_PER_CPU(type, name) extern __typeof__(type) per_cpu__##name
 
+#define DECLARE_PER_CPU_LOCKED(type, name) \
+	extern spinlock_t per_cpu_lock__##name##_locked; \
+	extern __typeof__(type) per_cpu__##name##_locked
+
 #define EXPORT_PER_CPU_SYMBOL(var) EXPORT_SYMBOL(per_cpu__##var)
 #define EXPORT_PER_CPU_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu__##var)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL(var) EXPORT_SYMBOL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL(per_cpu__##var##_locked)
+#define EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(var) EXPORT_SYMBOL_GPL(per_cpu_lock__##var##_locked); EXPORT_SYMBOL_GPL(per_cpu__##var##_locked)
 
 #endif /* _ASM_X8664_PERCPU_H_ */
Index: linux/include/asm-x86_64/proto.h
===================================================================
--- linux.orig/include/asm-x86_64/proto.h
+++ linux/include/asm-x86_64/proto.h
@@ -65,7 +65,7 @@ extern unsigned long end_pfn_map; 
 
 extern cpumask_t cpu_initialized;
 
-extern void show_trace(unsigned long * rsp);
+extern void show_trace(struct task_struct *task, unsigned long * rsp);
 extern void show_registers(struct pt_regs *regs);
 
 extern void exception_table_check(void);
Index: linux/include/asm-x86_64/rwsem.h
===================================================================
--- linux.orig/include/asm-x86_64/rwsem.h
+++ linux/include/asm-x86_64/rwsem.h
@@ -44,15 +44,15 @@
 
 struct rwsem_waiter;
 
-extern struct rw_semaphore *rwsem_down_read_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_down_write_failed(struct rw_semaphore *sem);
-extern struct rw_semaphore *rwsem_wake(struct rw_semaphore *);
-extern struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_read_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_down_write_failed(struct compat_rw_semaphore *sem);
+extern struct compat_rw_semaphore *rwsem_wake(struct compat_rw_semaphore *);
+extern struct compat_rw_semaphore *rwsem_downgrade_wake(struct compat_rw_semaphore *sem);
 
 /*
  * the semaphore definition
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	signed int		count;
 #define RWSEM_UNLOCKED_VALUE		0x00000000
 #define RWSEM_ACTIVE_BIAS		0x00000001
@@ -76,14 +76,16 @@ struct rw_semaphore {
 #define __RWSEM_DEBUG_INIT	/* */
 #endif
 
+#ifndef __RWSEM_INITIALIZER
 #define __RWSEM_INITIALIZER(name) \
-{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) \
+{ RWSEM_UNLOCKED_VALUE, SPIN_LOCK_UNLOCKED((name).wait_lock), LIST_HEAD_INIT((name).wait_list) \
 	__RWSEM_DEBUG_INIT }
+#endif
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPATH_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __RWSEM_INITIALIZER(name)
 
-static inline void init_rwsem(struct rw_semaphore *sem)
+static inline void compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->count = RWSEM_UNLOCKED_VALUE;
 	spin_lock_init(&sem->wait_lock);
@@ -96,7 +98,7 @@ static inline void init_rwsem(struct rw_
 /*
  * lock for reading
  */
-static inline void __down_read(struct rw_semaphore *sem)
+static inline void __down_read(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning down_read\n\t"
@@ -109,7 +111,7 @@ LOCK_PREFIX	"  incl      (%%rdi)\n\t" /*
 		"  jmp       1b\n"
 		LOCK_SECTION_END \
 		"# ending down_read\n\t"
-		: "+m"(sem->count)
+		:
 		: "D"(sem)
 		: "memory", "cc");
 }
@@ -118,7 +120,7 @@ LOCK_PREFIX	"  incl      (%%rdi)\n\t" /*
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int __down_read_trylock(struct rw_semaphore *sem)
+static inline int __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	__s32 result, tmp;
 	__asm__ __volatile__(
@@ -132,7 +134,7 @@ LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
 		"  jnz	     1b\n\t"
 		"2:\n\t"
 		"# ending __down_read_trylock\n\t"
-		: "+m"(sem->count), "=&a"(result), "=&r"(tmp)
+		: "+g"(sem->count), "=&a"(result), "=&r"(tmp)
 		: "i"(RWSEM_ACTIVE_READ_BIAS)
 		: "memory", "cc");
 	return result>=0 ? 1 : 0;
@@ -142,7 +144,7 @@ LOCK_PREFIX	"  cmpxchgl  %2,%0\n\t"
 /*
  * lock for writing
  */
-static inline void __down_write(struct rw_semaphore *sem)
+static inline void __down_write(struct compat_rw_semaphore *sem)
 {
 	int tmp;
 
@@ -167,7 +169,7 @@ LOCK_PREFIX	"  xaddl      %0,(%%rdi)\n\t
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int __down_write_trylock(struct rw_semaphore *sem)
+static inline int __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	signed long ret = cmpxchg(&sem->count,
 				  RWSEM_UNLOCKED_VALUE, 
@@ -180,7 +182,7 @@ static inline int __down_write_trylock(s
 /*
  * unlock after reading
  */
-static inline void __up_read(struct rw_semaphore *sem)
+static inline void __up_read(struct compat_rw_semaphore *sem)
 {
 	__s32 tmp = -RWSEM_ACTIVE_READ_BIAS;
 	__asm__ __volatile__(
@@ -196,7 +198,7 @@ LOCK_PREFIX	"  xaddl      %[tmp],(%%rdi)
 		"  jmp       1b\n"
 		LOCK_SECTION_END
 		"# ending __up_read\n"
-		: "+m"(sem->count), [tmp] "+r" (tmp)
+		: [tmp] "+r" (tmp)
 		: "D"(sem)
 		: "memory", "cc");
 }
@@ -204,7 +206,7 @@ LOCK_PREFIX	"  xaddl      %[tmp],(%%rdi)
 /*
  * unlock after writing
  */
-static inline void __up_write(struct rw_semaphore *sem)
+static inline void __up_write(struct compat_rw_semaphore *sem)
 {
 	unsigned tmp; 
 	__asm__ __volatile__(
@@ -221,7 +223,7 @@ LOCK_PREFIX	"  xaddl     %[tmp],(%%rdi)\
 		"  jmp       1b\n"
 		LOCK_SECTION_END
 		"# ending __up_write\n"
-		: "+m"(sem->count), [tmp] "=r" (tmp)
+		: [tmp] "=r" (tmp)
 		: "D"(sem), [bias] "i"(-RWSEM_ACTIVE_WRITE_BIAS)
 		: "memory", "cc");
 }
@@ -229,7 +231,7 @@ LOCK_PREFIX	"  xaddl     %[tmp],(%%rdi)\
 /*
  * downgrade write lock to read lock
  */
-static inline void __downgrade_write(struct rw_semaphore *sem)
+static inline void __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 		"# beginning __downgrade_write\n\t"
@@ -250,7 +252,7 @@ LOCK_PREFIX	"  addl      %[bias],(%%rdi)
 /*
  * implement atomic add functionality
  */
-static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem)
+static inline void rwsem_atomic_add(int delta, struct compat_rw_semaphore *sem)
 {
 	__asm__ __volatile__(
 LOCK_PREFIX	"addl %1,%0"
@@ -261,7 +263,7 @@ LOCK_PREFIX	"addl %1,%0"
 /*
  * implement exchange and add functionality
  */
-static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem)
+static inline int rwsem_atomic_update(int delta, struct compat_rw_semaphore *sem)
 {
 	int tmp = delta;
 
Index: linux/include/asm-x86_64/semaphore.h
===================================================================
--- linux.orig/include/asm-x86_64/semaphore.h
+++ linux/include/asm-x86_64/semaphore.h
@@ -1,10 +1,15 @@
 #ifndef _X86_64_SEMAPHORE_H
 #define _X86_64_SEMAPHORE_H
 
+#include <linux/config.h>
 #include <linux/linkage.h>
 
 #ifdef __KERNEL__
 
+#ifndef CONFIG_PREEMPT_RT
+# define compat_semaphore semaphore
+#endif
+
 /*
  * SMP- and interrupt-safe semaphores..
  *
@@ -43,32 +48,34 @@
 #include <linux/rwsem.h>
 #include <linux/stringify.h>
 
-struct semaphore {
+struct compat_semaphore {
 	atomic_t count;
 	int sleepers;
 	wait_queue_head_t wait;
 };
 
-#define __SEMAPHORE_INITIALIZER(name, n)				\
+#define __COMPAT_SEMAPHORE_INITIALIZER(name, n)				\
 {									\
 	.count		= ATOMIC_INIT(n),				\
 	.sleepers	= 0,						\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER((name).wait)	\
 }
 
-#define __MUTEX_INITIALIZER(name) \
-	__SEMAPHORE_INITIALIZER(name,1)
+#define __COMPAT_MUTEX_INITIALIZER(name) \
+	__COMPAT_SEMAPHORE_INITIALIZER(name,1)
+
+#define __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,count) \
+	struct compat_semaphore name = __COMPAT_SEMAPHORE_INITIALIZER(name,count)
 
-#define __DECLARE_SEMAPHORE_GENERIC(name,count) \
-	struct semaphore name = __SEMAPHORE_INITIALIZER(name,count)
+#define COMPAT_DECLARE_MUTEX(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,1)
+#define COMPAT_DECLARE_MUTEX_LOCKED(name) __COMPAT_DECLARE_SEMAPHORE_GENERIC(name,0)
 
-#define DECLARE_MUTEX(name) __DECLARE_SEMAPHORE_GENERIC(name,1)
-#define DECLARE_MUTEX_LOCKED(name) __DECLARE_SEMAPHORE_GENERIC(name,0)
+#define compat_sema_count(sem) atomic_read(&(sem)->count)
 
-static inline void sema_init (struct semaphore *sem, int val)
+static inline void compat_sema_init (struct compat_semaphore *sem, int val)
 {
 /*
- *	*sem = (struct semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
+ *	*sem = (struct compat_semaphore)__SEMAPHORE_INITIALIZER((*sem),val);
  *
  * i'd rather use the more flexible initialization above, but sadly
  * GCC 2.7.2.3 emits a bogus warning. EGCS doesn't. Oh well.
@@ -78,32 +85,32 @@ static inline void sema_init (struct sem
 	init_waitqueue_head(&sem->wait);
 }
 
-static inline void init_MUTEX (struct semaphore *sem)
+static inline void compat_init_MUTEX (struct compat_semaphore *sem)
 {
-	sema_init(sem, 1);
+	compat_sema_init(sem, 1);
 }
 
-static inline void init_MUTEX_LOCKED (struct semaphore *sem)
+static inline void compat_init_MUTEX_LOCKED (struct compat_semaphore *sem)
 {
-	sema_init(sem, 0);
+	compat_sema_init(sem, 0);
 }
 
-asmlinkage void __down_failed(void /* special register calling convention */);
-asmlinkage int  __down_failed_interruptible(void  /* params in registers */);
-asmlinkage int  __down_failed_trylock(void  /* params in registers */);
-asmlinkage void __up_wakeup(void /* special register calling convention */);
+asmlinkage void __compat_down_failed(void /* special register calling convention */);
+asmlinkage int  __compat_down_failed_interruptible(void  /* params in registers */);
+asmlinkage int  __compat_down_failed_trylock(void  /* params in registers */);
+asmlinkage void __compat_up_wakeup(void /* special register calling convention */);
 
-asmlinkage void __down(struct semaphore * sem);
-asmlinkage int  __down_interruptible(struct semaphore * sem);
-asmlinkage int  __down_trylock(struct semaphore * sem);
-asmlinkage void __up(struct semaphore * sem);
+asmlinkage void __compat_down(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_interruptible(struct compat_semaphore * sem);
+asmlinkage int  __compat_down_trylock(struct compat_semaphore * sem);
+asmlinkage void __compat_up(struct compat_semaphore * sem);
 
 /*
  * This is ugly, but we want the default case to fall through.
  * "__down_failed" is a special asm handler that calls the C
  * routine that actually waits. See arch/x86_64/kernel/semaphore.c
  */
-static inline void down(struct semaphore * sem)
+static inline void compat_down(struct compat_semaphore * sem)
 {
 	might_sleep();
 
@@ -113,7 +120,7 @@ static inline void down(struct semaphore
 		"js 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed\n\t"
+		"2:\tcall __compat_down_failed\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
@@ -125,7 +132,7 @@ static inline void down(struct semaphore
  * Interruptible try to acquire a semaphore.  If we obtained
  * it, return zero.  If we were interrupted, returns -EINTR
  */
-static inline int down_interruptible(struct semaphore * sem)
+static inline int compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -138,7 +145,7 @@ static inline int down_interruptible(str
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_interruptible\n\t"
+		"2:\tcall __compat_down_failed_interruptible\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -151,7 +158,7 @@ static inline int down_interruptible(str
  * Non-blockingly attempt to down() a semaphore.
  * Returns zero if we acquired it
  */
-static inline int down_trylock(struct semaphore * sem)
+static inline int compat_down_trylock(struct compat_semaphore * sem)
 {
 	int result;
 
@@ -162,7 +169,7 @@ static inline int down_trylock(struct se
 		"xorl %0,%0\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __down_failed_trylock\n\t"
+		"2:\tcall __compat_down_failed_trylock\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=a" (result), "=m" (sem->count)
@@ -177,7 +184,7 @@ static inline int down_trylock(struct se
  * The default case (no contention) will result in NO
  * jumps for both down() and up().
  */
-static inline void up(struct semaphore * sem)
+static inline void compat_up(struct compat_semaphore * sem)
 {
 	__asm__ __volatile__(
 		"# atomic up operation\n\t"
@@ -185,7 +192,7 @@ static inline void up(struct semaphore *
 		"jle 2f\n"
 		"1:\n"
 		LOCK_SECTION_START("")
-		"2:\tcall __up_wakeup\n\t"
+		"2:\tcall __compat_up_wakeup\n\t"
 		"jmp 1b\n"
 		LOCK_SECTION_END
 		:"=m" (sem->count)
Index: linux/include/asm-x86_64/spinlock.h
===================================================================
--- linux.orig/include/asm-x86_64/spinlock.h
+++ linux/include/asm-x86_64/spinlock.h
@@ -36,7 +36,7 @@
 	"movb $1,%0" \
 		:"=m" (lock->slock) : : "memory"
 
-static inline void __raw_spin_lock(raw_spinlock_t *lock)
+static inline void __raw_spin_lock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_lock_string
@@ -45,7 +45,7 @@ static inline void __raw_spin_lock(raw_s
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
 
-static inline int __raw_spin_trylock(raw_spinlock_t *lock)
+static inline int __raw_spin_trylock(__raw_spinlock_t *lock)
 {
 	char oldval;
 
@@ -57,7 +57,7 @@ static inline int __raw_spin_trylock(raw
 	return oldval > 0;
 }
 
-static inline void __raw_spin_unlock(raw_spinlock_t *lock)
+static inline void __raw_spin_unlock(__raw_spinlock_t *lock)
 {
 	__asm__ __volatile__(
 		__raw_spin_unlock_string
@@ -91,17 +91,17 @@ static inline void __raw_spin_unlock(raw
 #define __raw_read_can_lock(x)		((int)(x)->lock > 0)
 #define __raw_write_can_lock(x)		((x)->lock == RW_LOCK_BIAS)
 
-static inline void __raw_read_lock(raw_rwlock_t *rw)
+static inline void __raw_read_lock(__raw_rwlock_t *rw)
 {
 	__build_read_lock(rw, "__read_lock_failed");
 }
 
-static inline void __raw_write_lock(raw_rwlock_t *rw)
+static inline void __raw_write_lock(__raw_rwlock_t *rw)
 {
 	__build_write_lock(rw, "__write_lock_failed");
 }
 
-static inline int __raw_read_trylock(raw_rwlock_t *lock)
+static inline int __raw_read_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	atomic_dec(count);
@@ -111,7 +111,7 @@ static inline int __raw_read_trylock(raw
 	return 0;
 }
 
-static inline int __raw_write_trylock(raw_rwlock_t *lock)
+static inline int __raw_write_trylock(__raw_rwlock_t *lock)
 {
 	atomic_t *count = (atomic_t *)lock;
 	if (atomic_sub_and_test(RW_LOCK_BIAS, count))
@@ -120,12 +120,12 @@ static inline int __raw_write_trylock(ra
 	return 0;
 }
 
-static inline void __raw_read_unlock(raw_rwlock_t *rw)
+static inline void __raw_read_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; incl %0" :"=m" (rw->lock) : : "memory");
 }
 
-static inline void __raw_write_unlock(raw_rwlock_t *rw)
+static inline void __raw_write_unlock(__raw_rwlock_t *rw)
 {
 	asm volatile("lock ; addl $" RW_LOCK_BIAS_STR ",%0"
 				: "=m" (rw->lock) : : "memory");
Index: linux/include/asm-x86_64/spinlock_types.h
===================================================================
--- linux.orig/include/asm-x86_64/spinlock_types.h
+++ linux/include/asm-x86_64/spinlock_types.h
@@ -7,13 +7,13 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED	{ 1 }
 
 typedef struct {
 	volatile unsigned int lock;
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED		{ RW_LOCK_BIAS }
 
Index: linux/include/asm-x86_64/system.h
===================================================================
--- linux.orig/include/asm-x86_64/system.h
+++ linux/include/asm-x86_64/system.h
@@ -309,22 +309,30 @@ static inline unsigned long __cmpxchg(vo
 #define warn_if_not_ulong(x) do { unsigned long foo; (void) (&(x) == &foo); } while (0)
 
 /* interrupt control.. */
-#define local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
-#define local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
-#define local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
-#define local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
+
+#define __raw_local_save_flags(x)	do { warn_if_not_ulong(x); __asm__ __volatile__("# save_flags \n\t pushfq ; popq %q0":"=g" (x): /* no input */ :"memory"); } while (0)
+#define __raw_local_irq_restore(x) 	__asm__ __volatile__("# restore_flags \n\t pushq %0 ; popfq": /* no output */ :"g" (x):"memory", "cc")
+#define __raw_local_irq_disable() 	__asm__ __volatile__("cli": : :"memory")
+#define __raw_local_irq_enable()	__asm__ __volatile__("sti": : :"memory")
 /* used in the idle loop; sti takes one instruction cycle to complete */
-#define safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+#define __raw_safe_halt()		__asm__ __volatile__("sti; hlt": : :"memory")
+
+#define __raw_irqs_disabled_flags(flags)	\
+({						\
+	!(flags & (1<<9));			\
+})
 
-#define irqs_disabled()			\
-({					\
-	unsigned long flags;		\
-	local_save_flags(flags);	\
-	!(flags & (1<<9));		\
+#define __raw_irqs_disabled()			\
+({						\
+	unsigned long flags;			\
+	__raw_local_save_flags(flags);		\
+	__raw_irqs_disabled_flags(flags);		\
 })
 
 /* For spinlocks etc */
-#define local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+#define __raw_local_irq_save(x) 	do { warn_if_not_ulong(x); __asm__ __volatile__("# __raw_local_irq_save \n\t pushfq ; popq %0 ; cli":"=g" (x): /* no input */ :"memory"); } while (0)
+
+#include <linux/rt_irq.h>
 
 void cpu_idle_wait(void);
 
Index: linux/include/asm-x86_64/thread_info.h
===================================================================
--- linux.orig/include/asm-x86_64/thread_info.h
+++ linux/include/asm-x86_64/thread_info.h
@@ -101,6 +101,7 @@ static inline struct thread_info *stack_
 #define TIF_NEED_RESCHED	3	/* rescheduling necessary */
 #define TIF_SINGLESTEP		4	/* reenable singlestep on user return*/
 #define TIF_IRET		5	/* force IRET */
+#define TIF_NEED_RESCHED_DELAYED 6	/* reschedul on return to userspace */
 #define TIF_SYSCALL_AUDIT	7	/* syscall auditing active */
 #define TIF_SECCOMP		8	/* secure computing */
 #define TIF_POLLING_NRFLAG	16	/* true if poll_idle() is polling TIF_NEED_RESCHED */
@@ -117,6 +118,7 @@ static inline struct thread_info *stack_
 #define _TIF_IRET		(1<<TIF_IRET)
 #define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP		(1<<TIF_SECCOMP)
+#define _TIF_NEED_RESCHED_DELAYED	(1<<TIF_NEED_RESCHED_DELAYED)
 #define _TIF_POLLING_NRFLAG	(1<<TIF_POLLING_NRFLAG)
 #define _TIF_IA32		(1<<TIF_IA32)
 #define _TIF_FORK		(1<<TIF_FORK)
Index: linux/include/asm-x86_64/timeofday.h
===================================================================
--- /dev/null
+++ linux/include/asm-x86_64/timeofday.h
@@ -0,0 +1,4 @@
+#ifndef _ASM_X86_64_TIMEOFDAY_H
+#define _ASM_X86_64_TIMEOFDAY_H
+#include <asm-generic/timeofday.h>
+#endif
Index: linux/include/asm-x86_64/timex.h
===================================================================
--- linux.orig/include/asm-x86_64/timex.h
+++ linux/include/asm-x86_64/timex.h
@@ -24,8 +24,15 @@ static inline cycles_t get_cycles (void)
 }
 
 extern unsigned int cpu_khz;
+extern unsigned int tsc_khz;
+extern void tsc_c3_compensate(unsigned long usecs);
 
 extern int read_current_timer(unsigned long *timer_value);
+/*
+ * On an Athlon64 the cycles-based estimator is off by a
+ * factor of 2: udelay(100) takes 200 usecs. With the non-TSC
+ * based estimator the timings are precise. So turn it off.
+ */
 #define ARCH_HAS_READ_CURRENT_TIMER	1
 
 extern struct vxtime_data vxtime;
Index: linux/include/asm-x86_64/tlbflush.h
===================================================================
--- linux.orig/include/asm-x86_64/tlbflush.h
+++ linux/include/asm-x86_64/tlbflush.h
@@ -9,11 +9,13 @@
 	do {								\
 		unsigned long tmpreg;					\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr3, %0;  # flush TLB \n"		\
 			"movq %0, %%cr3;              \n"		\
 			: "=r" (tmpreg)					\
 			:: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 /*
@@ -24,6 +26,7 @@
 	do {								\
 		unsigned long tmpreg, cr4, cr4_orig;			\
 									\
+		preempt_disable();					\
 		__asm__ __volatile__(					\
 			"movq %%cr4, %2;  # turn off PGE     \n"	\
 			"movq %2, %1;                        \n"	\
@@ -35,6 +38,7 @@
 			: "=&r" (tmpreg), "=&r" (cr4), "=&r" (cr4_orig)	\
 			: "i" (~X86_CR4_PGE)				\
 			: "memory");					\
+		preempt_enable();					\
 	} while (0)
 
 extern unsigned long pgkern_mask;
Index: linux/include/asm-x86_64/unistd.h
===================================================================
--- linux.orig/include/asm-x86_64/unistd.h
+++ linux/include/asm-x86_64/unistd.h
@@ -573,6 +573,7 @@ __SYSCALL(__NR_inotify_add_watch, sys_in
 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch)
 
 #define __NR_syscall_max __NR_inotify_rm_watch
+#define NR_syscalls	__NR_syscall_max+1
 #ifndef __NO_STUBS
 
 /* user-visible error numbers are in the range -1 - -4095 */
Index: linux/include/asm-x86_64/vsyscall.h
===================================================================
--- linux.orig/include/asm-x86_64/vsyscall.h
+++ linux/include/asm-x86_64/vsyscall.h
@@ -14,7 +14,7 @@ enum vsyscall_num {
 #define VSYSCALL_ADDR(vsyscall_nr) (VSYSCALL_START+VSYSCALL_SIZE*(vsyscall_nr))
 
 #ifdef __KERNEL__
-
+/* XXX - All of these are unused w/ CONFIG_GENERIC_TIME and should be removed */
 #define __section_vxtime __attribute__ ((unused, __section__ (".vxtime"), aligned(16)))
 #define __section_wall_jiffies __attribute__ ((unused, __section__ (".wall_jiffies"), aligned(16)))
 #define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
@@ -23,6 +23,12 @@ enum vsyscall_num {
 #define __section_xtime __attribute__ ((unused, __section__ (".xtime"), aligned(16)))
 #define __section_xtime_lock __attribute__ ((unused, __section__ (".xtime_lock"), aligned(16)))
 
+/* Definitions for CONFIG_GENERIC_TIME definitions */
+#define __section_vsyscall_gtod_data __attribute__ ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
+#define __section_vsyscall_gtod_lock __attribute__ ((unused, __section__ (".vsyscall_gtod_lock"),aligned(16)))
+#define __vsyscall_fn __attribute__ ((unused,__section__(".vsyscall_fn")))
+#define __vsyscall_data __attribute__ ((unused,__section__(".vsyscall_data")))
+
 #define VXTIME_TSC	1
 #define VXTIME_HPET	2
 #define VXTIME_PMTMR	3
@@ -45,14 +51,14 @@ extern struct timespec __xtime;
 extern volatile unsigned long __jiffies;
 extern unsigned long __wall_jiffies;
 extern struct timezone __sys_tz;
-extern seqlock_t __xtime_lock;
+extern raw_seqlock_t __xtime_lock;
 
 /* kernel space (writeable) */
 extern struct vxtime_data vxtime;
 extern unsigned long wall_jiffies;
 extern struct timezone sys_tz;
 extern int sysctl_vsyscall;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 extern int sysctl_vsyscall;
 
Index: linux/include/linux/bit_spinlock.h
===================================================================
--- linux.orig/include/linux/bit_spinlock.h
+++ linux/include/linux/bit_spinlock.h
@@ -1,6 +1,8 @@
 #ifndef __LINUX_BIT_SPINLOCK_H
 #define __LINUX_BIT_SPINLOCK_H
 
+#if 0
+
 /*
  *  bit-based spin_lock()
  *
@@ -73,5 +75,7 @@ static inline int bit_spin_is_locked(int
 #endif
 }
 
+#endif
+
 #endif /* __LINUX_BIT_SPINLOCK_H */
 
Index: linux/include/linux/buffer_head.h
===================================================================
--- linux.orig/include/linux/buffer_head.h
+++ linux/include/linux/buffer_head.h
@@ -19,10 +19,6 @@ enum bh_state_bits {
 	BH_Dirty,	/* Is dirty */
 	BH_Lock,	/* Is locked */
 	BH_Req,		/* Has been submitted for I/O */
-	BH_Uptodate_Lock,/* Used by the first bh in a page, to serialise
-			  * IO completion of other buffers in the page
-			  */
-
 	BH_Mapped,	/* Has a disk mapping */
 	BH_New,		/* Disk mapping was newly created by get_block */
 	BH_Async_Read,	/* Is under end_buffer_async_read I/O */
@@ -65,6 +61,8 @@ struct buffer_head {
 	bh_end_io_t *b_end_io;		/* I/O completion */
  	void *b_private;		/* reserved for b_end_io */
 	struct list_head b_assoc_buffers; /* associated with another mapping */
+	spinlock_t b_uptodate_lock;
+	spinlock_t b_state_lock;
 };
 
 /*
Index: linux/include/linux/calc64.h
===================================================================
--- /dev/null
+++ linux/include/linux/calc64.h
@@ -0,0 +1,49 @@
+#ifndef _linux_CALC64_H
+#define _linux_CALC64_H
+
+#include <linux/types.h>
+#include <asm/div64.h>
+
+/*
+ * div_long_long_rem was moved out of jiffies.h as it is
+ * a general math function useful for other things than
+ * jiffy code.
+ *
+ * This is a generic macro which is used when the architecture
+ * specific div64.h does not provide a optimized one.
+ *
+ * The 64bit dividend is divided by the divisor (data type long), the
+ * result is returned and the remainder stored in the variable
+ * referenced by remainder (data type long *). In contrast to the
+ * do_div macro the dividend is kept intact.
+ */
+#ifndef div_long_long_rem
+#define div_long_long_rem(dividend,divisor,remainder) 	\
+({							\
+	u64 result = dividend;				\
+	*remainder = do_div(result,divisor);		\
+	result;						\
+})
+#endif
+
+/*
+ * Sign aware variation of the above. On some architectures a
+ * negative dividend leads to an divide overflow exception, which
+ * is avoided by the sign check.
+ */
+static inline long div_long_long_rem_signed(long long dividend,
+					    long divisor,
+					    long *remainder)
+{
+	long res;
+
+	if (unlikely(dividend < 0)) {
+		res = -div_long_long_rem(-dividend, divisor, remainder);
+		*remainder = -(*remainder);
+	} else {
+		res = div_long_long_rem(dividend, divisor, remainder);
+	}
+	return res;
+}
+
+#endif
Index: linux/include/linux/clockchips.h
===================================================================
--- /dev/null
+++ linux/include/linux/clockchips.h
@@ -0,0 +1,127 @@
+/*  linux/include/linux/clockchips.h
+ *
+ *  This file contains the structure definitions for clockchips.
+ *
+ *  If you are not a clockchip, or the time of day code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKCHIPS_H
+#define _LINUX_CLOCKCHIPS_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_GENERIC_TIME
+
+#include <linux/clocksource.h>
+#include <linux/interrupt.h>
+
+/* Clock event modes and commands */
+enum {
+	CLOCK_EVT_NONE,
+	CLOCK_EVT_STARTUP,
+	CLOCK_EVT_PERIODIC,
+	CLOCK_EVT_ONESHOT,
+	CLOCK_EVT_IPI,
+	CLOCK_EVT_STOP,
+	CLOCK_EVT_SHUTDOWN,
+	CLOCK_EVT_RUN_CYCLIC,
+	CLOCK_EVT_SCHEDTICK,
+	CLOCK_EVT_NOTICK,
+};
+
+/* Clock event capability flags */
+#define CLOCK_CAP_TICK		0x000001
+
+#if defined(CONFIG_HIGH_RES_TIMERS) || defined(CONFIG_DYNTICK)
+#define CLOCK_CAP_NEXTEVT	0x000002
+#else
+#define CLOCK_CAP_NEXTEVT	0x000000
+#endif
+
+#define CLOCK_CAP_UPDATE	0x000004
+
+#ifndef CONFIG_PROFILE_NMI
+#define CLOCK_CAP_PROFILE	0x000008
+#else
+#define CLOCK_CAP_PROFILE	0x000000
+#endif
+
+#define CLOCK_CAP_MASK		(CLOCK_CAP_TICK | CLOCK_CAP_NEXTEVT | CLOCK_CAP_PROFILE | CLOCK_CAP_UPDATE)
+
+/* The device has its own interrupt handler */
+#define CLOCK_HAS_IRQHANDLER	0x010000
+
+struct clock_event;
+
+/**
+ * struct clock_event - clock event descriptor
+ *
+ * @name:		ptr to clock event name
+ * @capabilities:	capabilities of the event chip
+ * @max_delta_ns:	maximum delta value in ns
+ * @min_delta_ns:	minimum delta value in ns
+ * @mult:		nanosecond to cycles multiplier
+ * @shift:		nanoseconds to cycles divisor (power of two)
+ * @set_next_event:	set next event
+ * @set_mode:		set mode function
+ * @suspend:		suspend function (optional)
+ * @resume:		resume function (optional)
+ * @evthandler:		Assigned by the framework to be called by the low
+ *			level handler of the event source
+ * @start_event:	called on entry (optional for chip handling...)
+ * @end_event:		called on exit (optional for chip handling...)
+ * @priv:		private device data
+ */
+struct clock_event {
+	const char* name;
+	unsigned int capabilities;
+	unsigned long max_delta_ns;
+	unsigned long min_delta_ns;
+	u32 mult;
+	u32 shift;
+	void (*set_next_event)(unsigned long evt);
+	void (*set_mode)(int mode);
+	int (*suspend)(void);
+	int (*resume)(void);
+	void (*event_handler)(struct pt_regs *regs);
+	void (*start_event)(void *priv);
+	void (*end_event)(void *priv);
+	unsigned int irq;
+	void *priv;
+};
+
+
+
+/*
+ * Calculate a multiplication factor with shift=32
+ */
+static inline unsigned long div_sc32(unsigned long a, unsigned long b)
+{
+	u64 tmp = ((u64)a) << 32;
+	do_div(tmp, b);
+	return (unsigned long) tmp;
+}
+
+static inline unsigned long mpy_sc32(unsigned long a, unsigned long b)
+{
+	u64 res = (u64) a * b;
+
+	return (unsigned long) (res >> 32);
+}
+
+/* Clock event layer functions */
+extern int setup_local_clockevent(struct clock_event *, cpumask_t cpumask);
+extern int setup_global_clockevent(struct clock_event *, cpumask_t cpumask);
+extern unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt);
+extern void init_clockevents(void);
+
+extern int clockevents_init_next_event(void);
+extern int clockevents_set_next_event(ktime_t expires, ktime_t now);
+extern void clockevents_trigger_next_event(void);
+extern int clockevents_next_event_available(void);
+
+#else
+# define init_clockevents() do { } while(0)
+#endif
+
+#endif
Index: linux/include/linux/clocksource.h
===================================================================
--- /dev/null
+++ linux/include/linux/clocksource.h
@@ -0,0 +1,304 @@
+/*  linux/include/linux/clocksource.h
+ *
+ *  This file contains the structure definitions for clocksources.
+ *
+ *  If you are not a clocksource, or the time of day code, you should
+ *  not be including this file!
+ */
+#ifndef _LINUX_CLOCKSOURCE_H
+#define _LINUX_CLOCKSOURCE_H
+
+#include <linux/types.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/list.h>
+#include <asm/div64.h>
+#include <asm/io.h>
+
+/**
+ * struct clocksource - hardware abstraction for a free running counter
+ *	Provides mostly state-free accessors to the underlying hardware.
+ *
+ * @name:		ptr to clocksource name
+ * @list:		list head for registration
+ * @rating:		rating value for selection (higher is better)
+ *			To avoid rating inflation the following
+ *			list should give you a guide as to how
+ *			to assign your clocksource a rating
+ *			1-99: Unfit for real use
+ *				Only available for bootup and testing purposes.
+ *			100-199: Base level usability.
+ *				Functional for real use, but not desired.
+ *			200-299: Good.
+ *				A correct and usable clocksource.
+ *			300-399: Desired.
+ *				A reasonably fast and accurate clocksource.
+ *			400-499: Perfect
+ *				The ideal clocksource. A must-use where
+ *				available.
+ * @read:		returns a cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ * @update_callback:	called when safe to alter clocksource values
+ * @is_continuous:	defines if clocksource is free-running.
+ * @vread:		vsyscall read function
+ * @vdata:		vsyscall data value passed to read function
+ */
+struct clocksource {
+	char *name;
+	struct list_head list;
+	int rating;
+	cycle_t (*read)(void);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+	int (*update_callback)(void);
+	int is_continuous;
+	cycle_t (*vread)(void *);
+	void *vdata;
+};
+
+
+/**
+ * clocksource_khz2mult - calculates mult from khz and shift
+ * @khz:		Clocksource frequency in KHz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a khz counter frequency to a timsource
+ * multiplier, given the clocksource shift value
+ */
+static inline u32 clocksource_khz2mult(u32 khz, u32 shift_constant)
+{
+	/*  khz = cyc/(Million ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Million/khz * 2^shift
+	 *  mult = 1000000 * 2^shift / khz
+	 *  mult = (1000000<<shift) / khz
+	 */
+	u64 tmp = ((u64)1000000) << shift_constant;
+
+	tmp += khz/2; /* round for do_div */
+	do_div(tmp, khz);
+
+	return (u32)tmp;
+}
+
+/**
+ * clocksource_hz2mult - calculates mult from hz and shift
+ * @hz:			Clocksource frequency in Hz
+ * @shift_constant:	Clocksource shift factor
+ *
+ * Helper functions that converts a hz counter
+ * frequency to a timsource multiplier, given the
+ * clocksource shift value
+ */
+static inline u32 clocksource_hz2mult(u32 hz, u32 shift_constant)
+{
+	/*  hz = cyc/(Billion ns)
+	 *  mult/2^shift  = ns/cyc
+	 *  mult = ns/cyc * 2^shift
+	 *  mult = 1Billion/hz * 2^shift
+	 *  mult = 1000000000 * 2^shift / hz
+	 *  mult = (1000000000<<shift) / hz
+	 */
+	u64 tmp = ((u64)1000000000) << shift_constant;
+
+	tmp += hz/2; /* round for do_div */
+	do_div(tmp, hz);
+
+	return (u32)tmp;
+}
+
+/**
+ * read_clocksource: - Access the clocksource's current cycle value
+ * @cs:		pointer to clocksource being read
+ *
+ * Uses the clocksource to return the current cycle_t value
+ */
+static inline cycle_t read_clocksource(struct clocksource *cs)
+{
+	return cs->read();
+}
+
+/**
+ * ppm_to_mult_adj - Converts shifted ppm values to mult adjustment
+ * @cs:		Pointer to clocksource
+ * @ppm:	Shifted PPM value
+ *
+ * Helper which converts a shifted ppm value to clocksource mult_adj value.
+ *
+ * XXX - this could use some optimization
+ */
+static inline int ppm_to_mult_adj(struct clocksource *cs, int ppm)
+{
+	u64 mult_adj;
+	int ret_adj;
+
+	/* The basic math is as follows:
+	 *     cyc * mult/2^shift * (1 + ppm/MILL) = scaled ns
+	 * We want to precalculate the ppm factor so it can be added
+	 * to the multiplyer saving the extra multiplication step.
+	 *     cyc * (mult/2^shift + (mult/2^shift) * (ppm/MILL)) =
+	 *     cyc * (mult/2^shift + (mult*ppm/MILL)/2^shift) =
+	 *     cyc * (mult + (mult*ppm/MILL))/2^shift =
+	 * Thus we want to calculate the value of:
+	 *     mult*ppm/MILL
+	 */
+	mult_adj = abs(ppm);
+	mult_adj = (mult_adj * cs->mult)>>SHIFT_USEC;
+	mult_adj += 1000000/2; /* round for div*/
+	do_div(mult_adj, 1000000);
+	if (ppm < 0)
+		ret_adj = -(int)mult_adj;
+	else
+		ret_adj = (int)mult_adj;
+
+	return ret_adj;
+}
+
+/**
+ * cyc2ns - converts clocksource cycles to nanoseconds
+ * @cs:		Pointer to clocksource
+ * @ntp_adj:	Multiplier adjustment value
+ * @cycles:	Cycles
+ *
+ * Uses the clocksource and ntp ajdustment to convert cycle_ts to nanoseconds.
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization
+ */
+static inline nsec_t cyc2ns(struct clocksource *cs, int ntp_adj, cycle_t cycles)
+{
+	u64 ret = (u64)cycles;
+
+	ret *= (cs->mult + ntp_adj);
+	ret >>= cs->shift;
+
+	return (nsec_t)ret;
+}
+
+/**
+ * cyc2ns_rem - converts clocksource cycles to nanoseconds w/ remainder
+ * @cs:		Pointer to clocksource
+ * @ntp_adj:	Multiplier adjustment value
+ * @cycles:	Cycles
+ * @rem:	Remainder
+ *
+ * Uses the clocksource and ntp ajdustment interval to convert cycle_t to
+ * nanoseconds. Add in remainder portion which is stored in (ns<<cs->shift)
+ * units and save the new remainder off.
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization.
+ */
+static inline nsec_t cyc2ns_rem(struct clocksource *cs, int ntp_adj,
+				cycle_t cycles, u64* rem)
+{
+	u64 ret = (u64)cycles;
+
+	ret *= (cs->mult + ntp_adj);
+	if (rem) {
+		ret += *rem;
+		*rem = ret & ((1<<cs->shift)-1);
+	}
+	ret >>= cs->shift;
+
+	return (nsec_t)ret;
+}
+
+
+/**
+ * struct clocksource_interval - Fixed interval conversion structure
+ *
+ * @cycles:	A specified number of cycles
+ * @nsecs:	The number of nanoseconds equivalent to the cycles value
+ * @remainder:	Non-integer nanosecond remainder stored in (ns<<cs->shift) units
+ * @remainder_ns_overflow:	Value at which the remainder is equal to
+ *				one second
+ *
+ * This is a optimization structure used by cyc2ns_fixed_rem() to avoid the
+ * multiply in cyc2ns().
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+struct clocksource_interval {
+	cycle_t cycles;
+	nsec_t nsecs;
+	u64 remainder;
+	u64 remainder_ns_overflow;
+};
+
+/**
+ * calculate_clocksource_interval - Calculates a clocksource interval struct
+ *
+ * @c:		Pointer to clocksource.
+ * @adj:	Multiplyer adjustment.
+ * @length_nsec: Desired interval length in nanoseconds.
+ *
+ * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
+ * pair and interval request.
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+static inline struct clocksource_interval
+calculate_clocksource_interval(struct clocksource *c, long adj,
+			       unsigned long length_nsec)
+{
+	struct clocksource_interval ret;
+	u64 tmp;
+
+	/* XXX - All of this could use a whole lot of optimization */
+	tmp = length_nsec;
+	tmp <<= c->shift;
+	do_div(tmp, c->mult+adj);
+
+	ret.cycles = (cycle_t)tmp;
+	if(ret.cycles == 0)
+		ret.cycles = 1;
+
+	ret.remainder = 0;
+	ret.remainder_ns_overflow = 1 << c->shift;
+	ret.nsecs = cyc2ns_rem(c, adj, ret.cycles, &ret.remainder);
+
+	return ret;
+}
+
+/**
+ * cyc2ns_fixed_rem -
+ *	converts clocksource cycles to nanoseconds using fixed intervals
+ *
+ * @interval:	precalculated clocksource_interval structure
+ * @cycles:	Number of clocksource cycles
+ * @rem:	Remainder
+ *
+ * Uses a precalculated fixed cycle/nsec interval to convert cycles to
+ * nanoseconds. Returns the unaccumulated cycles in the cycles pointer as
+ * well as uses and updates the value at the remainder pointer
+ *
+ * Unless you're the timeofday_periodic_hook, you should not be using this!
+ */
+static inline nsec_t cyc2ns_fixed_rem(struct clocksource_interval interval,
+				      cycle_t *cycles, u64* rem)
+{
+	nsec_t delta_nsec = 0;
+
+	while (*cycles > interval.cycles) {
+		delta_nsec += interval.nsecs;
+		*cycles -= interval.cycles;
+		*rem += interval.remainder;
+		while(*rem > interval.remainder_ns_overflow) {
+			*rem -= interval.remainder_ns_overflow;
+			delta_nsec += 1;
+		}
+	}
+
+	return delta_nsec;
+}
+
+/* used to install a new clocksource */
+void register_clocksource(struct clocksource*);
+void reselect_clocksource(void);
+struct clocksource* get_next_clocksource(void);
+
+#endif /* _LINUX_CLOCKSOURCE_H */
Index: linux/include/linux/completion.h
===================================================================
--- linux.orig/include/linux/completion.h
+++ linux/include/linux/completion.h
@@ -33,6 +33,7 @@ extern unsigned long FASTCALL(wait_for_c
 						   unsigned long timeout));
 extern unsigned long FASTCALL(wait_for_completion_interruptible_timeout(
 			struct completion *x, unsigned long timeout));
+extern unsigned int FASTCALL(completion_done(struct completion *x));
 
 extern void FASTCALL(complete(struct completion *));
 extern void FASTCALL(complete_all(struct completion *));
Index: linux/include/linux/console.h
===================================================================
--- linux.orig/include/linux/console.h
+++ linux/include/linux/console.h
@@ -54,6 +54,7 @@ struct consw {
 	void	(*con_invert_region)(struct vc_data *, u16 *, int);
 	u16    *(*con_screen_pos)(struct vc_data *, int);
 	unsigned long (*con_getxy)(struct vc_data *, unsigned long, int *, int *);
+	int	con_preemptible; // can it reschedule from within printk?
 };
 
 extern const struct consw *conswitchp;
Index: linux/include/linux/delay.h
===================================================================
--- linux.orig/include/linux/delay.h
+++ linux/include/linux/delay.h
@@ -38,9 +38,9 @@ extern unsigned long loops_per_jiffy;
 #define ndelay(x)	udelay(((x)+999)/1000)
 #endif
 
-void calibrate_delay(void);
-void msleep(unsigned int msecs);
-unsigned long msleep_interruptible(unsigned int msecs);
+extern void calibrate_delay(void);
+extern void msleep(unsigned int msecs);
+extern unsigned long msleep_interruptible(unsigned int msecs);
 
 static inline void ssleep(unsigned int seconds)
 {
Index: linux/include/linux/fs.h
===================================================================
--- linux.orig/include/linux/fs.h
+++ linux/include/linux/fs.h
@@ -345,6 +345,8 @@ struct address_space {
 	spinlock_t		private_lock;	/* for use by the address_space */
 	struct list_head	private_list;	/* ditto */
 	struct address_space	*assoc_mapping;	/* ditto */
+	struct list_head	robust_list;	/* list of robust futexes */
+	struct semaphore	robust_sem;	/* protect list of robust futexes */
 } __attribute__((aligned(sizeof(long))));
 	/*
 	 * On most architectures that alignment is already the case; but
Index: linux/include/linux/fs_struct.h
===================================================================
--- linux.orig/include/linux/fs_struct.h
+++ linux/include/linux/fs_struct.h
@@ -12,9 +12,9 @@ struct fs_struct {
 	struct vfsmount * rootmnt, * pwdmnt, * altrootmnt;
 };
 
-#define INIT_FS {				\
-	.count		= ATOMIC_INIT(1),	\
-	.lock		= RW_LOCK_UNLOCKED,	\
+#define INIT_FS(name) {					\
+	.count		= ATOMIC_INIT(1),		\
+	.lock		= RW_LOCK_UNLOCKED(name.lock),	\
 	.umask		= 0022, \
 }
 
Index: linux/include/linux/futex.h
===================================================================
--- linux.orig/include/linux/futex.h
+++ linux/include/linux/futex.h
@@ -1,8 +1,9 @@
 #ifndef _LINUX_FUTEX_H
 #define _LINUX_FUTEX_H
 
-/* Second argument to futex syscall */
+#include <linux/fs.h>
 
+/* Second argument to futex syscall */
 
 #define FUTEX_WAIT		0
 #define FUTEX_WAKE		1
@@ -10,10 +11,42 @@
 #define FUTEX_REQUEUE		3
 #define FUTEX_CMP_REQUEUE	4
 #define FUTEX_WAKE_OP		5
+#define FUTEX_WAIT_ROBUST	6
+#define FUTEX_WAKE_ROBUST	7
+#define FUTEX_REGISTER		8
+#define FUTEX_DEREGISTER	9
+#define FUTEX_RECOVER		10
+
+#define FUTEX_ATTR_PRIORITY_QUEUING		0x10000000
+#define FUTEX_ATTR_PRIORITY_INHERITANCE		0x20000000
+#define FUTEX_ATTR_PRIORITY_PROTECT		0x40000000
+#define FUTEX_ATTR_ROBUST			0x80000000
+#define FUTEX_ATTR_SHARED			0x01000000
+#define FUTEX_ATTR_MASK				0xff000000
+
+#define FUTEX_WAITERS                         0x80000000
+#define FUTEX_OWNER_DIED                      0x40000000
+#define FUTEX_NOT_RECOVERABLE                 0x20000000
+#define FUTEX_FLAGS (FUTEX_WAITERS | FUTEX_OWNER_DIED | FUTEX_NOT_RECOVERABLE)
+#define FUTEX_PID                             ~(FUTEX_FLAGS)
+
+#ifdef __KERNEL__
 
 long do_futex(unsigned long uaddr, int op, int val,
 		unsigned long timeout, unsigned long uaddr2, int val2,
 		int val3);
+#ifdef CONFIG_FUTEX
+extern void futex_free_robust_list(struct inode *inode);
+extern void exit_futex(struct task_struct *tsk);
+#else
+# define futex_free_robust_list(a)	do { } while (0)
+# define exit_futex(b)			do { } while (0)
+#endif
+static inline void futex_init_inode(struct inode *inode)
+{
+	INIT_LIST_HEAD(&inode->i_data.robust_list);
+	init_MUTEX(&inode->i_data.robust_sem);
+}
 
 #define FUTEX_OP_SET		0	/* *(int *)UADDR2 = OPARG; */
 #define FUTEX_OP_ADD		1	/* *(int *)UADDR2 += OPARG; */
@@ -41,3 +74,4 @@ long do_futex(unsigned long uaddr, int o
    | ((oparg & 0xfff) << 12) | (cmparg & 0xfff))
 
 #endif
+#endif
Index: linux/include/linux/genhd.h
===================================================================
--- linux.orig/include/linux/genhd.h
+++ linux/include/linux/genhd.h
@@ -141,18 +141,26 @@ struct disk_attribute {
  * variants disable/enable preemption.
  */
 #ifdef	CONFIG_SMP
-#define __disk_stat_add(gendiskp, field, addnd) 	\
-	(per_cpu_ptr(gendiskp->dkstats, smp_processor_id())->field += addnd)
+#define __disk_stat_add(gendiskp, field, addnd)			\
+do {								\
+	preempt_disable();					\
+	(per_cpu_ptr(gendiskp->dkstats,				\
+			smp_processor_id())->field += addnd);	\
+	preempt_enable();					\
+} while (0)
+
 
 #define disk_stat_read(gendiskp, field)					\
 ({									\
 	typeof(gendiskp->dkstats->field) res = 0;			\
 	int i;								\
+	preempt_disable();						\
 	for (i=0; i < NR_CPUS; i++) {					\
 		if (!cpu_possible(i))					\
 			continue;					\
 		res += per_cpu_ptr(gendiskp->dkstats, i)->field;	\
 	}								\
+	preempt_enable();						\
 	res;								\
 })
 
Index: linux/include/linux/hardirq.h
===================================================================
--- linux.orig/include/linux/hardirq.h
+++ linux/include/linux/hardirq.h
@@ -17,17 +17,18 @@
  * The hardirq count can be overridden per architecture, the default is:
  *
  * - bits 16-27 are the hardirq count (max # of hardirqs: 4096)
- * - ( bit 28 is the PREEMPT_ACTIVE flag. )
+ * - bit 28 is the PREEMPT_ACTIVE flag
  *
- * PREEMPT_MASK: 0x000000ff
- * SOFTIRQ_MASK: 0x0000ff00
- * HARDIRQ_MASK: 0x0fff0000
+ * PREEMPT_MASK:         0x000000ff
+ * SOFTIRQ_MASK:         0x0000ff00
+ * HARDIRQ_MASK:         0x0fff0000
+ * PREEMPT_ACTIVE_MASK:  0x10000000
  */
-#define PREEMPT_BITS	8
-#define SOFTIRQ_BITS	8
-
+#define PREEMPT_BITS		8
+#define SOFTIRQ_BITS		8
 #ifndef HARDIRQ_BITS
-#define HARDIRQ_BITS	12
+#define HARDIRQ_BITS		12
+
 /*
  * The hardirq mask has to be large enough to have space for potentially
  * all IRQ sources in the system nesting on a single CPU.
@@ -36,38 +37,43 @@
 # error HARDIRQ_BITS is too low!
 #endif
 #endif
+#define PREEMPT_ACTIVE_BITS	1
 
-#define PREEMPT_SHIFT	0
-#define SOFTIRQ_SHIFT	(PREEMPT_SHIFT + PREEMPT_BITS)
-#define HARDIRQ_SHIFT	(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
-
-#define __IRQ_MASK(x)	((1UL << (x))-1)
-
-#define PREEMPT_MASK	(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
-#define SOFTIRQ_MASK	(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
-#define HARDIRQ_MASK	(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
-
-#define PREEMPT_OFFSET	(1UL << PREEMPT_SHIFT)
-#define SOFTIRQ_OFFSET	(1UL << SOFTIRQ_SHIFT)
-#define HARDIRQ_OFFSET	(1UL << HARDIRQ_SHIFT)
+#define PREEMPT_SHIFT		0
+#define SOFTIRQ_SHIFT		(PREEMPT_SHIFT + PREEMPT_BITS)
+#define HARDIRQ_SHIFT		(SOFTIRQ_SHIFT + SOFTIRQ_BITS)
+#define PREEMPT_ACTIVE_SHIFT	(HARDIRQ_SHIFT + HARDIRQ_BITS)
+
+#define __IRQ_MASK(x)		((1UL << (x))-1)
+
+#define PREEMPT_MASK		(__IRQ_MASK(PREEMPT_BITS) << PREEMPT_SHIFT)
+#define SOFTIRQ_MASK		(__IRQ_MASK(SOFTIRQ_BITS) << SOFTIRQ_SHIFT)
+#define HARDIRQ_MASK		(__IRQ_MASK(HARDIRQ_BITS) << HARDIRQ_SHIFT)
+
+#define PREEMPT_OFFSET		(1UL << PREEMPT_SHIFT)
+#define SOFTIRQ_OFFSET		(1UL << SOFTIRQ_SHIFT)
+#define HARDIRQ_OFFSET		(1UL << HARDIRQ_SHIFT)
 
 #if PREEMPT_ACTIVE < (1 << (HARDIRQ_SHIFT + HARDIRQ_BITS))
-#error PREEMPT_ACTIVE is too low!
+# error PREEMPT_ACTIVE is too low!
 #endif
 
 #define hardirq_count()	(preempt_count() & HARDIRQ_MASK)
 #define softirq_count()	(preempt_count() & SOFTIRQ_MASK)
 #define irq_count()	(preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK))
+#define irqs_off()	(current->flags & PF_IRQSOFF)
 
 /*
  * Are we doing bottom half or hardware interrupt processing?
  * Are we in a softirq context? Interrupt context?
  */
-#define in_irq()		(hardirq_count())
-#define in_softirq()		(softirq_count())
-#define in_interrupt()		(irq_count())
-
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#define in_irq()	(hardirq_count() || (current->flags & PF_HARDIRQ))
+#define in_softirq()	(softirq_count() || (current->flags & PF_SOFTIRQ))
+#define in_interrupt()	(irq_count())
+
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != kernel_locked())
 #else
 # define in_atomic()	((preempt_count() & ~PREEMPT_ACTIVE) != 0)
@@ -87,8 +93,8 @@ extern void synchronize_irq(unsigned int
 # define synchronize_irq(irq)	barrier()
 #endif
 
-#define nmi_enter()		irq_enter()
-#define nmi_exit()		sub_preempt_count(HARDIRQ_OFFSET)
+#define nmi_enter()		/* irq_enter() */
+#define nmi_exit()		/* sub_preempt_count(HARDIRQ_OFFSET) */
 
 #ifndef CONFIG_VIRT_CPU_ACCOUNTING
 static inline void account_user_vtime(struct task_struct *tsk)
Index: linux/include/linux/ide.h
===================================================================
--- linux.orig/include/linux/ide.h
+++ linux/include/linux/ide.h
@@ -18,6 +18,7 @@
 #include <linux/bio.h>
 #include <linux/device.h>
 #include <linux/pci.h>
+#include <linux/completion.h>
 #include <asm/byteorder.h>
 #include <asm/system.h>
 #include <asm/io.h>
@@ -754,7 +755,7 @@ typedef struct ide_drive_s {
 	int		crc_count;	/* crc counter to reduce drive speed */
 	struct list_head list;
 	struct device	gendev;
-	struct semaphore gendev_rel_sem;	/* to deal with device release() */
+	struct completion gendev_rel_comp;	/* to deal with device release() */
 } ide_drive_t;
 
 #define to_ide_device(dev)container_of(dev, ide_drive_t, gendev)
@@ -910,7 +911,7 @@ typedef struct hwif_s {
 	unsigned	sg_mapped  : 1;	/* sg_table and sg_nents are ready */
 
 	struct device	gendev;
-	struct semaphore gendev_rel_sem; /* To deal with device release() */
+	struct completion gendev_rel_comp; /* To deal with device release() */
 
 	void		*hwif_data;	/* extra hwif data */
 
@@ -1490,7 +1491,7 @@ extern struct semaphore ide_cfg_sem;
  * ide_drive_t->hwif: constant, no locking
  */
 
-#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable(); } while (0)
+#define local_irq_set(flags)	do { local_save_flags((flags)); local_irq_enable_nort(); } while (0)
 
 extern struct bus_type ide_bus_type;
 
Index: linux/include/linux/idr.h
===================================================================
--- linux.orig/include/linux/idr.h
+++ linux/include/linux/idr.h
@@ -62,7 +62,7 @@ struct idr {
 	.id_free	= NULL,					\
 	.layers 	= 0,					\
 	.id_free_cnt	= 0,					\
-	.lock		= SPIN_LOCK_UNLOCKED,			\
+	.lock		= SPIN_LOCK_UNLOCKED(name.lock),	\
 }
 #define DEFINE_IDR(name)	struct idr name = IDR_INIT(name)
 
Index: linux/include/linux/init_task.h
===================================================================
--- linux.orig/include/linux/init_task.h
+++ linux/include/linux/init_task.h
@@ -2,6 +2,7 @@
 #define _LINUX__INIT_TASK_H
 
 #include <linux/file.h>
+#include <linux/fs_struct.h>
 #include <linux/rcupdate.h>
 
 #define INIT_FDTABLE \
@@ -17,10 +18,10 @@
 	.next		= NULL,		 		\
 }
 
-#define INIT_FILES \
+#define INIT_FILES(name) \
 { 							\
 	.count		= ATOMIC_INIT(1), 		\
-	.file_lock	= SPIN_LOCK_UNLOCKED, 		\
+	.file_lock	= SPIN_LOCK_UNLOCKED(name.file_lock), \
 	.fdt		= &init_files.fdtab, 		\
 	.fdtab		= INIT_FDTABLE,			\
 	.close_on_exec_init = { { 0, } }, 		\
@@ -36,7 +37,7 @@
 	.user_id	= 0,				\
 	.next		= NULL,				\
 	.wait		= __WAIT_QUEUE_HEAD_INITIALIZER(name.wait), \
-	.ctx_lock	= SPIN_LOCK_UNLOCKED,		\
+	.ctx_lock	= SPIN_LOCK_UNLOCKED(name.ctx_lock),	\
 	.reqs_active	= 0U,				\
 	.max_reqs	= ~0U,				\
 }
@@ -48,7 +49,7 @@
 	.mm_users	= ATOMIC_INIT(2), 			\
 	.mm_count	= ATOMIC_INIT(1), 			\
 	.mmap_sem	= __RWSEM_INITIALIZER(name.mmap_sem),	\
-	.page_table_lock =  SPIN_LOCK_UNLOCKED, 		\
+	.page_table_lock = SPIN_LOCK_UNLOCKED(name.page_table_lock), \
 	.mmlist		= LIST_HEAD_INIT(name.mmlist),		\
 	.cpu_vm_mask	= CPU_MASK_ALL,				\
 	.default_kioctx = INIT_KIOCTX(name.default_kioctx, name),	\
@@ -68,7 +69,7 @@
 #define INIT_SIGHAND(sighand) {						\
 	.count		= ATOMIC_INIT(1), 				\
 	.action		= { { { .sa_handler = NULL, } }, },		\
-	.siglock	= SPIN_LOCK_UNLOCKED, 				\
+	.siglock	= SPIN_LOCK_UNLOCKED(sighand.siglock),		\
 }
 
 extern struct group_info init_groups;
@@ -86,6 +87,7 @@ extern struct group_info init_groups;
 	.lock_depth	= -1,						\
 	.prio		= MAX_PRIO-20,					\
 	.static_prio	= MAX_PRIO-20,					\
+	.normal_prio	= MAX_PRIO-20,					\
 	.policy		= SCHED_NORMAL,					\
 	.cpus_allowed	= CPU_MASK_ALL,					\
 	.mm		= NULL,						\
@@ -117,8 +119,11 @@ extern struct group_info init_groups;
 		.list = LIST_HEAD_INIT(tsk.pending.list),		\
 		.signal = {{0}}},					\
 	.blocked	= {{0}},					\
-	.alloc_lock	= SPIN_LOCK_UNLOCKED,				\
-	.proc_lock	= SPIN_LOCK_UNLOCKED,				\
+	.alloc_lock	= SPIN_LOCK_UNLOCKED(tsk.alloc_lock),		\
+	.proc_lock	= SPIN_LOCK_UNLOCKED(tsk.proc_lock),		\
+	.delayed_put	= LIST_HEAD_INIT(tsk.delayed_put),		\
+	.pi_waiters	= PLIST_INIT(tsk.pi_waiters, MAX_PRIO),		\
+	.pi_lock	= RAW_SPIN_LOCK_UNLOCKED,			\
 	.journal_info	= NULL,						\
 	.cpu_timers	= INIT_CPU_TIMERS(tsk.cpu_timers),		\
 	.fs_excl	= ATOMIC_INIT(0),				\
Index: linux/include/linux/interrupt.h
===================================================================
--- linux.orig/include/linux/interrupt.h
+++ linux/include/linux/interrupt.h
@@ -41,7 +41,7 @@ struct irqaction {
 	void *dev_id;
 	struct irqaction *next;
 	int irq;
-	struct proc_dir_entry *dir;
+	struct proc_dir_entry *dir, *threaded;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id, struct pt_regs *regs);
@@ -59,6 +59,7 @@ extern void enable_irq(unsigned int irq)
 
 #ifndef __ARCH_SET_SOFTIRQ_PENDING
 #define set_softirq_pending(x) (local_softirq_pending() = (x))
+// FIXME: PREEMPT_RT: set_bit()?
 #define or_softirq_pending(x)  (local_softirq_pending() |= (x))
 #endif
 
@@ -91,13 +92,18 @@ static inline void __deprecated save_and
 #define save_and_cli(x)	save_and_cli(&x)
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_bh_disable() do { } while (0)
+# define local_bh_enable() do { } while (0)
+# define __local_bh_enable() do { } while (0)
+#else
 /* SoftIRQ primitives.  */
-#define local_bh_disable() \
+# define local_bh_disable() \
 		do { add_preempt_count(SOFTIRQ_OFFSET); barrier(); } while (0)
-#define __local_bh_enable() \
+# define __local_bh_enable() \
 		do { barrier(); sub_preempt_count(SOFTIRQ_OFFSET); } while (0)
-
-extern void local_bh_enable(void);
+  extern void local_bh_enable(void);
+#endif
 
 /* PLEASE, avoid to allocate new softirqs, if you need not _really_ high
    frequency threaded job scheduling. For almost all the purposes
@@ -112,7 +118,12 @@ enum
 	NET_TX_SOFTIRQ,
 	NET_RX_SOFTIRQ,
 	SCSI_SOFTIRQ,
-	TASKLET_SOFTIRQ
+	TASKLET_SOFTIRQ,
+#ifdef CONFIG_HIGH_RES_TIMERS
+	KTIMER_SOFTIRQ,
+#endif
+	/* Entries after this are ignored in the split softirq mode */
+	MAX_SOFTIRQ,
 };
 
 /* softirq mask and active fields moved to irq_cpustat_t in
@@ -131,6 +142,7 @@ extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void FASTCALL(raise_softirq_irqoff(unsigned int nr));
 extern void FASTCALL(raise_softirq(unsigned int nr));
+extern void wakeup_irqd(void);
 
 
 /* Tasklets --- multithreaded analogue of BHs.
@@ -243,6 +255,7 @@ extern void tasklet_kill(struct tasklet_
 extern void tasklet_kill_immediate(struct tasklet_struct *t, unsigned int cpu);
 extern void tasklet_init(struct tasklet_struct *t,
 			 void (*func)(unsigned long), unsigned long data);
+void takeover_tasklets(unsigned int cpu);
 
 /*
  * Autoprobing for irqs:
@@ -291,4 +304,33 @@ extern int probe_irq_off(unsigned long);
 extern unsigned int probe_irq_mask(unsigned long);	/* returns mask of ISA interrupts */
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+# define local_irq_disable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_enable_nort()	do { BUG_ON(in_interrupt()); } while (0)
+# define local_irq_save_nort(flags)	do { local_save_flags(flags); WARN_ON(in_interrupt()); } while (0)
+# define local_irq_restore_nort(flags)	do { (void)(flags); WARN_ON(in_interrupt()); } while (0)
+# define spin_lock_nort(lock)		do { } while (0)
+# define spin_unlock_nort(lock)		do { } while (0)
+# define spin_lock_bh_nort(lock)	do { } while (0)
+# define spin_unlock_bh_nort(lock)	do { } while (0)
+# define spin_lock_rt(lock)		spin_lock(lock)
+# define spin_unlock_rt(lock)		spin_unlock(lock)
+# define smp_processor_id_rt(cpu)	(cpu)
+# define in_atomic_rt()			(!oops_in_progress && \
+					  (in_atomic() || irqs_disabled()))
+#else
+# define local_irq_disable_nort()	local_irq_disable()
+# define local_irq_enable_nort()	local_irq_enable()
+# define local_irq_save_nort(flags)	local_irq_save(flags)
+# define local_irq_restore_nort(flags)	local_irq_restore(flags)
+# define spin_lock_rt(lock)		do { } while (0)
+# define spin_unlock_rt(lock)		do { } while (0)
+# define spin_lock_nort(lock)		spin_lock(lock)
+# define spin_unlock_nort(lock)		spin_unlock(lock)
+# define spin_lock_bh_nort(lock)	spin_lock_bh(lock)
+# define spin_unlock_bh_nort(lock)	spin_unlock_bh(lock)
+# define smp_processor_id_rt(cpu)	smp_processor_id()
+# define in_atomic_rt()			0
+#endif
+
 #endif
Index: linux/include/linux/irq.h
===================================================================
--- linux.orig/include/linux/irq.h
+++ linux/include/linux/irq.h
@@ -1,14 +1,6 @@
 #ifndef __irq_h
 #define __irq_h
 
-/*
- * Please do not include this file in generic code.  There is currently
- * no requirement for any architecture to implement anything held
- * within this file.
- *
- * Thanks. --rmk
- */
-
 #include <linux/config.h>
 
 #if !defined(CONFIG_ARCH_S390)
@@ -17,9 +9,11 @@
 #include <linux/cache.h>
 #include <linux/spinlock.h>
 #include <linux/cpumask.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
+#include <asm/timex.h>
 
 /*
  * IRQ line status.
@@ -39,48 +33,155 @@
 # define CHECK_IRQ_PER_CPU(var) 0
 #endif
 
+#define IRQ_NOPROBE	512	/* IRQ is not valid for probing */
+#define IRQ_NOREQUEST	1024	/* IRQ cannot be requested */
+
+#define IRQ_NODELAY	2048     /* IRQ must run immediately */
+
 /*
- * Interrupt controller descriptor. This is all we need
- * to describe about the low-level hardware. 
+ * Not used on any of the architectures, but feel free to provide
+ * your own per-arch one:
  */
-struct hw_interrupt_type {
-	const char * typename;
-	unsigned int (*startup)(unsigned int irq);
-	void (*shutdown)(unsigned int irq);
-	void (*enable)(unsigned int irq);
-	void (*disable)(unsigned int irq);
-	void (*ack)(unsigned int irq);
-	void (*end)(unsigned int irq);
-	void (*set_affinity)(unsigned int irq, cpumask_t dest);
+#ifndef SA_NODELAY
+# define SA_NODELAY 0x01000000
+#endif
+
+/*
+ * IRQ types
+ */
+#define IRQ_TYPE_NONE		0x0000		/* Default, unspecified type */
+#define IRQ_TYPE_EDGEL		0x0001		/* Edge low/falling type */
+#define IRQ_TYPE_EDGEH		0x0002		/* Edge high/rising type */
+#define IRQ_TYPE_EDGEB \
+	(IRQ_TYPE_EDGEL | IRQ_TYPE_EDGEH)	/* Edge low+high/both type */
+#define IRQ_TYPE_LEVELL		0x0004		/* Level low type */
+#define IRQ_TYPE_LEVELH		0x0008		/* Level high type */
+#define IRQ_TYPE_SIMPLE		0x0010		/* Simple type */
+
+
+/*
+ * IRQ wakeup control modes
+ */
+#define IRQ_WAKE_NORESUME	0x0000	/* Do not resume on this irq */
+#define IRQ_WAKE_RESUME		0x0001	/* Enable resume on this irq */
+
+/**
+ * struct irq_chip - Low level interrupt controller hardware descriptor
+ *
+ * @ack:	acknowledge IRQ
+ * @mask:	mask the IRQ
+ * @mask_ack:	acknowledge and mask the IRQ
+ * @unmask:	unmask the IRQ
+ * @retrigger:	retrigger the IRQ in hardware, if possible. Return 0 on success.
+ * @set_type:	set the IRQ type (level, edge[high,low,both])
+ * @set_wake:	Set the IRQ PM-wakeup function
+ * @options:	option field to store type, wake information
+ * @lock:	locking for SMP
+ * @chip_data:	platform-specific private data for the chip
+ */
+struct irq_chip {
+	spinlock_t	lock;
+	void		(*ack)(unsigned int irq);
+	void		(*mask)(unsigned int irq);
+	void		(*mask_ack)(unsigned int irq);
+	void		(*unmask)(unsigned int irq);
+	int		(*retrigger)(unsigned int irq);
+	int		(*set_type)(unsigned int irq, unsigned int hw_type);
+	int		(*set_wake)(unsigned int irq, unsigned int mode);
+	unsigned long	options;
+	void		*chip_data;
+};
+
+struct irq_desc;
+struct irq_type;
+
+/**
+ * struct irq_type - high level hardware interrupt type descriptor
+ *
+ * @typename:		name for /proc/interrupts
+ * @startup:		start up the interrupt (defaults to ->enable if NULL)
+ * @shutdown:		shut down the interrupt (defaults to ->disable if NULL)
+ * @enable:		enable the interrupt (defaults to chip->unmask if NULL)
+ * @disable:		disable the interrupt (defaults to chip->mask if NULL)
+ * @handle_irq:		irq flow handler called from the arch IRQ glue code
+ * @ack:		start of new interrupt.	(Note: This will be renamed to 'start')
+ * @hold:		same interrupt while the handler is running
+ * @end:		end of interrupt
+ * @set_affinity:	set the CPU affinity on SMP machines
+ * @set_type:		set the interrupt type (level, edge[high,low,both]),
+ *			returns a pointer to the irq_type structure which can
+ *			handle the requested type or NULL, if the type cannot
+ *			be handled.
+ */
+struct irq_type {
+	const char	*typename;
+	unsigned int 	(*startup)(unsigned int irq);
+	void		(*shutdown)(unsigned int irq);
+	void		(*enable)(unsigned int irq);
+	void		(*disable)(unsigned int irq);
+
+	void		(*handle_irq)(unsigned int irq, struct irq_desc *desc,
+				      struct pt_regs *regs);
+
+			/* (*start) Will be renamed */
+	void		(*ack)(unsigned int irq);
+	void		(*hold)(unsigned int irq);
+	void		(*end)(unsigned int irq);
+	void		(*set_affinity)(unsigned int irq, cpumask_t dest);
+	struct irq_type *(*set_type)(unsigned int irq, unsigned int type);
 	/* Currently used only by UML, might disappear one day.*/
 #ifdef CONFIG_IRQ_RELEASE_METHOD
 	void (*release)(unsigned int irq, void *dev_id);
 #endif
 };
 
-typedef struct hw_interrupt_type  hw_irq_controller;
-
-/*
- * This is the "IRQ descriptor", which contains various information
- * about the irq, including what kind of hardware handling it has,
- * whether it is disabled etc etc.
+/**
+ * struct irq_desc - interrupt descriptor
+ *
+ * @handler:		interrupt type dependent handler functions,
+ * 			(this should be renamed to 'type')
+ * @handler_data:	data for the type handlers
+ * @chip:		low level hardware access functions - comes from type
+ * @action:		the irq action chain
+ * @status:		status information
+ * @depth:		disable-depth, for nested irq_disable() calls
+ * @irq_count:		stats field to detect stalled irqs
+ * @irqs_unhandled:	stats field for spurious unhandled interrupts
+ * @thread:		Thread pointer for threaded preemptible irq handling
+ * @wait_for_handler:	Waitqueue to wait for a running preemptible handler
+ * @lock:		locking for SMP
+ * @move_irq:		Flag need to re-target interrupt destination
  *
  * Pad this out to 32 bytes for cache and indexing reasons.
  */
 typedef struct irq_desc {
-	hw_irq_controller *handler;
-	void *handler_data;
-	struct irqaction *action;	/* IRQ action list */
-	unsigned int status;		/* IRQ status */
-	unsigned int depth;		/* nested irq disables */
-	unsigned int irq_count;		/* For detecting broken interrupts */
-	unsigned int irqs_unhandled;
-	spinlock_t lock;
+	struct irq_type		*handler;
+	void			*handler_data;
+	struct irq_chip		*chip;
+	struct irqaction	*action;
+	unsigned int		status;
+				
+	unsigned int		depth;
+	unsigned int		irq_count;
+	unsigned int		irqs_unhandled;
+ 	struct task_struct	*thread;
+ 	wait_queue_head_t	wait_for_handler;
+	raw_spinlock_t		lock;
 #if defined (CONFIG_GENERIC_PENDING_IRQ) || defined (CONFIG_IRQBALANCE)
-	unsigned int move_irq;		/* Flag need to re-target intr dest*/
+	unsigned int		move_irq;
 #endif
 } ____cacheline_aligned irq_desc_t;
 
+
+/*
+ * Migration helpers for obsolete names, they will go away:
+ */
+#define irqdesc			irq_desc
+#define irqchip			irq_chip
+#define hw_interrupt_type	irq_type
+#define set_irq_type		set_hwirq_type
+typedef struct irq_type hw_irq_controller;
+
 extern irq_desc_t irq_desc [NR_IRQS];
 
 /* Return a pointer to the irq descriptor for IRQ.  */
@@ -210,16 +311,87 @@ static inline void set_irq_info(int irq,
 #endif // CONFIG_SMP
 
 extern int no_irq_affinity;
-extern int noirqdebug_setup(char *str);
 
+/* Handle irq action chains */
 extern fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
-					struct irqaction *action);
+				       struct irqaction *action);
+
+/*
+ * Built-in IRQ handlers for various IRQ types,
+ * callable via desc->handler->handle_irq()
+ */
+extern void handle_level_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_edge_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_simple_irq(unsigned int irq, struct irq_desc *desc,  struct pt_regs *regs);
+extern void handle_percpu_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+extern void handle_bad_irq(unsigned int irq, struct irq_desc *desc, struct pt_regs *regs);
+
+#define desc_handle_irq(irq, desc, regs)		\
+do {							\
+	spin_lock(&(desc)->lock);			\
+	(desc)->handler->handle_irq(irq, (desc), regs);	\
+	spin_unlock(&(desc)->lock);			\
+} while(0)
+
+/* Monolithic do_IRQ implementation */
 extern fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs);
+
+/* Handling of unhandled and spurious interrupts */
 extern void note_interrupt(unsigned int irq, irq_desc_t *desc,
 					int action_ret, struct pt_regs *regs);
-extern int can_request_irq(unsigned int irq, unsigned long irqflags);
 
+/* Resending of interrupts */
+void check_irq_resend(irq_desc_t *desc, unsigned int irq);
+
+/* Proc filesystem */
 extern void init_irq_proc(void);
+
+/* Enable/disable irq debugging output */
+extern int noirqdebug_setup(char *str);
+
+/* Set/get irq type */
+extern int set_irq_type(unsigned int irq, unsigned int type);
+extern int get_irq_type(unsigned int irq, unsigned int type);
+
+/* Irq wakeup (PM) control) */
+extern int set_irq_wake(unsigned int irq, unsigned int mode);
+#define enable_irq_wake(irq) set_irq_wake(irq, IRQ_WAKE_RESUME)
+#define disable_irq_wake(irq) set_irq_wake(irq, IRQ_WAKE_NORESUME)
+
+/* Checks whether the interrupt can be requested by request_irq() */
+extern int can_request_irq(unsigned int irq, unsigned long irqflags);
+
+/* Set type control/chip/data for an interrupt */
+extern int generic_set_irq_type(unsigned int irq, struct irq_type *type);
+extern int set_irq_data(unsigned int irq, void *data);
+extern int set_irq_chip(unsigned int irq, struct irq_chip *chip);
+extern int set_irq_chip_data(unsigned int irq, void *data);
+
+/* Get chip/data for an interrupt */
+#define get_irq_chip(irq) (irq_desc[irq].chip)
+#define get_irq_chip_data(irq) (irq_desc[irq].chip->chip_data)
+
+/* Interrupt type default implementations */
+extern struct irq_type no_irq_type;
+extern struct irq_type default_edge_type;
+extern struct irq_type default_level_type;
+extern struct irq_type default_simple_type;
+extern struct irq_type default_percpu_type;
+
+/* Early initialization of irqs */
+extern void early_init_hardirqs(void);
+
+#if defined(CONFIG_PREEMPT_HARDIRQS)
+extern void init_hardirqs(void);
+#else
+static inline void init_hardirqs(void) { }
+#endif
+
+#else	/* GENERIC HARDIRQS */
+
+static inline void early_init_hardirqs(void) { }
+static inline void init_hardirqs(void) { }
+
 #endif
 
 extern hw_irq_controller no_irq_type;  /* needed in every arch ? */
Index: linux/include/linux/jbd.h
===================================================================
--- linux.orig/include/linux/jbd.h
+++ linux/include/linux/jbd.h
@@ -272,6 +272,15 @@ void buffer_assertion_failure(struct buf
 #define J_ASSERT(assert)	do { } while (0)
 #endif		/* JBD_ASSERTIONS */
 
+/*
+ * For assertions that are only valid on SMP (e.g. spin_is_locked()):
+ */
+#ifdef CONFIG_SMP
+# define J_ASSERT_JH_SMP(jh, expr)	J_ASSERT_JH(jh, expr)
+#else
+# define J_ASSERT_JH_SMP(jh, assert)	do { } while (0)
+#endif
+
 #if defined(JBD_PARANOID_IOFAIL)
 #define J_EXPECT(expr, why...)		J_ASSERT(expr)
 #define J_EXPECT_BH(bh, expr, why...)	J_ASSERT_BH(bh, expr)
@@ -327,32 +336,32 @@ static inline struct journal_head *bh2jh
 
 static inline void jbd_lock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_State, &bh->b_state);
+	spin_lock(&bh->b_state_lock);
 }
 
 static inline int jbd_trylock_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_trylock(BH_State, &bh->b_state);
+	return spin_trylock(&bh->b_state_lock);
 }
 
 static inline int jbd_is_locked_bh_state(struct buffer_head *bh)
 {
-	return bit_spin_is_locked(BH_State, &bh->b_state);
+	return spin_is_locked(&bh->b_state_lock);
 }
 
 static inline void jbd_unlock_bh_state(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_State, &bh->b_state);
+	spin_unlock(&bh->b_state_lock);
 }
 
 static inline void jbd_lock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_lock(BH_JournalHead, &bh->b_state);
+	spin_lock(&bh->b_uptodate_lock);
 }
 
 static inline void jbd_unlock_bh_journal_head(struct buffer_head *bh)
 {
-	bit_spin_unlock(BH_JournalHead, &bh->b_state);
+	spin_unlock(&bh->b_uptodate_lock);
 }
 
 struct jbd_revoke_table_s;
Index: linux/include/linux/jffs2_fs_i.h
===================================================================
--- linux.orig/include/linux/jffs2_fs_i.h
+++ linux/include/linux/jffs2_fs_i.h
@@ -14,7 +14,15 @@ struct jffs2_inode_info {
 	   before letting GC proceed. Or we'd have to put ugliness
 	   into the GC code so it didn't attempt to obtain the i_sem
 	   for the inode(s) which are already locked */
-	struct semaphore sem;
+	/*
+	 * (On PREEMPT_RT: while use of ei->sem is mostly mutex-alike, the
+	 * SLAB cache keeps the semaphore locked, which breaks the strict
+	 * "owner must exist" properties of rt_mutexes. Fix it the easy
+	 * way: by going to a compat_semaphore. But the real fix would be
+	 * to cache inodes in an unlocked state and lock them when
+	 * allocating a new inode.)
+	 */
+	struct compat_semaphore sem;
 
 	/* The highest (datanode) version number used for this ino */
 	uint32_t highest_version;
Index: linux/include/linux/jffs2_fs_sb.h
===================================================================
--- linux.orig/include/linux/jffs2_fs_sb.h
+++ linux/include/linux/jffs2_fs_sb.h
@@ -35,7 +35,7 @@ struct jffs2_sb_info {
 	struct completion gc_thread_start; /* GC thread start completion */
 	struct completion gc_thread_exit; /* GC thread exit completion port */
 
-	struct semaphore alloc_sem;	/* Used to protect all the following 
+	struct compat_semaphore alloc_sem; /* Used to protect all the following
 					   fields, and also to protect against
 					   out-of-order writing of nodes. And GC. */
 	uint32_t cleanmarker_size;	/* Size of an _inline_ CLEANMARKER
@@ -93,7 +93,7 @@ struct jffs2_sb_info {
 	/* Sem to allow jffs2_garbage_collect_deletion_dirent to
 	   drop the erase_completion_lock while it's holding a pointer 
 	   to an obsoleted node. I don't like this. Alternatives welcomed. */
-	struct semaphore erase_free_sem;
+	struct compat_semaphore erase_free_sem;
 
 #ifdef CONFIG_JFFS2_FS_WRITEBUFFER
 	/* Write-behind buffer for NAND flash */
@@ -103,7 +103,7 @@ struct jffs2_sb_info {
 	uint32_t wbuf_pagesize;
 	struct jffs2_inodirty *wbuf_inodes;
 
-	struct rw_semaphore wbuf_sem;	/* Protects the write buffer */
+	struct compat_rw_semaphore wbuf_sem;	/* Protects the write buffer */
 
 	/* Information about out-of-band area usage... */
 	struct nand_oobinfo *oobinfo;
Index: linux/include/linux/jiffies.h
===================================================================
--- linux.orig/include/linux/jiffies.h
+++ linux/include/linux/jiffies.h
@@ -1,21 +1,12 @@
 #ifndef _LINUX_JIFFIES_H
 #define _LINUX_JIFFIES_H
 
+#include <linux/calc64.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/time.h>
 #include <linux/timex.h>
 #include <asm/param.h>			/* for HZ */
-#include <asm/div64.h>
-
-#ifndef div_long_long_rem
-#define div_long_long_rem(dividend,divisor,remainder) \
-({							\
-	u64 result = dividend;				\
-	*remainder = do_div(result,divisor);		\
-	result;						\
-})
-#endif
 
 /*
  * The following defines establish the engineering parameters of the PLL
Index: linux/include/linux/kernel.h
===================================================================
--- linux.orig/include/linux/kernel.h
+++ linux/include/linux/kernel.h
@@ -65,7 +65,7 @@ extern int cond_resched(void);
 # define might_resched() do { } while (0)
 #endif
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
   void __might_sleep(char *file, int line);
 # define might_sleep() \
 	do { __might_sleep(__FILE__, __LINE__); might_resched(); } while (0)
@@ -137,6 +137,12 @@ static inline int printk(const char *s, 
 static inline int printk(const char *s, ...) { return 0; }
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+extern void zap_rt_locks(void);
+#else
+# define zap_rt_locks() do { } while (0)
+#endif
+
 unsigned long int_sqrt(unsigned long);
 
 static inline int __attribute_pure__ long_log2(unsigned long x)
@@ -177,6 +183,7 @@ extern void add_taint(unsigned);
 /* Values used for system_state */
 extern enum system_states {
 	SYSTEM_BOOTING,
+	SYSTEM_BOOTING_SCHEDULER_OK,
 	SYSTEM_RUNNING,
 	SYSTEM_HALT,
 	SYSTEM_POWER_OFF,
Index: linux/include/linux/kernel_stat.h
===================================================================
--- linux.orig/include/linux/kernel_stat.h
+++ linux/include/linux/kernel_stat.h
@@ -6,6 +6,7 @@
 #include <linux/smp.h>
 #include <linux/threads.h>
 #include <linux/percpu.h>
+#include <linux/cpumask.h>
 #include <asm/cputime.h>
 
 /*
@@ -43,11 +44,10 @@ extern unsigned long long nr_context_swi
  */
 static inline int kstat_irqs(int irq)
 {
-	int i, sum=0;
+	int i, sum = 0;
 
-	for (i = 0; i < NR_CPUS; i++)
-		if (cpu_possible(i))
-			sum += kstat_cpu(i).irqs[irq];
+	for_each_cpu(i)
+		sum += kstat_cpu(i).irqs[irq];
 
 	return sum;
 }
Index: linux/include/linux/ktime.h
===================================================================
--- /dev/null
+++ linux/include/linux/ktime.h
@@ -0,0 +1,390 @@
+/*
+ *  include/linux/ktime.h
+ *
+ *  ktime_t - nanosecond-resolution time format.
+ *
+ *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  data type definitions, declarations, prototypes and macros.
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_KTIME_H
+#define _LINUX_KTIME_H
+
+#include <linux/jiffies.h>
+
+/*
+ * ktime_t:
+ *
+ * On 64-bit CPUs a single 64-bit variable is used to store the ktimers
+ * internal representation of time values in scalar nanoseconds. The
+ * design plays out best on 64-bit CPUs, where most conversions are
+ * NOPs and most arithmetic ktime_t operations are plain arithmetic
+ * operations.
+ *
+ * On 32-bit CPUs an optimized representation of the timespec structure
+ * is used to avoid expensive conversions from and to timespecs. The
+ * endian-aware order of the tv struct members is choosen to allow
+ * mathematical operations on the tv64 member of the union too, which
+ * for certain operations produces better code.
+ *
+ * For architectures with efficient support for 64/32-bit conversions the
+ * plain scalar nanosecond based representation can be selected by the
+ * config switch CONFIG_KTIME_SCALAR.
+ */
+
+#define KTIME_ZERO			0
+#define KTIME_MAX			(~((u64)1 << 63))
+
+/*
+ * ktime_t definitions when using the 64-bit scalar representation:
+ */
+
+#if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+
+typedef s64 ktime_t;
+
+/* Define a ktime_t variable and initialize it to zero: */
+#define DEFINE_KTIME(kt)		ktime_t kt = 0
+
+/*
+ * Compare two ktime_t variables. The comparison operand is
+ * given as a literal in the macro call (e.g. <, >, ==):
+ *
+ * ( E.g. "ktime_cmp(t1, <, t2) is still more readable to programmers
+ *   than ktime_before()/ktime_after() would be. )
+ */
+#define ktime_cmp(a, op, b)		((a) op (b))
+
+/*
+ * Compare a ktime_t variable and a constant. The comparison operand is
+ * given as a literal in the macro call (e.g. <, >, ==):
+ */
+#define ktime_cmp_val(a, op, b)		((a) op (b))
+
+/**
+ * ktime_set - Set a ktime_t variable from a seconds/nanoseconds value
+ *
+ * @secs:	seconds to set
+ * @nsecs:	nanoseconds to set
+ *
+ * Return the ktime_t representation of the value
+ */
+#define ktime_set(sec, nsec)	(((s64)(sec) * NSEC_PER_SEC) + (s64)(nsec))
+
+/*
+ * Set a ktime_t variable to a value in a scalar nanosecond representation
+ *
+ * NOTE: use only with KTIME_ZERO or KTIME_MAX to maintain compability
+ * with the union type version.
+ */
+#define ktime_set_scalar(kt, s)		(kt) = (s)
+
+/*
+ * The following 3 macros are used for the nanosleep restart handling
+ * to store the "low" and "high" part of a 64-bit ktime variable.
+ * (on 32-bit CPUs the restart block has 32-bit fields, so we have to
+ *  split the 64-bit value up into two pieces)
+ *
+ * In the scalar representation we have to split up the 64-bit scalar:
+ */
+
+/* Set the "low" and "high" part of a ktime_t variable: */
+#define ktime_set_low_high(l, h)	((s64)((u64)(l)) | (((s64)(h)) << 32))
+
+/* Get the "low" part of a ktime_t variable: */
+#define ktime_get_low(kt)		((kt) & 0xFFFFFFFF)
+
+/* Get the "high" part of a ktime_t variable: */
+#define ktime_get_high(kt)		((kt) >> 32)
+
+/* Subtract two ktime_t variables. rem = lhs -rhs: */
+#define ktime_sub(lhs, rhs)		((lhs) - (rhs))
+
+/* Add two ktime_t variables. res = lhs + rhs: */
+#define ktime_add(lhs, rhs)		((lhs) + (rhs))
+
+/*
+ * Add a ktime_t variable and a scalar nanosecond value.
+ * res = kt + nsval:
+ */
+#define ktime_add_ns(kt, nsval)		((kt) + (nsval))
+
+/* convert a timespec to ktime_t format: */
+#define timespec_to_ktime(ts)		ktime_set((ts).tv_sec, (ts).tv_nsec)
+
+/* convert a timeval to ktime_t format: */
+#define timeval_to_ktime(tv)		ktime_set((tv).tv_sec, (tv).tv_usec * 1000)
+
+/* Map the ktime_t to timespec conversion to ns_to_timespec function */
+#define ktime_to_timespec(ts, kt)	ns_to_timespec(ts, kt)
+
+/* Map the ktime_t to timeval conversion to ns_to_timeval function */
+#define ktime_to_timeval(tv, kt)	ns_to_timeval(tv, kt)
+
+/* Map the ktime_t to clock_t conversion to the inline in jiffies.h: */
+#define ktime_to_clock_t(kt)		nsec_to_clock_t(kt)
+
+/* Convert ktime_t to nanoseconds - NOP in the scalar storage format: */
+#define ktime_to_ns(kt)			(kt)
+
+#if (BITS_PER_LONG == 64)
+/*
+ * Calc ktime_t modulo div.
+ * div is less than NSEC_PER_SEC and (NSEC_PER_SEC % div) = 0 !
+ */
+#define ktime_modulo(kt, div)		(unsigned long)(kt % div)
+#endif
+
+#else
+
+/*
+ * Helper macros/inlines to get the ktime_t math right in the timespec
+ * representation. The macros are sometimes ugly - their actual use is
+ * pretty okay-ish, given the circumstances. We do all this for
+ * performance reasons. The pure scalar nsec_t based code was nice and
+ * simple, but created too many 64-bit / 32-bit conversions and divisions.
+ *
+ * Be especially aware that negative values are represented in a way
+ * that the tv.sec field is negative and the tv.nsec field is greater
+ * or equal to zero but less than nanoseconds per second. This is the
+ * same representation which is used by timespecs.
+ *
+ *   tv.sec < 0 and 0 >= tv.nsec < NSEC_PER_SEC
+ */
+
+typedef union {
+	s64	tv64;
+	struct {
+#ifdef __BIG_ENDIAN
+	s32	sec, nsec;
+#else
+	s32	nsec, sec;
+#endif
+	} tv;
+} ktime_t;
+
+/* Define a ktime_t variable and initialize it to zero: */
+#define DEFINE_KTIME(kt)		ktime_t kt = { .tv64 = 0 }
+
+/*
+ * Compare two ktime_t variables. The comparison operand is
+ * given as a literal in the macro call (e.g. <, >, ==):
+ */
+#define ktime_cmp(a, op, b)		((a).tv64 op (b).tv64)
+
+/*
+ * Compare a ktime_t variable and a constant. The comparison operand is
+ * given as a literal in the macro call (e.g. <, >, ==):
+ */
+#define ktime_cmp_val(a, op, b)		((a).tv64 op (b))
+
+/* Set a ktime_t variable to a value in sec/nsec representation: */
+static inline ktime_t ktime_set(long secs, unsigned long nsecs)
+{
+	return (ktime_t) { .tv = { .sec = secs, .nsec = nsecs } };
+}
+
+/*
+ * Set the scalar value of a ktime variable (union type)
+ * NOTE: use only with KTIME_ZERO or KTIME_MAX!
+ */
+#define ktime_set_scalar(kt, s)		(kt).tv64 = (s)
+
+/*
+ * The following 3 macros are used for the nanosleep restart handling
+ * to store the "low" and "high" part of a 64-bit ktime variable.
+ * (on 32-bit CPUs the restart block has 32-bit fields, so we have to
+ *  split the 64-bit value up into two pieces)
+ *
+ * In the union type representation this is just storing and restoring
+ * the sec and nsec members of the tv structure:
+ */
+
+/* Set the "low" and "high" part of a ktime_t variable: */
+#define ktime_set_low_high(l, h)	ktime_set(h, l)
+
+/* Get the "low" part of a ktime_t variable: */
+#define ktime_get_low(kt)		(kt).tv.nsec
+
+/* Get the "high" part of a ktime_t variable: */
+#define ktime_get_high(kt)		(kt).tv.sec
+
+/**
+ * ktime_sub - subtract two ktime_t variables
+ *
+ * @lhs:	minuend
+ * @rhs:	subtrahend
+ *
+ * Returns the remainder of the substraction
+ */
+static inline ktime_t ktime_sub(ktime_t lhs, ktime_t rhs)
+{
+	ktime_t res;
+
+	res.tv64 = lhs.tv64 - rhs.tv64;
+	if (res.tv.nsec < 0)
+		res.tv.nsec += NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add - add two ktime_t variables
+ *
+ * @add1:	addend1
+ * @add2:	addend2
+ *
+ * Returns the sum of addend1 and addend2
+ */
+static inline ktime_t ktime_add(ktime_t add1, ktime_t add2)
+{
+	ktime_t res;
+
+	res.tv64 = add1.tv64 + add2.tv64;
+	/*
+	 * performance trick: the (u32) -NSEC gives 0x00000000Fxxxxxxx
+	 * so we subtract NSEC_PER_SEC and add 1 to the upper 32 bit.
+	 *
+	 * it's equivalent to:
+	 *   tv.nsec -= NSEC_PER_SEC
+	 *   tv.sec ++;
+	 */
+	if (res.tv.nsec >= NSEC_PER_SEC)
+		res.tv64 += (u32)-NSEC_PER_SEC;
+
+	return res;
+}
+
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ *
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+extern ktime_t ktime_add_ns(ktime_t kt, u64 nsec);
+
+/**
+ * timespec_to_ktime - convert a timespec to ktime_t format
+ *
+ * @ts:		the timespec variable to convert
+ *
+ * Returns a ktime_t variable with the converted timespec value
+ */
+static inline ktime_t timespec_to_ktime(struct timespec ts)
+{
+	return (ktime_t) { .tv = { .sec = (s32)ts.tv_sec,
+			   	   .nsec = (s32)ts.tv_nsec } };
+}
+
+/**
+ * timeval_to_ktime - convert a timeval to ktime_t format
+ *
+ * @tv:		the timeval variable to convert
+ *
+ * Returns a ktime_t variable with the converted timeval value
+ */
+static inline ktime_t timeval_to_ktime(struct timeval tv)
+{
+	return (ktime_t) { .tv = { .sec = (s32)tv.tv_sec,
+				   .nsec = (s32)tv.tv_usec * 1000 } };
+}
+
+/**
+ * ktime_to_timespec - convert a ktime_t variable to timespec format
+ *
+ * @ts:		pointer to timespec variable to store result
+ * @kt:		the ktime_t variable to convert
+ *
+ * Stores the timespec representation of the ktime value in
+ * the timespec variable pointed to by @ts
+ */
+static inline void ktime_to_timespec(struct timespec *ts, ktime_t kt)
+{
+	ts->tv_sec = (time_t) kt.tv.sec;
+	ts->tv_nsec = (long) kt.tv.nsec;
+}
+
+/**
+ * ktime_to_timeval - convert a ktime_t variable to timeval format
+ *
+ * @tv:		pointer to timeval variable to store result
+ * @kt:		the ktime_t variable to convert
+ *
+ * Stores the timeval representation of the ktime value in
+ * the timeval variable pointed to by @tv
+ */
+static inline void ktime_to_timeval(struct timeval *tv, ktime_t kt)
+{
+	tv->tv_sec = (time_t) kt.tv.sec;
+	tv->tv_usec = (suseconds_t) (kt.tv.nsec / NSEC_PER_USEC);
+}
+
+/**
+ * ktime_to_clock_t - convert a ktime_t variable to clock_t format
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns a clock_t variable with the converted value
+ */
+static inline clock_t ktime_to_clock_t(ktime_t kt)
+{
+	return nsec_to_clock_t( (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec);
+}
+
+/**
+ * ktime_to_ns - convert a ktime_t variable to scalar nanoseconds
+ * @kt:		the ktime_t variable to convert
+ *
+ * Returns the scalar nanoseconds representation of kt
+ */
+static inline u64 ktime_to_ns(ktime_t kt)
+{
+	return (u64) kt.tv.sec * NSEC_PER_SEC + kt.tv.nsec;
+}
+
+/*
+ * Calc ktime_t modulo div.
+ * div is less than NSEC_PER_SEC and (NSEC_PER_SEC % div) = 0 !
+ */
+#define ktime_modulo(kt, div)		((unsigned long)kt.tv.nsec % div)
+
+#endif
+
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define KTIME_LOW_RES		(NSEC_PER_SEC/HZ)
+
+#ifdef CONFIG_GENERIC_TIME
+
+#define ktime_get get_monotonic_clock
+#define ktime_get_real get_realtime_clock
+#define ktime_get_ts(ts) get_monotonic_clock_ts(ts)
+#define ktime_get_real_ts(ts) get_realtime_clock_ts(ts)
+
+#else /* CONFIG_GENERIC_TIME */
+
+/* Get the monotonic time in ktime_t format: */
+extern ktime_t ktime_get(void);
+
+/* Get the real (wall-) time in ktime_t format: */
+extern ktime_t ktime_get_real(void);
+
+/* Get the monotonic time in timespec format: */
+extern void ktime_get_ts(struct timespec *ts);
+
+/* Get the real (wall-) time in timespec format: */
+#define ktime_get_real_ts(ts)	getnstimeofday(ts)
+
+#endif /* !CONFIG_GENERIC_TIME */
+
+#endif
Index: linux/include/linux/ktimer.h
===================================================================
--- /dev/null
+++ linux/include/linux/ktimer.h
@@ -0,0 +1,222 @@
+/*
+ *  include/linux/ktimer.h
+ *
+ *  ktimers - high-precision kernel timers
+ *
+ *   Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *   Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  data type definitions, declarations, prototypes
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+#ifndef _LINUX_KTIMER_H
+#define _LINUX_KTIMER_H
+
+#include <linux/rbtree.h>
+#include <linux/ktime.h>
+#include <linux/init.h>
+#include <linux/list.h>
+#include <linux/wait.h>
+
+/*
+ * Mode arguments of xxx_ktimer functions:
+ */
+enum ktimer_rearm {
+	KTIMER_ABS = 1,	/* Time value is absolute */
+	KTIMER_REL,	/* Time value is relative to now */
+	KTIMER_INCR,	/* Time value is relative to previous expiry time */
+	KTIMER_FORWARD,	/* Timer is rearmed with value. Overruns accounted */
+	KTIMER_REARM,	/* Timer is rearmed with interval. Overruns accounted */
+	KTIMER_RESTART,	/* Timer is restarted with the stored expiry value */
+
+	/*
+	 * Expiry must not be checked when the timer is started:
+	 * (can be OR-ed with another above mode flag)
+	 */
+	KTIMER_NOCHECK = 0x10000,
+	/*
+	 * Rounding is required when the time is set up. Thats an
+	 * optimization for relative timers as we read current time
+	 * in the enqueing code so we do not need to read is twice.
+	 */
+	KTIMER_ROUND = 0x20000,
+
+	/* (used internally: no rearming) */
+	KTIMER_NOREARM = 0
+};
+
+/*
+ * Timer states:
+ */
+enum ktimer_state {
+	KTIMER_INACTIVE,	/* Timer is inactive */
+	KTIMER_PENDING,		/* Timer is pending */
+	KTIMER_EXPIRED,		/* Timer is expired and queued in the rbtree */
+	KTIMER_EXPIRED_NOQUEUE, /* Timer is expired and not queued in the rbtree */
+};
+
+struct ktimer_base;
+
+/**
+ * struct ktimer - the basic ktimer structure
+ *
+ * @node:	red black tree node for time ordered insertion
+ * @list:	list head for easier access to the time ordered list,
+ *		without walking the red black tree.
+ * @expires:	the absolute expiry time in the ktimers internal
+ *		representation. The time is related to the clock on
+ *		which the timer is based.
+ * @expired:	the absolute time when the timer expired. Used for
+ *		simplifying return path calculations and for debugging
+ *		purposes.
+ * @interval:	the timer interval for automatic rearming
+ * @overrun:	the number of intervals missed when rearming a timer
+ * @state:	state of the timer
+ * @function:	timer expiry callback function
+ * @data:	argument for the callback function
+ * @base:	pointer to the timer base (per cpu and per clock)
+ *
+ * The ktimer structure must be initialized by init_ktimer_#CLOCKTYPE()
+ */
+struct ktimer {
+	struct rb_node		node;
+	struct list_head	list;
+	ktime_t			expires;
+	ktime_t			expired;
+	int			expiry_mode;
+	ktime_t			interval;
+	int			overrun;
+	enum ktimer_state	state;
+	void			(*function)(void *);
+	void			*data;
+	struct ktimer_base	*base;
+	int			prio;
+};
+
+/**
+ * struct ktimer_base - the timer base for a specific clock
+ *
+ * @index:	clock type index for per_cpu support when moving a timer
+ *		to a base on another cpu.
+ * @lock:	lock protecting the base and associated timers
+ * @active:	red black tree root node for the active timers
+ * @pending:	list of pending timers for simple time ordered access
+ * @count:	the number of active timers
+ * @resolution:	the resolution of the clock, in nanoseconds
+ * @get_time:	function to retrieve the current time of the clock
+ * @curr_timer:	the timer which is executing a callback right now
+ * @wait:	waitqueue to wait for a currently running timer
+ * @name:	string identifier of the clock
+ */
+struct ktimer_base {
+	int			index;
+	raw_spinlock_t		lock;
+	struct rb_root		active;
+	struct list_head	pending;
+	int			count;
+	unsigned long		resolution;
+	ktime_t			(*get_time)(void);
+	struct ktimer		*curr_timer;
+	wait_queue_head_t	wait;
+#ifdef CONFIG_HIGH_RES_TIMERS
+	struct list_head	expired;
+	ktime_t			(*getoffset)(void);
+	int			(*reprogram)(struct ktimer *t,
+					     struct ktimer_base *b, ktime_t n);
+#endif
+	char			*name;
+};
+
+#define KTIMER_POISON		((void *) 0x00100101)
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+extern void ktimer_clock_notify(void);
+extern void clock_was_set(void);
+extern int ktimer_interrupt(void);
+
+/*
+ * The resolution of the clocks. The resolution value is returned in
+ * the clock_getres() system call to give application programmers an
+ * idea of the (in)accuracy of timers. Timer values are rounded up to
+ * this resolution values.
+ */
+#define KTIME_REALTIME_RES		CONFIG_HIGH_RES_RESOLUTION
+#define KTIME_MONOTONIC_RES		CONFIG_HIGH_RES_RESOLUTION
+
+#define ktimer_trace(a,b)		trace_special(ktime_get_high(a),ktime_get_low(a),b)
+
+#else
+
+#define KTIME_REALTIME_RES		KTIME_LOW_RES
+#define KTIME_MONOTONIC_RES		KTIME_LOW_RES
+
+/*
+ * clock_was_set() is a NOP for non- high-resolution systems. The
+ * time-sorted order guarantees that a timer does not expire early and
+ * is expired in the next softirq when the clock was advanced.
+ */
+#define clock_was_set()			do { } while (0)
+#define ktimer_clock_notify()		do { } while (0)
+
+static inline int ktimer_interrupt(void)
+{
+	return 0;
+}
+
+# if (BITS_PER_LONG == 64) || defined(CONFIG_KTIME_SCALAR)
+#  define ktimer_trace(a,b)		trace_special_u64(a,b)
+# else
+#  define ktimer_trace(a,b)		trace_special(ktime_get_high(a),ktime_get_low(a),b)
+# endif
+#endif
+
+/* Exported timer functions: */
+
+/* Initialize timers: */
+extern void ktimer_init(struct ktimer *timer);
+extern void ktimer_init_real(struct ktimer *timer);
+
+/* Basic timer operations: */
+extern int ktimer_start(struct ktimer *timer, ktime_t *tim, int mode);
+extern int ktimer_restart(struct ktimer *timer, ktime_t *tim, int mode);
+extern int ktimer_cancel(struct ktimer *timer);
+extern int ktimer_try_to_cancel(struct ktimer *timer);
+
+/* Query timers: */
+extern ktime_t ktimer_get_remtime(struct ktimer *timer);
+extern ktime_t ktimer_get_expiry(struct ktimer *timer, ktime_t *now);
+extern int ktimer_get_res(clockid_t which_clock, struct timespec *tp);
+extern int ktimer_get_res_real(clockid_t which_clock, struct timespec *tp);
+
+static inline int ktimer_active(struct ktimer *timer)
+{
+	return timer->state != KTIMER_INACTIVE;
+}
+
+/* Convert with rounding based on resolution of timer's clock: */
+extern ktime_t ktimer_round_timeval(struct ktimer *timer, struct timeval *tv);
+extern ktime_t ktimer_round_timespec(struct ktimer *timer, struct timespec *ts);
+
+/* Precise sleep: */
+extern long ktimer_nanosleep(struct timespec *rqtp,
+			     struct timespec __user *rmtp, int mode);
+extern long ktimer_nanosleep_real(struct timespec *rqtp,
+				  struct timespec __user *rmtp, int mode);
+
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+extern void wait_for_ktimer(struct ktimer *timer);
+#else
+# define wait_for_ktimer(t)	do { } while (0)
+#endif
+
+/* Soft interrupt function to run the ktimer queues: */
+extern void ktimer_run_queues(void);
+
+/* Bootup initialization: */
+extern void __init ktimers_init(void);
+
+#endif
Index: linux/include/linux/latency_hist.h
===================================================================
--- /dev/null
+++ linux/include/linux/latency_hist.h
@@ -0,0 +1,32 @@
+/*
+ * kernel/latency_hist.h
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#ifndef _LINUX_LATENCY_HIST_H_
+#define _LINUX_LATENCY_HIST_H_
+
+enum {
+        INTERRUPT_LATENCY = 0,
+        PREEMPT_LATENCY,
+        WAKEUP_LATENCY
+};
+
+#define MAX_ENTRY_NUM 10240
+#define LATENCY_TYPE_NUM 3
+
+#ifdef CONFIG_LATENCY_HIST
+extern void latency_hist(int latency_type, int cpu, unsigned long latency);
+# define latency_hist_flag 1
+#else
+# define latency_hist(a,b,c) do { (void)(cpu); } while (0)
+# define latency_hist_flag 0
+#endif /* CONFIG_LATENCY_HIST */
+
+#endif /* ifndef _LINUX_LATENCY_HIST_H_ */
Index: linux/include/linux/linkage.h
===================================================================
--- linux.orig/include/linux/linkage.h
+++ linux/include/linux/linkage.h
@@ -4,6 +4,8 @@
 #include <linux/config.h>
 #include <asm/linkage.h>
 
+#define notrace __attribute ((no_instrument_function))
+
 #ifdef __cplusplus
 #define CPP_ASMLINKAGE extern "C"
 #else
@@ -42,7 +44,7 @@
 
 #endif
 
-#define NORET_TYPE    /**/
+#define NORET_TYPE    /* */
 #define ATTRIB_NORET  __attribute__((noreturn))
 #define NORET_AND     noreturn,
 
Index: linux/include/linux/list.h
===================================================================
--- linux.orig/include/linux/list.h
+++ linux/include/linux/list.h
@@ -208,6 +208,7 @@ static inline void list_replace_rcu(stru
 	smp_wmb();
 	new->next->prev = new;
 	new->prev->next = new;
+	old->prev = LIST_POISON2;
 }
 
 /**
@@ -578,6 +579,25 @@ static inline void hlist_del_init(struct
 	}
 }
 
+/*
+ * hlist_replace_rcu - replace old entry by new one
+ * @old : the element to be replaced
+ * @new : the new element to insert
+ *
+ * The old entry will be replaced with the new entry atomically.
+ */
+static inline void hlist_replace_rcu(struct hlist_node *old, struct hlist_node *new){
+	struct hlist_node *next = old->next;
+
+	new->next = next;
+	new->pprev = old->pprev;
+	smp_wmb();
+	if (next)
+		new->next->pprev = &new->next;
+	*new->pprev = new;
+	old->pprev = LIST_POISON2;
+}
+
 static inline void hlist_add_head(struct hlist_node *n, struct hlist_head *h)
 {
 	struct hlist_node *first = h->first;
Index: linux/include/linux/loop.h
===================================================================
--- linux.orig/include/linux/loop.h
+++ linux/include/linux/loop.h
@@ -58,9 +58,9 @@ struct loop_device {
 	struct bio 		*lo_bio;
 	struct bio		*lo_biotail;
 	int			lo_state;
-	struct semaphore	lo_sem;
+	struct completion	lo_done;
+	struct completion	lo_bh_done;
 	struct semaphore	lo_ctl_mutex;
-	struct semaphore	lo_bh_mutex;
 	int			lo_pending;
 
 	request_queue_t		*lo_queue;
Index: linux/include/linux/mc146818rtc.h
===================================================================
--- linux.orig/include/linux/mc146818rtc.h
+++ linux/include/linux/mc146818rtc.h
@@ -17,7 +17,7 @@
 
 #ifdef __KERNEL__
 #include <linux/spinlock.h>		/* spinlock_t */
-extern spinlock_t rtc_lock;		/* serialize CMOS RAM access */
+extern raw_spinlock_t rtc_lock;		/* serialize CMOS RAM access */
 #endif
 
 /**********************************************************************
Index: linux/include/linux/mca.h
===================================================================
--- linux.orig/include/linux/mca.h
+++ linux/include/linux/mca.h
@@ -12,8 +12,10 @@
 #include <asm/mca.h>
 
 extern int MCA_bus;
+extern void mca_timer_ack(void *);
 #else
 #define MCA_bus 0
+#define mca_timer_ack NULL
 #endif
 
 /* This sets up an information callback for /proc/mca/slot?.  The
Index: linux/include/linux/mm.h
===================================================================
--- linux.orig/include/linux/mm.h
+++ linux/include/linux/mm.h
@@ -951,10 +951,21 @@ static inline void vm_stat_unaccount(str
 /* update per process rss and vm hiwater data */
 extern void update_mem_hiwater(struct task_struct *tsk);
 
+#ifdef CONFIG_DEBUG_DEADLOCKS
+ extern int check_no_locks_freed(const void *from, const void *to);
+#else
+ static inline int check_no_locks_freed(const void *from, const void *to)
+ {
+	return 0;
+ }
+#endif
+
 #ifndef CONFIG_DEBUG_PAGEALLOC
 static inline void
 kernel_map_pages(struct page *page, int numpages, int enable)
 {
+	if (!PageHighMem(page) && !enable)
+		check_no_locks_freed(page_address(page), page_address(page+numpages));
 }
 #endif
 
Index: linux/include/linux/netdevice.h
===================================================================
--- linux.orig/include/linux/netdevice.h
+++ linux/include/linux/netdevice.h
@@ -599,12 +599,12 @@ static inline void __netif_schedule(stru
 		unsigned long flags;
 		struct softnet_data *sd;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
 		dev->next_sched = sd->output_queue;
 		sd->output_queue = dev;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -658,12 +658,12 @@ static inline void dev_kfree_skb_irq(str
 		struct softnet_data *sd;
 		unsigned long flags;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		sd = &__get_cpu_var(softnet_data);
 		skb->next = sd->completion_queue;
 		sd->completion_queue = skb;
 		raise_softirq_irqoff(NET_TX_SOFTIRQ);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 }
 
@@ -816,15 +816,15 @@ static inline void __netif_rx_schedule(s
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	dev_hold(dev);
 	list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
 	if (dev->quota < 0)
 		dev->quota += dev->weight;
 	else
 		dev->quota = dev->weight;
-	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-	local_irq_restore(flags);
+	raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	raw_local_irq_restore(flags);
 }
 
 /* Try to reschedule poll. Called by irq handler. */
@@ -845,10 +845,10 @@ static inline int netif_rx_reschedule(st
 
 		dev->quota += undo;
 
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		list_add_tail(&dev->poll_list, &__get_cpu_var(softnet_data).poll_list);
-		__raise_softirq_irqoff(NET_RX_SOFTIRQ);
-		local_irq_restore(flags);
+		raise_softirq_irqoff(NET_RX_SOFTIRQ);
+		raw_local_irq_restore(flags);
 		return 1;
 	}
 	return 0;
@@ -863,12 +863,12 @@ static inline void netif_rx_complete(str
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	BUG_ON(!test_bit(__LINK_STATE_RX_SCHED, &dev->state));
 	list_del(&dev->poll_list);
 	smp_mb__before_clear_bit();
 	clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 static inline void netif_poll_disable(struct net_device *dev)
@@ -885,7 +885,7 @@ static inline void netif_poll_enable(str
 	clear_bit(__LINK_STATE_RX_SCHED, &dev->state);
 }
 
-/* same as netif_rx_complete, except that local_irq_save(flags)
+/* same as netif_rx_complete, except that raw_local_irq_save(flags)
  * has already been issued
  */
 static inline void __netif_rx_complete(struct net_device *dev)
Index: linux/include/linux/netfilter_ipv4/ip_conntrack.h
===================================================================
--- linux.orig/include/linux/netfilter_ipv4/ip_conntrack.h
+++ linux/include/linux/netfilter_ipv4/ip_conntrack.h
@@ -445,7 +445,12 @@ struct ip_conntrack_stat
 	unsigned int expect_delete;
 };
 
-#define CONNTRACK_STAT_INC(count) (__get_cpu_var(ip_conntrack_stat).count++)
+#define CONNTRACK_STAT_INC(count) \
+do { \
+	preempt_disable(); \
+	__get_cpu_var(ip_conntrack_stat).count++; \
+	preempt_enable(); \
+} while (0)
 
 #ifdef CONFIG_IP_NF_CONNTRACK_EVENTS
 #include <linux/notifier.h>
@@ -455,10 +460,8 @@ struct ip_conntrack_ecache {
 	struct ip_conntrack *ct;
 	unsigned int events;
 };
-DECLARE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DECLARE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
-#define CONNTRACK_ECACHE(x)	(__get_cpu_var(ip_conntrack_ecache).x)
- 
 extern struct notifier_block *ip_conntrack_chain;
 extern struct notifier_block *ip_conntrack_expect_chain;
 
@@ -493,12 +496,14 @@ ip_conntrack_event_cache(enum ip_conntra
 {
 	struct ip_conntrack *ct = (struct ip_conntrack *)skb->nfct;
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ct != ecache->ct)
 		__ip_ct_event_cache_init(ct);
 	ecache->events |= event;
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
Index: linux/include/linux/oprofile.h
===================================================================
--- linux.orig/include/linux/oprofile.h
+++ linux/include/linux/oprofile.h
@@ -114,6 +114,6 @@ ssize_t oprofilefs_ulong_to_user(unsigne
 int oprofilefs_ulong_from_user(unsigned long * val, char const __user * buf, size_t count);
 
 /** lock for read/write safety */
-extern spinlock_t oprofilefs_lock;
+extern raw_spinlock_t oprofilefs_lock;
  
 #endif /* OPROFILE_H */
Index: linux/include/linux/pagemap.h
===================================================================
--- linux.orig/include/linux/pagemap.h
+++ linux/include/linux/pagemap.h
@@ -111,20 +111,19 @@ DECLARE_PER_CPU(long, nr_pagecache_local
  * an offset in their per-cpu arena and will spill that into the
  * global count whenever the absolute value of the local count
  * exceeds the counter's threshold.
- *
- * MUST be protected from preemption.
- * current protection is mapping->page_lock.
  */
 static inline void pagecache_acct(int count)
 {
 	long *local;
 
+	preempt_disable();
 	local = &__get_cpu_var(nr_pagecache_local);
 	*local += count;
 	if (*local > PAGECACHE_ACCT_THRESHOLD || *local < -PAGECACHE_ACCT_THRESHOLD) {
 		atomic_add(*local, &nr_pagecache);
 		*local = 0;
 	}
+	preempt_enable();
 }
 
 #else
Index: linux/include/linux/pagevec.h
===================================================================
--- linux.orig/include/linux/pagevec.h
+++ linux/include/linux/pagevec.h
@@ -6,7 +6,7 @@
  */
 
 /* 14 pointers + two long's align the pagevec structure to a power of two */
-#define PAGEVEC_SIZE	14
+#define PAGEVEC_SIZE	8
 
 struct page;
 struct address_space;
Index: linux/include/linux/parport.h
===================================================================
--- linux.orig/include/linux/parport.h
+++ linux/include/linux/parport.h
@@ -254,7 +254,7 @@ enum ieee1284_phase {
 struct ieee1284_info {
 	int mode;
 	volatile enum ieee1284_phase phase;
-	struct semaphore irq;
+	struct compat_semaphore irq;
 };
 
 /* A parallel port */
Index: linux/include/linux/percpu.h
===================================================================
--- linux.orig/include/linux/percpu.h
+++ linux/include/linux/percpu.h
@@ -8,13 +8,36 @@
 
 /* Enough to cover all DEFINE_PER_CPUs in kernel, including modules. */
 #ifndef PERCPU_ENOUGH_ROOM
-#define PERCPU_ENOUGH_ROOM 32768
+#define PERCPU_ENOUGH_ROOM 65536
 #endif
 
 /* Must be an lvalue. */
 #define get_cpu_var(var) (*({ preempt_disable(); &__get_cpu_var(var); }))
 #define put_cpu_var(var) preempt_enable()
 
+/*
+ * Per-CPU data structures with an additional lock - useful for
+ * PREEMPT_RT code that wants to reschedule but also wants
+ * per-CPU data structures.
+ *
+ * 'cpu' gets updated with the CPU the task is currently executing on.
+ *
+ * NOTE: on normal !PREEMPT_RT kernels these per-CPU variables
+ * are the same as the normal per-CPU variables, so there no
+ * runtime overhead.
+ */
+#define get_cpu_var_locked(var, cpuptr)			\
+(*({							\
+	int __cpu = raw_smp_processor_id();		\
+							\
+	*(cpuptr) = __cpu;				\
+	spin_lock(&__get_cpu_lock(var, __cpu));		\
+	&__get_cpu_var_locked(var, __cpu);		\
+}))
+
+#define put_cpu_var_locked(var, cpu) \
+		 do { (void)cpu; spin_unlock(&__get_cpu_lock(var, cpu)); } while (0)
+
 #ifdef CONFIG_SMP
 
 struct percpu_data {
Index: linux/include/linux/percpu_counter.h
===================================================================
--- linux.orig/include/linux/percpu_counter.h
+++ linux/include/linux/percpu_counter.h
@@ -15,7 +15,7 @@
 #ifdef CONFIG_SMP
 
 struct percpu_counter {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 	long count;
 	long *counters;
 };
Index: linux/include/linux/plist.h
===================================================================
--- /dev/null
+++ linux/include/linux/plist.h
@@ -0,0 +1,189 @@
+/*
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ * Tested and made it functional.
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ *
+ *
+ * This is a priority-sorted list of nodes; each node has a >= 0
+ * priority from 0 (highest) to INT_MAX (lowest). The list itself has
+ * a priority too (the highest of all the nodes), stored in the head
+ * of the list (that is a node itself).
+ *
+ * Addition is O(K), removal is O(1), change of priority of a node is
+ * O(K) and K is the number of RT priority levels used in the system.
+ * (1 <= K <= 99)
+ *
+ * This list is really a list of lists:
+ *
+ *  - The tier 1 list is the dp list (Different Priority)
+ *
+ *  - The tier 2 list is the sp list (Serialized Priority)
+ *
+ * Simple ASCII art explanation:
+ *
+ * |HEAD   |
+ * |       |
+ * |dp.prev|<------------------------------------|
+ * |dp.next|<->|dp|<->|dp|<--------------->|dp|<-|
+ * |10     |   |10|   |21|   |21|   |21|   |40|   (prio)
+ * |       |   |  |   |  |   |  |   |  |   |  |
+ * |       |   |  |   |  |   |  |   |  |   |  |
+ * |sp.next|<->|sp|<->|sp|<->|sp|<->|sp|<->|sp|<-|
+ * |sp.prev|<------------------------------------|
+ *
+ * The nodes on the dp list are sorted by priority to simplify
+ * the insertion of new nodes. There are no nodes with duplicate
+ * priorites on the list.
+ *
+ * The nodes on the sp list are ordered by priority and can contain
+ * entries which have the same priority. Those entries are ordered
+ * FIFO
+ *
+ * Addition means: look for the dp node in the dp list for the
+ * priority of the node and insert it before the sp entry of the next
+ * dp node. If it is the first node of that priority, add it to the
+ * dp list in the right position and insert it into the serialized
+ * sp list
+ *
+ * Removal means remove it from the sp list and remove it from the dp
+ * list if the dp list_head is non empty. In case of removal from the
+ * dp list it must be checked whether other entries of the same
+ * priority are on the list or not. If there is another entry of
+ * the same priority then this entry has to replace the
+ * removed entry on the dp list. If the entry which is removed is
+ * the only entry of this priority then a simple remove from both
+ * list is sufficient.
+ *
+ * INT_MIN is the highest priority, 0 is the medium highest, INT_MAX
+ * is lowest priority.
+ *
+ * No locking is done, up to the caller.
+ *
+ * NOTE: This implementation does not offer as many interfaces as
+ *       linux/list.h does -- it is lazily minimal. You are welcome to
+ *       add them.
+ */
+
+#ifndef _LINUX_PLIST_H_
+#define _LINUX_PLIST_H_
+
+#include <linux/kernel.h>
+#include <linux/list.h>
+
+/* Priority-sorted list */
+struct plist {
+	int prio;
+	struct list_head dp_node;
+	struct list_head sp_node;
+};
+
+#define PLIST_INIT(p,__prio)				\
+{							\
+	.prio = __prio,					\
+	.dp_node = LIST_HEAD_INIT((p).dp_node),	\
+	.sp_node = LIST_HEAD_INIT((p).sp_node),	\
+}
+
+/**
+ * plist_entry - get the struct for this entry
+ * @ptr:        the &struct plist pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define plist_entry(ptr, type, member) \
+        container_of(ptr, type, member)
+
+/**
+ * plist_first_entry - get the struct for the first entry
+ * @ptr:        the &struct plist pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ */
+#define plist_first_entry(ptr, type, member) \
+        container_of(plist_first(ptr), type, member)
+
+/**
+ * plist_for_each  -       iterate over the plist
+ * @pos1:        the type * to use as a loop counter.
+ * @head:       the head for your list.
+ */
+#define plist_for_each(pos1, head)	\
+	list_for_each_entry(pos1, &((head)->sp_node), sp_node)
+/**
+ * plist_for_each_entry_safe - iterate over a plist of given type safe against removal of list entry
+ * @pos1:        the type * to use as a loop counter.
+ * @n1:          another type * to use as temporary storage
+ * @head:       the head for your list.
+ */
+#define plist_for_each_safe(pos1, n1, head)			\
+	list_for_each_entry_safe(pos1, n1, &((head)->sp_node), sp_node)
+
+/* Initialize a pl */
+extern void plist_init(struct plist *pl, int prio);
+
+/* Return the first node (and thus, highest priority)
+ *
+ * Assumes the plist is _not_ empty.
+ */
+static inline
+struct plist * plist_first(struct plist *plist)
+{
+	return list_entry(plist->dp_node.next, struct plist, dp_node);
+}
+
+/* Return if the plist is empty. */
+static inline
+unsigned plist_empty(struct plist *plist)
+{
+	return list_empty(&plist->sp_node);
+}
+
+/* Update the maximum priority of the whole list
+ *
+ * @returns !0 if the plist prio changed, 0 otherwise.
+ */
+extern unsigned plist_update_prio(struct plist *plist);
+
+/**
+ * Add node @pl to @plist @returns !0 if the plist prio changed, 0
+ * otherwise.
+ */
+extern unsigned plist_add(struct plist *pl, struct plist *plist);
+
+/**
+ * Remove a node @pl from @plist. @returns !0 if the plist prio
+ * changed, 0 otherwise.
+ */
+extern unsigned plist_del(struct plist *pl, struct plist *plist);
+
+/**
+ * plist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+extern void plist_del_init(struct plist *pl, struct plist *plist);
+
+/* Return the priority a pl node */
+static inline int plist_prio(struct plist *pl)
+{
+	return pl->prio;
+}
+
+/**
+ * Change the priority of node @pl in @plist (updating the list's max
+ * priority).  @returns !0 if the plist's maximum priority changes
+ */
+extern unsigned plist_chprio(struct plist *plist, struct plist *pl, int new_prio);
+
+#endif /* #ifndef _LINUX_PLIST_H_ */
+
Index: linux/include/linux/pm.h
===================================================================
--- linux.orig/include/linux/pm.h
+++ linux/include/linux/pm.h
@@ -25,6 +25,7 @@
 
 #include <linux/config.h>
 #include <linux/list.h>
+#include <linux/spinlock.h>
 #include <asm/atomic.h>
 
 /*
@@ -151,6 +152,8 @@ static inline int pm_send_all(pm_request
  */
 extern void (*pm_idle)(void);
 extern void (*pm_power_off)(void);
+extern spinlock_t pm_idle_switch_lock;
+extern int pm_idle_locked;
 
 typedef int __bitwise suspend_state_t;
 
Index: linux/include/linux/posix-timers.h
===================================================================
--- linux.orig/include/linux/posix-timers.h
+++ linux/include/linux/posix-timers.h
@@ -51,10 +51,9 @@ struct k_itimer {
 	struct sigqueue *sigq;		/* signal queue entry. */
 	union {
 		struct {
-			struct timer_list timer;
-			struct list_head abs_timer_entry; /* clock abs_timer_list */
-			struct timespec wall_to_prev;   /* wall_to_monotonic used when set */
-			unsigned long incr; /* interval in jiffies */
+			struct ktimer timer;
+			ktime_t incr;
+			int overrun;
 		} real;
 		struct cpu_timer_list cpu;
 		struct {
@@ -66,10 +65,6 @@ struct k_itimer {
 	} it;
 };
 
-struct k_clock_abs {
-	struct list_head list;
-	spinlock_t lock;
-};
 struct k_clock {
 	int res;		/* in nano seconds */
 	int (*clock_getres) (clockid_t which_clock, struct timespec *tp);
@@ -77,7 +72,7 @@ struct k_clock {
 	int (*clock_set) (clockid_t which_clock, struct timespec * tp);
 	int (*clock_get) (clockid_t which_clock, struct timespec * tp);
 	int (*timer_create) (struct k_itimer *timer);
-	int (*nsleep) (clockid_t which_clock, int flags, struct timespec *);
+	int (*nsleep) (clockid_t which_clock, int flags, struct timespec *, struct timespec __user *);
 	int (*timer_set) (struct k_itimer * timr, int flags,
 			  struct itimerspec * new_setting,
 			  struct itimerspec * old_setting);
@@ -91,37 +86,104 @@ void register_posix_clock(clockid_t cloc
 
 /* Error handlers for timer_create, nanosleep and settime */
 int do_posix_clock_notimer_create(struct k_itimer *timer);
-int do_posix_clock_nonanosleep(clockid_t, int flags, struct timespec *);
+int do_posix_clock_nonanosleep(clockid_t, int flags, struct timespec *, struct timespec __user *);
 int do_posix_clock_nosettime(clockid_t, struct timespec *tp);
 
 /* function to call to trigger timer event */
 int posix_timer_event(struct k_itimer *timr, int si_private);
 
-struct now_struct {
-	unsigned long jiffies;
-};
-
-#define posix_get_now(now) (now)->jiffies = jiffies;
-#define posix_time_before(timer, now) \
-                      time_before((timer)->expires, (now)->jiffies)
-
-#define posix_bump_timer(timr, now)					\
-         do {								\
-              long delta, orun;						\
-	      delta = now.jiffies - (timr)->it.real.timer.expires;	\
-              if (delta >= 0) {						\
-	           orun = 1 + (delta / (timr)->it.real.incr);		\
-	          (timr)->it.real.timer.expires +=			\
-			 orun * (timr)->it.real.incr;			\
-                  (timr)->it_overrun += orun;				\
-              }								\
-            }while (0)
+#if (BITS_PER_LONG < 64)
+static inline ktime_t forward_posix_timer(struct k_itimer *t, ktime_t now)
+{
+	ktime_t delta = ktime_sub(now, t->it.real.timer.expires);
+	unsigned long orun = 1;
+
+	if (ktime_cmp_val(delta, <, KTIME_ZERO))
+		goto out;
+
+	if (unlikely(ktime_cmp(delta, >, t->it.real.incr))) {
+
+		int sft = 0;
+		u64 div, dclc, inc, dns;
+
+		dclc = dns = ktime_to_ns(delta);
+		div = inc = ktime_to_ns(t->it.real.incr);
+		/* Make sure the divisor is less than 2^32 */
+		while(div >> 32) {
+			sft++;
+			div >>= 1;
+		}
+		dclc >>= sft;
+		do_div(dclc, (unsigned long) div);
+		orun = (unsigned long) dclc;
+		if (likely(!(inc >> 32)))
+			dclc *= (unsigned long) inc;
+		else
+			dclc *= inc;
+		t->it.real.timer.expires = ktime_add_ns(t->it.real.timer.expires,
+							dclc);
+	} else {
+		t->it.real.timer.expires = ktime_add(t->it.real.timer.expires,
+						     t->it.real.incr);
+	}
+	/*
+	 * Here is the correction for exact.  Also covers delta == incr
+	 * which is the else clause above.
+	 */
+	if (ktime_cmp(t->it.real.timer.expires, <=, now)) {
+		t->it.real.timer.expires = ktime_add(t->it.real.timer.expires,
+						     t->it.real.incr);
+		orun++;
+	}
+	t->it_overrun += orun;
+
+ out:
+	return ktime_sub(t->it.real.timer.expires, now);
+}
+#else
+static inline ktime_t forward_posix_timer(struct k_itimer *t, ktime_t now)
+{
+	ktime_t delta = ktime_sub(now, t->it.real.timer.expires);
+	unsigned long orun = 1;
+
+	if (ktime_cmp_val(delta, <, KTIME_ZERO))
+		goto out;
+
+	if (unlikely(ktime_cmp(delta, >, t->it.real.incr))) {
+
+		u64 dns, inc;
+
+		dns = ktime_to_ns(delta);
+		inc = ktime_to_ns(t->it.real.incr);
+
+		orun = dns / inc;
+		t->it.real.timer.expires = ktime_add_ns(t->it.real.timer.expires,
+							orun * inc);
+	} else {
+		t->it.real.timer.expires = ktime_add(t->it.real.timer.expires,
+						     t->it.real.incr);
+	}
+	/*
+	 * Here is the correction for exact.  Also covers delta == incr
+	 * which is the else clause above.
+	 */
+	if (ktime_cmp(t->it.real.timer.expires, <=, now)) {
+		t->it.real.timer.expires = ktime_add(t->it.real.timer.expires,
+						     t->it.real.incr);
+		orun++;
+	}
+	t->it_overrun += orun;
+ out:
+	return ktime_sub(t->it.real.timer.expires, now);
+}
+#endif
 
 int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *);
 int posix_cpu_clock_get(clockid_t which_clock, struct timespec *);
 int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp);
 int posix_cpu_timer_create(struct k_itimer *);
-int posix_cpu_nsleep(clockid_t, int, struct timespec *);
+int posix_cpu_nsleep(clockid_t, int, struct timespec *,
+		     struct timespec __user *);
 int posix_cpu_timer_set(struct k_itimer *, int,
 			struct itimerspec *, struct itimerspec *);
 int posix_cpu_timer_del(struct k_itimer *);
Index: linux/include/linux/preempt.h
===================================================================
--- linux.orig/include/linux/preempt.h
+++ linux/include/linux/preempt.h
@@ -8,23 +8,53 @@
 
 #include <linux/config.h>
 #include <linux/linkage.h>
+#include <linux/thread_info.h>
 
-#ifdef CONFIG_DEBUG_PREEMPT
-  extern void fastcall add_preempt_count(int val);
-  extern void fastcall sub_preempt_count(int val);
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+  extern void notrace add_preempt_count(unsigned int val);
+  extern void notrace sub_preempt_count(unsigned int val);
+  extern void notrace add_preempt_count_ti(struct thread_info *ti, unsigned int val);
+  extern void notrace sub_preempt_count_ti(struct thread_info *ti, unsigned int val);
+  extern void notrace mask_preempt_count(unsigned int mask);
+  extern void notrace unmask_preempt_count(unsigned int mask);
 #else
 # define add_preempt_count(val)	do { preempt_count() += (val); } while (0)
 # define sub_preempt_count(val)	do { preempt_count() -= (val); } while (0)
+# define add_preempt_count_ti(ti, val)	do { preempt_count_ti(ti) += (val); } while (0)
+# define sub_preempt_count_ti(ti, val)	do { preempt_count_ti(ti) -= (val); } while (0)
+# define mask_preempt_count(mask) \
+		do { preempt_count() |= (mask); } while (0)
+# define unmask_preempt_count(mask) \
+		do { preempt_count() &= ~(mask); } while (0)
+#endif
+
+#ifdef CONFIG_CRITICAL_TIMING
+  extern void touch_critical_timing(void);
+  extern void stop_critical_timing(void);
+#else
+# define touch_critical_timing()	do { } while (0)
+# define stop_critical_timing()	do { } while (0)
 #endif
 
 #define inc_preempt_count() add_preempt_count(1)
 #define dec_preempt_count() sub_preempt_count(1)
 
-#define preempt_count()	(current_thread_info()->preempt_count)
+#define inc_preempt_count_ti(ti) add_preempt_count_ti(ti, 1)
+#define dec_preempt_count_ti(ti) sub_preempt_count_ti(ti, 1)
+
+#define preempt_count()		(current_thread_info()->preempt_count)
+#define preempt_count_ti(ti)	((ti)->preempt_count)
 
 #ifdef CONFIG_PREEMPT
 
 asmlinkage void preempt_schedule(void);
+asmlinkage void preempt_schedule_irq(void);
+
+#define preempt_disable_ti(ti) \
+do { \
+	inc_preempt_count_ti(ti); \
+	barrier(); \
+} while (0)
 
 #define preempt_disable() \
 do { \
@@ -32,30 +62,63 @@ do { \
 	barrier(); \
 } while (0)
 
-#define preempt_enable_no_resched() \
+
+#define __preempt_enable_no_resched() \
 do { \
 	barrier(); \
 	dec_preempt_count(); \
 } while (0)
 
+#define __preempt_enable_no_resched_ti(ti) \
+do { \
+	barrier(); \
+	dec_preempt_count_ti(ti); \
+} while (0)
+
+
+#ifdef CONFIG_DEBUG_PREEMPT
+extern void notrace preempt_enable_no_resched(void);
+#else
+# define preempt_enable_no_resched() __preempt_enable_no_resched()
+#endif
+
 #define preempt_check_resched() \
 do { \
 	if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \
 		preempt_schedule(); \
 } while (0)
 
+#define preempt_check_resched_delayed() \
+do { \
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED))) \
+		preempt_schedule(); \
+} while (0)
+
+
 #define preempt_enable() \
 do { \
-	preempt_enable_no_resched(); \
+	__preempt_enable_no_resched(); \
 	preempt_check_resched(); \
 } while (0)
 
+#define preempt_enable_ti(ti) \
+do { \
+	__preempt_enable_no_resched_ti(ti); \
+	if (unlikely(test_ti_thread_flag(ti, TIF_NEED_RESCHED))) \
+		preempt_schedule(); \
+} while (0)
+
+
 #else
 
 #define preempt_disable()		do { } while (0)
 #define preempt_enable_no_resched()	do { } while (0)
+#define __preempt_enable_no_resched()	do { } while (0)
 #define preempt_enable()		do { } while (0)
 #define preempt_check_resched()		do { } while (0)
+#define preempt_check_resched_delayed()	do { } while (0)
+
+#define preempt_schedule_irq()		do { } while (0)
 
 #endif
 
Index: linux/include/linux/profile.h
===================================================================
--- linux.orig/include/linux/profile.h
+++ linux/include/linux/profile.h
@@ -7,10 +7,12 @@
 #include <linux/config.h>
 #include <linux/init.h>
 #include <linux/cpumask.h>
+#include <linux/kernel_stat.h>
 #include <asm/errno.h>
 
-#define CPU_PROFILING	1
-#define SCHED_PROFILING	2
+#define CPU_PROFILING		1
+#define SCHED_PROFILING		2
+#define PREEMPT_PROFILING	3
 
 struct proc_dir_entry;
 struct pt_regs;
@@ -30,6 +32,8 @@ enum profile_type {
 	PROFILE_MUNMAP
 };
 
+extern int prof_pid;
+
 #ifdef CONFIG_PROFILING
 
 struct notifier_block;
Index: linux/include/linux/quota.h
===================================================================
--- linux.orig/include/linux/quota.h
+++ linux/include/linux/quota.h
@@ -37,6 +37,7 @@
 
 #include <linux/errno.h>
 #include <linux/types.h>
+#include <linux/wait.h>
 #include <linux/spinlock.h>
 
 #define __DQUOT_VERSION__	"dquot_6.5.1"
Index: linux/include/linux/radix-tree.h
===================================================================
--- linux.orig/include/linux/radix-tree.h
+++ linux/include/linux/radix-tree.h
@@ -19,6 +19,7 @@
 #ifndef _LINUX_RADIX_TREE_H
 #define _LINUX_RADIX_TREE_H
 
+#include <linux/config.h>
 #include <linux/preempt.h>
 #include <linux/types.h>
 
@@ -50,7 +51,18 @@ void *radix_tree_delete(struct radix_tre
 unsigned int
 radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
 			unsigned long first_index, unsigned int max_items);
+/*
+ * On a mutex based kernel we can freely schedule within the radix code:
+ */
+#ifdef CONFIG_PREEMPT_RT
+static inline int radix_tree_preload(gfp_t gfp_mask)
+{
+	return 0;
+}
+#else
 int radix_tree_preload(gfp_t gfp_mask);
+#endif
+
 void radix_tree_init(void);
 void *radix_tree_tag_set(struct radix_tree_root *root,
 			unsigned long index, int tag);
@@ -65,7 +77,9 @@ int radix_tree_tagged(struct radix_tree_
 
 static inline void radix_tree_preload_end(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	preempt_enable();
+#endif
 }
 
 #endif /* _LINUX_RADIX_TREE_H */
Index: linux/include/linux/rcupdate.h
===================================================================
--- linux.orig/include/linux/rcupdate.h
+++ linux/include/linux/rcupdate.h
@@ -59,6 +59,7 @@ struct rcu_head {
 } while (0)
 
 
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Global control variables for rcupdate callback mechanism. */
 struct rcu_ctrlblk {
@@ -184,14 +185,26 @@ static inline int rcu_pending(int cpu)
  *
  * It is illegal to block while in an RCU read-side critical section.
  */
-#define rcu_read_lock()		preempt_disable()
+#define rcu_read_lock preempt_disable
 
 /**
  * rcu_read_unlock - marks the end of an RCU read-side critical section.
  *
  * See rcu_read_lock() for more information.
  */
-#define rcu_read_unlock()	preempt_enable()
+#define rcu_read_unlock preempt_enable
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+#define rcu_qsctr_inc(cpu)
+#define rcu_bh_qsctr_inc(cpu)
+#define call_rcu_bh(head, rcu) call_rcu(head, rcu)
+
+extern void rcu_read_lock(void);
+extern void rcu_read_unlock(void);
+extern int rcu_pending(int cpu);
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
@@ -214,14 +227,22 @@ static inline int rcu_pending(int cpu)
  * can use just rcu_read_lock().
  *
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_lock_bh()	local_bh_disable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_lock_bh()	{ rcu_read_lock(); local_bh_disable(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /*
  * rcu_read_unlock_bh - marks the end of a softirq-only RCU critical section
  *
  * See rcu_read_lock_bh() for more information.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define rcu_read_unlock_bh()	local_bh_enable()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+#define rcu_read_unlock_bh()	{ local_bh_enable(); rcu_read_unlock(); }
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 /**
  * rcu_dereference - fetch an RCU-protected pointer in an
@@ -270,11 +291,16 @@ static inline int rcu_pending(int cpu)
  * synchronize_kernel() API.  In contrast, synchronize_rcu() only
  * guarantees that rcu_read_lock() sections will have completed.
  */
+#ifndef CONFIG_PREEMPT_RCU
 #define synchronize_sched() synchronize_rcu()
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+extern void synchronize_sched(void);
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
 
 extern void rcu_init(void);
 extern void rcu_check_callbacks(int cpu, int user);
 extern void rcu_restart_cpu(int cpu);
+extern long rcu_batches_completed(void);
 
 /* Exported interfaces */
 extern void FASTCALL(call_rcu(struct rcu_head *head, 
Index: linux/include/linux/rt_irq.h
===================================================================
--- /dev/null
+++ linux/include/linux/rt_irq.h
@@ -0,0 +1,68 @@
+#ifndef __LINUX_RT_IRQ_H
+#define __LINUX_RT_IRQ_H
+
+/*
+ * Soft IRQ flag support on PREEMPT_RT kernels:
+ */
+#ifdef CONFIG_PREEMPT_RT
+
+extern void local_irq_enable(void);
+extern void local_irq_disable(void);
+extern void local_irq_restore(unsigned long flags);
+extern void __local_save_flags(unsigned long *flags);
+extern void __local_irq_save(unsigned long *flags);
+extern int irqs_disabled(void);
+extern int irqs_disabled_flags(unsigned long flags);
+
+# define local_save_flags(flags)	__local_save_flags(&(flags))
+# define local_irq_save(flags)		__local_irq_save(&(flags))
+
+# define RAW_LOCAL_ILLEGAL_MASK		0x20000000UL
+# ifdef CONFIG_DEBUG_IRQ_FLAGS
+#  define LOCAL_ILLEGAL_MASK		0x40000000UL
+   void check_raw_flags(unsigned long flags);
+# else
+#  define check_raw_flags(flags)	do { } while (0)
+# endif
+
+/* soft state does not follow the hard state */
+# define raw_local_irq_enable()		do { trace_irqs_on(); __raw_local_irq_enable(); } while (0)
+# define raw_local_irq_disable()	do { __raw_local_irq_disable(); trace_irqs_off(); } while (0)
+# define raw_local_irq_save(flags)	do { __raw_local_irq_save(flags); trace_irqs_off(); } while (0)
+# define raw_local_irq_restore(flags) \
+	do { check_raw_flags(flags); if (!__raw_irqs_disabled_flags(flags)) { trace_irqs_on(); } \
+			__raw_local_irq_restore(flags); } while (0)
+# define raw_safe_halt()		__raw_safe_halt()
+#else
+# define RAW_LOCAL_ILLEGAL_MASK		0UL
+# define LOCAL_ILLEGAL_MASK		0UL
+# define raw_local_irq_enable		__raw_local_irq_enable
+# define raw_local_irq_disable		__raw_local_irq_disable
+# define raw_local_irq_save		__raw_local_irq_save
+# define raw_local_irq_restore		__raw_local_irq_restore
+# define raw_safe_halt			__raw_safe_halt
+# define safe_halt			raw_safe_halt
+# define local_save_flags		__raw_local_save_flags
+# define local_irq_enable		__raw_local_irq_enable
+# define local_irq_disable		__raw_local_irq_disable
+# define local_irq_save			__raw_local_irq_save
+# define local_irq_restore		__raw_local_irq_restore
+# define irqs_disabled			__raw_irqs_disabled
+# define irqs_disabled_flags		__raw_irqs_disabled_flags
+#endif
+
+#define raw_local_save_flags		__raw_local_save_flags
+#define raw_irqs_disabled		__raw_irqs_disabled
+#define raw_irqs_disabled_flags		__raw_irqs_disabled_flags
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+  extern void notrace trace_irqs_off_lowlevel(void);
+  extern void notrace trace_irqs_off(void);
+  extern void notrace trace_irqs_on(void);
+#else
+# define trace_irqs_off_lowlevel()	do { } while (0)
+# define trace_irqs_off()		do { } while (0)
+# define trace_irqs_on()		do { } while (0)
+#endif
+
+#endif /* __LINUX_RT_IRQ_H */
Index: linux/include/linux/rt_lock.h
===================================================================
--- /dev/null
+++ linux/include/linux/rt_lock.h
@@ -0,0 +1,396 @@
+#ifndef __LINUX_RT_LOCK_H
+#define __LINUX_RT_LOCK_H
+
+/*
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * This file contains the main data structure definitions.
+ */
+#include <linux/config.h>
+#include <linux/list.h>
+#include <linux/plist.h>
+#include <asm/atomic.h>
+#include <linux/spinlock_types.h>
+
+/*
+ * This is the core locking object used by PREEMPT_RT.
+ * This one handles all the logic necessary, the other locking
+ * objects (spinlocks, rwlocks, semaphores and rw-semaphores)
+ * all use this synchronization object internally:
+ */
+struct rt_mutex {
+	raw_spinlock_t		wait_lock;
+	struct plist		wait_list;
+	struct thread_info	*owner;
+# ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	raw_spinlock_t		debug_slock;
+	raw_rwlock_t		debug_rwlock;
+# endif
+# ifdef CONFIG_DEBUG_DEADLOCKS
+	int			save_state;
+	struct list_head	held_list;
+	unsigned long		acquire_eip;
+	char 			*name, *file;
+	int			line;
+# endif
+# ifdef CONFIG_DEBUG_PREEMPT
+	int			was_preempt_off;
+# endif
+	unsigned int		mutex_attr;
+};
+
+/*
+ * This is the control structure for tasks blocked on an
+ * RT mutex:
+ */
+struct rt_mutex_waiter {
+	struct rt_mutex		*lock;
+	struct plist		list;
+	struct plist		pi_list;
+	struct thread_info	*ti;
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	unsigned long eip;
+#endif
+};
+
+#ifdef CONFIG_PREEMPT_RT
+
+#ifdef CONFIG_DEBUG_PREEMPT
+# define __WAS_PREEMPT_OFF(x)	, .was_preempt_off = x
+#else
+# define __WAS_PREEMPT_OFF(x)
+#endif
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+# define __RT_MUTEX_DEADLOCK_DETECT_INITIALIZER(lockname) \
+	, .name = #lockname, .file = __FILE__, .line = __LINE__
+#else
+# define __RT_MUTEX_DEADLOCK_DETECT_INITIALIZER(lockname)
+#endif
+
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+# define __RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER \
+	, .debug_slock = _RAW_SPIN_LOCK_UNLOCKED \
+	, .debug_rwlock = _RAW_RW_LOCK_UNLOCKED
+#else
+# define __RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER
+#endif
+
+/*
+ * FIXME: on SMP it's hard to initialize plists in the percpu.data area
+ */
+#ifdef CONFIG_SMP
+# define __PLIST_INIT(lockname)
+#else
+# define __PLIST_INIT(lockname) \
+	, .wait_list = PLIST_INIT((lockname).wait_list, 140 /*MAX_PRIO*/)
+#endif
+
+#define __RT_MUTEX_INITIALIZER(lockname) \
+	{ .wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(lockname) \
+	__WAS_PREEMPT_OFF(0) \
+	__RT_MUTEX_DEADLOCK_DETECT_INITIALIZER(lockname) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER }
+
+/*
+ * RW-semaphores are an RT mutex plus a reader-depth count.
+ *
+ * Note that the semantics are different from the usual
+ * Linux rw-sems, in PREEMPT_RT mode we do not allow
+ * multiple readers to hold the lock at once, we only allow
+ * a read-lock owner to read-lock recursively. This is
+ * better for latency, makes the implementation inherently
+ * fair and makes it simpler as well:
+ */
+struct rw_semaphore {
+	struct rt_mutex		lock;
+	int			read_depth;
+};
+
+/*
+ * rwlocks - an RW semaphore plus lock-break field:
+ */
+typedef struct {
+	struct rw_semaphore	lock;
+	unsigned int		break_lock;
+} rwlock_t;
+
+# ifdef CONFIG_DEBUG_DEADLOCKS
+#  define __RW_LOCK_UNLOCKED(lockname) \
+	.wait_lock = _RAW_SPIN_LOCK_UNLOCKED, .save_state = 1 \
+	__PLIST_INIT((lockname).lock.lock) \
+	, .file = __FILE__, .line = __LINE__ \
+	__WAS_PREEMPT_OFF(1) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER
+#  define _RW_LOCK_UNLOCKED(lockname) \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED(lockname), .name = #lockname } } }
+#  define RW_LOCK_UNLOCKED(lockname) \
+	(rwlock_t) { { { __RW_LOCK_UNLOCKED(lockname) } } }
+# else
+#  define RW_LOCK_UNLOCKED(lockname) (rwlock_t) \
+	{ { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(((lockname).lock.lock)) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER } } }
+#  define _RW_LOCK_UNLOCKED(lockname) RW_LOCK_UNLOCKED(lockname)
+# endif
+#else /* !PREEMPT_RT */
+  typedef raw_rwlock_t rwlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+# define _RW_LOCK_UNLOCKED(lockname)					\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+				.magic = RWLOCK_MAGIC,			\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _RW_LOCK_UNLOCKED(lockname)					\
+	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+# endif
+# define RW_LOCK_UNLOCKED(lockname)	_RW_LOCK_UNLOCKED(lockname)
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * spinlocks - an RT mutex plus lock-break field:
+ */
+typedef struct {
+	struct rt_mutex lock;
+	unsigned int break_lock;
+} spinlock_t;
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+# define __SPIN_LOCK_UNLOCKED(lockname) \
+	.wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(((lockname).lock)) \
+	, .save_state = 1, .file = __FILE__, .line = __LINE__ \
+	__WAS_PREEMPT_OFF(1) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER
+# define _SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED(lockname), .name = #lockname } }
+# define SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t) { { __SPIN_LOCK_UNLOCKED(lockname) } }
+#else
+# define SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t) { { .wait_lock = _RAW_SPIN_LOCK_UNLOCKED \
+	__PLIST_INIT(((lockname).lock)) \
+	__RT_MUTEX_DEBUG_RT_LOCKING_MODE_INITIALIZER } }
+# define _SPIN_LOCK_UNLOCKED(lockname) SPIN_LOCK_UNLOCKED(lockname)
+#endif
+#else /* !PREEMPT_RT */
+  typedef raw_spinlock_t spinlock_t;
+# ifdef CONFIG_DEBUG_SPINLOCK
+#  define _SPIN_LOCK_UNLOCKED(lockname)					\
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = SPINLOCK_MAGIC,		\
+				.owner = SPINLOCK_OWNER_INIT,		\
+				.owner_cpu = -1 }
+# else
+#  define _SPIN_LOCK_UNLOCKED(lockname) \
+	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+# endif
+# define SPIN_LOCK_UNLOCKED(lockname) _SPIN_LOCK_UNLOCKED(lockname)
+#endif
+
+#define DEFINE_SPINLOCK(name) \
+	spinlock_t name __cacheline_aligned_in_smp = _SPIN_LOCK_UNLOCKED(name)
+
+#define DEFINE_RWLOCK(name) \
+	rwlock_t name __cacheline_aligned_in_smp = _RW_LOCK_UNLOCKED(name)
+
+#ifdef CONFIG_PREEMPT_RT
+
+/*
+ * Semaphores - an RT-mutex plus the semaphore count:
+ */
+struct semaphore {
+	atomic_t count;
+	struct rt_mutex lock;
+};
+
+#define DECLARE_MUTEX(name) \
+struct semaphore name = \
+	{ .count = { 1 }, .lock = __RT_MUTEX_INITIALIZER(name.lock) }
+
+/*
+ * DECLARE_MUTEX_LOCKED() is deprecated: very hard to initialize properly
+ * and it also often signals abuse of semaphores. So we redirect it to
+ * compat semaphores:
+ */
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+extern void FASTCALL(__sema_init(struct semaphore *sem, int val, char *name, char *file, int line));
+
+#define rt_sema_init(sem, val) \
+		__sema_init(sem, val, #sem, __FILE__, __LINE__)
+
+extern void FASTCALL(__init_MUTEX(struct semaphore *sem, char *name, char *file, int line));
+#define rt_init_MUTEX(sem) \
+		__init_MUTEX(sem, #sem, __FILE__, __LINE__)
+
+extern void there_is_no_init_MUTEX_LOCKED_for_RT_semaphores(void);
+
+/*
+ * No locked initialization for RT semaphores
+ */
+#define rt_init_MUTEX_LOCKED(sem) \
+		there_is_no_init_MUTEX_LOCKED_for_RT_semaphores()
+extern void FASTCALL(rt_down(struct semaphore *sem));
+extern int FASTCALL(rt_down_interruptible(struct semaphore *sem));
+extern int FASTCALL(rt_down_trylock(struct semaphore *sem));
+extern void FASTCALL(rt_up(struct semaphore *sem));
+extern int FASTCALL(rt_sem_is_locked(struct semaphore *sem));
+extern int FASTCALL(rt_sema_count(struct semaphore *sem));
+
+
+extern int __bad_func_type(void);
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(var, type) \
+		__builtin_types_compatible_p(typeof(var), type *)
+
+#define PICK_FUNC_1ARG(type1, type2, func1, func2, arg)			\
+do {									\
+	if (TYPE_EQUAL((arg), type1))					\
+		func1((type1 *)(arg));					\
+	else if (TYPE_EQUAL((arg), type2))				\
+		func2((type2 *)(arg));					\
+	else __bad_func_type();						\
+} while (0)
+
+#define PICK_FUNC_1ARG_RET(type1, type2, func1, func2, arg)		\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((arg), type1))					\
+		__ret = func1((type1 *)(arg));				\
+	else if (TYPE_EQUAL((arg), type2))				\
+		__ret = func2((type2 *)(arg));				\
+	else __ret = __bad_func_type();					\
+									\
+	__ret;								\
+})
+
+#define PICK_FUNC_2ARG(type1, type2, func1, func2, arg0, arg1)		\
+do {									\
+	if (TYPE_EQUAL((arg0), type1))					\
+		func1((type1 *)(arg0), arg1);				\
+	else if (TYPE_EQUAL((arg0), type2))				\
+		func2((type2 *)(arg0), arg1);				\
+	else __bad_func_type();						\
+} while (0)
+
+#define sema_init(sem, val) \
+	PICK_FUNC_2ARG(struct compat_semaphore, struct semaphore, \
+		compat_sema_init, rt_sema_init, sem, val)
+
+#define init_MUTEX(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX, rt_init_MUTEX, sem)
+
+#define init_MUTEX_LOCKED(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_init_MUTEX_LOCKED, rt_init_MUTEX_LOCKED, sem)
+
+#define down(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_down, rt_down, sem)
+
+#define down_interruptible(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_interruptible, rt_down_interruptible, sem)
+
+#define down_trylock(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_down_trylock, rt_down_trylock, sem)
+
+#define up(sem) \
+	PICK_FUNC_1ARG(struct compat_semaphore, struct semaphore, \
+		compat_up, rt_up, sem)
+
+#define sem_is_locked(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sem_is_locked, rt_sem_is_locked, sem)
+
+#define sema_count(sem) \
+	PICK_FUNC_1ARG_RET(struct compat_semaphore, struct semaphore, \
+		compat_sema_count, rt_sema_count, sem)
+
+/*
+ * rwsems:
+ */
+
+#define __RWSEM_INITIALIZER(lockname) \
+	{ .lock = __RT_MUTEX_INITIALIZER(lockname.lock) }
+
+#define DECLARE_RWSEM(lockname) \
+	struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname)
+
+extern void FASTCALL(__init_rwsem(struct rw_semaphore *rwsem, int mutex,
+				char *name, char *file, int line));
+
+#define rt_init_rwsem(sem) __init_rwsem(sem, 0, #sem, __FILE__, __LINE__)
+
+extern void FASTCALL(rt_down_read(struct rw_semaphore *rwsem));
+extern int FASTCALL(rt_down_read_trylock(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_down_write(struct rw_semaphore *rwsem));
+extern int FASTCALL(rt_down_write_trylock(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_up_read(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_up_write(struct rw_semaphore *rwsem));
+extern void FASTCALL(rt_downgrade_write(struct rw_semaphore *rwsem));
+extern int FASTCALL(rt_rwsem_is_locked(struct rw_semaphore *rwsem));
+
+#define init_rwsem(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_init_rwsem, rt_init_rwsem, rwsem)
+
+#define down_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read, rt_down_read, rwsem)
+
+#define down_read_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_read_trylock, rt_down_read_trylock, rwsem)
+
+#define down_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write, rt_down_write, rwsem)
+
+#define down_write_trylock(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_down_write_trylock, rt_down_write_trylock, rwsem)
+
+#define up_read(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_read, rt_up_read, rwsem)
+
+#define up_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_up_write, rt_up_write, rwsem)
+
+#define downgrade_write(rwsem) \
+	PICK_FUNC_1ARG(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_downgrade_write, rt_downgrade_write, rwsem)
+
+#define rwsem_is_locked(rwsem) \
+	PICK_FUNC_1ARG_RET(struct compat_rw_semaphore, struct rw_semaphore, \
+		compat_rwsem_is_locked, rt_rwsem_is_locked, rwsem)
+
+#endif /* CONFIG_PREEMPT_RT */
+
+struct semaphore;
+
+extern void FASTCALL(up_futex(struct rt_mutex *lock));
+extern int FASTCALL(down_futex(struct rt_mutex *lock, unsigned long time, pid_t owner_pid, struct semaphore *sem));
+extern int FASTCALL(rt_mutex_owned_by(struct rt_mutex *lock, struct thread_info *t));
+extern int FASTCALL(rt_mutex_has_waiters(struct rt_mutex *lock));
+extern struct thread_info *FASTCALL(rt_mutex_owner(struct rt_mutex *lock));
+extern void FASTCALL(init_rt_mutex(struct rt_mutex *lock, int save_state,
+				   char *name, char *file, int line));
+
+#endif
+
Index: linux/include/linux/rwsem-spinlock.h
===================================================================
--- linux.orig/include/linux/rwsem-spinlock.h
+++ linux/include/linux/rwsem-spinlock.h
@@ -28,7 +28,7 @@ struct rwsem_waiter;
  * - if activity is -1 then there is one active writer
  * - if wait_list is not empty, then there are processes waiting for the semaphore
  */
-struct rw_semaphore {
+struct compat_rw_semaphore {
 	__s32			activity;
 	spinlock_t		wait_lock;
 	struct list_head	wait_list;
@@ -46,20 +46,20 @@ struct rw_semaphore {
 #define __RWSEM_DEBUG_INIT	/* */
 #endif
 
-#define __RWSEM_INITIALIZER(name) \
-{ 0, SPIN_LOCK_UNLOCKED, LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
+#define __COMPAT_RWSEM_INITIALIZER(name) \
+{ 0, SPIN_LOCK_UNLOCKED((name).wait_lock), LIST_HEAD_INIT((name).wait_list) __RWSEM_DEBUG_INIT }
 
-#define DECLARE_RWSEM(name) \
-	struct rw_semaphore name = __RWSEM_INITIALIZER(name)
+#define COMPAT_DECLARE_RWSEM(name) \
+	struct compat_rw_semaphore name = __COMPAT_RWSEM_INITIALIZER(name)
 
-extern void FASTCALL(init_rwsem(struct rw_semaphore *sem));
-extern void FASTCALL(__down_read(struct rw_semaphore *sem));
-extern int FASTCALL(__down_read_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__down_write(struct rw_semaphore *sem));
-extern int FASTCALL(__down_write_trylock(struct rw_semaphore *sem));
-extern void FASTCALL(__up_read(struct rw_semaphore *sem));
-extern void FASTCALL(__up_write(struct rw_semaphore *sem));
-extern void FASTCALL(__downgrade_write(struct rw_semaphore *sem));
+extern void FASTCALL(compat_init_rwsem(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_read(struct compat_rw_semaphore *sem));
+extern int FASTCALL(__down_read_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__down_write(struct compat_rw_semaphore *sem));
+extern int FASTCALL(__down_write_trylock(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_read(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__up_write(struct compat_rw_semaphore *sem));
+extern void FASTCALL(__downgrade_write(struct compat_rw_semaphore *sem));
 
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_SPINLOCK_H */
Index: linux/include/linux/rwsem.h
===================================================================
--- linux.orig/include/linux/rwsem.h
+++ linux/include/linux/rwsem.h
@@ -9,6 +9,10 @@
 
 #include <linux/linkage.h>
 
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#endif
+
 #define RWSEM_DEBUG 0
 
 #ifdef __KERNEL__
@@ -19,17 +23,29 @@
 #include <asm/system.h>
 #include <asm/atomic.h>
 
-struct rw_semaphore;
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * On !PREEMPT_RT all rw-semaphores are compat:
+ */
+#define compat_rw_semaphore rw_semaphore
+#endif
+
+struct compat_rw_semaphore;
+
 
 #ifdef CONFIG_RWSEM_GENERIC_SPINLOCK
-#include <linux/rwsem-spinlock.h> /* use a generic implementation */
+# include <linux/rwsem-spinlock.h> /* use a generic implementation */
+#  ifndef CONFIG_PREEMPT_RT
+#  define __RWSEM_INITIALIZER __COMPAT_RWSEM_INITIALIZER
+#  define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+# endif
 #else
-#include <asm/rwsem.h> /* use an arch-specific implementation */
+# include <asm/rwsem.h> /* use an arch-specific implementation */
 #endif
 
 #ifndef rwsemtrace
 #if RWSEM_DEBUG
-extern void FASTCALL(rwsemtrace(struct rw_semaphore *sem, const char *str));
+extern void FASTCALL(rwsemtrace(struct compat_rw_semaphore *sem, const char *str));
 #else
 #define rwsemtrace(SEM,FMT)
 #endif
@@ -38,7 +54,7 @@ extern void FASTCALL(rwsemtrace(struct r
 /*
  * lock for reading
  */
-static inline void down_read(struct rw_semaphore *sem)
+static inline void compat_down_read(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsemtrace(sem,"Entering down_read");
@@ -49,7 +65,7 @@ static inline void down_read(struct rw_s
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-static inline int down_read_trylock(struct rw_semaphore *sem)
+static inline int compat_down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret;
 	rwsemtrace(sem,"Entering down_read_trylock");
@@ -61,7 +77,7 @@ static inline int down_read_trylock(stru
 /*
  * lock for writing
  */
-static inline void down_write(struct rw_semaphore *sem)
+static inline void compat_down_write(struct compat_rw_semaphore *sem)
 {
 	might_sleep();
 	rwsemtrace(sem,"Entering down_write");
@@ -72,7 +88,7 @@ static inline void down_write(struct rw_
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-static inline int down_write_trylock(struct rw_semaphore *sem)
+static inline int compat_down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	int ret;
 	rwsemtrace(sem,"Entering down_write_trylock");
@@ -84,7 +100,7 @@ static inline int down_write_trylock(str
 /*
  * release a read lock
  */
-static inline void up_read(struct rw_semaphore *sem)
+static inline void compat_up_read(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_read");
 	__up_read(sem);
@@ -94,7 +110,7 @@ static inline void up_read(struct rw_sem
 /*
  * release a write lock
  */
-static inline void up_write(struct rw_semaphore *sem)
+static inline void compat_up_write(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering up_write");
 	__up_write(sem);
@@ -104,12 +120,50 @@ static inline void up_write(struct rw_se
 /*
  * downgrade write lock to read lock
  */
-static inline void downgrade_write(struct rw_semaphore *sem)
+static inline void compat_downgrade_write(struct compat_rw_semaphore *sem)
 {
 	rwsemtrace(sem,"Entering downgrade_write");
 	__downgrade_write(sem);
 	rwsemtrace(sem,"Leaving downgrade_write");
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+#define DECLARE_RWSEM COMPAT_DECLARE_RWSEM
+
+static inline void init_rwsem(struct compat_rw_semaphore *rwsem)
+{
+	compat_init_rwsem(rwsem);
+}
+static inline void down_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_read(rwsem);
+}
+static inline int down_read_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_read_trylock(rwsem);
+}
+static inline void down_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_down_write(rwsem);
+}
+static inline int down_write_trylock(struct compat_rw_semaphore *rwsem)
+{
+	return compat_down_write_trylock(rwsem);
+}
+static inline void up_read(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_read(rwsem);
+}
+static inline void up_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_up_write(rwsem);
+}
+static inline void downgrade_write(struct compat_rw_semaphore *rwsem)
+{
+	compat_downgrade_write(rwsem);
+}
+#endif /* CONFIG_PREEMPT_RT */
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_RWSEM_H */
Index: linux/include/linux/sched.h
===================================================================
--- linux.orig/include/linux/sched.h
+++ linux/include/linux/sched.h
@@ -34,9 +34,166 @@
 #include <linux/percpu.h>
 #include <linux/topology.h>
 #include <linux/seccomp.h>
+#include <linux/rcupdate.h>
 
 #include <linux/auxvec.h>	/* For AT_VECTOR_SIZE */
 
+#ifdef CONFIG_PREEMPT
+extern int kernel_preemption;
+#else
+# define kernel_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+extern int voluntary_preemption;
+#else
+# define voluntary_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+extern int softirq_preemption;
+#else
+# define softirq_preemption 0
+#endif
+#ifdef CONFIG_PREEMPT_HARDIRQS
+extern int hardirq_preemption;
+#else
+# define hardirq_preemption 0
+#endif
+
+#ifdef CONFIG_PREEMPT_BKL
+extern struct semaphore kernel_sem;
+#endif
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern int debug_direct_keyboard;
+#else
+# define debug_direct_keyboard 0
+#endif
+
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPT_RT)
+extern int check_locking_preempt_off(struct task_struct *p);
+extern void check_preempt_wakeup(struct task_struct * p);
+#else
+#define check_locking_preempt_off(x)		0
+#define check_preempt_wakeup(p)			do { } while (0)
+#endif
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+  extern void deadlock_trace_off(void);
+  extern void show_held_locks(struct task_struct *filter);
+  extern void check_no_held_locks(struct task_struct *task);
+  extern void show_all_locks(void);
+#else
+# define deadlock_trace_off()			do { } while (0)
+# define show_held_locks(p)			do { } while (0)
+# define check_no_held_locks(task)		do { } while (0)
+# define show_all_locks()			do { } while (0)
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+  extern void print_traces(struct task_struct *task);
+#else
+# define print_traces(task)			do { } while (0)
+#endif
+
+#ifdef CONFIG_FRAME_POINTER
+# ifndef CONFIG_ARM
+#  define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+#  define CALLER_ADDR1 ((unsigned long)__builtin_return_address(1))
+#  define CALLER_ADDR2 ((unsigned long)__builtin_return_address(2))
+#  define CALLER_ADDR3 ((unsigned long)__builtin_return_address(3))
+# else
+   extern unsigned long arm_return_addr(int level);
+#  define CALLER_ADDR0 arm_return_addr(0)
+#  define CALLER_ADDR1 arm_return_addr(1)
+#  define CALLER_ADDR2 arm_return_addr(2)
+#  define CALLER_ADDR3 arm_return_addr(3)
+#endif
+#else
+# define CALLER_ADDR0 ((unsigned long)__builtin_return_address(0))
+# define CALLER_ADDR1 0UL
+# define CALLER_ADDR2 0UL
+# define CALLER_ADDR3 0UL
+#endif
+
+#ifdef CONFIG_MCOUNT
+  extern void notrace mcount(void);
+#else
+# define mcount() do { } while (0)
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+  extern int mcount_enabled, trace_enabled, trace_user_triggered,
+		trace_user_trigger_irq, trace_freerunning, trace_verbose,
+		trace_print_at_crash, trace_all_cpus;
+  extern void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3);
+  extern void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2);
+  extern void notrace trace_special_u64(unsigned long long v1, unsigned long v2);
+  extern void stop_trace(void);
+  extern void print_last_trace(void);
+  extern void nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags);
+  extern long user_trace_start(void);
+  extern long user_trace_stop(void);
+  extern void trace_cmdline(void);
+#else
+# define mcount_enabled				0
+# define trace_enabled				0
+# define trace_user_triggered			0
+# define trace_freerunning			0
+# define trace_all_cpus				0
+# define trace_verbose				0
+# define trace_special(v1,v2,v3)		do { } while (0)
+# define trace_special_pid(pid,v1,v2)		do { } while (0)
+# define trace_special_u64(v1,v2)		do { } while (0)
+# define stop_trace()				do { } while (0)
+# define print_last_trace()			do { } while (0)
+# define nmi_trace(eip, parent_eip, flags)	do { } while (0)
+# define user_trace_start()			do { } while (0)
+# define user_trace_stop()			do { } while (0)
+# define trace_cmdline()			do { } while (0)
+#endif
+
+extern int timeofday_API_hacks(void *tv, void *tz);
+
+#ifdef CONFIG_WAKEUP_TIMING
+  extern int wakeup_timing;
+  extern void __trace_start_sched_wakeup(struct task_struct *p);
+  extern void trace_stop_sched_switched(struct task_struct *p);
+  extern void trace_change_sched_cpu(struct task_struct *p, int new_cpu);
+#else
+# define wakeup_timing 0
+# define __trace_start_sched_wakeup(p)		do { } while (0)
+# define trace_stop_sched_switched(p)		do { } while (0)
+# define trace_change_sched_cpu(p, cpu)		do { } while (0)
+#endif
+
+// #define PREEMPT_DIRECT
+
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+extern int preempt_locks_user;
+extern void propagate_preempt_locks_value(void);
+#else
+# define propagate_preempt_locks_value() do { } while (0)
+#endif
+
+#ifdef CONFIG_X86_LOCAL_APIC
+extern void nmi_show_all_regs(void);
+#else
+# define nmi_show_all_regs() do { } while (0)
+#endif
+
+#include <linux/smp.h>
+#include <linux/sem.h>
+#include <linux/signal.h>
+#include <linux/securebits.h>
+#include <linux/fs_struct.h>
+#include <linux/compiler.h>
+#include <linux/completion.h>
+#include <linux/pid.h>
+#include <linux/percpu.h>
+#include <linux/topology.h>
+#include <linux/seccomp.h>
+
 struct exec_domain;
 
 /*
@@ -104,6 +261,7 @@ extern unsigned long nr_iowait(void);
 #include <linux/param.h>
 #include <linux/resource.h>
 #include <linux/timer.h>
+#include <linux/ktimer.h>
 
 #include <asm/processor.h>
 
@@ -118,15 +276,16 @@ extern unsigned long nr_iowait(void);
  * mistake.
  */
 #define TASK_RUNNING		0
-#define TASK_INTERRUPTIBLE	1
-#define TASK_UNINTERRUPTIBLE	2
-#define TASK_STOPPED		4
-#define TASK_TRACED		8
+#define TASK_RUNNING_MUTEX	1
+#define TASK_INTERRUPTIBLE	2
+#define TASK_UNINTERRUPTIBLE	4
+#define TASK_STOPPED		8
+#define TASK_TRACED		16
 /* in tsk->exit_state */
-#define EXIT_ZOMBIE		16
-#define EXIT_DEAD		32
+#define EXIT_ZOMBIE		32
+#define EXIT_DEAD		64
 /* in tsk->state again */
-#define TASK_NONINTERACTIVE	64
+#define TASK_NONINTERACTIVE	128
 
 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
@@ -203,11 +362,12 @@ extern void update_process_times(int use
 extern void scheduler_tick(void);
 
 #ifdef CONFIG_DETECT_SOFTLOCKUP
-extern void softlockup_tick(struct pt_regs *regs);
+extern void softlockup_tick(void);
 extern void spawn_softlockup_task(void);
 extern void touch_softlockup_watchdog(void);
+extern void touch_light_softlockup_watchdog(void);
 #else
-static inline void softlockup_tick(struct pt_regs *regs)
+static inline void softlockup_tick(void)
 {
 }
 static inline void spawn_softlockup_task(void)
@@ -216,9 +376,11 @@ static inline void spawn_softlockup_task
 static inline void touch_softlockup_watchdog(void)
 {
 }
+static inline void touch_light_softlockup_watchdog(void)
+{
+}
 #endif
 
-
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
 /* Is this address in the __sched functions? */
@@ -229,6 +391,11 @@ extern signed long FASTCALL(schedule_tim
 extern signed long schedule_timeout_interruptible(signed long timeout);
 extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
+/*
+ * This one can be called with interrupts disabled, only
+ * to be used by lowlevel arch code!
+ */
+extern void __sched __schedule(void);
 
 struct namespace;
 
@@ -297,6 +464,9 @@ struct mm_struct {
 	/* Architecture-specific MM context */
 	mm_context_t context;
 
+	/* realtime bits */
+	struct list_head	delayed_drop;
+
 	/* Token based thrashing protection. */
 	unsigned long swap_token_time;
 	char recent_pagein;
@@ -319,8 +489,16 @@ struct sighand_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
+	struct rcu_head		rcu;
 };
 
+static inline void sighand_free(struct sighand_struct *sp)
+{
+	extern void sighand_free_cb(struct rcu_head *rhp);
+
+	call_rcu(&sp->rcu, sighand_free_cb);
+}
+
 /*
  * NOTE! "signal_struct" does not have it's own
  * locking, because a shared signal_struct always
@@ -358,8 +536,7 @@ struct signal_struct {
 	struct list_head posix_timers;
 
 	/* ITIMER_REAL timer for the process */
-	struct timer_list real_timer;
-	unsigned long it_real_value, it_real_incr;
+	struct ktimer real_timer;
 
 	/* ITIMER_PROF and ITIMER_VIRTUAL timers for the process */
 	cputime_t it_prof_expires, it_virt_expires;
@@ -445,7 +622,8 @@ struct signal_struct {
 
 #define MAX_PRIO		(MAX_RT_PRIO + 40)
 
-#define rt_task(p)		(unlikely((p)->prio < MAX_RT_PRIO))
+#define rt_prio(prio)		((prio) < MAX_RT_PRIO)
+#define rt_task(p)		(unlikely(rt_prio((p)->prio)))
 
 /*
  * Some day this will be a full-fledged user tracking system..
@@ -651,7 +829,7 @@ struct task_struct {
 #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
 	int oncpu;
 #endif
-	int prio, static_prio;
+	int prio, static_prio, normal_prio;
 	struct list_head run_list;
 	prio_array_t *array;
 
@@ -666,6 +844,11 @@ struct task_struct {
 	cpumask_t cpus_allowed;
 	unsigned int time_slice, first_time_slice;
 
+#ifdef CONFIG_PREEMPT_RCU
+	int rcu_read_lock_nesting;
+	atomic_t *rcu_flipctr1;
+	atomic_t *rcu_flipctr2;
+#endif
 #ifdef CONFIG_SCHEDSTATS
 	struct sched_info sched_info;
 #endif
@@ -776,6 +959,41 @@ struct task_struct {
 /* Protection of proc_dentry: nesting proc_lock, dcache_lock, write_lock_irq(&tasklist_lock); */
 	spinlock_t proc_lock;
 
+#define MAX_PREEMPT_TRACE 25
+
+#ifdef CONFIG_PREEMPT_TRACE
+	unsigned long preempt_trace_eip[MAX_PREEMPT_TRACE];
+	unsigned long preempt_trace_parent_eip[MAX_PREEMPT_TRACE];
+#endif
+
+#define MAX_LOCK_STACK	MAX_PREEMPT_TRACE
+#ifdef CONFIG_DEBUG_PREEMPT
+	int lock_count;
+# ifdef CONFIG_PREEMPT_RT
+	struct rt_mutex *owned_lock[MAX_LOCK_STACK];
+# endif
+#endif
+#ifdef CONFIG_DETECT_SOFTLOCKUP
+	unsigned long	softlockup_count; /* Count to keep track how long the
+					   *  thread is in the kernel without
+					   *  sleeping.
+					   */
+#endif
+	/* realtime bits */
+	struct list_head delayed_put;
+	struct plist pi_waiters;
+
+	/* RT deadlock detection and priority inheritance handling */
+	struct rt_mutex_waiter *blocked_on;
+	struct rt_mutex *pending_owner;
+	raw_spinlock_t pi_lock;
+	unsigned long rt_flags;
+
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	void *last_kernel_lock;
+#endif
+
 /* journalling filesystem info */
 	void *journal_info;
 
@@ -813,6 +1031,7 @@ struct task_struct {
 	int cpuset_mems_generation;
 #endif
 	atomic_t fs_excl;	/* holding fs exclusive resources */
+	struct rcu_head rcu;
 };
 
 static inline pid_t process_group(struct task_struct *tsk)
@@ -836,8 +1055,29 @@ static inline int pid_alive(struct task_
 extern void free_task(struct task_struct *tsk);
 extern void __put_task_struct(struct task_struct *tsk);
 #define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
-#define put_task_struct(tsk) \
-do { if (atomic_dec_and_test(&(tsk)->usage)) __put_task_struct(tsk); } while(0)
+
+static inline int get_task_struct_rcu(struct task_struct *t)
+{
+	int oldusage;
+
+	do {
+		oldusage = atomic_read(&t->usage);
+		if (oldusage == 0) {
+			return 0;
+		}
+	} while (cmpxchg(&t->usage.counter,
+		 oldusage, oldusage + 1) != oldusage);
+	return 1;
+}
+
+extern void __put_task_struct_cb(struct rcu_head *rhp);
+
+static inline void put_task_struct(struct task_struct *t)
+{
+	if (atomic_dec_and_test(&t->usage)) {
+		call_rcu(&t->rcu, __put_task_struct_cb);
+	}
+}
 
 /*
  * Per process flags
@@ -864,6 +1104,10 @@ do { if (atomic_dec_and_test(&(tsk)->usa
 #define PF_SYNCWRITE	0x00200000	/* I am doing a sync write */
 #define PF_BORROWED_MM	0x00400000	/* I am a kthread doing use_mm */
 #define PF_RANDOMIZE	0x00800000	/* randomize virtual address space */
+#define PF_SOFTIRQ	0x01000000	/* softirq context */
+#define PF_HARDIRQ	0x02000000	/* hardirq context */
+#define PF_NOSCHED	0x04000000	/* no voluntary scheduling */
+#define PF_IRQSOFF	0x08000000	/* soft IRQs-off flag */
 
 /*
  * Only the _current_ task can read/write to tsk->flags, but other
@@ -929,7 +1173,12 @@ extern task_t *idle_task(int cpu);
 extern task_t *curr_task(int cpu);
 extern void set_curr_task(int cpu, task_t *p);
 
+extern void mutex_setprio(task_t *p, int prio);
+extern void pi_changeprio(task_t *p, int prio);
+extern int normal_prio(task_t *p);
+
 void yield(void);
+void __yield(void);
 
 /*
  * The default (Linux) execution domain.
@@ -977,6 +1226,9 @@ extern void do_timer(struct pt_regs *);
 
 extern int FASTCALL(wake_up_state(struct task_struct * tsk, unsigned int state));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_sync(struct task_struct * tsk));
+extern int FASTCALL(wake_up_process_mutex_sync(struct task_struct * tsk));
 extern void FASTCALL(wake_up_new_task(struct task_struct * tsk,
 						unsigned long clone_flags));
 #ifdef CONFIG_SMP
@@ -1074,12 +1326,20 @@ extern struct mm_struct * mm_alloc(void)
 
 /* mmdrop drops the mm and the page tables */
 extern void FASTCALL(__mmdrop(struct mm_struct *));
+extern void FASTCALL(__mmdrop_delayed(struct mm_struct *));
+
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (atomic_dec_and_test(&mm->mm_count))
 		__mmdrop(mm);
 }
 
+static inline void mmdrop_delayed(struct mm_struct * mm)
+{
+	if (atomic_dec_and_test(&mm->mm_count))
+		__mmdrop_delayed(mm);
+}
+
 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Grab a reference to a task's mm, if it is not already going away */
@@ -1210,6 +1470,11 @@ static inline int test_tsk_thread_flag(s
 	return test_ti_thread_flag(tsk->thread_info,flag);
 }
 
+static inline int signal_pending(struct task_struct *p)
+{
+	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+}
+
 static inline void set_tsk_need_resched(struct task_struct *tsk)
 {
 	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
@@ -1220,48 +1485,97 @@ static inline void clear_tsk_need_resche
 	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
 }
 
-static inline int signal_pending(struct task_struct *p)
+static inline int _need_resched(void)
 {
-	return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
 }
-  
+
 static inline int need_resched(void)
 {
-	return unlikely(test_thread_flag(TIF_NEED_RESCHED));
+	touch_critical_timing();
+	return _need_resched();
 }
 
-/*
- * cond_resched() and cond_resched_lock(): latency reduction via
- * explicit rescheduling in places that are safe. The return
- * value indicates whether a reschedule was done in fact.
- * cond_resched_lock() will drop the spinlock before scheduling,
- * cond_resched_softirq() will enable bhs before scheduling.
- */
-extern int cond_resched(void);
-extern int cond_resched_lock(spinlock_t * lock);
-extern int cond_resched_softirq(void);
+static inline void set_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline void clear_tsk_need_resched_delayed(struct task_struct *tsk)
+{
+	clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_DELAYED);
+}
+
+static inline int need_resched_delayed(void)
+{
+	return unlikely(test_thread_flag(TIF_NEED_RESCHED_DELAYED));
+}
 
 /*
  * Does a critical section need to be broken due to another
  * task waiting?:
  */
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
-# define need_lockbreak(lock) ((lock)->break_lock)
+#if (defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)) || defined(CONFIG_PREEMPT_RT)
+# define need_lockbreak(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
 #else
 # define need_lockbreak(lock) 0
 #endif
 
+#if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
+# define need_lockbreak_raw(lock) ({ int __need = ((lock)->break_lock); if (__need) (lock)->break_lock = 0; __need; })
+#else
+# define need_lockbreak_raw(lock) 0
+#endif
+
 /*
  * Does a critical section need to be broken due to another
  * task waiting or preemption being signalled:
  */
-static inline int lock_need_resched(spinlock_t *lock)
+#define lock_need_resched(lock) \
+	unlikely(need_lockbreak(lock) || need_resched())
+
+static inline int softirq_need_resched(void)
 {
-	if (need_lockbreak(lock) || need_resched())
-		return 1;
+	if (softirq_preemption)
+		return need_resched();
 	return 0;
 }
 
+static inline int hardirq_need_resched(void)
+{
+	if (current->flags & PF_HARDIRQ)
+		return need_resched();
+	return 0;
+}
+
+/*
+ * cond_resched() and cond_resched_lock(): latency reduction via
+ * explicit rescheduling in places that are safe. The return
+ * value indicates whether a reschedule was done in fact.
+ * cond_resched_lock() will drop the spinlock before scheduling,
+ * cond_resched_softirq() will enable bhs before scheduling.
+ */
+extern int cond_resched(void);
+extern int __cond_resched_raw_spinlock(raw_spinlock_t *lock);
+extern int __cond_resched_spinlock(spinlock_t *spinlock);
+
+#define cond_resched_lock(lock) \
+({								\
+	int __ret;						\
+								\
+	if (TYPE_EQUAL((lock), raw_spinlock_t))	 		\
+		__ret = __cond_resched_raw_spinlock((raw_spinlock_t *)lock);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = __cond_resched_spinlock((spinlock_t *)lock); \
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+extern int cond_resched_softirq(void);
+extern int cond_resched_hardirq(void);
+extern int cond_resched_all(void);
+
 /* Reevaluate whether the task has signals pending delivery.
    This is required every time the blocked sigset_t changes.
    callers must hold sighand->siglock.  */
@@ -1283,6 +1597,7 @@ static inline unsigned int task_cpu(cons
 
 static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
 {
+	trace_change_sched_cpu(p, cpu);
 	p->thread_info->cpu = cpu;
 }
 
Index: linux/include/linux/semaphore.h
===================================================================
--- /dev/null
+++ linux/include/linux/semaphore.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_SEMAPHORE_H
+#define _LINUX_SEMAPHORE_H
+
+#include <linux/config.h>
+
+#ifdef CONFIG_PREEMPT_RT
+# include <linux/rt_lock.h>
+#else
+
+#define DECLARE_MUTEX COMPAT_DECLARE_MUTEX
+#define DECLARE_MUTEX_LOCKED COMPAT_DECLARE_MUTEX_LOCKED
+
+static inline void sema_init(struct compat_semaphore *sem, int val)
+{
+	compat_sema_init(sem, val);
+}
+static inline void init_MUTEX(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX(sem);
+}
+static inline void init_MUTEX_LOCKED(struct compat_semaphore *sem)
+{
+	compat_init_MUTEX_LOCKED(sem);
+}
+static inline void down(struct compat_semaphore *sem)
+{
+	compat_down(sem);
+}
+static inline int down_interruptible(struct compat_semaphore *sem)
+{
+	return compat_down_interruptible(sem);
+}
+static inline int down_trylock(struct compat_semaphore *sem)
+{
+	return compat_down_trylock(sem);
+}
+static inline void up(struct compat_semaphore *sem)
+{
+	compat_up(sem);
+}
+static inline int sem_is_locked(struct compat_semaphore *sem)
+{
+	return compat_sem_is_locked(sem);
+}
+static inline int sema_count(struct compat_semaphore *sem)
+{
+	return compat_sema_count(sem);
+}
+
+#endif /* CONFIG_PREEMPT_RT */
+
+#endif /* _LINUX_SEMAPHORE_H */
Index: linux/include/linux/seqlock.h
===================================================================
--- linux.orig/include/linux/seqlock.h
+++ linux/include/linux/seqlock.h
@@ -29,39 +29,63 @@
 #include <linux/config.h>
 #include <linux/spinlock.h>
 #include <linux/preempt.h>
+#include <linux/rt_irq.h>
 
 typedef struct {
 	unsigned sequence;
 	spinlock_t lock;
-} seqlock_t;
+} __seqlock_t;
+
+typedef struct {
+	unsigned sequence;
+	raw_spinlock_t lock;
+} __raw_seqlock_t;
+
+#define seqlock_need_resched(seq) lock_need_resched(&(seq)->lock)
+
+#ifdef CONFIG_PREEMPT_RT
+typedef __seqlock_t seqlock_t;
+#else
+typedef __raw_seqlock_t seqlock_t;
+#endif
+
+typedef __raw_seqlock_t raw_seqlock_t;
 
 /*
  * These macros triggered gcc-3.x compile-time problems.  We think these are
  * OK now.  Be cautious.
  */
-#define SEQLOCK_UNLOCKED { 0, SPIN_LOCK_UNLOCKED }
-#define seqlock_init(x)	do { *(x) = (seqlock_t) SEQLOCK_UNLOCKED; } while (0)
-
+#ifdef CONFIG_PREEMPT_RT
+#define SEQLOCK_UNLOCKED(name) { 0, SPIN_LOCK_UNLOCKED((name).lock) }
+#else
+#define SEQLOCK_UNLOCKED(name) { 0, RAW_SPIN_LOCK_UNLOCKED }
+#endif
+
+#define seqlock_init(x)	do { (x)->sequence = 0; spin_lock_init(&(x)->lock); } while (0)
+
+#define RAW_SEQLOCK_UNLOCKED { 0, RAW_SPIN_LOCK_UNLOCKED }
+#define raw_seqlock_init(x) \
+		do { *(x) = (raw_seqlock_t) RAW_SEQLOCK_UNLOCKED; } while (0)
 
 /* Lock out other writers and update the count.
  * Acts like a normal spin_lock/unlock.
  * Don't need preempt_disable() because that is in the spin_lock already.
  */
-static inline void write_seqlock(seqlock_t *sl)
+static inline void __write_seqlock(seqlock_t *sl)
 {
 	spin_lock(&sl->lock);
 	++sl->sequence;
 	smp_wmb();			
 }	
 
-static inline void write_sequnlock(seqlock_t *sl) 
+static inline void __write_sequnlock(seqlock_t *sl)
 {
 	smp_wmb();
 	sl->sequence++;
 	spin_unlock(&sl->lock);
 }
 
-static inline int write_tryseqlock(seqlock_t *sl)
+static inline int __write_tryseqlock(seqlock_t *sl)
 {
 	int ret = spin_trylock(&sl->lock);
 
@@ -73,7 +97,7 @@ static inline int write_tryseqlock(seqlo
 }
 
 /* Start of read calculation -- fetch last complete writer token */
-static inline unsigned read_seqbegin(const seqlock_t *sl)
+static inline unsigned __read_seqbegin(const seqlock_t *sl)
 {
 	unsigned ret = sl->sequence;
 	smp_rmb();
@@ -88,13 +112,126 @@ static inline unsigned read_seqbegin(con
  *    
  * Using xor saves one conditional branch.
  */
-static inline int read_seqretry(const seqlock_t *sl, unsigned iv)
+static inline int __read_seqretry(seqlock_t *sl, unsigned iv)
+{
+	int ret;
+
+	smp_rmb();
+	ret = (iv & 1) | (sl->sequence ^ iv);
+	/*
+	 * If invalid then serialize with the writer, to make sure we
+	 * are not livelocking it:
+	 */
+	if (unlikely(ret)) {
+		unsigned long flags;
+		spin_lock_irqsave(&sl->lock, flags);
+		spin_unlock_irqrestore(&sl->lock, flags);
+	}
+	return ret;
+}
+
+static inline void __write_seqlock_raw(raw_seqlock_t *sl)
+{
+	spin_lock(&sl->lock);
+	++sl->sequence;
+	smp_wmb();
+}
+
+static inline void __write_sequnlock_raw(raw_seqlock_t *sl)
+{
+	smp_wmb();
+	sl->sequence++;
+	spin_unlock(&sl->lock);
+}
+
+static inline int __write_tryseqlock_raw(raw_seqlock_t *sl)
+{
+	int ret = spin_trylock(&sl->lock);
+
+	if (ret) {
+		++sl->sequence;
+		smp_wmb();
+	}
+	return ret;
+}
+
+static inline unsigned __read_seqbegin_raw(const raw_seqlock_t *sl)
+{
+	unsigned ret = sl->sequence;
+	smp_rmb();
+	return ret;
+}
+
+static inline int __read_seqretry_raw(const raw_seqlock_t *sl, unsigned iv)
 {
 	smp_rmb();
 	return (iv & 1) | (sl->sequence ^ iv);
 }
 
 
+extern int __bad_seqlock_type(void);
+
+#define PICK_SEQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op##_raw((raw_seqlock_t *)(lock));		\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		op((seqlock_t *)(lock));			\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_SEQOP_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((raw_seqlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		__ret = op((seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP_CONST_RET(op, lock)				\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		__ret = op##_raw((const raw_seqlock_t *)(lock));\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		__ret = op((const seqlock_t *)(lock));		\
+	else __ret = __bad_seqlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_SEQOP2_CONST_RET(op, lock, arg)				\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))				\
+		__ret = op##_raw((const raw_seqlock_t *)(lock), (arg));	\
+	else if (TYPE_EQUAL(lock, seqlock_t))				\
+		__ret = op((seqlock_t *)(lock), (arg));			\
+	else __ret = __bad_seqlock_type();				\
+									\
+	__ret;								\
+})
+
+
+#define write_seqlock(sl)	PICK_SEQOP(__write_seqlock, sl)
+#define write_sequnlock(sl)	PICK_SEQOP(__write_sequnlock, sl)
+#define write_tryseqlock(sl)	PICK_SEQOP_RET(__write_tryseqlock, sl)
+#define read_seqbegin(sl)	PICK_SEQOP_CONST_RET(__read_seqbegin, sl)
+#define read_seqretry(sl, iv)	PICK_SEQOP2_CONST_RET(__read_seqretry, sl, iv)
+
+#define DECLARE_SEQLOCK(name) \
+	seqlock_t name __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED(name)
+
+#define DECLARE_RAW_SEQLOCK(name) \
+	raw_seqlock_t name __cacheline_aligned_in_smp = RAW_SEQLOCK_UNLOCKED
+
 /*
  * Version using sequence counter only.
  * This can be used when code has its own mutex protecting the
@@ -145,30 +282,51 @@ static inline void write_seqcount_end(se
 	s->sequence++;
 }
 
+#define PICK_IRQOP(op, lock)					\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op();						\
+	else if (TYPE_EQUAL((lock), seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+#define PICK_IRQOP2(op, arg, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), raw_seqlock_t))			\
+		op(arg);					\
+	else if (TYPE_EQUAL(lock, seqlock_t))			\
+		{ /* nothing */ }				\
+	else __bad_seqlock_type();				\
+} while (0)
+
+
+
 /*
  * Possible sw/hw IRQ protected versions of the interfaces.
  */
 #define write_seqlock_irqsave(lock, flags)				\
-	do { local_irq_save(flags); write_seqlock(lock); } while (0)
+	do { PICK_IRQOP2(raw_local_irq_save, flags, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_irq(lock)						\
-	do { local_irq_disable();   write_seqlock(lock); } while (0)
+	do { PICK_IRQOP(raw_local_irq_disable, lock); write_seqlock(lock); } while (0)
 #define write_seqlock_bh(lock)						\
-        do { local_bh_disable();    write_seqlock(lock); } while (0)
+        do { PICK_IRQOP(local_bh_disable, lock); write_seqlock(lock); } while (0)
 
 #define write_sequnlock_irqrestore(lock, flags)				\
-	do { write_sequnlock(lock); local_irq_restore(flags); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP2(raw_local_irq_restore, flags, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_irq(lock)					\
-	do { write_sequnlock(lock); local_irq_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(raw_local_irq_enable, lock); preempt_check_resched(); } while(0)
 #define write_sequnlock_bh(lock)					\
-	do { write_sequnlock(lock); local_bh_enable(); } while(0)
+	do { write_sequnlock(lock); PICK_IRQOP(local_bh_enable, lock); } while(0)
 
 #define read_seqbegin_irqsave(lock, flags)				\
-	({ local_irq_save(flags);   read_seqbegin(lock); })
+	({ PICK_IRQOP2(raw_local_irq_save, flags, lock); read_seqbegin(lock); })
 
 #define read_seqretry_irqrestore(lock, iv, flags)			\
 	({								\
 		int ret = read_seqretry(lock, iv);			\
-		local_irq_restore(flags);				\
+		PICK_IRQOP2(raw_local_irq_restore, flags, lock);		\
+		preempt_check_resched(); 				\
 		ret;							\
 	})
 
Index: linux/include/linux/smp.h
===================================================================
--- linux.orig/include/linux/smp.h
+++ linux/include/linux/smp.h
@@ -34,6 +34,11 @@ extern void smp_send_stop(void);
  */
 extern void smp_send_reschedule(int cpu);
 
+/*
+ * trigger a reschedule on all other CPUs:
+ */
+extern void smp_send_reschedule_allbutself(void);
+
 
 /*
  * Prepare machine for booting other CPUs.
@@ -97,6 +102,7 @@ void smp_prepare_boot_cpu(void);
 #define smp_call_function(func,info,retry,wait)	({ 0; })
 #define on_each_cpu(func,info,retry,wait)	({ func(info); 0; })
 static inline void smp_send_reschedule(int cpu) { }
+static inline void smp_send_reschedule_allbutself(void) { }
 #define num_booting_cpus()			1
 #define smp_prepare_boot_cpu()			do {} while (0)
 
@@ -126,6 +132,6 @@ static inline void smp_send_reschedule(i
 
 #define get_cpu()		({ preempt_disable(); smp_processor_id(); })
 #define put_cpu()		preempt_enable()
-#define put_cpu_no_resched()	preempt_enable_no_resched()
+#define put_cpu_no_resched()	__preempt_enable_no_resched()
 
 #endif /* __LINUX_SMP_H */
Index: linux/include/linux/smp_lock.h
===================================================================
--- linux.orig/include/linux/smp_lock.h
+++ linux/include/linux/smp_lock.h
@@ -20,6 +20,7 @@ extern void __lockfunc __release_kernel_
 		__release_kernel_lock();	\
 } while (0)
 
+
 /*
  * Non-SMP kernels will never block on the kernel lock,
  * so we are better off returning a constant zero from
@@ -47,7 +48,7 @@ extern void __lockfunc unlock_kernel(voi
 #define lock_kernel()				do { } while(0)
 #define unlock_kernel()				do { } while(0)
 #define release_kernel_lock(task)		do { } while(0)
-#define reacquire_kernel_lock(task)		0
+#define reacquire_kernel_lock(task)		do { } while(0)
 #define kernel_locked()				1
 
 #endif /* CONFIG_LOCK_KERNEL */
Index: linux/include/linux/spinlock.h
===================================================================
--- linux.orig/include/linux/spinlock.h
+++ linux/include/linux/spinlock.h
@@ -52,6 +52,7 @@
 #include <linux/compiler.h>
 #include <linux/thread_info.h>
 #include <linux/kernel.h>
+#include <linux/cache.h>
 #include <linux/stringify.h>
 
 #include <asm/system.h>
@@ -90,16 +91,10 @@ extern int __lockfunc generic__raw_read_
 # include <linux/spinlock_up.h>
 #endif
 
-#define spin_lock_init(lock)	do { *(lock) = SPIN_LOCK_UNLOCKED; } while (0)
-#define rwlock_init(lock)	do { *(lock) = RW_LOCK_UNLOCKED; } while (0)
-
-#define spin_is_locked(lock)	__raw_spin_is_locked(&(lock)->raw_lock)
-
-/**
- * spin_unlock_wait - wait until the spinlock gets unlocked
- * @lock: the spinlock in question.
+/*
+ * Pull the RT types:
  */
-#define spin_unlock_wait(lock)	__raw_spin_unlock_wait(&(lock)->raw_lock)
+#include <linux/rt_lock.h>
 
 /*
  * Pull the _spin_*()/_read_*()/_write_*() functions/declarations:
@@ -110,18 +105,19 @@ extern int __lockfunc generic__raw_read_
 # include <linux/spinlock_api_up.h>
 #endif
 
+#if 0
 #ifdef CONFIG_DEBUG_SPINLOCK
- extern void _raw_spin_lock(spinlock_t *lock);
-#define _raw_spin_lock_flags(lock, flags) _raw_spin_lock(lock)
- extern int _raw_spin_trylock(spinlock_t *lock);
- extern void _raw_spin_unlock(spinlock_t *lock);
-
- extern void _raw_read_lock(rwlock_t *lock);
- extern int _raw_read_trylock(rwlock_t *lock);
- extern void _raw_read_unlock(rwlock_t *lock);
- extern void _raw_write_lock(rwlock_t *lock);
- extern int _raw_write_trylock(rwlock_t *lock);
- extern void _raw_write_unlock(rwlock_t *lock);
+ extern void _raw_spin_lock(raw_spinlock_t *lock);
+#define _raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+ extern int _raw_spin_trylock(raw_spinlock_t *lock);
+ extern void _raw_spin_unlock(raw_spinlock_t *lock);
+
+ extern void _raw_read_lock(raw_rwlock_t *lock);
+ extern int _raw_read_trylock(raw_rwlock_t *lock);
+ extern void _raw_read_unlock(raw_rwlock_t *lock);
+ extern void _raw_write_lock(raw_rwlock_t *lock);
+ extern int _raw_write_trylock(raw_rwlock_t *lock);
+ extern void _raw_write_unlock(raw_rwlock_t *lock);
 #else
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
@@ -135,95 +131,452 @@ extern int __lockfunc generic__raw_read_
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 #endif
+#endif
+
+extern int __bad_spinlock_type(void);
+
+/*
+ * The following ones are only implemented on PREEMPT_RT, but
+ * the type selection macros need the prototypes even though the
+ * functions never get called (hence, linked):
+ */
+#if !defined(CONFIG_PREEMPT_RT) || \
+	defined(CONFIG_DEBUG_RT_LOCKING_MODE) || \
+	defined(CONFIG_DEBUG_DEADLOCKS) || \
+	defined(CONFIG_DEBUG_IRQ_FLAGS)
+# undef DEBUG_RT_DONT_INLINE
+# define DEBUG_RT_DONT_INLINE
+#endif
+
+#ifdef DEBUG_RT_DONT_INLINE
+extern void __lockfunc _spin_lock(spinlock_t *lock);
+extern void __lockfunc _spin_lock_bh(spinlock_t *lock);
+extern void __lockfunc _spin_lock_irq(spinlock_t *lock);
+extern void __lockfunc _spin_unlock(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_no_resched(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_bh(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_irq(spinlock_t *lock);
+extern unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock);
+extern void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags);
+#else
+/*
+ * Inlined shortcuts for the most common APIs:
+ */
+extern void __down_mutex(struct rt_mutex *lock);
+extern void __up_mutex_nosavestate(struct rt_mutex *lock);
+extern void __up_mutex_savestate(struct rt_mutex *lock);
+
+static inline void _spin_lock(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+}
+static inline void _spin_lock_bh(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+}
+static inline void _spin_lock_irq(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+}
+static inline unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+{
+	__down_mutex(&lock->lock);
+	return 0;
+}
+static inline void _spin_unlock(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_no_resched(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_bh(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_irq(spinlock_t *lock)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+static inline void _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+	__up_mutex_savestate(&lock->lock);
+}
+#endif
+extern void __lockfunc _spin_unlock_wait(spinlock_t *lock);
+extern int __lockfunc _spin_trylock(spinlock_t *lock);
+extern int __lockfunc _spin_trylock_bh(spinlock_t *lock);
+extern int __lockfunc _spin_trylock_irq(spinlock_t *lock);
+extern int __lockfunc _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags);
+extern int _spin_can_lock(spinlock_t *lock);
+extern int _spin_is_locked(spinlock_t *lock);
+extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock);
+extern void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line);
+
+#undef TYPE_EQUAL
+#define TYPE_EQUAL(lock, type) \
+		__builtin_types_compatible_p(typeof(lock), type *)
+
+#define PICK_OP(type, optype, op, lock)				\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock));		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP_RET(type, optype, op, lock...)			\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), type))	  			\
+		__ret = _raw_##optype##op((type *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock));	\
+	else __ret = __bad_spinlock_type();			\
+								\
+	__ret;							\
+})
+
+#define PICK_OP2(type, optype, op, lock, flags)			\
+do {								\
+	if (TYPE_EQUAL((lock), type))				\
+		_raw_##optype##op((type *)(lock), flags);	\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		_spin##op((spinlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();				\
+} while (0)
+
+#define PICK_OP2_RET(type, optype, op, lock, flags)		\
+({								\
+	unsigned long __ret;					\
+								\
+	if (TYPE_EQUAL((lock), type))				\
+		__ret = _raw_##optype##op((type *)(lock), flags);\
+	else if (TYPE_EQUAL(lock, spinlock_t))			\
+		__ret = _spin##op((spinlock_t *)(lock), flags);	\
+	else __bad_spinlock_type();				\
+								\
+	__ret;							\
+})
 
-#define read_can_lock(rwlock)		__raw_read_can_lock(&(rwlock)->raw_lock)
-#define write_can_lock(rwlock)		__raw_write_can_lock(&(rwlock)->raw_lock)
 
+extern int __lockfunc _read_trylock(rwlock_t *rwlock);
+extern int __lockfunc _write_trylock(rwlock_t *rwlock);
+extern int _read_can_lock(rwlock_t *rwlock);
+extern int _write_can_lock(rwlock_t *rwlock);
+extern void __lockfunc _write_lock(rwlock_t *rwlock);
+extern void __lockfunc _read_lock(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock(rwlock_t *rwlock);
+extern void __lockfunc _read_unlock(rwlock_t *rwlock);
+extern unsigned long __lockfunc _write_lock_irqsave(rwlock_t *rwlock);
+extern unsigned long __lockfunc _read_lock_irqsave(rwlock_t *rwlock);
+extern void __lockfunc _write_lock_irq(rwlock_t *rwlock);
+extern void __lockfunc _read_lock_irq(rwlock_t *rwlock);
+extern void __lockfunc _write_lock_bh(rwlock_t *rwlock);
+extern void __lockfunc _read_lock_bh(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock_irq(rwlock_t *rwlock);
+extern void __lockfunc _read_unlock_irq(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock_bh(rwlock_t *rwlock);
+extern void __lockfunc _read_unlock_bh(rwlock_t *rwlock);
+extern void __lockfunc _write_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags);
+extern void __lockfunc _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags);
+extern void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line);
+
+#define __PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		##op((rwlock_t *)(lock));				\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define PICK_RW_OP(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock));			\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define __PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP_RET(type, optype, op, lock...)			\
+({									\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL((lock), type))	  				\
+		__ret = _raw_##optype##op((type *)(lock));		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		__ret = _##optype##op((rwlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+})
+
+#define PICK_RW_OP2(type, optype, op, lock, flags)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock), flags);		\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), flags);		\
+	else __bad_spinlock_type();					\
+} while (0)
+
+#define _raw_spin_lock_init __raw_spin_lock_init
+
+#define PICK_OP_INIT(type, optype, op, lock)				\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		_spin##op((spinlock_t *)(lock), #lock, __FILE__, __LINE__); \
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define spin_lock_init(lock) \
+		PICK_OP_INIT(raw_spinlock_t, spin, _lock_init, lock)
+
+#define _raw_rwlock_init __raw_rwlock_init
+
+#define __PICK_RW_OP_INIT(type, optype, op, lock)			\
+do {									\
+	if (TYPE_EQUAL((lock), type))					\
+		_raw_##optype##op((type *)(lock));			\
+	else if (TYPE_EQUAL(lock, rwlock_t))				\
+		_##optype##op((rwlock_t *)(lock), #lock, __FILE__, __LINE__);\
+	else __bad_spinlock_type();					\
+} while (0)
+
+
+#define rwlock_init(lock) \
+		__PICK_RW_OP_INIT(raw_rwlock_t, rwlock, _init, lock)
+
+#define _raw_spin_is_locked(lock) __raw_spin_is_locked(&(lock)->raw_lock)
+
+#define spin_is_locked(lock) \
+		PICK_OP_RET(raw_spinlock_t, spin, _is_locked, lock)
+
+#define _raw_spin_unlock_wait(lock) __raw_spin_unlock_wait(&(lock)->raw_lock)
+
+#define spin_unlock_wait(lock) \
+		PICK_OP(raw_spinlock_t, spin, _unlock_wait, lock)
 /*
  * Define the various spin_lock and rw_lock methods.  Note we define these
  * regardless of whether CONFIG_SMP or CONFIG_PREEMPT are set. The various
  * methods are defined as nops in the case they are not required.
  */
-#define spin_trylock(lock)		__cond_lock(_spin_trylock(lock))
-#define read_trylock(lock)		__cond_lock(_read_trylock(lock))
-#define write_trylock(lock)		__cond_lock(_write_trylock(lock))
-
-#define spin_lock(lock)			_spin_lock(lock)
-#define write_lock(lock)		_write_lock(lock)
-#define read_lock(lock)			_read_lock(lock)
+// #define spin_trylock(lock)	_spin_trylock(lock)
+#define spin_trylock(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock, lock))
 
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
-#define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
-#define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
-#define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
+//#define read_trylock(lock)	_read_trylock(lock)
+#define read_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, read, _trylock, lock))
+
+//#define write_trylock(lock)	_write_trylock(lock)
+#define write_trylock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, write, _trylock, lock))
+
+#define _raw_spin_can_lock(lock) __raw_spin_can_lock(&(lock)->raw_lock)
+#define _raw_read_can_lock(lock) __raw_read_can_lock(&(lock)->raw_lock)
+#define _raw_write_can_lock(lock) __raw_write_can_lock(&(lock)->raw_lock)
+
+#define spin_can_lock(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _can_lock, lock))
+#define read_can_lock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, read, _can_lock, lock))
+#define write_can_lock(lock)	__cond_lock(PICK_RW_OP_RET(raw_rwlock_t, write, _can_lock, lock))
+
+// #define spin_lock(lock)	_spin_lock(lock)
+#define spin_lock(lock)		PICK_OP(raw_spinlock_t, spin, _lock, lock)
+
+//#define write_lock(lock)	_write_lock(lock)
+#define write_lock(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock, lock)
+
+// #define read_lock(lock)		_read_lock(lock)
+#define read_lock(lock)		PICK_RW_OP(raw_rwlock_t, read, _lock, lock)
+
+#ifdef CONFIG_SMP
+// #define spin_lock_irqsave(lock, flags)	flags = _spin_lock_irqsave(lock)
+// #define read_lock_irqsave(lock, flags)	flags = _read_lock_irqsave(lock)
+// #define write_lock_irqsave(lock, flags)	flags = _write_lock_irqsave(lock)
 #else
-#define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
-#define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
-#define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
+// #define spin_lock_irqsave(lock, flags)	_spin_lock_irqsave(lock, flags)
+// #define read_lock_irqsave(lock, flags)	_read_lock_irqsave(lock, flags)
+// #define write_lock_irqsave(lock, flags)	_write_lock_irqsave(lock, flags)
 #endif
 
-#define spin_lock_irq(lock)		_spin_lock_irq(lock)
-#define spin_lock_bh(lock)		_spin_lock_bh(lock)
+# define spin_lock_irqsave(lock, flags) \
+	flags = PICK_OP_RET(raw_spinlock_t, spin, _lock_irqsave, lock)
+# define read_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, read, _lock_irqsave, lock)
+# define write_lock_irqsave(lock, flags) \
+	flags = PICK_RW_OP_RET(raw_rwlock_t, write, _lock_irqsave, lock)
+
+// #define spin_lock_irq(lock)	_spin_lock_irq(lock)
+// #define spin_lock_bh(lock)	_spin_lock_bh(lock)
+#define spin_lock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _lock_irq, lock)
+#define spin_lock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _lock_bh, lock)
+
+// #define read_lock_irq(lock)	_read_lock_irq(lock)
+// #define read_lock_bh(lock)	_read_lock_bh(lock)
+#define read_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_irq, lock)
+#define read_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, read, _lock_bh, lock)
+
+// #define write_lock_irq(lock)		_write_lock_irq(lock)
+// #define write_lock_bh(lock)		_write_lock_bh(lock)
+#define write_lock_irq(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_irq, lock)
+#define write_lock_bh(lock)	PICK_RW_OP(raw_rwlock_t, write, _lock_bh, lock)
+
+// #define spin_unlock(lock)	_spin_unlock(lock)
+// #define write_unlock(lock)	_write_unlock(lock)
+// #define read_unlock(lock)	_read_unlock(lock)
+#define spin_unlock(lock)	PICK_OP(raw_spinlock_t, spin, _unlock, lock)
+#define read_unlock(lock)	PICK_RW_OP(raw_rwlock_t, read, _unlock, lock)
+#define write_unlock(lock)	PICK_RW_OP(raw_rwlock_t, write, _unlock, lock)
+
+// #define spin_unlock(lock)	_spin_unlock_no_resched(lock)
+#define spin_unlock_no_resched(lock) \
+			PICK_OP(raw_spinlock_t, spin, _unlock_no_resched, lock)
+
+//#define spin_unlock_irqrestore(lock, flags)
+//		_spin_unlock_irqrestore(lock, flags)
+//#define spin_unlock_irq(lock)	_spin_unlock_irq(lock)
+//#define spin_unlock_bh(lock)	_spin_unlock_bh(lock)
+#define spin_unlock_irqrestore(lock, flags) \
+	PICK_OP2(raw_spinlock_t, spin, _unlock_irqrestore, lock, flags)
+#define spin_unlock_irq(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_irq, lock)
+#define spin_unlock_bh(lock)	PICK_OP(raw_spinlock_t, spin, _unlock_bh, lock)
+
+// #define read_unlock_irqrestore(lock, flags)
+// 		_read_unlock_irqrestore(lock, flags)
+// #define read_unlock_irq(lock)	_read_unlock_irq(lock)
+// #define read_unlock_bh(lock)	_read_unlock_bh(lock)
+#define read_unlock_irqrestore(lock, flags) \
+		PICK_RW_OP2(raw_rwlock_t, read, _unlock_irqrestore, lock, flags)
+#define read_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_irq, lock)
+#define read_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, read, _unlock_bh, lock)
+
+// #define write_unlock_irqrestore(lock, flags)
+// 	_write_unlock_irqrestore(lock, flags)
+// #define write_unlock_irq(lock)			_write_unlock_irq(lock)
+// #define write_unlock_bh(lock)			_write_unlock_bh(lock)
+#define write_unlock_irqrestore(lock, flags) \
+	PICK_RW_OP2(raw_rwlock_t, write, _unlock_irqrestore, lock, flags)
+#define write_unlock_irq(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_irq, lock)
+#define write_unlock_bh(lock) PICK_RW_OP(raw_rwlock_t, write, _unlock_bh, lock)
 
-#define read_lock_irq(lock)		_read_lock_irq(lock)
-#define read_lock_bh(lock)		_read_lock_bh(lock)
+// #define spin_trylock_bh(lock)	_spin_trylock_bh(lock)
+#define spin_trylock_bh(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_bh, lock))
 
-#define write_lock_irq(lock)		_write_lock_irq(lock)
-#define write_lock_bh(lock)		_write_lock_bh(lock)
+// #define spin_trylock_irq(lock)
 
-#define spin_unlock(lock)		_spin_unlock(lock)
-#define write_unlock(lock)		_write_unlock(lock)
-#define read_unlock(lock)		_read_unlock(lock)
+#define spin_trylock_irq(lock)	__cond_lock(PICK_OP_RET(raw_spinlock_t, spin, _trylock_irq, lock))
 
-#define spin_unlock_irqrestore(lock, flags) \
-					_spin_unlock_irqrestore(lock, flags)
-#define spin_unlock_irq(lock)		_spin_unlock_irq(lock)
-#define spin_unlock_bh(lock)		_spin_unlock_bh(lock)
+// #define spin_trylock_irqsave(lock, flags)
 
-#define read_unlock_irqrestore(lock, flags) \
-					_read_unlock_irqrestore(lock, flags)
-#define read_unlock_irq(lock)		_read_unlock_irq(lock)
-#define read_unlock_bh(lock)		_read_unlock_bh(lock)
+#define spin_trylock_irqsave(lock, flags)	__cond_lock(PICK_OP2_RET(raw_spinlock_t, spin, _trylock_irqsave, lock, &flags))
 
-#define write_unlock_irqrestore(lock, flags) \
-					_write_unlock_irqrestore(lock, flags)
-#define write_unlock_irq(lock)		_write_unlock_irq(lock)
-#define write_unlock_bh(lock)		_write_unlock_bh(lock)
-
-#define spin_trylock_bh(lock)		__cond_lock(_spin_trylock_bh(lock))
-
-#define spin_trylock_irq(lock) \
-({ \
-	local_irq_disable(); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_enable(); 0;  }); \
-})
+/* "lock on reference count zero" */
+#ifndef ATOMIC_DEC_AND_LOCK
+# include <asm/atomic.h>
+  extern int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock);
+#endif
+
+#define atomic_dec_and_lock(atomic, lock)				\
+__cond_lock(({								\
+	unsigned long __ret;						\
+									\
+	if (TYPE_EQUAL(lock, raw_spinlock_t))				\
+		__ret = _atomic_dec_and_raw_spin_lock(atomic,		\
+					(raw_spinlock_t *)(lock));	\
+	else if (TYPE_EQUAL(lock, spinlock_t))				\
+		__ret = atomic_dec_and_spin_lock(atomic,		\
+					(spinlock_t *)(lock));		\
+	else __ret = __bad_spinlock_type();				\
+									\
+	__ret;								\
+}))
 
-#define spin_trylock_irqsave(lock, flags) \
-({ \
-	local_irq_save(flags); \
-	_spin_trylock(lock) ? \
-	1 : ({ local_irq_restore(flags); 0; }); \
-})
 
 /*
- * Pull the atomic_t declaration:
- * (asm-mips/atomic.h needs above definitions)
+ *  bit-based spin_lock()
+ *
+ * Don't use this unless you really need to: spin_lock() and spin_unlock()
+ * are significantly faster.
  */
-#include <asm/atomic.h>
-/**
- * atomic_dec_and_lock - lock on reaching reference count zero
- * @atomic: the atomic counter
- * @lock: the spinlock in question
- */
-extern int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock);
-#define atomic_dec_and_lock(atomic, lock) \
-		__cond_lock(_atomic_dec_and_lock(atomic, lock))
+static inline void bit_spin_lock(int bitnum, unsigned long *addr)
+{
+	/*
+	 * Assuming the lock is uncontended, this never enters
+	 * the body of the outer loop. If it is contended, then
+	 * within the inner loop a non-atomic test is used to
+	 * busywait with less bus contention for a good time to
+	 * attempt to acquire the lock bit.
+	 */
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	while (test_and_set_bit(bitnum, addr))
+		while (test_bit(bitnum, addr))
+			cpu_relax();
+#endif
+	__acquire(bitlock);
+}
+
+/*
+ * Return true if it was acquired
+ */
+static inline int bit_spin_trylock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	if (test_and_set_bit(bitnum, addr))
+		return 0;
+#endif
+	__acquire(bitlock);
+	return 1;
+}
+
+/*
+ *  bit-based spin_unlock()
+ */
+static inline void bit_spin_unlock(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	BUG_ON(!test_bit(bitnum, addr));
+	smp_mb__before_clear_bit();
+	clear_bit(bitnum, addr);
+#endif
+	__release(bitlock);
+}
+
+/*
+ * Return true if the lock is held.
+ */
+static inline int bit_spin_is_locked(int bitnum, unsigned long *addr)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT)
+	return test_bit(bitnum, addr);
+#else
+	return 1;
+#endif
+}
 
 /**
- * spin_can_lock - would spin_trylock() succeed?
+ * __raw_spin_can_lock - would __raw_spin_trylock() succeed?
  * @lock: the spinlock in question.
  */
-#define spin_can_lock(lock)	(!spin_is_locked(lock))
+#define __raw_spin_can_lock(lock)            (!__raw_spin_is_locked(lock))
 
 #endif /* __LINUX_SPINLOCK_H */
+
Index: linux/include/linux/spinlock_api_smp.h
===================================================================
--- linux.orig/include/linux/spinlock_api_smp.h
+++ linux/include/linux/spinlock_api_smp.h
@@ -19,39 +19,42 @@ int in_lock_functions(unsigned long addr
 
 #define assert_spin_locked(x)	BUG_ON(!spin_is_locked(x))
 
-void __lockfunc _spin_lock(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_bh(spinlock_t *lock)		__acquires(spinlock_t);
-void __lockfunc _read_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_bh(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _spin_lock_irq(spinlock_t *lock)	__acquires(spinlock_t);
-void __lockfunc _read_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-void __lockfunc _write_lock_irq(rwlock_t *lock)		__acquires(rwlock_t);
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
-							__acquires(spinlock_t);
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
-							__acquires(rwlock_t);
-int __lockfunc _spin_trylock(spinlock_t *lock);
-int __lockfunc _read_trylock(rwlock_t *lock);
-int __lockfunc _write_trylock(rwlock_t *lock);
-int __lockfunc _spin_trylock_bh(spinlock_t *lock);
-void __lockfunc _spin_unlock(spinlock_t *lock)		__releases(spinlock_t);
-void __lockfunc _read_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_bh(rwlock_t *lock)		__releases(rwlock_t);
-void __lockfunc _write_unlock_bh(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)	__releases(spinlock_t);
-void __lockfunc _read_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _write_unlock_irq(rwlock_t *lock)	__releases(rwlock_t);
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
-							__releases(spinlock_t);
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
-							__releases(rwlock_t);
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)		__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)		__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)	__acquires(raw_spinlock_t);
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)		__acquires(raw_rwlock_t);
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
+							__acquires(raw_spinlock_t);
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)
+							__acquires(raw_rwlock_t);
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)
+							__acquires(raw_rwlock_t);
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock);
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock);
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock);
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags);
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock);
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)		__releases(raw_spinlock_t);
+void __lockfunc _raw_spin_unlock_no_resched(raw_spinlock_t *lock) __releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)	__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)		__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)	__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)	__releases(raw_rwlock_t);
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
+							__releases(raw_spinlock_t);
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+							__releases(raw_rwlock_t);
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
+							__releases(raw_rwlock_t);
 
 #endif /* __LINUX_SPINLOCK_API_SMP_H */
Index: linux/include/linux/spinlock_api_up.h
===================================================================
--- linux.orig/include/linux/spinlock_api_up.h
+++ linux/include/linux/spinlock_api_up.h
@@ -31,50 +31,67 @@
   do { local_bh_disable(); __LOCK(lock); } while (0)
 
 #define __LOCK_IRQ(lock) \
-  do { local_irq_disable(); __LOCK(lock); } while (0)
+  do { raw_local_irq_disable(); __LOCK(lock); } while (0)
 
-#define __LOCK_IRQSAVE(lock, flags) \
-  do { local_irq_save(flags); __LOCK(lock); } while (0)
+#define __LOCK_IRQSAVE(lock) \
+  ({ unsigned long __flags; raw_local_irq_save(__flags); __LOCK(lock); __flags; })
+
+#define __TRYLOCK_IRQSAVE(lock, flags) \
+	({ raw_local_irq_save(*(flags)); __LOCK(lock); 1; })
+
+#define _raw_spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
 
 #define __UNLOCK(lock) \
   do { preempt_enable(); __release(lock); (void)(lock); } while (0)
 
+#define __UNLOCK_NO_RESCHED(lock) \
+  do { __preempt_enable_no_resched(); __release(lock); (void)(lock); } while (0)
+
 #define __UNLOCK_BH(lock) \
   do { preempt_enable_no_resched(); local_bh_enable(); __release(lock); (void)(lock); } while (0)
 
 #define __UNLOCK_IRQ(lock) \
-  do { local_irq_enable(); __UNLOCK(lock); } while (0)
+  do { raw_local_irq_enable(); __UNLOCK(lock); } while (0)
 
 #define __UNLOCK_IRQRESTORE(lock, flags) \
-  do { local_irq_restore(flags); __UNLOCK(lock); } while (0)
+  do { raw_local_irq_restore(flags); __UNLOCK(lock); } while (0)
 
-#define _spin_lock(lock)			__LOCK(lock)
-#define _read_lock(lock)			__LOCK(lock)
-#define _write_lock(lock)			__LOCK(lock)
-#define _spin_lock_bh(lock)			__LOCK_BH(lock)
-#define _read_lock_bh(lock)			__LOCK_BH(lock)
-#define _write_lock_bh(lock)			__LOCK_BH(lock)
-#define _spin_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _read_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _write_lock_irq(lock)			__LOCK_IRQ(lock)
-#define _spin_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _read_lock_irqsave(lock, flags)		__LOCK_IRQSAVE(lock, flags)
-#define _write_lock_irqsave(lock, flags)	__LOCK_IRQSAVE(lock, flags)
-#define _spin_trylock(lock)			({ __LOCK(lock); 1; })
-#define _read_trylock(lock)			({ __LOCK(lock); 1; })
-#define _write_trylock(lock)			({ __LOCK(lock); 1; })
-#define _spin_trylock_bh(lock)			({ __LOCK_BH(lock); 1; })
-#define _spin_unlock(lock)			__UNLOCK(lock)
-#define _read_unlock(lock)			__UNLOCK(lock)
-#define _write_unlock(lock)			__UNLOCK(lock)
-#define _spin_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _write_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _read_unlock_bh(lock)			__UNLOCK_BH(lock)
-#define _spin_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _read_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _write_unlock_irq(lock)			__UNLOCK_IRQ(lock)
-#define _spin_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _read_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
-#define _write_unlock_irqrestore(lock, flags)	__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_spin_lock(lock)			__LOCK(lock)
+#define _raw_read_lock(lock)			__LOCK(lock)
+#define _raw_write_lock(lock)			__LOCK(lock)
+#define _raw_spin_lock_bh(lock)			__LOCK_BH(lock)
+#define _raw_read_lock_bh(lock)			__LOCK_BH(lock)
+#define _raw_write_lock_bh(lock)		__LOCK_BH(lock)
+#define _raw_spin_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_read_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_write_lock_irq(lock)		__LOCK_IRQ(lock)
+#define _raw_spin_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_read_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_write_lock_irqsave(lock)		__LOCK_IRQSAVE(lock)
+#define _raw_spin_trylock(lock)			({ __LOCK(lock); 1; })
+#define _raw_read_trylock(lock)			({ __LOCK(lock); 1; })
+#define _raw_write_trylock(lock)		({ __LOCK(lock); 1; })
+#define _raw_spin_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_read_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_write_trylock_bh(lock)		({ __LOCK_BH(lock); 1; })
+#define _raw_spin_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_read_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_read_trylock_irqsave(lock, flags)	__TRYLOCK_IRQSAVE(lock, flags)
+#define _raw_spin_unlock(lock)			__UNLOCK(lock)
+#define _raw_spin_unlock_no_resched(lock)	__UNLOCK_NO_RESCHED(lock)
+#define _raw_read_unlock(lock)			__UNLOCK(lock)
+#define _raw_write_unlock(lock)			__UNLOCK(lock)
+#define _raw_spin_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_write_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_read_unlock_bh(lock)		__UNLOCK_BH(lock)
+#define _raw_spin_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_read_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_write_unlock_irq(lock)		__UNLOCK_IRQ(lock)
+#define _raw_spin_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_read_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
+#define _raw_write_unlock_irqrestore(lock, flags) \
+						__UNLOCK_IRQRESTORE(lock, flags)
 
 #endif /* __LINUX_SPINLOCK_API_UP_H */
Index: linux/include/linux/spinlock_types.h
===================================================================
--- linux.orig/include/linux/spinlock_types.h
+++ linux/include/linux/spinlock_types.h
@@ -16,7 +16,7 @@
 #endif
 
 typedef struct {
-	raw_spinlock_t raw_lock;
+	__raw_spinlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -24,12 +24,12 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
-} spinlock_t;
+} raw_spinlock_t;
 
-#define SPINLOCK_MAGIC		0xdead4ead
+#define RAW_SPINLOCK_MAGIC	0xdead4ead
 
 typedef struct {
-	raw_rwlock_t raw_lock;
+	__raw_rwlock_t raw_lock;
 #if defined(CONFIG_PREEMPT) && defined(CONFIG_SMP)
 	unsigned int break_lock;
 #endif
@@ -37,31 +37,46 @@ typedef struct {
 	unsigned int magic, owner_cpu;
 	void *owner;
 #endif
-} rwlock_t;
+} raw_rwlock_t;
 
 #define RWLOCK_MAGIC		0xdeaf1eed
 
 #define SPINLOCK_OWNER_INIT	((void *)-1L)
 
 #ifdef CONFIG_DEBUG_SPINLOCK
-# define SPIN_LOCK_UNLOCKED						\
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
-				.magic = SPINLOCK_MAGIC,		\
+# define RAW_SPIN_LOCK_UNLOCKED						\
+	(raw_spinlock_t) {	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED,	\
+				.magic = RAW_SPINLOCK_MAGIC,		\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
-#define RW_LOCK_UNLOCKED						\
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
+# define RAW_RW_LOCK_UNLOCKED						\
+	(raw_rwlock_t) {	.raw_lock = __RAW_RW_LOCK_UNLOCKED,	\
 				.magic = RWLOCK_MAGIC,			\
 				.owner = SPINLOCK_OWNER_INIT,		\
 				.owner_cpu = -1 }
 #else
-# define SPIN_LOCK_UNLOCKED \
-	(spinlock_t)	{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
-#define RW_LOCK_UNLOCKED \
-	(rwlock_t)	{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
-#endif
-
-#define DEFINE_SPINLOCK(x)	spinlock_t x = SPIN_LOCK_UNLOCKED
-#define DEFINE_RWLOCK(x)	rwlock_t x = RW_LOCK_UNLOCKED
+# define _RAW_SPIN_LOCK_UNLOCKED \
+		{	.raw_lock = __RAW_SPIN_LOCK_UNLOCKED }
+# define _RAW_RW_LOCK_UNLOCKED \
+		{	.raw_lock = __RAW_RW_LOCK_UNLOCKED }
+# define RAW_SPIN_LOCK_UNLOCKED \
+	(raw_spinlock_t) _RAW_SPIN_LOCK_UNLOCKED
+# define RAW_RW_LOCK_UNLOCKED \
+	(raw_rwlock_t) _RAW_RW_LOCK_UNLOCKED
+#endif
+
+#define DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name __cacheline_aligned_in_smp = RAW_SPIN_LOCK_UNLOCKED
+
+#define __DEFINE_RAW_SPINLOCK(name) \
+	raw_spinlock_t name = RAW_SPIN_LOCK_UNLOCKED
+
+#define DEFINE_RAW_RWLOCK(name) \
+	raw_rwlock_t name __cacheline_aligned_in_smp = RAW_RW_LOCK_UNLOCKED
+
+#define __raw_spin_lock_init(lock) \
+	do { *(lock) = RAW_SPIN_LOCK_UNLOCKED; } while (0)
+#define __raw_rwlock_init(lock) \
+	do { *(lock) = RAW_RW_LOCK_UNLOCKED; } while (0)
 
 #endif /* __LINUX_SPINLOCK_TYPES_H */
Index: linux/include/linux/spinlock_types_up.h
===================================================================
--- linux.orig/include/linux/spinlock_types_up.h
+++ linux/include/linux/spinlock_types_up.h
@@ -16,7 +16,7 @@
 
 typedef struct {
 	volatile unsigned int slock;
-} raw_spinlock_t;
+} __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { 1 }
 
@@ -27,11 +27,11 @@ typedef struct {
  * with empty initializers.
  */
 #if (__GNUC__ > 2)
-typedef struct { } raw_spinlock_t;
+typedef struct { } __raw_spinlock_t;
 
 #define __RAW_SPIN_LOCK_UNLOCKED { }
 #else
-typedef struct { int gcc_is_buggy; } raw_spinlock_t;
+typedef struct { int gcc_is_buggy; } __raw_spinlock_t;
 #define __RAW_SPIN_LOCK_UNLOCKED (raw_spinlock_t) { 0 }
 #endif
 
@@ -40,7 +40,7 @@ typedef struct { int gcc_is_buggy; } raw
 #if (__GNUC__ > 2)
 typedef struct {
 	/* no debug version on UP */
-} raw_rwlock_t;
+} __raw_rwlock_t;
 
 #define __RAW_RW_LOCK_UNLOCKED { }
 #else
Index: linux/include/linux/spinlock_up.h
===================================================================
--- linux.orig/include/linux/spinlock_up.h
+++ linux/include/linux/spinlock_up.h
@@ -29,7 +29,7 @@ static inline void __raw_spin_lock(raw_s
 static inline void
 __raw_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags)
 {
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	lock->slock = 0;
 }
 
Index: linux/include/linux/stop_machine.h
===================================================================
--- linux.orig/include/linux/stop_machine.h
+++ linux/include/linux/stop_machine.h
@@ -43,9 +43,9 @@ static inline int stop_machine_run(int (
 				   unsigned int cpu)
 {
 	int ret;
-	local_irq_disable();
+	raw_local_irq_disable();
 	ret = fn(data);
-	local_irq_enable();
+	raw_local_irq_enable();
 	return ret;
 }
 #endif /* CONFIG_SMP */
Index: linux/include/linux/sunrpc/sched.h
===================================================================
--- linux.orig/include/linux/sunrpc/sched.h
+++ linux/include/linux/sunrpc/sched.h
@@ -203,7 +203,7 @@ struct rpc_wait_queue {
 
 #ifndef RPC_DEBUG
 # define RPC_WAITQ_INIT(var,qname) { \
-		.lock = SPIN_LOCK_UNLOCKED, \
+		.lock = SPIN_LOCK_UNLOCKED(var.lock), \
 		.tasks = { \
 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
@@ -212,7 +212,7 @@ struct rpc_wait_queue {
 	}
 #else
 # define RPC_WAITQ_INIT(var,qname) { \
-		.lock = SPIN_LOCK_UNLOCKED, \
+		.lock = SPIN_LOCK_UNLOCKED(var.lock), \
 		.tasks = { \
 			[0] = LIST_HEAD_INIT(var.tasks[0]), \
 			[1] = LIST_HEAD_INIT(var.tasks[1]), \
Index: linux/include/linux/time.h
===================================================================
--- linux.orig/include/linux/time.h
+++ linux/include/linux/time.h
@@ -4,6 +4,7 @@
 #include <linux/types.h>
 
 #ifdef __KERNEL__
+#include <linux/calc64.h>
 #include <linux/seqlock.h>
 #endif
 
@@ -27,6 +28,10 @@ struct timezone {
 
 #ifdef __KERNEL__
 
+/* timeofday base types */
+typedef s64 nsec_t;
+typedef u64 cycle_t;
+
 /* Parameters used to convert the timespec values */
 #define MSEC_PER_SEC (1000L)
 #define USEC_PER_SEC (1000000L)
@@ -38,48 +43,25 @@ static __inline__ int timespec_equal(str
 	return (a->tv_sec == b->tv_sec) && (a->tv_nsec == b->tv_nsec);
 } 
 
-/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
- * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
- * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
- *
- * [For the Julian calendar (which was used in Russia before 1917,
- * Britain & colonies before 1752, anywhere else before 1582,
- * and is still in use by some communities) leave out the
- * -year/100+year/400 terms, and add 10.]
- *
- * This algorithm was first published by Gauss (I think).
- *
- * WARNING: this function will overflow on 2106-02-07 06:28:16 on
- * machines were long is 32-bit! (However, as time_t is signed, we
- * will already get problems at other places on 2038-01-19 03:14:08)
- */
-static inline unsigned long
+#define timespec_valid(ts) \
+(((ts)->tv_sec >= 0) && (((unsigned) (ts)->tv_nsec) < NSEC_PER_SEC))
+
+extern unsigned long
 mktime (unsigned int year, unsigned int mon,
 	unsigned int day, unsigned int hour,
-	unsigned int min, unsigned int sec)
-{
-	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
-		mon += 12;		/* Puts Feb last since it has leap day */
-		year -= 1;
-	}
-
-	return (((
-		(unsigned long) (year/4 - year/100 + year/400 + 367*mon/12 + day) +
-			year*365 - 719499
-	    )*24 + hour /* now have hours */
-	  )*60 + min /* now have minutes */
-	)*60 + sec; /* finally seconds */
-}
+	unsigned int min, unsigned int sec);
 
 extern struct timespec xtime;
 extern struct timespec wall_to_monotonic;
-extern seqlock_t xtime_lock;
+extern raw_seqlock_t xtime_lock;
 
 static inline unsigned long get_seconds(void)
 { 
 	return xtime.tv_sec;
 }
 
+extern void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec);
+
 struct timespec current_kernel_time(void);
 
 #define CURRENT_TIME (current_kernel_time())
@@ -88,31 +70,66 @@ struct timespec current_kernel_time(void
 extern void do_gettimeofday(struct timeval *tv);
 extern int do_settimeofday(struct timespec *tv);
 extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz);
-extern void clock_was_set(void); // call when ever the clock is set
-extern int do_posix_clock_monotonic_gettime(struct timespec *tp);
+extern void do_posix_clock_monotonic_gettime(struct timespec *ts);
 extern long do_utimes(char __user * filename, struct timeval * times);
 struct itimerval;
 extern int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue);
 extern int do_getitimer(int which, struct itimerval *value);
-extern void getnstimeofday (struct timespec *tv);
 
 extern struct timespec timespec_trunc(struct timespec t, unsigned gran);
 
-static inline void
-set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+/**
+ * timespec_to_ns - Convert timespec to nanoseconds
+ * @ts:		pointer to the timespec variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timespec
+ * variable
+ */
+static inline nsec_t timespec_to_ns(struct timespec *ts)
+{
+	nsec_t res = (nsec_t) ts->tv_sec * NSEC_PER_SEC;
+
+	return res + (nsec_t) ts->tv_nsec;
+}
+
+/**
+ * timeval_to_ns - Convert timeval to nanoseconds
+ * @ts:		pointer to the timeval variable to be converted
+ *
+ * Returns the scalar nanosecond representation of the timeval
+ * variable
+ */
+static inline nsec_t timeval_to_ns(struct timeval *tv)
 {
-	while (nsec > NSEC_PER_SEC) {
-		nsec -= NSEC_PER_SEC;
-		++sec;
+	nsec_t res = (nsec_t) tv->tv_sec * NSEC_PER_SEC;
+
+	return res + (nsec_t) tv->tv_usec * NSEC_PER_USEC;
+}
+
+extern void ns_to_timespec(struct timespec *ts, nsec_t nsec);
+extern void ns_to_timeval(struct timeval *tv, nsec_t nsec);
+
+static inline void normalize_timespec(struct timespec *ts)
+{
+	while (unlikely((unsigned long)ts->tv_nsec >= NSEC_PER_SEC)) {
+		ts->tv_nsec -= NSEC_PER_SEC;
+		ts->tv_sec++;
 	}
-	while (nsec < 0) {
-		nsec += NSEC_PER_SEC;
-		--sec;
+}
+
+static inline void timespec_add_ns(struct timespec *a, nsec_t ns)
+{
+	while(unlikely(ns >= NSEC_PER_SEC)) {
+		ns -= NSEC_PER_SEC;
+		a->tv_sec++;
 	}
-	ts->tv_sec = sec;
-	ts->tv_nsec = nsec;
+	a->tv_nsec += ns;
+	normalize_timespec(a);
 }
 
+extern nsec_t nsleep(nsec_t nsecs);
+extern nsec_t nsleep_interruptible(nsec_t nsecs);
+
 #endif /* __KERNEL__ */
 
 #define NFDBITS			__NFDBITS
@@ -145,23 +162,18 @@ struct	itimerval {
 /*
  * The IDs of the various system clocks (for POSIX.1b interval timers).
  */
-#define CLOCK_REALTIME		  0
-#define CLOCK_MONOTONIC	  1
+#define CLOCK_REALTIME		 0
+#define CLOCK_MONOTONIC	  	 1
 #define CLOCK_PROCESS_CPUTIME_ID 2
 #define CLOCK_THREAD_CPUTIME_ID	 3
-#define CLOCK_REALTIME_HR	 4
-#define CLOCK_MONOTONIC_HR	  5
 
 /*
  * The IDs of various hardware clocks
  */
-
-
 #define CLOCK_SGI_CYCLE 10
 #define MAX_CLOCKS 16
-#define CLOCKS_MASK  (CLOCK_REALTIME | CLOCK_MONOTONIC | \
-                     CLOCK_REALTIME_HR | CLOCK_MONOTONIC_HR)
-#define CLOCKS_MONO (CLOCK_MONOTONIC & CLOCK_MONOTONIC_HR)
+#define CLOCKS_MASK  (CLOCK_REALTIME | CLOCK_MONOTONIC)
+#define CLOCKS_MONO (CLOCK_MONOTONIC)
 
 /*
  * The various flags for setting POSIX.1b interval timers.
Index: linux/include/linux/timeofday.h
===================================================================
--- /dev/null
+++ linux/include/linux/timeofday.h
@@ -0,0 +1,44 @@
+/*  linux/include/linux/timeofday.h
+ *
+ *  This file contains the interface to the time of day subsystem
+ */
+#ifndef _LINUX_TIMEOFDAY_H
+#define _LINUX_TIMEOFDAY_H
+#include <linux/calc64.h>
+#include <linux/types.h>
+#include <linux/ktime.h>
+#include <linux/time.h>
+#include <linux/timex.h>
+
+#ifdef CONFIG_GENERIC_TIME
+
+/* Kernel internal interfaces */
+extern ktime_t get_monotonic_clock(void);
+extern ktime_t get_realtime_clock(void);
+extern ktime_t get_realtime_offset(void);
+
+/* Timepsec based interfaces for user space functionality */
+extern void get_realtime_clock_ts(struct timespec *ts);
+extern void get_monotonic_clock_ts(struct timespec *ts);
+
+/* legacy timeofday interfaces*/
+#define getnstimeofday(ts) get_realtime_clock_ts(ts)
+extern void getnstimeofday(struct timespec *ts);
+extern void do_gettimeofday(struct timeval *tv);
+extern int do_settimeofday(struct timespec *ts);
+
+/* Internal functions */
+extern int timeofday_is_continuous(void);
+extern void timeofday_init(void);
+
+#ifndef CONFIG_IS_TICK_BASED
+#define arch_getoffset() (0)
+#else
+extern unsigned long arch_getoffset(void);
+#endif
+
+#else /* CONFIG_GENERIC_TIME */
+#define timeofday_init()
+extern void getnstimeofday(struct timespec *ts);
+#endif /* CONFIG_GENERIC_TIME */
+#endif /* _LINUX_TIMEOFDAY_H */
Index: linux/include/linux/timer.h
===================================================================
--- linux.orig/include/linux/timer.h
+++ linux/include/linux/timer.h
@@ -79,10 +79,12 @@ static inline void add_timer(struct time
 	__mod_timer(timer, timer->expires);
 }
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+  extern int timer_pending_sync(struct timer_list *timer);
   extern int try_to_del_timer_sync(struct timer_list *timer);
   extern int del_timer_sync(struct timer_list *timer);
 #else
+# define timer_pending_sync(t)		timer_pending(t)
 # define try_to_del_timer_sync(t)	del_timer(t)
 # define del_timer_sync(t)		del_timer(t)
 #endif
@@ -91,6 +93,6 @@ static inline void add_timer(struct time
 
 extern void init_timers(void);
 extern void run_local_timers(void);
-extern void it_real_fn(unsigned long);
+extern void it_real_fn(void *);
 
 #endif
Index: linux/include/linux/timex.h
===================================================================
--- linux.orig/include/linux/timex.h
+++ linux/include/linux/timex.h
@@ -260,6 +260,8 @@ extern long pps_calcnt;		/* calibration 
 extern long pps_errcnt;		/* calibration errors */
 extern long pps_stbcnt;		/* stability limit exceeded */
 
+extern raw_seqlock_t ntp_lock;
+
 /**
  * ntp_clear - Clears the NTP state variables
  *
@@ -267,10 +269,14 @@ extern long pps_stbcnt;		/* stability li
  */
 static inline void ntp_clear(void)
 {
+	unsigned long flags;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
 	time_adjust = 0;		/* stop active adjtime() */
 	time_status |= STA_UNSYNC;
 	time_maxerror = NTP_PHASE_LIMIT;
 	time_esterror = NTP_PHASE_LIMIT;
+	write_sequnlock_irqrestore(&ntp_lock, flags);
 }
 
 /**
@@ -282,6 +288,30 @@ static inline int ntp_synced(void)
 	return !(time_status & STA_UNSYNC);
 }
 
+/**
+ * ntp_get_ppm_adjustment - Returns Shifted PPM adjustment
+ */
+extern long ntp_get_ppm_adjustment(void);
+
+/**
+ * ntp_advance - Advances the NTP state machine by interval_ns
+ */
+extern void ntp_advance(unsigned long interval_ns);
+
+/**
+ * ntp_leapsecond - NTP leapsecond processing code.
+ */
+extern int ntp_leapsecond(struct timespec now);
+
+
+/* Required to safely shift negative values */
+#define shift_right(x, s) ({	\
+	__typeof__(x) __x = (x);	\
+	__typeof__(s) __s = (s);	\
+	__x < 0 ? -(-__x >> __s) : __x >> __s;	\
+})
+
+#ifndef CONFIG_GENERIC_TIME
 
 #ifdef CONFIG_TIME_INTERPOLATION
 
@@ -337,6 +367,7 @@ time_interpolator_reset(void)
 }
 
 #endif /* !CONFIG_TIME_INTERPOLATION */
+#endif /* !CONFIG_GENERIC_TIME */
 
 #endif /* KERNEL */
 
Index: linux/include/linux/wait.h
===================================================================
--- linux.orig/include/linux/wait.h
+++ linux/include/linux/wait.h
@@ -48,11 +48,13 @@ struct wait_bit_queue {
 	wait_queue_t wait;
 };
 
+#if 1
 struct __wait_queue_head {
 	spinlock_t lock;
 	struct list_head task_list;
 };
 typedef struct __wait_queue_head wait_queue_head_t;
+#endif
 
 
 /*
@@ -68,7 +70,7 @@ typedef struct __wait_queue_head wait_qu
 	wait_queue_t name = __WAITQUEUE_INITIALIZER(name, tsk)
 
 #define __WAIT_QUEUE_HEAD_INITIALIZER(name) {				\
-	.lock		= SPIN_LOCK_UNLOCKED,				\
+	.lock		= SPIN_LOCK_UNLOCKED(name.lock),		\
 	.task_list	= { &(name).task_list, &(name).task_list } }
 
 #define DECLARE_WAIT_QUEUE_HEAD(name) \
Index: linux/include/linux/workqueue.h
===================================================================
--- linux.orig/include/linux/workqueue.h
+++ linux/include/linux/workqueue.h
@@ -54,6 +54,8 @@ extern struct workqueue_struct *__create
 						    int singlethread);
 #define create_workqueue(name) __create_workqueue((name), 0)
 #define create_singlethread_workqueue(name) __create_workqueue((name), 1)
+extern void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+				int rt_priority, int nice);
 
 extern void destroy_workqueue(struct workqueue_struct *wq);
 
Index: linux/include/net/dn_dev.h
===================================================================
--- linux.orig/include/net/dn_dev.h
+++ linux/include/net/dn_dev.h
@@ -76,9 +76,9 @@ struct dn_dev_parms {
 	int priority;             /* Priority to be a router            */
 	char *name;               /* Name for sysctl                    */
 	int ctl_name;             /* Index for sysctl                   */
-	int  (*up)(struct net_device *);
-	void (*down)(struct net_device *);
-	void (*timer3)(struct net_device *, struct dn_ifaddr *ifa);
+	int  (*dn_up)(struct net_device *);
+	void (*dn_down)(struct net_device *);
+	void (*dn_timer3)(struct net_device *, struct dn_ifaddr *ifa);
 	void *sysctl;
 };
 
Index: linux/include/net/sock.h
===================================================================
--- linux.orig/include/net/sock.h
+++ linux/include/net/sock.h
@@ -608,12 +608,12 @@ static inline void sk_refcnt_debug_relea
 /* Called with local bh disabled */
 static __inline__ void sock_prot_inc_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse++;
+	prot->stats[raw_smp_processor_id()].inuse++;
 }
 
 static __inline__ void sock_prot_dec_use(struct proto *prot)
 {
-	prot->stats[smp_processor_id()].inuse--;
+	prot->stats[raw_smp_processor_id()].inuse--;
 }
 
 /* With per-bucket locks this operation is not-atomic, so that
@@ -735,8 +735,8 @@ extern void FASTCALL(lock_sock(struct so
 extern void FASTCALL(release_sock(struct sock *sk));
 
 /* BH context may only use the following locking interface. */
-#define bh_lock_sock(__sk)	spin_lock(&((__sk)->sk_lock.slock))
-#define bh_unlock_sock(__sk)	spin_unlock(&((__sk)->sk_lock.slock))
+#define bh_lock_sock(__sk)	do { spin_lock(&((__sk)->sk_lock.slock)); } while (0)
+#define bh_unlock_sock(__sk)	do { spin_unlock(&((__sk)->sk_lock.slock)); } while (0)
 
 extern struct sock		*sk_alloc(int family,
 					  gfp_t priority,
Index: linux/include/pcmcia/ss.h
===================================================================
--- linux.orig/include/pcmcia/ss.h
+++ linux/include/pcmcia/ss.h
@@ -242,7 +242,7 @@ struct pcmcia_socket {
 #endif
 
 	/* state thread */
-	struct semaphore		skt_sem;	/* protects socket h/w state */
+	struct compat_semaphore		skt_sem;	/* protects socket h/w state */
 
 	struct task_struct		*thread;
 	struct completion		thread_done;
Index: linux/include/scsi/scsi_host.h
===================================================================
--- linux.orig/include/scsi/scsi_host.h
+++ linux/include/scsi/scsi_host.h
@@ -467,8 +467,8 @@ struct Scsi_Host {
 
 	struct list_head	eh_cmd_q;
 	struct task_struct    * ehandler;  /* Error recovery thread. */
-	struct semaphore      * eh_action; /* Wait for specific actions on the
-                                          host. */
+	struct compat_semaphore * eh_action; /* Wait for specific actions on the
+                                                host. */
 	unsigned int            eh_active:1; /* Indicates the eh thread is awake and active if
                                           this is true. */
 	wait_queue_head_t       host_wait;
Index: linux/include/scsi/scsi_transport_spi.h
===================================================================
--- linux.orig/include/scsi/scsi_transport_spi.h
+++ linux/include/scsi/scsi_transport_spi.h
@@ -51,7 +51,7 @@ struct spi_transport_attrs {
 	unsigned int support_qas; /* supports quick arbitration and selection */
 	/* Private Fields */
 	unsigned int dv_pending:1; /* Internal flag */
-	struct semaphore dv_sem; /* semaphore to serialise dv */
+	struct compat_semaphore dv_sem; /* semaphore to serialise dv */
 };
 
 enum spi_signal_type {
Index: linux/include/sound/timer.h
===================================================================
--- linux.orig/include/sound/timer.h
+++ linux/include/sound/timer.h
@@ -25,6 +25,7 @@
 
 #include <sound/asound.h>
 #include <linux/interrupt.h>
+#include <linux/timeofday.h>
 
 typedef enum sndrv_timer_class snd_timer_class_t;
 typedef enum sndrv_timer_slave_class snd_timer_slave_class_t;
Index: linux/init/main.c
===================================================================
--- linux.orig/init/main.c
+++ linux/init/main.c
@@ -45,8 +45,11 @@
 #include <linux/efi.h>
 #include <linux/unistd.h>
 #include <linux/rmap.h>
+#include <linux/irq.h>
 #include <linux/mempolicy.h>
 #include <linux/key.h>
+#include <linux/timeofday.h>
+#include <linux/clockchips.h>
 #include <net/sock.h>
 
 #include <asm/io.h>
@@ -391,14 +394,16 @@ static void __init smp_init(void)
 static void noinline rest_init(void)
 	__releases(kernel_lock)
 {
+	system_state = SYSTEM_BOOTING_SCHEDULER_OK;
+
 	kernel_thread(init, NULL, CLONE_FS | CLONE_SIGHAND);
 	numa_default_policy();
 	unlock_kernel();
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 
 	/*
 	 * The boot idle thread must execute schedule()
-	 * at least one to get things moving:
+	 * at least once to get things moving:
 	 */
 	schedule();
 
@@ -444,6 +449,7 @@ asmlinkage void __init start_kernel(void
 {
 	char * command_line;
 	extern struct kernel_param __start___param[], __stop___param[];
+
 /*
  * Interrupts are still disabled. Do necessary setups, then
  * enable them
@@ -472,8 +478,10 @@ asmlinkage void __init start_kernel(void
 	 * fragile until we cpu_idle() for the first time.
 	 */
 	preempt_disable();
+
 	build_all_zonelists();
 	page_alloc_init();
+	early_init_hardirqs();
 	printk(KERN_NOTICE "Kernel command line: %s\n", saved_command_line);
 	parse_early_param();
 	parse_args("Booting kernel", command_line, __start___param,
@@ -484,8 +492,11 @@ asmlinkage void __init start_kernel(void
 	rcu_init();
 	init_IRQ();
 	pidhash_init();
+	init_clockevents();
 	init_timers();
+	ktimers_init();
 	softirq_init();
+	timeofday_init();
 	time_init();
 
 	/*
@@ -497,7 +508,12 @@ asmlinkage void __init start_kernel(void
 	if (panic_later)
 		panic(panic_later, panic_param);
 	profile_init();
-	local_irq_enable();
+
+	/*
+	 * Soft IRQ state will be enabled with the hard state.
+	 */
+	raw_local_irq_enable();
+
 #ifdef CONFIG_BLK_DEV_INITRD
 	if (initrd_start && !initrd_below_start_ok &&
 			initrd_start < min_low_pfn << PAGE_SHIFT) {
@@ -542,6 +558,9 @@ asmlinkage void __init start_kernel(void
 
 	acpi_early_init(); /* before LAPIC and SMP init */
 
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(raw_irqs_disabled());
+#endif
 	/* Do the rest non-__init'ed, we're now alive */
 	rest_init();
 }
@@ -584,6 +603,12 @@ static void __init do_initcalls(void)
 			msg = "disabled interrupts";
 			local_irq_enable();
 		}
+#ifdef CONFIG_PREEMPT_RT
+		if (raw_irqs_disabled()) {
+			msg = "disabled hard interrupts";
+			raw_local_irq_enable();
+		}
+#endif
 		if (msg) {
 			printk(KERN_WARNING "error in initcall at 0x%p: "
 				"returned with %s\n", *call, msg);
@@ -621,6 +646,7 @@ static void __init do_basic_setup(void)
 static void do_pre_smp_initcalls(void)
 {
 	extern int spawn_ksoftirqd(void);
+	extern int spawn_desched_task(void);
 #ifdef CONFIG_SMP
 	extern int migration_init(void);
 
@@ -628,6 +654,7 @@ static void do_pre_smp_initcalls(void)
 #endif
 	spawn_ksoftirqd();
 	spawn_softlockup_task();
+	spawn_desched_task();
 }
 
 static void run_init_process(char *init_filename)
@@ -674,6 +701,8 @@ static int init(void * unused)
 	/* Sets up cpus_possible() */
 	smp_prepare_cpus(max_cpus);
 
+	init_hardirqs();
+
 	do_pre_smp_initcalls();
 
 	fixup_cpu_present_map();
@@ -703,6 +732,50 @@ static int init(void * unused)
 		prepare_namespace();
 	}
 
+#define DEBUG_COUNT (defined(CONFIG_DEBUG_RT_LOCKING_MODE) + defined(CONFIG_DEBUG_DEADLOCKS) + defined(CONFIG_DEBUG_PREEMPT) + defined(CONFIG_CRITICAL_PREEMPT_TIMING) + defined(CONFIG_CRITICAL_IRQSOFF_TIMING) + defined(CONFIG_LATENCY_TRACE) + defined(CONFIG_DEBUG_SLAB) + defined(CONFIG_DEBUG_PAGEALLOC))
+
+#if DEBUG_COUNT > 0
+	printk(KERN_ERR "*****************************************************************************\n");
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  REMINDER, the following debugging option is turned on in your .config:   *\n");
+#else
+	printk(KERN_ERR "*  REMINDER, the following debugging options are turned on in your .config: *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	printk(KERN_ERR "*        CONFIG_DEBUG_RT_LOCKING_MODE                                       *\n");
+#endif
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	printk(KERN_ERR "*        CONFIG_DEBUG_DEADLOCKS                                             *\n");
+#endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	printk(KERN_ERR "*        CONFIG_DEBUG_PREEMPT                                               *\n");
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_PREEMPT_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+	printk(KERN_ERR "*        CONFIG_CRITICAL_IRQSOFF_TIMING                                     *\n");
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	printk(KERN_ERR "*        CONFIG_LATENCY_TRACE                                               *\n");
+#endif
+#ifdef CONFIG_DEBUG_SLAB
+	printk(KERN_ERR "*        CONFIG_DEBUG_SLAB                                                  *\n");
+#endif
+#ifdef CONFIG_DEBUG_PAGEALLOC
+	printk(KERN_ERR "*        CONFIG_DEBUG_PAGEALLOC                                             *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+#if DEBUG_COUNT == 1
+	printk(KERN_ERR "*  it may increase runtime overhead and latencies.                          *\n");
+#else
+	printk(KERN_ERR "*  they may increase runtime overhead and latencies.                        *\n");
+#endif
+	printk(KERN_ERR "*                                                                           *\n");
+	printk(KERN_ERR "*****************************************************************************\n");
+#endif
 	/*
 	 * Ok, we have completed the initial bootup, and
 	 * we're essentially up and running. Get rid of the
@@ -724,6 +797,9 @@ static int init(void * unused)
 		printk(KERN_WARNING "Failed to execute %s\n",
 				ramdisk_execute_command);
 	}
+#ifdef CONFIG_PREEMPT_RT
+	WARN_ON(raw_irqs_disabled() || irqs_disabled());
+#endif
 
 	/*
 	 * We try each of these until one succeeds.
Index: linux/ipc/mqueue.c
===================================================================
--- linux.orig/ipc/mqueue.c
+++ linux/ipc/mqueue.c
@@ -765,12 +765,17 @@ static inline void pipelined_send(struct
 				  struct msg_msg *message,
 				  struct ext_wait_queue *receiver)
 {
+	/*
+	 * Keep them in one critical section for PREEMPT_RT:
+	 */
+	preempt_disable();
 	receiver->msg = message;
 	list_del(&receiver->list);
 	receiver->state = STATE_PENDING;
 	wake_up_process(receiver->task);
 	smp_wmb();
 	receiver->state = STATE_READY;
+	preempt_enable();
 }
 
 /* pipelined_receive() - if there is task waiting in sys_mq_timedsend()
Index: linux/ipc/msg.c
===================================================================
--- linux.orig/ipc/msg.c
+++ linux/ipc/msg.c
@@ -164,6 +164,11 @@ static void expunge_all(struct msg_queue
 	tmp = msq->q_receivers.next;
 	while (tmp != &msq->q_receivers) {
 		struct msg_receiver* msr;
+		/*
+		 * Make sure that the wakeup doesnt preempt
+		 * _this_ CPU prematurely. (on PREEMPT_RT)
+		 */
+		preempt_disable();
 		
 		msr = list_entry(tmp,struct msg_receiver,r_list);
 		tmp = tmp->next;
@@ -171,6 +176,8 @@ static void expunge_all(struct msg_queue
 		wake_up_process(msr->r_tsk);
 		smp_mb();
 		msr->r_msg = ERR_PTR(res);
+
+		preempt_enable();
 	}
 }
 /* 
@@ -532,7 +539,13 @@ static inline int pipelined_send(struct 
 		if(testmsg(msg,msr->r_msgtype,msr->r_mode) &&
 		   !security_msg_queue_msgrcv(msq, msg, msr->r_tsk, msr->r_msgtype, msr->r_mode)) {
 			list_del(&msr->r_list);
+			/*
+			 * Make sure that the wakeup doesnt preempt
+			 * _this_ CPU prematurely. (on PREEMPT_RT)
+			 */
+			preempt_disable();
 			if(msr->r_maxsize < msg->m_ts) {
+
 				msr->r_msg = NULL;
 				wake_up_process(msr->r_tsk);
 				smp_mb();
@@ -544,8 +557,10 @@ static inline int pipelined_send(struct 
 				wake_up_process(msr->r_tsk);
 				smp_mb();
 				msr->r_msg = msg;
+				preempt_enable();
 				return 1;
 			}
+			preempt_enable();
 		}
 	}
 	return 0;
Index: linux/ipc/sem.c
===================================================================
--- linux.orig/ipc/sem.c
+++ linux/ipc/sem.c
@@ -361,6 +361,11 @@ static void update_queue (struct sem_arr
 		if (error <= 0) {
 			struct sem_queue *n;
 			remove_from_queue(sma,q);
+			/*
+			 * make sure that the wakeup doesnt preempt
+			 * _this_ cpu prematurely. (on preempt_rt)
+			 */
+			preempt_disable();
 			q->status = IN_WAKEUP;
 			/*
 			 * Continue scanning. The next operation
@@ -382,6 +387,7 @@ static void update_queue (struct sem_arr
 			 * writing q->status.
 			 */
 			q->status = error;
+			preempt_enable();
 			q = n;
 		} else {
 			q = q->next;
Index: linux/kernel/Kconfig.preempt
===================================================================
--- linux.orig/kernel/Kconfig.preempt
+++ linux/kernel/Kconfig.preempt
@@ -1,14 +1,13 @@
-
 choice
-	prompt "Preemption Model"
-	default PREEMPT_NONE
+	prompt "Preemption Mode"
+	default PREEMPT_RT
 
 config PREEMPT_NONE
 	bool "No Forced Preemption (Server)"
 	help
-	  This is the traditional Linux preemption model, geared towards
+	  This is the traditional Linux preemption model geared towards
 	  throughput. It will still provide good latencies most of the
-	  time, but there are no guarantees and occasional longer delays
+	  time but there are no guarantees and occasional long delays
 	  are possible.
 
 	  Select this option if you are building a kernel for a server or
@@ -21,7 +20,7 @@ config PREEMPT_VOLUNTARY
 	help
 	  This option reduces the latency of the kernel by adding more
 	  "explicit preemption points" to the kernel code. These new
-	  preemption points have been selected to reduce the maximum
+	  preemption points have been selected to minimize the maximum
 	  latency of rescheduling, providing faster application reactions,
 	  at the cost of slighly lower throughput.
 
@@ -33,33 +32,133 @@ config PREEMPT_VOLUNTARY
 
 	  Select this if you are building a kernel for a desktop system.
 
-config PREEMPT
+config PREEMPT_DESKTOP
 	bool "Preemptible Kernel (Low-Latency Desktop)"
 	help
 	  This option reduces the latency of the kernel by making
-	  all kernel code (that is not executing in a critical section)
+	  all kernel code that is not executing in a critical section
 	  preemptible.  This allows reaction to interactive events by
 	  permitting a low priority process to be preempted involuntarily
 	  even if it is in kernel mode executing a system call and would
-	  otherwise not be about to reach a natural preemption point.
-	  This allows applications to run more 'smoothly' even when the
-	  system is under load, at the cost of slighly lower throughput
-	  and a slight runtime overhead to kernel code.
+	  otherwise not about to reach a preemption point.  This allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of slighly lower throughput and a
+	  slight runtime overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 50% of the time.)
 
 	  Select this if you are building a kernel for a desktop or
 	  embedded system with latency requirements in the milliseconds
 	  range.
 
+config PREEMPT_RT
+	bool "Complete Preemption (Real-Time)"
+	select PREEMPT_SOFTIRQS
+	select PREEMPT_HARDIRQS
+	select PREEMPT_RCU
+	help
+	  This option further reduces the scheduling latency of the
+	  kernel by replacing almost every spinlock used by the kernel
+	  with preemptible mutexes and thus making all but the most
+	  critical kernel code involuntarily preemptible. The remaining
+	  handful of lowlevel non-preemptible codepaths are short and
+	  have a deterministic latency of a couple of tens of
+	  microseconds (depending the the hardware).  This also allows
+	  applications to run more 'smoothly' even when the system is
+	  under load, at the cost of lower throughput and runtime
+	  overhead to kernel code.
+
+	  (According to profiles, when this mode is selected then even
+	  during kernel-intense workloads the system is in an immediately
+	  preemptible state more than 95% of the time.)
+
+	  Select this if you are building a kernel for a desktop,
+	  embedded or real-time system with guaranteed latency
+	  requirements of 100 usecs or lower.
+
 endchoice
 
-config PREEMPT_BKL
-	bool "Preempt The Big Kernel Lock"
-	depends on SMP || PREEMPT
+config PREEMPT
+	bool
 	default y
+	depends on PREEMPT_DESKTOP || PREEMPT_RT
+
+config PREEMPT_SOFTIRQS
+	bool "Thread Softirqs"
+	default n
+#	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by 'threading'
+          soft interrupts. This means that all softirqs will execute
+          in softirqd's context. While this helps latency, it can also
+          reduce performance.
+
+          The threading of softirqs can also be controlled via
+          /proc/sys/kernel/softirq_preemption runtime flag and the
+          sofirq-preempt=0/1 boot-time option.
+
+	  Say N if you are unsure.
+
+config PREEMPT_HARDIRQS
+	bool "Thread Hardirqs"
+	default n
+#	depends on PREEMPT
 	help
-	  This option reduces the latency of the kernel by making the
-	  big kernel lock preemptible.
+	  This option reduces the latency of the kernel by 'threading'
+          hardirqs. This means that all (or selected) hardirqs will run
+          in their own kernel thread context. While this helps latency,
+          this feature can also reduce performance.
+
+          The threading of hardirqs can also be controlled via the
+          /proc/sys/kernel/hardirq_preemption runtime flag and the
+          hardirq-preempt=0/1 boot-time option. Per-irq threading can
+          be enabled/disable via the /proc/irq/<IRQ>/<handler>/threaded
+          runtime flags.
+
+	  Say N if you are unsure.
+
+config SPINLOCK_BKL
+	bool "Old-Style Big Kernel Lock"
+	depends on (PREEMPT || SMP) && !PREEMPT_RT
+	default n
+	help
+	  This option increases the latency of the kernel by making the
+	  big kernel lock spinlock-based (which is bad for latency).
+	  However, enable this option if you see any problems to revert
+	  back to the traditional spinlock BKL design.
 
 	  Say Y here if you are building a kernel for a desktop system.
 	  Say N if you are unsure.
 
+config PREEMPT_BKL
+	bool
+	depends on PREEMPT_RT || !SPINLOCK_BKL
+	default n if !PREEMPT
+	default y
+
+config PREEMPT_RCU
+	bool "Preemptible RCU"
+	default n
+	depends on PREEMPT
+	help
+	  This option reduces the latency of the kernel by making certain
+	  RCU sections preemptible. Normally RCU code is non-preemptible, if
+	  this option is selected then read-only RCU sections become
+	  preemptible. This helps latency, but may expose bugs due to
+	  now-naive assumptions about each RCU read-side critical section
+	  remaining on a given CPU through its execution.
+
+	  Say N if you are unsure.
+
+config RCU_STATS
+	bool "/proc stats for preemptible RCU read-side critical sections"
+	depends on PREEMPT_RCU
+	default y
+	help
+	  This option provides /proc stats to provide debugging info for
+	  the preemptible realtime RCU implementation.
+
+	  Say Y here if you want to see RCU stats in /proc
+	  Say N if you are unsure.
Index: linux/kernel/Makefile
===================================================================
--- linux.orig/kernel/Makefile
+++ linux/kernel/Makefile
@@ -7,7 +7,13 @@ obj-y     = sched.o fork.o exec_domain.o
 	    sysctl.o capability.o ptrace.o timer.o user.o \
 	    signal.o sys.o kmod.o workqueue.o pid.o \
 	    rcupdate.o intermodule.o extable.o params.o posix-timers.o \
-	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o
+	    kthread.o wait.o kfifo.o sys_ni.o posix-cpu-timers.o \
+	    ktimers.o rt.o
+
+obj-$(CONFIG_GENERIC_TIME) += time/
+obj-$(CONFIG_DEBUG_PREEMPT) += latency.o
+obj-$(CONFIG_LATENCY_TIMING) += latency.o
+obj-$(CONFIG_LATENCY_HIST) += latency_hist.o
 
 obj-$(CONFIG_FUTEX) += futex.o
 obj-$(CONFIG_GENERIC_ISA_DMA) += dma.o
@@ -32,6 +38,7 @@ obj-$(CONFIG_DETECT_SOFTLOCKUP) += softl
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_CRASH_DUMP) += crash_dump.o
 obj-$(CONFIG_SECCOMP) += seccomp.o
+obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
 
 ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
Index: linux/kernel/acct.c
===================================================================
--- linux.orig/kernel/acct.c
+++ linux/kernel/acct.c
@@ -88,7 +88,7 @@ struct acct_glbs {
 	struct timer_list	timer;
 };
 
-static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED};
+static struct acct_glbs acct_globals __cacheline_aligned = {SPIN_LOCK_UNLOCKED(acct_globals.lock)};
 
 /*
  * Called whenever the timer says to check the free space.
Index: linux/kernel/audit.c
===================================================================
--- linux.orig/kernel/audit.c
+++ linux/kernel/audit.c
@@ -614,7 +614,7 @@ err:
 
 unsigned int audit_serial(void)
 {
-	static spinlock_t serial_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(serial_lock);
 	static unsigned int serial = 0;
 
 	unsigned long flags;
Index: linux/kernel/exit.c
===================================================================
--- linux.orig/kernel/exit.c
+++ linux/kernel/exit.c
@@ -28,6 +28,7 @@
 #include <linux/cpuset.h>
 #include <linux/syscalls.h>
 #include <linux/signal.h>
+#include <linux/futex.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -49,8 +50,11 @@ static void __unhash_process(struct task
 	if (thread_group_leader(p)) {
 		detach_pid(p, PIDTYPE_PGID);
 		detach_pid(p, PIDTYPE_SID);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)--;
+			preempt_enable();
+		}
 	}
 
 	REMOVE_LINKS(p);
@@ -71,7 +75,6 @@ repeat: 
 		__ptrace_unlink(p);
 	BUG_ON(!list_empty(&p->ptrace_list) || !list_empty(&p->ptrace_children));
 	__exit_signal(p);
-	__exit_sighand(p);
 	/*
 	 * Note that the fastpath in sys_times depends on __exit_signal having
 	 * updated the counters before a task is removed from the tasklist of
@@ -387,8 +390,10 @@ static inline void close_files(struct fi
 		while (set) {
 			if (set & 1) {
 				struct file * file = xchg(&fdt->fd[i], NULL);
-				if (file)
+				if (file) {
 					filp_close(file, files);
+					cond_resched();
+				}
 			}
 			i++;
 			set >>= 1;
@@ -522,9 +527,11 @@ static void exit_mm(struct task_struct *
 	if (mm != tsk->active_mm) BUG();
 	/* more a memory barrier than a real lock */
 	task_lock(tsk);
+	preempt_disable(); // FIXME
 	tsk->mm = NULL;
 	up_read(&mm->mmap_sem);
 	enter_lazy_tlb(mm, current);
+	preempt_enable();
 	task_unlock(tsk);
 	mmput(mm);
 }
@@ -783,10 +790,6 @@ static void exit_notify(struct task_stru
 	/* If the process is dead, release it - nobody will wait for it */
 	if (state == EXIT_DEAD)
 		release_task(tsk);
-
-	/* PF_DEAD causes final put_task_struct after we schedule. */
-	preempt_disable();
-	tsk->flags |= PF_DEAD;
 }
 
 fastcall NORET_TYPE void do_exit(long code)
@@ -842,10 +845,11 @@ fastcall NORET_TYPE void do_exit(long co
 	update_mem_hiwater(tsk);
 	group_dead = atomic_dec_and_test(&tsk->signal->live);
 	if (group_dead) {
- 		del_timer_sync(&tsk->signal->real_timer);
+ 		ktimer_cancel(&tsk->signal->real_timer);
 		exit_itimers(tsk->signal);
 		acct_process(code);
 	}
+	exit_futex(tsk);
 	exit_mm(tsk);
 
 	exit_sem(tsk);
@@ -869,12 +873,18 @@ fastcall NORET_TYPE void do_exit(long co
 	mpol_free(tsk->mempolicy);
 	tsk->mempolicy = NULL;
 #endif
-
-	BUG_ON(!(current->flags & PF_DEAD));
-	schedule();
-	BUG();
-	/* Avoid "noreturn function does return".  */
-	for (;;) ;
+	check_no_held_locks(tsk);
+	/* PF_DEAD causes final put_task_struct after we schedule. */
+again:
+	raw_local_irq_disable();
+	tsk->flags |= PF_DEAD;
+	__schedule();
+	printk(KERN_ERR "BUG: dead task %s:%d back from the grave!\n",
+		current->comm, current->pid);
+	printk(KERN_ERR ".... flags: %08lx, count: %d, state: %08lx\n",
+		current->flags, atomic_read(&current->usage), current->state);
+	printk(KERN_ERR ".... trying again ...\n");
+	goto again;
 }
 
 EXPORT_SYMBOL_GPL(do_exit);
@@ -1374,6 +1384,7 @@ repeat:
 		list_for_each(_p,&tsk->children) {
 			p = list_entry(_p,struct task_struct,sibling);
 
+			BUG_ON(!atomic_read(&p->usage));
 			ret = eligible_child(pid, options, p);
 			if (!ret)
 				continue;
Index: linux/kernel/fork.c
===================================================================
--- linux.orig/kernel/fork.c
+++ linux/kernel/fork.c
@@ -42,6 +42,8 @@
 #include <linux/profile.h>
 #include <linux/rmap.h>
 #include <linux/acct.h>
+#include <linux/kthread.h>
+#include <linux/notifier.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -64,6 +66,16 @@ DEFINE_PER_CPU(unsigned long, process_co
 
 EXPORT_SYMBOL(tasklist_lock);
 
+/*
+ * Delayed mmdrop/put_task_struct. In the PREEMPT_RT case we
+ * dont want to do this from the scheduling context.
+ */
+static DEFINE_PER_CPU(struct task_struct *, desched_task);
+
+static DEFINE_PER_CPU(struct list_head, delayed_put_list);
+static DEFINE_PER_CPU(struct list_head, delayed_drop_list);
+
+
 int nr_processes(void)
 {
 	int cpu;
@@ -108,6 +120,8 @@ EXPORT_SYMBOL(free_task);
 
 void __put_task_struct(struct task_struct *tsk)
 {
+	BUG_ON(atomic_read(&tsk->usage));
+	WARN_ON(!(tsk->flags & PF_DEAD));
 	WARN_ON(!(tsk->exit_state & (EXIT_DEAD | EXIT_ZOMBIE)));
 	WARN_ON(atomic_read(&tsk->usage));
 	WARN_ON(tsk == current);
@@ -122,8 +136,33 @@ void __put_task_struct(struct task_struc
 		free_task(tsk);
 }
 
+#if 0
+
+void put_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+
+	if (!atomic_dec_and_test(&tsk->usage))
+		return;
+	__put_task_struct(tsk);
+}
+
+EXPORT_SYMBOL(put_task_struct);
+
+void get_task_struct(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+	atomic_inc(&tsk->usage);
+}
+
+EXPORT_SYMBOL(get_task_struct);
+
+#endif
+
 void __init fork_init(unsigned long mempages)
 {
+	int i;
+
 #ifndef __HAVE_ARCH_TASK_STRUCT_ALLOCATOR
 #ifndef ARCH_MIN_TASKALIGN
 #define ARCH_MIN_TASKALIGN	L1_CACHE_BYTES
@@ -151,6 +190,11 @@ void __init fork_init(unsigned long memp
 	init_task.signal->rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 	init_task.signal->rlim[RLIMIT_SIGPENDING] =
 		init_task.signal->rlim[RLIMIT_NPROC];
+
+	for (i = 0; i < NR_CPUS; i++) {
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, i));
+		INIT_LIST_HEAD(&per_cpu(delayed_put_list, i));
+	}
 }
 
 static struct task_struct *dup_task_struct(struct task_struct *orig)
@@ -327,6 +371,7 @@ static struct mm_struct * mm_init(struct
 	rwlock_init(&mm->ioctx_list_lock);
 	mm->ioctx_list = NULL;
 	mm->default_kioctx = (struct kioctx)INIT_KIOCTX(mm->default_kioctx, *mm);
+	INIT_LIST_HEAD(&mm->delayed_drop);
 	mm->free_area_cache = TASK_UNMAPPED_BASE;
 	mm->cached_hole_size = ~0UL;
 
@@ -754,6 +799,14 @@ int unshare_files(void)
 
 EXPORT_SYMBOL(unshare_files);
 
+void sighand_free_cb(struct rcu_head *rhp)
+{
+	struct sighand_struct *sp =
+		container_of(rhp, struct sighand_struct, rcu);
+
+	kmem_cache_free(sighand_cachep, sp);
+}
+
 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct sighand_struct *sig;
@@ -804,10 +857,9 @@ static inline int copy_signal(unsigned l
 	init_sigpending(&sig->shared_pending);
 	INIT_LIST_HEAD(&sig->posix_timers);
 
-	sig->it_real_value = sig->it_real_incr = 0;
+	ktimer_init(&sig->real_timer);
 	sig->real_timer.function = it_real_fn;
-	sig->real_timer.data = (unsigned long) tsk;
-	init_timer(&sig->real_timer);
+	sig->real_timer.data = tsk;
 
 	sig->it_virt_expires = cputime_zero;
 	sig->it_virt_incr = cputime_zero;
@@ -983,6 +1035,12 @@ static task_t *copy_process(unsigned lon
  		goto bad_fork_cleanup;
  	}
 #endif
+	INIT_LIST_HEAD(&p->delayed_put);
+	preempt_disable();
+	plist_init(&p->pi_waiters, MAX_PRIO);
+	preempt_enable();
+	p->blocked_on = NULL; /* not blocked yet */
+	spin_lock_init(&p->pi_lock);
 
 	p->tgid = p->pid;
 	if (clone_flags & CLONE_THREAD)
@@ -1012,6 +1070,9 @@ static task_t *copy_process(unsigned lon
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_namespace;
+#ifdef CONFIG_DEBUG_PREEMPT
+	p->lock_count = 0;
+#endif
 
 	p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
 	/*
@@ -1061,10 +1122,12 @@ static task_t *copy_process(unsigned lon
 	 * to ensure it is on a valid CPU (and if not, just force it back to
 	 * parent's CPU). This avoids alot of nasty races.
 	 */
+	preempt_disable();
 	p->cpus_allowed = current->cpus_allowed;
 	if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) ||
 			!cpu_online(task_cpu(p))))
 		set_task_cpu(p, smp_processor_id());
+	preempt_enable();
 
 	/*
 	 * Check for pending SIGKILL! The new thread should not be allowed
@@ -1142,8 +1205,11 @@ static task_t *copy_process(unsigned lon
 	if (thread_group_leader(p)) {
 		attach_pid(p, PIDTYPE_PGID, process_group(p));
 		attach_pid(p, PIDTYPE_SID, p->signal->session);
-		if (p->pid)
+		if (p->pid) {
+			preempt_disable();
 			__get_cpu_var(process_counts)++;
+			preempt_enable();
+		}
 	}
 
 	if (!current->signal->tty && p->signal->tty)
@@ -1322,3 +1388,168 @@ void __init proc_caches_init(void)
 			sizeof(struct mm_struct), 0,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL, NULL);
 }
+
+static int put_task_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_put_list);
+	while (!list_empty(head)) {
+		struct task_struct *task = list_entry(head->next,
+					struct task_struct, delayed_put);
+		list_del(&task->delayed_put);
+		put_cpu_var(delayed_put_list);
+
+		__put_task_struct(task);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_put_list);
+	}
+	put_cpu_var(delayed_put_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __put_task_struct_delayed(struct task_struct *task)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_put_list);
+	list_add_tail(&task->delayed_put, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_put_list);
+}
+
+void put_task_struct_delayed(struct task_struct *tsk)
+{
+	BUG_ON(!atomic_read(&tsk->usage));
+
+	if (!atomic_dec_and_test(&tsk->usage))
+		return;
+	__put_task_struct_delayed(tsk);
+}
+
+static int mmdrop_complete(void)
+{
+	struct list_head *head;
+	int ret = 0;
+
+	head = &get_cpu_var(delayed_drop_list);
+	while (!list_empty(head)) {
+		struct mm_struct *mm = list_entry(head->next,
+					struct mm_struct, delayed_drop);
+		list_del(&mm->delayed_drop);
+		put_cpu_var(delayed_drop_list);
+
+		__mmdrop(mm);
+		ret = 1;
+
+		head = &get_cpu_var(delayed_drop_list);
+	}
+	put_cpu_var(delayed_drop_list);
+
+	return ret;
+}
+
+/*
+ * We dont want to do complex work from the scheduler, thus
+ * we delay the work to a per-CPU worker thread:
+ */
+void fastcall __mmdrop_delayed(struct mm_struct *mm)
+{
+	struct task_struct *desched_task;
+	struct list_head *head;
+
+	head = &get_cpu_var(delayed_drop_list);
+	list_add_tail(&mm->delayed_drop, head);
+	desched_task = __get_cpu_var(desched_task);
+	if (desched_task)
+		wake_up_process(desched_task);
+	put_cpu_var(delayed_drop_list);
+}
+
+static int desched_thread(void * __bind_cpu)
+{
+	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
+
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	while (!kthread_should_stop()) {
+		int ret;
+
+		ret = put_task_complete();
+		ret |= mmdrop_complete();
+		if (ret)
+			continue;
+		schedule();
+		set_current_state(TASK_INTERRUPTIBLE);
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int __devinit cpu_callback(struct notifier_block *nfb,
+				  unsigned long action,
+				  void *hcpu)
+{
+	int hotcpu = (unsigned long)hcpu;
+	struct task_struct *p;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+
+		BUG_ON(per_cpu(desched_task, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_drop_list, hotcpu));
+		INIT_LIST_HEAD(&per_cpu(delayed_put_list, hotcpu));
+		p = kthread_create(desched_thread, hcpu, "desched/%d", hotcpu);
+		if (IS_ERR(p)) {
+			printk("desched_thread for %i failed\n", hotcpu);
+			return NOTIFY_BAD;
+		}
+  		per_cpu(desched_task, hotcpu) = p;
+		kthread_bind(p, hotcpu);
+ 		break;
+	case CPU_ONLINE:
+
+		wake_up_process(per_cpu(desched_task, hotcpu));
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_UP_CANCELED:
+
+		/* Unbind so it can run.  Fall thru. */
+		kthread_bind(per_cpu(desched_task, hotcpu), smp_processor_id());
+	case CPU_DEAD:
+
+		p = per_cpu(desched_task, hotcpu);
+		per_cpu(desched_task, hotcpu) = NULL;
+		kthread_stop(p);
+		takeover_tasklets(hotcpu);
+		break;
+#endif /* CONFIG_HOTPLUG_CPU */
+ 	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata cpu_nfb = {
+	.notifier_call = cpu_callback
+};
+
+__init int spawn_desched_task(void)
+{
+	void *cpu = (void *)(long)smp_processor_id();
+
+	cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
+	cpu_callback(&cpu_nfb, CPU_ONLINE, cpu);
+	register_cpu_notifier(&cpu_nfb);
+	return 0;
+}
+
Index: linux/kernel/futex.c
===================================================================
--- linux.orig/kernel/futex.c
+++ linux/kernel/futex.c
@@ -8,6 +8,9 @@
  *  Removed page pinning, fix privately mapped COW pages and other cleanups
  *  (C) Copyright 2003, 2004 Jamie Lokier
  *
+ *  Robust futexes added by Todd Kneisel
+ *  (C) Copyright 2005, Bull HN.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -92,14 +95,16 @@ struct futex_q {
 	/* For fd, sigio sent using these. */
 	int fd;
 	struct file *filp;
+
+	struct futex_robust *robust;
 };
 
 /*
  * Split the global futex_lock into every hash list lock.
  */
 struct futex_hash_bucket {
-       spinlock_t              lock;
-       struct list_head       chain;
+	spinlock_t		lock;
+	struct list_head	chain;
 };
 
 static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
@@ -140,7 +145,8 @@ static inline int match_futex(union fute
  *
  * Should be called with &current->mm->mmap_sem but NOT any spinlocks.
  */
-static int get_futex_key(unsigned long uaddr, union futex_key *key)
+static int get_futex_key(unsigned long uaddr, union futex_key *key,
+			struct list_head **list, struct semaphore **sem)
 {
 	struct mm_struct *mm = current->mm;
 	struct vm_area_struct *vma;
@@ -163,6 +169,14 @@ static int get_futex_key(unsigned long u
 	if (unlikely(!vma))
 		return -EFAULT;
 
+	if (vma->vm_file && vma->vm_file->f_mapping) {
+		*list = &vma->vm_file->f_mapping->robust_list;
+		*sem = &vma->vm_file->f_mapping->robust_sem;
+	} else {
+		*list = NULL;
+		*sem = NULL;
+	}
+
 	/*
 	 * Permissions.
 	 */
@@ -301,11 +315,12 @@ static int futex_wake(unsigned long uadd
 	struct futex_hash_bucket *bh;
 	struct list_head *head;
 	struct futex_q *this, *next;
+	struct semaphore *sem;
 	int ret;
 
 	down_read(&current->mm->mmap_sem);
 
-	ret = get_futex_key(uaddr, &key);
+	ret = get_futex_key(uaddr, &key, &head, &sem);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -336,16 +351,17 @@ static int futex_wake_op(unsigned long u
 	union futex_key key1, key2;
 	struct futex_hash_bucket *bh1, *bh2;
 	struct list_head *head;
+	struct semaphore *sem;
 	struct futex_q *this, *next;
 	int ret, op_ret, attempt = 0;
 
 retryfull:
 	down_read(&current->mm->mmap_sem);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, &key1, &head, &sem);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, &key2, &head, &sem);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -449,16 +465,17 @@ static int futex_requeue(unsigned long u
 	union futex_key key1, key2;
 	struct futex_hash_bucket *bh1, *bh2;
 	struct list_head *head1;
+	struct semaphore *sem;
 	struct futex_q *this, *next;
 	int ret, drop_count = 0;
 
  retry:
 	down_read(&current->mm->mmap_sem);
 
-	ret = get_futex_key(uaddr1, &key1);
+	ret = get_futex_key(uaddr1, &key1, &head1, &sem);
 	if (unlikely(ret != 0))
 		goto out;
-	ret = get_futex_key(uaddr2, &key2);
+	ret = get_futex_key(uaddr2, &key2, &head1, &sem);
 	if (unlikely(ret != 0))
 		goto out;
 
@@ -623,11 +640,13 @@ static int futex_wait(unsigned long uadd
 	int ret, curval;
 	struct futex_q q;
 	struct futex_hash_bucket *bh;
+	struct list_head *head;
+	struct semaphore *sem;
 
  retry:
 	down_read(&current->mm->mmap_sem);
 
-	ret = get_futex_key(uaddr, &q.key);
+	ret = get_futex_key(uaddr, &q.key, &head, &sem);
 	if (unlikely(ret != 0))
 		goto out_release_sem;
 
@@ -701,8 +720,13 @@ static int futex_wait(unsigned long uadd
 	 * !list_empty() is safe here without any lock.
 	 * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
 	 */
-	if (likely(!list_empty(&q.list)))
+	if (likely(!list_empty(&q.list))) {
+		unsigned long nosched_flag = current->flags & PF_NOSCHED;
+
+		current->flags &= ~PF_NOSCHED;
 		time = schedule_timeout(time);
+		current->flags |= nosched_flag;
+	}
 	__set_current_state(TASK_RUNNING);
 
 	/*
@@ -765,6 +789,8 @@ static int futex_fd(unsigned long uaddr,
 {
 	struct futex_q *q;
 	struct file *filp;
+	struct list_head *head;
+	struct semaphore *sem;
 	int ret, err;
 
 	ret = -EINVAL;
@@ -800,7 +826,7 @@ static int futex_fd(unsigned long uaddr,
 	}
 
 	down_read(&current->mm->mmap_sem);
-	err = get_futex_key(uaddr, &q->key);
+	err = get_futex_key(uaddr, &q->key, &head, &sem);
 
 	if (unlikely(err != 0)) {
 		up_read(&current->mm->mmap_sem);
@@ -828,8 +854,629 @@ error:
 	goto out;
 }
 
+/*
+ * Robust futexes provide a locking mechanism that can be shared between
+ * user mode processes. The major difference between robust futexes and
+ * regular futexes is that when the owner of a robust futex dies, the
+ * next task waiting on the futex will be awakened, will get ownership
+ * of the futex lock, and will receive the error status EOWNERDEAD.
+ *
+ * A robust futex is a 32 bit integer stored in user mode shared memory.
+ * Bit 31 indicates that there are tasks waiting on the futex.
+ * Bit 30 indicates that the task that owned the futex has died.
+ * Bit 29 indicates that the futex is not recoverable and cannot be used.
+ * Bits 0-28 are the pid of the task that owns the futex lock, or zero if
+ * the futex is not locked.
+ */
+
+/*
+ * Used to track registered robust futexes. Attached to linked list in inodes.
+ */
+struct futex_robust {
+	struct list_head list;
+	union futex_key key;
+	struct rt_mutex futex_mutex;
+};
+
+/*
+ * there really isn't an atomic page fault, so we're going to
+ * put the burden on the user. If either futex_get_user or futex_put_user
+ * return -EFAULT, it really means it's avoiding a race condition
+ * and the user will have to try again.
+ */
+static int futex_put_user(int value, unsigned long uaddr)
+{
+	int ret = 0;
+
+	if ((put_user(value, (int __user *)uaddr)) != 0)
+		if ((put_user(value, (int __user *)uaddr)) != 0)
+			ret = -EFAULT;
+	return ret;
+}
+
+static int futex_get_user(unsigned long uaddr)
+{
+	int value = 0;
+
+	if (get_user(value, (int __user *)uaddr))
+		if (get_user(value, (int __user *)uaddr))
+			value = -EFAULT;
+	return value;
+}
+
+/**
+ * futex_wake_robust - wake a task that is waiting on a robust futex
+ * @uaddr: user space address of the robust futex
+ *
+ * Called from user space (through sys_futex syscall) when unlocking a
+ * robust futex, but only if %FUTEX_WAITERS is set in the futex.
+ * Unlocking when there are no waiters is done entirely in user space.
+ */
+static int futex_wake_robust(unsigned long uaddr)
+{
+	struct thread_info *ti = current_thread_info();
+	union futex_key key;
+	struct list_head *head = NULL;
+	struct semaphore *sem = NULL;
+	struct futex_robust *this, *next;
+	int ret;
+	int value;
+	int found = 0;
+
+retry:
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &key, &head, &sem);
+	if (ret != 0)
+		goto out;
+	if (head == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+	down(sem);
+
+	ret = get_futex_value_locked(&value, (int __user *)uaddr);
+
+	if (unlikely(ret)) {
+		up(sem);
+		/* If we would have faulted, release mmap_sem, fault it in and
+		 * start all over again.
+		 */
+		up_read(&current->mm->mmap_sem);
+
+		ret = get_user(value, (int __user *)uaddr);
+
+		if (!ret)
+			goto retry;
+		return ret;
+	}
+
+	/*
+	 * if the owner died, mark the futex as not recoverable
+	 * and wake up all waiting tasks.
+	 */
+	if (value & FUTEX_OWNER_DIED) {
+		if (put_user(FUTEX_OWNER_DIED | FUTEX_NOT_RECOVERABLE,
+				(int __user *) uaddr)) {
+			ret = -EFAULT;
+			goto out_unlock;
+		}
+		list_for_each_entry_safe(this, next, head, list) {
+			if (match_futex (&this->key, &key)) {
+				up_futex(&this->futex_mutex);
+			}
+		}
+		goto out_unlock;
+	}
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &key)) {
+			found++;
+			if ((rt_mutex_owner(&this->futex_mutex)) != ti) {
+				ret = -EINVAL;
+				goto out_unlock;
+			}
+			if (!rt_mutex_has_waiters(&this->futex_mutex)) {
+				value = 0;
+				break;
+			} else {
+				value |= FUTEX_WAITERS;
+				break;
+			}
+
+		}
+	}
+	if (!found) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+
+	if (futex_put_user(value, uaddr)) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+	up_futex(&this->futex_mutex);
+	ret = 1;
+
+out_unlock:
+	up(sem);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+/**
+ * futex_wait_robust - add current task to wait queue of a robust futex
+ * @uaddr: user space address of the robust futex
+ * @time:  timeout in jiffies. zero for no timeout.
+ *
+ * Called from user space (through sys_futex syscall) when locking a
+ * robust futex. Only called if the futex is already locked by another
+ * task. Uncontended locking is done entirely in user space.
+ */
+static int futex_wait_robust(unsigned long uaddr, unsigned long time)
+{
+	int ret, curval;
+	struct futex_robust *this, *next;
+	struct list_head *head = NULL;
+	struct semaphore *sem = NULL;
+	union futex_key key;
+	pid_t owner_pid;
+	int found = 0;
+
+ retry:
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &key, &head, &sem);
+	if (ret != 0)
+		goto out;
+	if (head == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+	down(sem);
+
+	ret = get_futex_value_locked(&curval, (int __user *)uaddr);
+
+	if (unlikely(ret)) {
+		up(sem);
+
+		/* If we would have faulted, release mmap_sem, fault it in and
+		 * start all over again.
+		 */
+		up_read(&current->mm->mmap_sem);
+
+		ret = get_user(curval, (int __user *)uaddr);
+
+		if (!ret)
+			goto retry;
+		return ret;
+	}
+
+	owner_pid = curval & FUTEX_PID;
+	/*
+	 * user mode called us because futex was owned by a task,
+	 * but now it's not. Let user mode try again.
+	 */
+	if (curval == 0) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+	if ((curval & FUTEX_PID) == current->pid) {
+		ret = -EAGAIN;
+		goto out_unlock;
+	}
+
+	/* if owner has died, we don't want to wait */
+	if ((curval & FUTEX_OWNER_DIED)) {
+		ret = -EOWNERDEAD;
+		goto out_unlock;
+	}
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex(&this->key, &key)) {
+			found++;
+			break;
+		}
+	}
+	if (!found) {
+		ret = -EINVAL;
+		goto out_unlock;
+	}
+	get_key_refs(&key);
+
+	ret = down_futex(&this->futex_mutex, time, owner_pid, sem);
+	if (ret >= 0) {
+		curval = futex_get_user(uaddr);
+		curval &= ~FUTEX_PID;
+		curval |= current->pid;
+		if (rt_mutex_has_waiters(&this->futex_mutex))
+			curval |= FUTEX_WAITERS;
+		ret = futex_put_user(curval, uaddr);
+		if (curval & FUTEX_OWNER_DIED) {
+			ret = -EOWNERDEAD;
+		}
+	}
+	return ret;
+
+out_unlock:
+	up(sem);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+/**
+ * futex_free_robust_list - release the list of registered futexes.
+ * @inode: inode that may be a memory mapped file
+ *
+ * Called from dput() when a dentry reference count reaches zero.
+ * If the dentry is associated with a memory mapped file, then
+ * release the list of registered robust futexes that are contained
+ * in that mapping.
+ */
+void futex_free_robust_list(struct inode *inode)
+{
+	struct address_space *mapping;
+	struct list_head *head;
+ 	struct futex_robust *this, *next;
+
+	if (inode == NULL)
+		return;
+
+	mapping = inode->i_mapping;
+	if (mapping == NULL)
+		return;
+
+	if (list_empty(&mapping->robust_list))
+		return;
+
+	down(&mapping->robust_sem);
+
+	head = &mapping->robust_list;
+
+	list_for_each_entry_safe(this, next, head, list) {
+		list_del(&this->list);
+		kfree(this);
+	}
+
+	up(&mapping->robust_sem);
+	return;
+}
+
+/**
+ * get_private_uaddr - convert a private futex_key to a user addr
+ * @key: the futex_key that identifies a futex.
+ *
+ * Private futex_keys identify a futex that is in non-shared memory.
+ * Robust futexes should never result in private futex_keys, but keep
+ * this code for completeness.
+ * Returns zero if futex is not contained in current task's mm
+ */
+static unsigned long get_private_uaddr(union futex_key *key)
+{
+	unsigned long uaddr = 0;
+
+	if (key->private.mm == current->mm)
+		uaddr = key->private.uaddr;
+	return uaddr;
+}
+
+/**
+ * get_shared_uaddr - convert a shared futex_key to a user addr.
+ * @key: a futex_key that identifies a futex.
+ * @vma: a vma that may contain the futex
+ *
+ * Shared futex_keys identify a futex that is contained in a vma,
+ * and so may be shared.
+ * Returns zero if futex is not contained in @vma
+ */
+static unsigned long get_shared_uaddr(union futex_key *key,
+				      struct vm_area_struct *vma)
+{
+	unsigned long uaddr = 0;
+	unsigned long tmpaddr;
+	struct address_space *mapping;
+
+	mapping = vma->vm_file->f_mapping;
+	if (key->shared.inode == mapping->host) {
+		tmpaddr = ((key->shared.pgoff - vma->vm_pgoff) << PAGE_SHIFT)
+				+ (key->shared.offset & ~0x1)
+				+ vma->vm_start;
+		if (tmpaddr >= vma->vm_start && tmpaddr < vma->vm_end)
+			uaddr = tmpaddr;
+	}
+
+	return uaddr;
+}
+
+/**
+ * get_futex_uaddr - convert a futex_key to a user addr.
+ * @key: futex_key that identifies a futex
+ * @vma: vma that may contain the futex
+ *
+ * Converts both shared and private futex_keys.
+ * Returns zero if futex is not contained in @vma or in the current
+ * task's mm.
+ */
+static unsigned long get_futex_uaddr(union futex_key *key,
+				     struct vm_area_struct *vma)
+{
+	unsigned long uaddr;
+
+	if ((key->both.offset & 0x1) == 0)
+		uaddr = get_private_uaddr(key);
+	else
+		uaddr = get_shared_uaddr(key,vma);
+
+	return uaddr;
+}
+
+/**
+ * find_owned_futex - find futexes owned by the current task
+ * @vma: the vma to search for futexes
+ * @head: list head for list of robust futexes
+ * @sem: semaphore that protects the list
+ *
+ * Walk the list of registered robust futexes for this @vma,
+ * setting the %FUTEX_OWNER_DIED flag on those futexes owned
+ * by the current, exiting task.
+ */
+static void find_owned_futex(struct vm_area_struct *vma, struct list_head *head,
+				struct semaphore *sem)
+{
+	struct thread_info *ti = current_thread_info();
+	struct futex_robust *this, *next;
+ 	unsigned long uaddr;
+	int value;
+
+	down(sem);
+
+	list_for_each_entry_safe(this, next, head, list) {
+
+		uaddr = get_futex_uaddr(&this->key, vma);
+		if (uaddr == 0)
+			continue;
+
+		up(sem);
+		up_read(&current->mm->mmap_sem);
+		value = futex_get_user(uaddr);
+		if (this->futex_mutex.mutex_attr & FUTEX_ATTR_ROBUST)
+			value |= FUTEX_OWNER_DIED;
+		if (rt_mutex_owned_by(&this->futex_mutex, ti)) {
+			futex_put_user(value, uaddr);
+			up_futex(&this->futex_mutex);
+		} else if ((value & FUTEX_PID) == current->pid) {
+			/*
+			 * this bit is for the fast path.  If the lock is only
+			 * locked in user space wee need to unlock it
+			 * for the exiting thread.
+			 */
+			value &= ~FUTEX_PID;
+			futex_put_user(value, uaddr);
+		}
+		down_read(&current->mm->mmap_sem);
+		down(sem);
+	}
+
+	up(sem);
+}
+
+/**
+ * exit_futex - futex processing when a task exits.
+ *
+ * Called from do_exit() when a task exits. Mark all robust futexes
+ * that are owned by the current terminating task as %FUTEX_OWNER_DIED.
+ */
+
+void exit_futex(struct task_struct *tsk)
+{
+	struct mm_struct *mm;
+	struct vm_area_struct *vma;
+	struct list_head *list;
+	struct semaphore *sem;
+
+	if (tsk==NULL)
+		return;
+
+	mm = current->mm;
+	if (mm==NULL)
+		return;
+
+	down_read(&mm->mmap_sem);
+
+	for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+		if (vma->vm_file == NULL)
+			continue;
+		if (vma->vm_file->f_mapping == NULL)
+			continue;
+
+		list = &vma->vm_file->f_mapping->robust_list;
+		sem = &vma->vm_file->f_mapping->robust_sem;
+		if (list_empty(list))
+			continue;
+
+		find_owned_futex(vma, list, sem);
+	}
+
+	up_read(&mm->mmap_sem);
+}
+
+/**
+ * futex_register - Record the existence of a robust futex in a vma.
+ * @uaddr: user space address of the robust futex
+ *
+ * Called from user space (through sys_futex syscall) when a robust
+ * futex is created. Looks up the vma that contains the futex and
+ * adds an entry to the list of all robust futexes in the vma.
+ */
+static int futex_register(unsigned long uaddr, unsigned int attr)
+{
+	int ret;
+	struct futex_robust *robust;
+	struct mm_struct *mm = current->mm;
+	struct vm_area_struct *vma;
+	struct list_head *head = NULL;
+	struct semaphore *sem = NULL;
+
+	robust = kmalloc(sizeof(*robust), GFP_KERNEL);
+	down_read(&current->mm->mmap_sem);
+	if (!robust) {
+		ret = -ENOMEM;
+		goto out;
+	}
+	preempt_disable();
+	init_rt_mutex(&robust->futex_mutex, 1, "futex", __FILE__, __LINE__);
+	preempt_enable();
+	robust->futex_mutex.mutex_attr = FUTEX_ATTR_PRIORITY_QUEUING;
+	robust->futex_mutex.mutex_attr |= attr;
+	/*
+	 * priority queueing is default on robust mutexes.
+	 */
+
+	ret = get_futex_key(uaddr, &robust->key, &head, &sem);
+	if (unlikely(ret != 0))	{
+		kfree(robust);
+		goto out;
+	}
+
+	vma = find_extend_vma(mm, uaddr);
+	if (unlikely(!vma)) {
+		ret = -EFAULT;
+		kfree(robust);
+		goto out;
+	}
+
+	if (vma->vm_file && vma->vm_file->f_mapping) {
+		head = &vma->vm_file->f_mapping->robust_list;
+		sem = &vma->vm_file->f_mapping->robust_sem;
+	} else {
+		ret = -EINVAL;
+		kfree(robust);
+		goto out;
+	}
+
+	down(sem);
+	list_add_tail(&robust->list, head);
+	up(sem);
+
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+/**
+ * futex_deregister - Delete robust futex registration from a vma
+ * @uaddr: user space address of the robust futex
+ *
+ * Called from user space (through sys_futex syscall) when a robust
+ * futex is destroyed. Looks up the vma that contains the futex and
+ * removes the futex entry from the list of all robust futexes in
+ * the vma.
+ */
+static int futex_deregister(unsigned long uaddr)
+{
+	struct thread_info *ti = current_thread_info();
+	union futex_key key;
+	struct list_head *head = NULL;
+	struct semaphore *sem = NULL;
+	struct futex_robust *this, *next;
+	int ret;
+
+	down_read(&current->mm->mmap_sem);
+
+	ret = get_futex_key(uaddr, &key, &head, &sem);
+	if (unlikely(ret != 0))
+		goto out;
+	if (head == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+	down(sem);
+
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex (&this->key, &key)) {
+			/*
+			 * unlock it if we own it,  don't allow
+			 * deregister if someone else owns the lock
+			 */
+			if (rt_mutex_owned_by(&this->futex_mutex, ti)) {
+				up_futex(&this->futex_mutex);
+			} else if (rt_mutex_owner(&this->futex_mutex) != NULL) {
+				ret = -EBUSY;
+				break;
+			}
+			list_del(&this->list);
+			kfree(this);
+			break;
+		}
+	}
+
+	up(sem);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
+/**
+ * futex_recover - Recover a futex after its owner died
+ * @uaddr: user space address of the robust futex
+ *
+ * Called from user space (through sys_futex syscall).
+ * When a task dies while owning a robust futex, the futex is
+ * marked with %FUTEX_OWNER_DIED and ownership is transferred
+ * to the next waiting task. That task can choose to restore
+ * the futex to a useful state by calling this function.
+ */
+static int futex_recover(unsigned long uaddr)
+{
+	struct thread_info *ti = current_thread_info();
+	int ret = 0;
+	int value = 0;
+	union futex_key key;
+	struct futex_robust *this, *next;
+	struct list_head *head = NULL;
+	struct semaphore *sem = NULL;
+
+	down_read(&current->mm->mmap_sem);
+	ret = get_futex_key(uaddr, &key, &head, &sem);
+	if (ret != 0)
+		goto out;
+	if (head == NULL) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	down(sem);
+	list_for_each_entry_safe(this, next, head, list) {
+		if (match_futex(&this->key, &key)) {
+			/*
+			* can't recover a futex we don't own
+			*/
+			if (!rt_mutex_owned_by(&this->futex_mutex, ti)) {
+				ret = -EINVAL;
+				goto out_unlock;
+			}
+			break;
+		}
+        }
+
+	if ((value = futex_get_user(uaddr)) == -EFAULT) {
+		ret = -EFAULT;
+		goto out_unlock;
+	}
+
+	value &= ~FUTEX_OWNER_DIED;
+	ret = futex_put_user(value, uaddr);
+out_unlock:
+	up(sem);
+out:
+	up_read(&current->mm->mmap_sem);
+	return ret;
+}
+
 long do_futex(unsigned long uaddr, int op, int val, unsigned long timeout,
-		unsigned long uaddr2, int val2, int val3)
+	      unsigned long uaddr2, int val2, int val3)
 {
 	int ret;
 
@@ -853,6 +1500,21 @@ long do_futex(unsigned long uaddr, int o
 	case FUTEX_WAKE_OP:
 		ret = futex_wake_op(uaddr, uaddr2, val, val2, val3);
 		break;
+	case FUTEX_WAIT_ROBUST:
+		ret = futex_wait_robust(uaddr, timeout);
+		break;
+	case FUTEX_WAKE_ROBUST:
+		ret = futex_wake_robust(uaddr);
+		break;
+	case FUTEX_REGISTER:
+		ret = futex_register(uaddr, val);
+		break;
+	case FUTEX_DEREGISTER:
+		ret = futex_deregister(uaddr);
+		break;
+	case FUTEX_RECOVER:
+		ret = futex_recover(uaddr);
+		break;
 	default:
 		ret = -ENOSYS;
 	}
@@ -868,7 +1530,7 @@ asmlinkage long sys_futex(u32 __user *ua
 	unsigned long timeout = MAX_SCHEDULE_TIMEOUT;
 	int val2 = 0;
 
-	if ((op == FUTEX_WAIT) && utime) {
+	if ((op == FUTEX_WAIT || op == FUTEX_WAIT_ROBUST) && utime) {
 		if (copy_from_user(&t, utime, sizeof(t)) != 0)
 			return -EFAULT;
 		timeout = timespec_to_jiffies(&t) + 1;
@@ -876,7 +1538,7 @@ asmlinkage long sys_futex(u32 __user *ua
 	/*
 	 * requeue parameter in 'utime' if op == FUTEX_REQUEUE.
 	 */
-	if (op >= FUTEX_REQUEUE)
+	if (op == FUTEX_REQUEUE || op == FUTEX_CMP_REQUEUE)
 		val2 = (int) (unsigned long) utime;
 
 	return do_futex((unsigned long)uaddr, op, val, timeout,
Index: linux/kernel/irq/Makefile
===================================================================
--- linux.orig/kernel/irq/Makefile
+++ linux/kernel/irq/Makefile
@@ -1,5 +1,5 @@
 
-obj-y := handle.o manage.o spurious.o
+obj-y := handle.o manage.o spurious.o resend.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 
Index: linux/kernel/irq/autoprobe.c
===================================================================
--- linux.orig/kernel/irq/autoprobe.c
+++ linux/kernel/irq/autoprobe.c
@@ -7,6 +7,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
Index: linux/kernel/irq/handle.c
===================================================================
--- linux.orig/kernel/irq/handle.c
+++ linux/kernel/irq/handle.c
@@ -1,68 +1,75 @@
 /*
  * linux/kernel/irq/handle.c
  *
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner, Russell King
  *
  * This file contains the core interrupt handling code.
+ *
+ * Detailed information is available in Documentation/DocBook/genericirq
+ *
  */
 
 #include <linux/irq.h>
 #include <linux/module.h>
 #include <linux/random.h>
+#include <linux/kallsyms.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 
+#if defined(CONFIG_NO_IDLE_HZ)
+#include <asm/dyntick.h>
+#endif
+
 #include "internals.h"
 
 /*
- * Linux has a controller-independent interrupt architecture.
- * Every controller has a 'controller-template', that is used
- * by the main code to do the right thing. Each driver-visible
- * interrupt source is transparently wired to the apropriate
- * controller. Thus drivers need not be aware of the
- * interrupt-controller.
- *
- * The code is designed to be easily extended with new/different
- * interrupt controllers, without having to do assembly magic or
- * having to touch the generic code.
- *
- * Controller mappings for all interrupt sources:
+ * Default initialization for all interrupt sources
  */
 irq_desc_t irq_desc[NR_IRQS] __cacheline_aligned = {
 	[0 ... NR_IRQS-1] = {
 		.status = IRQ_DISABLED,
 		.handler = &no_irq_type,
-		.lock = SPIN_LOCK_UNLOCKED
+ 		.lock = RAW_SPIN_LOCK_UNLOCKED,
+		.depth = 1,
 	}
 };
 
+EXPORT_SYMBOL_GPL(irq_desc);
+
 /*
- * Generic 'no controller' code
+ * What should we do if we get a hw irq event on an illegal vector?
+ * Each architecture has to answer this themself.
  */
-static void end_none(unsigned int irq) { }
-static void enable_none(unsigned int irq) { }
-static void disable_none(unsigned int irq) { }
-static void shutdown_none(unsigned int irq) { }
-static unsigned int startup_none(unsigned int irq) { return 0; }
-
-static void ack_none(unsigned int irq)
+static void ack_bad(unsigned int irq)
 {
-	/*
-	 * 'what should we do if we get a hw irq event on an illegal vector'.
-	 * each architecture has to answer this themself.
-	 */
 	ack_bad_irq(irq);
 }
 
-struct hw_interrupt_type no_irq_type = {
+/*
+ * NOP functions
+ */
+static void noop(unsigned int irq)
+{
+}
+
+static unsigned int noop_ret(unsigned int irq)
+{
+	return 0;
+}
+
+/*
+ * Generic no controller implementation
+ */
+struct irq_type no_irq_type = {
 	.typename = 	"none",
-	.startup = 	startup_none,
-	.shutdown = 	shutdown_none,
-	.enable = 	enable_none,
-	.disable = 	disable_none,
-	.ack = 		ack_none,
-	.end = 		end_none,
-	.set_affinity = NULL
+	.startup = 	noop_ret,
+	.shutdown = 	noop,
+	.enable = 	noop,
+	.disable = 	noop,
+	.ack = 		ack_bad,
+	.end = 		noop,
+	.handle_irq =	handle_bad_irq,
 };
 
 /*
@@ -74,42 +81,459 @@ irqreturn_t no_action(int cpl, void *dev
 }
 
 /*
- * Have got an event to handle:
+ * default ack function
+ */
+static void default_ack(unsigned int irq)
+{
+	irq_desc[irq].chip->ack(irq);
+}
+
+/*
+ * default mask ack function
+ */
+static void default_mask_ack(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (desc->chip->mask_ack) {
+		desc->chip->mask_ack(irq);
+	} else {
+		desc->chip->mask(irq);
+		desc->chip->ack(irq);
+	}
+}
+
+/*
+ * default enable function
+ */
+static void default_enable(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	desc->chip->unmask(irq);
+	desc->status &= ~IRQ_MASKED;
+}
+
+/*
+ * default end function
+ */
+static void default_end(unsigned int irq)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (!desc->depth)
+		desc->chip->unmask(irq);
+}
+
+/*
+ * default disable function
+ */
+static void default_disable(unsigned int irq)
+{
+	irq_desc[irq].chip->mask(irq);
+}
+
+/*
+ * Default set type function
+ */
+static struct irq_type *default_set_type(unsigned int irq, unsigned int type)
+{
+	irq_desc_t *desc = irq_desc + irq;
+
+	if (desc->chip->set_type)
+		if (desc->chip->set_type(irq, type))
+			return NULL;
+
+	switch (type) {
+	case IRQ_TYPE_NONE:
+		return &no_irq_type;
+
+	case IRQ_TYPE_EDGEL:
+	case IRQ_TYPE_EDGEH:
+	case IRQ_TYPE_EDGEB:
+		return &default_edge_type;
+	case IRQ_TYPE_LEVELL:
+	case IRQ_TYPE_LEVELH:
+		return &default_level_type;
+	case IRQ_TYPE_SIMPLE:
+		return &default_simple_type;
+	}
+	return NULL;
+}
+
+/*
+ * Generic edge type interrupt
+ *
+ */
+struct irq_type default_edge_type = {
+	.typename	= "default_edge",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_ack,
+	.hold		= default_mask_ack,
+	.end		= noop,
+	.handle_irq	= handle_edge_irq,
+	.set_type	= default_set_type,
+};
+
+/*
+ * Generic level type interrupt
+ */
+struct irq_type default_level_type = {
+	.typename	= "default_level",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_mask_ack,
+	.end		= default_end,
+	.handle_irq	= handle_level_irq,
+	.set_type	= default_set_type,
+};
+
+/*
+ * Generic simple type interrupt
+ *
+ * No hardware handling necessary
+ */
+struct irq_type default_simple_type = {
+	.typename	= "default_simple",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.set_type	= default_set_type,
+	.handle_irq	= handle_simple_irq,
+};
+
+#ifdef CONFIG_SMP
+/*
+ * Generic per cpu type interrupt
+ */
+struct irq_type default_percpu_type = {
+	.typename	= "default_percpu",
+	.enable		= default_enable,
+	.disable	= default_disable,
+	.ack		= default_ack,
+	.end		= default_end,
+	.handle_irq	= handle_percpu_irq,
+};
+#endif
+
+/*
+ * Hack - used for development only.
+ */
+int debug_direct_keyboard = 0;
+
+int redirect_hardirq(struct irq_desc *desc)
+{
+	/*
+	 * Direct execution:
+	 */
+	if (!hardirq_preemption || (desc->status & IRQ_NODELAY) ||
+							!desc->thread)
+		return 0;
+
+#ifdef __i386__
+	if (debug_direct_keyboard && (desc - irq_desc == 1))
+		return 0;
+#endif
+
+	BUG_ON(!raw_irqs_disabled());
+	if (desc->thread && desc->thread->state != TASK_RUNNING)
+		wake_up_process(desc->thread);
+
+	return 1;
+}
+
+/**
+ * handle_IRQ_event - irq action chain handler
+ * @irq:	the interrupt number
+ * @regs:	pointer to a register structure
+ * @action:	the interrupt action chain for this irq
+ *
+ * Handles the action chain of an irq event
  */
 fastcall int handle_IRQ_event(unsigned int irq, struct pt_regs *regs,
 				struct irqaction *action)
 {
 	int ret, retval = 0, status = 0;
 
-	if (!(action->flags & SA_INTERRUPT))
-		local_irq_enable();
+	/*
+	 * Unconditionally enable interrupts for threaded
+	 * IRQ handlers:
+	 */
+	if (!hardirq_count() || !(action->flags & SA_INTERRUPT))
+		raw_local_irq_enable();
+
+#if defined(CONFIG_NO_IDLE_HZ)
+	if (!(action->flags & SA_TIMER) && system_timer->dyn_tick != NULL) {
+		write_seqlock(&xtime_lock);
+		if (system_timer->dyn_tick->state & DYN_TICK_ENABLED)
+			system_timer->dyn_tick->handler(irq, 0, regs);
+		write_sequnlock(&xtime_lock);
+	}
+#endif
 
 	do {
+		unsigned int preempt_count = preempt_count();
+
 		ret = action->handler(irq, action->dev_id, regs);
+		if (preempt_count() != preempt_count) {
+			stop_trace();
+			print_symbol("BUG: unbalanced irq-handler preempt count in %s!\n", (unsigned long) action->handler);
+			printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+			dump_stack();
+			preempt_count() = preempt_count;
+		}
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
 		action = action->next;
 	} while (action);
 
-	if (status & SA_SAMPLE_RANDOM)
+	if (status & SA_SAMPLE_RANDOM) {
+		raw_local_irq_enable();
 		add_interrupt_randomness(irq);
-	local_irq_disable();
+	}
+	raw_local_irq_disable();
 
 	return retval;
 }
 
-/*
- * do_IRQ handles all normal device IRQ's (the special
+/**
+ * handle_bad_irq - handle spurious and unhandled irqs
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ */
+void notrace handle_bad_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+}
+
+/**
+ * handle_simple_irq - Simple and software-decoded IRQs.
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Simple interrupts are either sent from a demultiplexing interrupt
+ * handler or come from hardware, where no interrupt hardware control
+ * is necessary.
+ *
+ * Note: The caller is expected to handle the ack, clear, mask and
+ * unmask issues if necessary.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void handle_simple_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	action = desc->action;
+	if (unlikely(!action || desc->depth))
+		return;
+
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	spin_unlock(&desc->lock);
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	spin_lock(&desc->lock);
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/**
+ * handle_level_irq - Level type irq handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Level type interrupts are active as long as the hardware line has
+ * the active level. This may require to mask the interrupt and unmask it
+ * after the associated handler has acknowledged the device, so the
+ * interrupt line is back to inactive.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void notrace handle_level_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	struct irqaction *action;
+	irqreturn_t action_ret;
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	desc->handler->ack(irq);
+
+	/*
+	 * If its disabled or no action available
+	 * keep it masked and get out of here
+	 */
+	action = desc->action;
+	if (unlikely(!action || desc->depth))
+		goto out;
+
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	spin_unlock(&desc->lock);
+	action_ret = handle_IRQ_event(irq, regs, action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	spin_lock(&desc->lock);
+
+	desc->status &= ~IRQ_INPROGRESS;
+out:
+	end_irq(desc, irq);
+}
+
+/**
+ * handle_edge_irq - edge type IRQ handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Interrupt occures on the falling and/or rising edge of a hardware
+ * signal. The occurence is latched into the irq controller hardware
+ * and must be acked in order to be reenabled. After the ack another
+ * interrupt can happen on the same source even before the first one
+ * is handled by the assosiacted event handler. If this happens it
+ * might be necessary to disable (mask) the interrupt depending on the
+ * controller hardware. This requires to reenable the interrupt inside
+ * of the loop which handles the interrupts which have arrived while
+ * the handler was running. If all pending interrupts are handled, the
+ * loop is left and depending on the hardware controller some final
+ * ack might be necessary.
+ *
+ * Must be called with the irq_desc->lock held
+ */
+void notrace handle_edge_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	const unsigned int cpu = smp_processor_id();
+
+	kstat_cpu(cpu).irqs[irq]++;
+
+	desc->status &= ~(IRQ_REPLAY | IRQ_WAITING);
+
+	/*
+	 * If we're currently running this IRQ, or its disabled,
+	 * we shouldn't process the IRQ. Mark it pending, handle
+	 * the necessary masking and go out
+	 */
+	if (unlikely((desc->status & IRQ_INPROGRESS) || desc->depth ||
+		    !desc->action)) {
+		desc->status |= (IRQ_PENDING | IRQ_MASKED);
+		desc->handler->hold(irq);
+		return;
+	}
+
+	/* Start handling the irq */
+	desc->handler->ack(irq);
+
+	/* Mark the IRQ currently in progress.*/
+	desc->status |= IRQ_INPROGRESS;
+
+	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		return;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->handler->disable(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->handler->enable(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, regs, action);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, regs);
+		spin_lock(&desc->lock);
+
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	end_irq(desc, irq);
+}
+
+#ifdef CONFIG_SMP
+/**
+ * handle_percpu_IRQ - Per CPU local irq handler
+ * @irq:	the interrupt number
+ * @desc:	the interrupt description structure for this irq
+ * @regs:	pointer to a register structure
+ *
+ * Per CPU interrupts on SMP machines without locking requirements
+ */
+void notrace handle_percpu_irq(unsigned int irq, irq_desc_t *desc, struct pt_regs *regs)
+{
+	irqreturn_t action_ret;
+
+	kstat_this_cpu.irqs[irq]++;
+	desc->handler->ack(irq);
+	action_ret = handle_IRQ_event(irq, regs, desc->action);
+	if (!noirqdebug)
+		note_interrupt(irq, desc, action_ret, regs);
+	desc->handler->end(irq);
+}
+#endif /* CONFIG_SMP */
+
+/**
+ * __do_IRQ - original all in one handler
+ * @irq:	the interrupt number
+ * @regs:	pointer to a register structure
+ *
+ * __do_IRQ handles all normal device IRQ's (the special
  * SMP cross-CPU interrupts have their own specific
- * handlers).
+ * handlers). * This is the original x86 implementation which is used for every
+ * type of interrupt.
+ *
  */
-fastcall unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
+fastcall notrace unsigned int __do_IRQ(unsigned int irq, struct pt_regs *regs)
 {
 	irq_desc_t *desc = irq_desc + irq;
 	struct irqaction * action;
 	unsigned int status;
 
+	/*
+	 * If the task is currently running in user mode, don't
+	 * detect soft lockups.  If CONFIG_DETECT_SOFTLOCKUP is not
+	 * configured, this should be optimized out.
+	 */
+	if (user_mode(regs))
+		touch_light_softlockup_watchdog();
+
 	kstat_this_cpu.irqs[irq]++;
 	if (CHECK_IRQ_PER_CPU(desc->status)) {
 		irqreturn_t action_ret;
@@ -119,7 +543,7 @@ fastcall unsigned int __do_IRQ(unsigned 
 		 */
 		desc->handler->ack(irq);
 		action_ret = handle_IRQ_event(irq, regs, desc->action);
-		desc->handler->end(irq);
+		end_irq(desc, irq);
 		return 1;
 	}
 
@@ -154,6 +578,12 @@ fastcall unsigned int __do_IRQ(unsigned 
 		goto out;
 
 	/*
+	 * hardirq redirection to the irqd process context:
+	 */
+	if (redirect_hardirq(desc))
+		goto out_no_end;
+
+	/*
 	 * Edge triggered interrupts need to remember
 	 * pending events.
 	 * This applies to any hw interrupts that allow a second
@@ -178,13 +608,13 @@ fastcall unsigned int __do_IRQ(unsigned 
 		desc->status &= ~IRQ_PENDING;
 	}
 	desc->status &= ~IRQ_INPROGRESS;
-
 out:
 	/*
-	 * The ->end() handler has to deal with interrupts which got
-	 * disabled while the handler was running.
+	 * The end-handler has to deal with interrupts which got
+	 * disabled while the handler was running:
 	 */
-	desc->handler->end(irq);
+	end_irq(desc, irq);
+out_no_end:
 	spin_unlock(&desc->lock);
 
 	return 1;
Index: linux/kernel/irq/internals.h
===================================================================
--- linux.orig/kernel/irq/internals.h
+++ linux/kernel/irq/internals.h
@@ -4,6 +4,23 @@
 
 extern int noirqdebug;
 
+void recalculate_desc_flags(struct irq_desc *desc);
+
+/*
+ * On PREEMPT_HARDIRQS, the ->ack handler masks interrupts, so that
+ * they can be redirected to an IRQ thread, if needed. So here we
+ * have to unmask the interrupt line, if needed:
+ */
+static inline void end_irq(irq_desc_t *desc, unsigned int irq)
+{
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_ARM) && !defined(CONFIG_PPC)
+	if (!(desc->status & (IRQ_DISABLED|IRQ_INPROGRESS)))
+		desc->handler->enable(irq);
+#else
+	desc->handler->end(irq);
+#endif
+}
+
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq);
 extern void register_handler_proc(unsigned int irq, struct irqaction *action);
Index: linux/kernel/irq/manage.c
===================================================================
--- linux.orig/kernel/irq/manage.c
+++ linux/kernel/irq/manage.c
@@ -1,15 +1,18 @@
 /*
  * linux/kernel/irq/manage.c
  *
- * Copyright (C) 1992, 1998-2004 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner
  *
  * This file contains driver APIs to the irq subsystem.
  */
 
 #include <linux/config.h>
 #include <linux/irq.h>
-#include <linux/module.h>
 #include <linux/random.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/syscalls.h>
 #include <linux/interrupt.h>
 
 #include "internals.h"
@@ -24,6 +27,7 @@ cpumask_t __cacheline_aligned pending_ir
 
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
+ *	@irq: Interrupt to synchronize
  *
  *	This function waits for any pending IRQ handlers for this interrupt
  *	to complete before returning. If you use this function while
@@ -35,8 +39,12 @@ void synchronize_irq(unsigned int irq)
 {
 	struct irq_desc *desc = irq_desc + irq;
 
-	while (desc->status & IRQ_INPROGRESS)
-		cpu_relax();
+	if (hardirq_preemption && !(desc->status & IRQ_NODELAY))
+		wait_event(desc->wait_for_handler,
+			!(desc->status & IRQ_INPROGRESS));
+	else
+		while (desc->status & IRQ_INPROGRESS)
+			cpu_relax();
 }
 
 EXPORT_SYMBOL(synchronize_irq);
@@ -115,11 +123,9 @@ void enable_irq(unsigned int irq)
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
 
-		desc->status = status;
-		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
-			desc->status = status | IRQ_REPLAY;
-			hw_resend_irq(desc->handler,irq);
-		}
+		/* Prevent probing on this irq */
+		desc->status = status | IRQ_NOPROBE;
+		check_irq_resend(desc, irq);
 		desc->handler->enable(irq);
 		/* fall-through */
 	}
@@ -131,6 +137,43 @@ void enable_irq(unsigned int irq)
 
 EXPORT_SYMBOL(enable_irq);
 
+/**
+ * 	set_irq_wake - control irq power management wakeup
+ *	@irq: 	Interrupt to control
+ *	@mode:	power management wakeup mode
+ *
+ *	Enable/disable power management wakeup mode
+ */
+int set_irq_wake(unsigned int irq, unsigned int mode)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+	int ret = -ENXIO;
+
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->chip && desc->chip->set_wake)
+		ret = desc->chip->set_wake(irq, mode);
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return ret;
+}
+
+EXPORT_SYMBOL(set_irq_wake);
+
+/*
+ * If any action has SA_NODELAY then turn IRQ_NODELAY on:
+ */
+void recalculate_desc_flags(struct irq_desc *desc)
+{
+	struct irqaction *action;
+
+	desc->status &= ~IRQ_NODELAY;
+	for (action = desc->action ; action; action = action->next)
+		if (action->flags & SA_NODELAY)
+			desc->status |= IRQ_NODELAY;
+}
+
+static int start_irq_thread(int irq, struct irq_desc *desc);
+
 /*
  * Internal function that tells the architecture code whether a
  * particular irq has been exclusively allocated or is available
@@ -140,7 +183,7 @@ int can_request_irq(unsigned int irq, un
 {
 	struct irqaction *action;
 
-	if (irq >= NR_IRQS)
+	if (irq >= NR_IRQS || irq_desc[irq].status & IRQ_NOREQUEST)
 		return 0;
 
 	action = irq_desc[irq].action;
@@ -181,15 +224,18 @@ int setup_irq(unsigned int irq, struct i
 		rand_initialize_irq(irq);
 	}
 
+	if (!(new->flags & SA_NODELAY))
+		if (start_irq_thread(irq, desc))
+			return -ENOMEM;
 	/*
 	 * The following block of code has to be executed atomically
 	 */
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	if ((old = *p) != NULL) {
 		/* Can't share interrupts unless both agree to */
 		if (!(old->flags & new->flags & SA_SHIRQ)) {
-			spin_unlock_irqrestore(&desc->lock,flags);
+			spin_unlock_irqrestore(&desc->lock, flags);
 			return -EBUSY;
 		}
 
@@ -203,6 +249,11 @@ int setup_irq(unsigned int irq, struct i
 
 	*p = new;
 
+	/*
+	 * Propagate any possible SA_NODELAY flag into IRQ_NODELAY:
+	 */
+	recalculate_desc_flags(desc);
+
 	if (!shared) {
 		desc->depth = 0;
 		desc->status &= ~(IRQ_DISABLED | IRQ_AUTODETECT |
@@ -212,11 +263,11 @@ int setup_irq(unsigned int irq, struct i
 		else
 			desc->handler->enable(irq);
 	}
-	spin_unlock_irqrestore(&desc->lock,flags);
+	spin_unlock_irqrestore(&desc->lock, flags);
 
 	new->irq = irq;
 	register_irq_proc(irq);
-	new->dir = NULL;
+	new->dir = new->threaded = NULL;
 	register_handler_proc(irq, new);
 
 	return 0;
@@ -246,7 +297,7 @@ void free_irq(unsigned int irq, void *de
 		return;
 
 	desc = irq_desc + irq;
-	spin_lock_irqsave(&desc->lock,flags);
+	spin_lock_irqsave(&desc->lock, flags);
 	p = &desc->action;
 	for (;;) {
 		struct irqaction * action = *p;
@@ -274,7 +325,8 @@ void free_irq(unsigned int irq, void *de
 				else
 					desc->handler->disable(irq);
 			}
-			spin_unlock_irqrestore(&desc->lock,flags);
+			recalculate_desc_flags(desc);
+			spin_unlock_irqrestore(&desc->lock, flags);
 			unregister_handler_proc(irq, action);
 
 			/* Make sure it's not being used on another CPU */
@@ -282,8 +334,8 @@ void free_irq(unsigned int irq, void *de
 			kfree(action);
 			return;
 		}
-		printk(KERN_ERR "Trying to free free IRQ%d\n",irq);
-		spin_unlock_irqrestore(&desc->lock,flags);
+		printk(KERN_ERR "Trying to free free IRQ%d\n", irq);
+		spin_unlock_irqrestore(&desc->lock, flags);
 		return;
 	}
 }
@@ -336,6 +388,8 @@ int request_irq(unsigned int irq,
 		return -EINVAL;
 	if (irq >= NR_IRQS)
 		return -EINVAL;
+	if (irq_desc[irq].status & IRQ_NOREQUEST)
+		return -EINVAL;
 	if (!handler)
 		return -EINVAL;
 
@@ -359,3 +413,371 @@ int request_irq(unsigned int irq,
 
 EXPORT_SYMBOL(request_irq);
 
+/**
+ *	generic_set_irq_type - set the hardware irq type structure for an irq
+ *	@irq: 	Interrupt number
+ *	@type: 	Pointer to irq_type structure
+ */
+int generic_set_irq_type(unsigned int irq, struct irq_type *type)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install type for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	if (!type)
+		type = &no_irq_type;
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler = type;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(generic_set_irq_type);
+
+/**
+ *	set_irq_data - set irq type data for an irq
+ *	@irq: 	Interrupt number
+ *	@data: 	Pointer to interrupt specific data
+ *
+ * 	Set the hardware irq controller data for an irq
+ */
+int set_irq_data(unsigned int irq, void *data)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install controller data for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->handler_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_data);
+
+/**
+ *	set_irq_chip - set irq chip for an IRQ
+ *	@irq: 	Interrupt number
+ *	@chip: 	Pointer to irq_chip structure
+ *
+ * 	Set the hardware chip structure for an IRQ
+ */
+int set_irq_chip(unsigned int irq, struct irq_chip *chip)
+{
+	irq_desc_t *desc;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to install chip for IRQ%d\n", irq);
+		return -EINVAL;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip = chip;
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_chip);
+
+/**
+ *	set_irq_chip_data - set irq chip data for an irq
+ *	@irq: 	Interrupt number
+ *	@data: 	Pointer to chip specific data
+ *
+ * 	Set the hardware irq chip data for an irq
+ */
+int set_irq_chip_data(unsigned int irq, void *data)
+{
+	irq_desc_t *desc = irq_desc + irq;
+	unsigned long flags;
+
+	if (irq >= NR_IRQS || !desc->handler || !desc->chip) {
+		printk(KERN_ERR "BUG: bad set_irq_chip_data(IRQ#%d)\n", irq);
+		return -EINVAL;
+	}
+
+	spin_lock_irqsave(&desc->lock, flags);
+	desc->chip->chip_data = data;
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	return 0;
+}
+
+EXPORT_SYMBOL(set_irq_chip_data);
+
+/*
+ * 	set_hwirq_type - Set the irq type (level/edge/simple/percpu)
+ *	@irq: 		Interrupt number
+ *	@hw_type: 	interrupt type (see constants in include/linux/irq.h)
+ *
+ * 	Called from device drivers to configure GPIO interrupts
+ * 	according to their requirements. The set_type function of the
+ * 	handler returns a pointer to an irq_type structure which is
+ * 	able to handle this interrupt type. The handler in the irq
+ * 	descriptor structure is set to the new handler type.
+ *
+ */
+int set_hwirq_type(unsigned int irq, unsigned int hw_type)
+{
+	struct irq_type *type = NULL;
+	unsigned long flags;
+	irq_desc_t *desc;
+
+	if (irq >= NR_IRQS) {
+		printk(KERN_ERR "Trying to set irq type for IRQ%d\n", irq);
+		return -ENODEV;
+	}
+
+	desc = irq_desc + irq;
+	spin_lock_irqsave(&desc->lock, flags);
+	if (desc->handler->set_type) {
+		type = desc->handler->set_type(irq, hw_type);
+		if (type)
+			desc->handler = type;
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+	return type ? -ENXIO : 0;
+}
+
+EXPORT_SYMBOL(set_hwirq_type);
+
+#ifdef CONFIG_PREEMPT_HARDIRQS
+
+int hardirq_preemption = 1;
+
+EXPORT_SYMBOL(hardirq_preemption);
+
+/*
+ * Real-Time Preemption depends on hardirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init hardirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		hardirq_preemption = 0;
+	else
+		get_option(&str, &hardirq_preemption);
+	if (!hardirq_preemption)
+		printk("turning off hardirq preemption!\n");
+
+	return 1;
+}
+
+__setup("hardirq-preempt=", hardirq_preempt_setup);
+
+#endif
+
+/*
+ * threaded simple handler
+ */
+static void thread_simple_irq(irq_desc_t *desc)
+{
+	struct irqaction *action = desc->action;
+	unsigned int irq = desc - irq_desc;
+	irqreturn_t action_ret;
+
+	if (action && !desc->depth) {
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		raw_local_irq_enable();
+ 		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	}
+	desc->status &= ~IRQ_INPROGRESS;
+}
+
+/*
+ * threaded level type irq handler
+ */
+static void thread_level_irq(irq_desc_t *desc)
+{
+	thread_simple_irq(desc);
+	end_irq(desc, desc - irq_desc);
+}
+
+/*
+ * threaded edge type IRQ handler
+ */
+static void thread_edge_irq(irq_desc_t *desc)
+{
+	unsigned int irq = desc - irq_desc;
+
+	do {
+		struct irqaction *action = desc->action;
+		irqreturn_t action_ret;
+
+		if (unlikely(!action)) {
+			desc->status &= ~IRQ_INPROGRESS;
+			desc->handler->disable(irq);
+			return;
+		}
+
+		/*
+		 * When another irq arrived while we were handling
+		 * one, we could have masked the irq.
+		 * Renable it, if it was not disabled in meantime.
+		 */
+		if (unlikely(((desc->status & (IRQ_PENDING | IRQ_MASKED)) ==
+			    (IRQ_PENDING | IRQ_MASKED)) && !desc->depth))
+			desc->handler->enable(irq);
+
+		desc->status &= ~IRQ_PENDING;
+		spin_unlock(&desc->lock);
+		action_ret = handle_IRQ_event(irq, NULL, action);
+		raw_local_irq_enable();
+		cond_resched_all();
+		spin_lock_irq(&desc->lock);
+		if (!noirqdebug)
+			note_interrupt(irq, desc, action_ret, NULL);
+	} while ((desc->status & IRQ_PENDING) && !desc->depth);
+
+	desc->status &= ~IRQ_INPROGRESS;
+	/*
+	 * The end-handler has to deal with interrupts which got
+	 * disabled while the handler was running:
+	 */
+	end_irq(desc, irq);
+}
+
+static void do_hardirq(struct irq_desc *desc)
+{
+	spin_lock_irq(&desc->lock);
+
+	if (!(desc->status & IRQ_INPROGRESS))
+		goto out;
+
+	if (desc->handler->handle_irq == handle_simple_irq)
+		thread_simple_irq(desc);
+	else if (desc->handler->handle_irq == handle_level_irq)
+		thread_level_irq(desc);
+	else
+		thread_edge_irq(desc);
+ out:
+	spin_unlock_irq(&desc->lock);
+
+	if (waitqueue_active(&desc->wait_for_handler))
+		wake_up(&desc->wait_for_handler);
+}
+
+extern asmlinkage void __do_softirq(void);
+
+static int curr_irq_prio = 49;
+
+static int do_irqd(void * __desc)
+{
+	struct sched_param param = { 0, };
+	struct irq_desc *desc = __desc;
+#ifdef CONFIG_SMP
+	int irq = desc - irq_desc;
+	cpumask_t mask;
+
+	mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq]));
+	set_cpus_allowed(current, mask);
+#endif
+	current->flags |= PF_NOFREEZE | PF_HARDIRQ;
+
+	/*
+	 * Scale irq thread priorities from prio 50 to prio 25
+	 */
+	param.sched_priority = curr_irq_prio;
+	if (param.sched_priority > 25)
+		curr_irq_prio = param.sched_priority - 1;
+
+//	param.sched_priority = 1;
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		do_hardirq(desc);
+		cond_resched_all();
+		__do_softirq();
+//		do_softirq_from_hardirq();
+		raw_local_irq_enable();
+#ifdef CONFIG_SMP
+		/*
+		 * Did IRQ affinities change?
+		 */
+		if (!cpu_isset(smp_processor_id(), irq_affinity[irq])) {
+			mask = cpumask_of_cpu(any_online_cpu(irq_affinity[irq]));
+			set_cpus_allowed(current, mask);
+		}
+#endif
+		schedule();
+	}
+	__set_current_state(TASK_RUNNING);
+	return 0;
+}
+
+static int ok_to_create_irq_threads;
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	if (desc->thread || !ok_to_create_irq_threads)
+		return 0;
+
+	desc->thread = kthread_create(do_irqd, desc, "IRQ %d", irq);
+	if (!desc->thread) {
+		printk(KERN_ERR "irqd: could not create IRQ thread %d!\n", irq);
+		return -ENOMEM;
+	}
+
+	/*
+	 * An interrupt may have come in before the thread pointer was
+	 * stored in desc->thread; make sure the thread gets woken up in
+	 * such a case:
+	 */
+	smp_mb();
+	wake_up_process(desc->thread);
+
+	return 0;
+}
+
+void __init init_hardirqs(void)
+{
+	int i;
+	ok_to_create_irq_threads = 1;
+
+	for (i = 0; i < NR_IRQS; i++) {
+		irq_desc_t *desc = irq_desc + i;
+
+		if (desc->action && !(desc->status & IRQ_NODELAY))
+			start_irq_thread(i, desc);
+	}
+}
+
+#else
+
+static int start_irq_thread(int irq, struct irq_desc *desc)
+{
+	return 0;
+}
+
+#endif
+
+void __init early_init_hardirqs(void)
+{
+	int i;
+
+	for (i = 0; i < NR_IRQS; i++)
+		init_waitqueue_head(&irq_desc[i].wait_for_handler);
+}
+
+
+
Index: linux/kernel/irq/proc.c
===================================================================
--- linux.orig/kernel/irq/proc.c
+++ linux/kernel/irq/proc.c
@@ -7,9 +7,13 @@
  */
 
 #include <linux/irq.h>
+#include <asm/uaccess.h>
+#include <linux/profile.h>
 #include <linux/proc_fs.h>
 #include <linux/interrupt.h>
 
+#include "internals.h"
+
 static struct proc_dir_entry *root_irq_dir, *irq_dir[NR_IRQS];
 
 #ifdef CONFIG_SMP
@@ -77,37 +81,6 @@ static int irq_affinity_write_proc(struc
 
 #endif
 
-#define MAX_NAMELEN 128
-
-static int name_unique(unsigned int irq, struct irqaction *new_action)
-{
-	struct irq_desc *desc = irq_desc + irq;
-	struct irqaction *action;
-
-	for (action = desc->action ; action; action = action->next)
-		if ((action != new_action) && action->name &&
-				!strcmp(new_action->name, action->name))
-			return 0;
-	return 1;
-}
-
-void register_handler_proc(unsigned int irq, struct irqaction *action)
-{
-	char name [MAX_NAMELEN];
-
-	if (!irq_dir[irq] || action->dir || !action->name ||
-					!name_unique(irq, action))
-		return;
-
-	memset(name, 0, MAX_NAMELEN);
-	snprintf(name, MAX_NAMELEN, "%s", action->name);
-
-	/* create /proc/irq/1234/handler/ */
-	action->dir = proc_mkdir(name, irq_dir[irq]);
-}
-
-#undef MAX_NAMELEN
-
 #define MAX_NAMELEN 10
 
 void register_irq_proc(unsigned int irq)
@@ -147,10 +120,96 @@ void register_irq_proc(unsigned int irq)
 
 void unregister_handler_proc(unsigned int irq, struct irqaction *action)
 {
+	if (action->threaded)
+		remove_proc_entry(action->threaded->name, action->dir);
 	if (action->dir)
 		remove_proc_entry(action->dir->name, irq_dir[irq]);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
+static int threaded_read_proc(char *page, char **start, off_t off,
+			      int count, int *eof, void *data)
+{
+	return sprintf(page, "%c\n",
+		((struct irqaction *)data)->flags & SA_NODELAY ? '0' : '1');
+}
+
+static int threaded_write_proc(struct file *file, const char __user *buffer,
+			       unsigned long count, void *data)
+{
+	int c;
+	struct irqaction *action = data;
+	irq_desc_t *desc = irq_desc + action->irq;
+
+	if (get_user(c, buffer))
+		return -EFAULT;
+	if (c != '0' && c != '1')
+		return -EINVAL;
+
+	spin_lock_irq(&desc->lock);
+
+	if (c == '0')
+		action->flags |= SA_NODELAY;
+	if (c == '1')
+		action->flags &= ~SA_NODELAY;
+	recalculate_desc_flags(desc);
+
+	spin_unlock_irq(&desc->lock);
+
+	return 1;
+}
+
+#endif
+
+#define MAX_NAMELEN 128
+
+static int name_unique(unsigned int irq, struct irqaction *new_action)
+{
+	struct irq_desc *desc = irq_desc + irq;
+	struct irqaction *action;
+
+	for (action = desc->action ; action; action = action->next)
+		if ((action != new_action) && action->name &&
+				!strcmp(new_action->name, action->name))
+			return 0;
+	return 1;
+}
+
+void register_handler_proc(unsigned int irq, struct irqaction *action)
+{
+	char name [MAX_NAMELEN];
+
+	if (!irq_dir[irq] || action->dir || !action->name ||
+					!name_unique(irq, action))
+		return;
+
+	memset(name, 0, MAX_NAMELEN);
+	snprintf(name, MAX_NAMELEN, "%s", action->name);
+
+	/* create /proc/irq/1234/handler/ */
+	action->dir = proc_mkdir(name, irq_dir[irq]);
+	if (!action->dir)
+		return;
+#ifndef CONFIG_PREEMPT_RT
+	{
+		struct proc_dir_entry *entry;
+		/* create /proc/irq/1234/handler/threaded */
+		entry = create_proc_entry("threaded", 0600, action->dir);
+		if (!entry)
+			return;
+		entry->nlink = 1;
+		entry->data = (void *)action;
+		entry->read_proc = threaded_read_proc;
+		entry->write_proc = threaded_write_proc;
+		action->threaded = entry;
+	}
+#endif
+}
+
+#undef MAX_NAMELEN
+
+
 void init_irq_proc(void)
 {
 	int i;
@@ -160,6 +219,9 @@ void init_irq_proc(void)
 	if (!root_irq_dir)
 		return;
 
+	/* create /proc/irq/prof_cpu_mask */
+	create_prof_cpu_mask(root_irq_dir);
+
 	/*
 	 * Create entries for all existing IRQs.
 	 */
Index: linux/kernel/irq/resend.c
===================================================================
--- /dev/null
+++ linux/kernel/irq/resend.c
@@ -0,0 +1,82 @@
+/*
+ * linux/kernel/irq/resend.c
+ *
+ * Copyright (C) 1992, 1998-2005 Linus Torvalds, Ingo Molnar
+ * Copyright (C) 2005, Thomas Gleixner
+ *
+ * This file contains the tasklet-based IRQ-resend code
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/random.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+/* Bitmap to handle software resend of interrupts: */
+static DECLARE_BITMAP(irqs_resend, NR_IRQS);
+
+/*
+ * Run software resends of IRQ's
+ */
+static void resend_irqs(unsigned long arg)
+{
+	unsigned long flags;
+	int irq;
+
+	for (;;) {
+		if (bitmap_empty(irqs_resend, NR_IRQS))
+			break;
+		irq = find_first_bit(irqs_resend, NR_IRQS);
+		clear_bit(irq, irqs_resend);
+		local_irq_save(flags);
+		desc_handle_irq(irq, (irq_desc + irq), NULL);
+		local_irq_restore(flags);
+	}
+}
+
+/* Tasklet to handle resend: */
+static DECLARE_TASKLET(resend_tasklet, resend_irqs, 0);
+
+/*
+ * Handle irq resend
+ *
+ * If the interrupt is waiting to be processed, try to re-run it.  We
+ * can't directly run it from here since the caller might be in an
+ * interrupt-protected region. Not all irq controller chips can
+ * retrigger interrupts at hardware level. For edge type interrupts it
+ * is necessary to resend them by software.  At the moment the pending
+ * list is handled at the end of asm_do_IRQ. That means the next
+ * interrupt (on any irq line) will invoke the do_pending function. It
+ * could also be done by a thread which is woken up by the
+ * check_irq_resend function.
+ *
+ * Is called with interrupts disabled and desc->lock held
+ */
+void check_irq_resend(irq_desc_t *desc, unsigned int irq)
+{
+
+	/* Chipless implementation. This should vanish in the long run */
+	if (!desc->chip) {
+		unsigned int status = desc->status;
+		if ((status & (IRQ_PENDING | IRQ_REPLAY)) == IRQ_PENDING) {
+			desc->status = status | IRQ_REPLAY;
+			hw_resend_irq(desc->handler, irq);
+		}
+		return;
+	}
+
+	/* Chip based implementation */
+	if ((desc->status & IRQ_PENDING) && !test_bit(irq, irqs_resend)) {
+		desc->status &= ~IRQ_PENDING;
+		/* Try to retrigger it in hardware */
+		if (!desc->chip || !desc->chip->retrigger ||
+		    desc->chip->retrigger(irq)) {
+			/* Mark it pending */
+			set_bit(irq, irqs_resend);
+			tasklet_schedule(&resend_tasklet);
+		}
+	}
+}
+
Index: linux/kernel/irq/spurious.c
===================================================================
--- linux.orig/kernel/irq/spurious.c
+++ linux/kernel/irq/spurious.c
@@ -10,6 +10,10 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
+#ifdef CONFIG_X86_IO_APIC
+# include <asm/apicdef.h>
+# include <asm/io_apic.h>
+#endif
 
 static int irqfixup;
 
@@ -55,9 +59,8 @@ static int misrouted_irq(int irq, struct
 			}
 			action = action->next;
 		}
-		local_irq_disable();
 		/* Now clean up the flags */
-		spin_lock(&desc->lock);
+		spin_lock_irq(&desc->lock);
 		action = desc->action;
 
 		/*
@@ -161,12 +164,19 @@ void note_interrupt(unsigned int irq, ir
 		 * The interrupt is stuck
 		 */
 		__report_bad_irq(irq, desc, action_ret);
+#ifdef CONFIG_X86_IO_APIC
+		if (!sis_apic_bug) {
+			sis_apic_bug = 1;
+			printk(KERN_ERR "turning off IO-APIC fast mode.\n");
+		}
+#else
 		/*
 		 * Now kill the IRQ
 		 */
 		printk(KERN_EMERG "Disabling IRQ #%d\n", irq);
 		desc->status |= IRQ_DISABLED;
 		desc->handler->disable(irq);
+#endif
 	}
 	desc->irqs_unhandled = 0;
 }
Index: linux/kernel/itimer.c
===================================================================
--- linux.orig/kernel/itimer.c
+++ linux/kernel/itimer.c
@@ -12,36 +12,49 @@
 #include <linux/syscalls.h>
 #include <linux/time.h>
 #include <linux/posix-timers.h>
+#include <linux/ktimer.h>
 
 #include <asm/uaccess.h>
 
-static unsigned long it_real_value(struct signal_struct *sig)
+/**
+ * itimer_get_remtime - get remaining time for the timer
+ *
+ * @timer: the timer to read
+ * @fake:  a pending, but expired timer returns fake (itimers kludge)
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero or the fake value described above.
+ */
+static ktime_t itimer_get_remtime(struct ktimer *timer, long fake)
 {
-	unsigned long val = 0;
-	if (timer_pending(&sig->real_timer)) {
-		val = sig->real_timer.expires - jiffies;
-
-		/* look out for negative/zero itimer.. */
-		if ((long) val <= 0)
-			val = 1;
-	}
-	return val;
+	ktime_t rem = ktimer_get_remtime(timer);
+
+	/*
+	 * Racy but safe: if the itimer expires after the above
+	 * ktimer_get_remtime() call but before this condition
+	 * then we return KTIMER_ZERO - which is correct.
+	 */
+	if (ktimer_active(timer)) {
+		if (ktime_cmp_val(rem, <=, KTIME_ZERO))
+			rem = ktime_set(0, fake);
+	} else
+		ktime_set_scalar(rem, KTIME_ZERO);
+
+	return rem;
 }
 
 int do_getitimer(int which, struct itimerval *value)
 {
 	struct task_struct *tsk = current;
-	unsigned long interval, val;
+	ktime_t interval, val;
 	cputime_t cinterval, cval;
 
 	switch (which) {
 	case ITIMER_REAL:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		spin_unlock_irq(&tsk->sighand->siglock);
-		jiffies_to_timeval(val, &value->it_value);
-		jiffies_to_timeval(interval, &value->it_interval);
+		interval = tsk->signal->real_timer.interval;
+		val = itimer_get_remtime(&tsk->signal->real_timer, NSEC_PER_USEC);
+		ktime_to_timeval(&value->it_value, val);
+		ktime_to_timeval(&value->it_interval, interval);
 		break;
 	case ITIMER_VIRTUAL:
 		read_lock(&tasklist_lock);
@@ -113,59 +126,36 @@ asmlinkage long sys_getitimer(int which,
 }
 
 
-void it_real_fn(unsigned long __data)
+/*
+ * The timer is automagically restarted, when interval != 0
+ */
+void it_real_fn(void *data)
 {
-	struct task_struct * p = (struct task_struct *) __data;
-	unsigned long inc = p->signal->it_real_incr;
-
-	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, p);
-
-	/*
-	 * Now restart the timer if necessary.  We don't need any locking
-	 * here because do_setitimer makes sure we have finished running
-	 * before it touches anything.
-	 * Note, we KNOW we are (or should be) at a jiffie edge here so
-	 * we don't need the +1 stuff.  Also, we want to use the prior
-	 * expire value so as to not "slip" a jiffie if we are late.
-	 * Deal with requesting a time prior to "now" here rather than
-	 * in add_timer.
-	 */
-	if (!inc)
-		return;
-	while (time_before_eq(p->signal->real_timer.expires, jiffies))
-		p->signal->real_timer.expires += inc;
-	add_timer(&p->signal->real_timer);
+	send_group_sig_info(SIGALRM, SEND_SIG_PRIV, data);
 }
 
 int do_setitimer(int which, struct itimerval *value, struct itimerval *ovalue)
 {
 	struct task_struct *tsk = current;
- 	unsigned long val, interval, expires;
+	struct ktimer *timer;
+	ktime_t expires;
 	cputime_t cval, cinterval, nval, ninterval;
 
 	switch (which) {
 	case ITIMER_REAL:
-again:
-		spin_lock_irq(&tsk->sighand->siglock);
-		interval = tsk->signal->it_real_incr;
-		val = it_real_value(tsk->signal);
-		/* We are sharing ->siglock with it_real_fn() */
-		if (try_to_del_timer_sync(&tsk->signal->real_timer) < 0) {
-			spin_unlock_irq(&tsk->sighand->siglock);
-			goto again;
-		}
-		tsk->signal->it_real_incr =
-			timeval_to_jiffies(&value->it_interval);
-		expires = timeval_to_jiffies(&value->it_value);
-		if (expires)
-			mod_timer(&tsk->signal->real_timer,
-				  jiffies + 1 + expires);
-		spin_unlock_irq(&tsk->sighand->siglock);
+		timer = &tsk->signal->real_timer;
+		ktimer_cancel(timer);
 		if (ovalue) {
-			jiffies_to_timeval(val, &ovalue->it_value);
-			jiffies_to_timeval(interval,
-					   &ovalue->it_interval);
-		}
+			ktime_to_timeval(&ovalue->it_value,
+				itimer_get_remtime(timer, NSEC_PER_USEC));
+			ktime_to_timeval(&ovalue->it_interval, timer->interval);
+		}
+		timer->interval = ktimer_round_timeval(timer,
+							&value->it_interval);
+		expires = timeval_to_ktime(value->it_value);
+		if (ktime_cmp_val(expires, != , KTIME_ZERO))
+			ktimer_restart(timer, &expires,
+				KTIMER_REL | KTIMER_NOCHECK | KTIMER_ROUND);
 		break;
 	case ITIMER_VIRTUAL:
 		nval = timeval_to_cputime(&value->it_value);
Index: linux/kernel/ktimers.c
===================================================================
--- /dev/null
+++ linux/kernel/ktimers.c
@@ -0,0 +1,1604 @@
+/*
+ *  linux/kernel/ktimers.c
+ *
+ *  Copyright(C) 2005, Thomas Gleixner <tglx@linutronix.de>
+ *  Copyright(C) 2005, Red Hat, Inc., Ingo Molnar
+ *
+ *  High-precision kernel timers
+ *
+ *  In contrast to the low-resolution timeout API implemented in
+ *  kernel/timer.c, ktimers provide finer resolution and accuracy
+ *  depending on system configuration and capabilities.
+ *
+ *  These timers are currently used for:
+ *   - itimers
+ *   - POSIX timers
+ *   - nanosleep
+ *   - precise in-kernel timing
+ *
+ *  Started by: Thomas Gleixner and Ingo Molnar
+ *
+ *  Credits:
+ *	based on kernel/timer.c
+ *
+ *  For licencing details see kernel-base/COPYING
+ */
+
+#include <linux/clockchips.h>
+#include <linux/cpu.h>
+#include <linux/ktimer.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/syscalls.h>
+#include <linux/timeofday.h>
+#include <linux/interrupt.h>
+
+#include <asm/uaccess.h>
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+static int hrtimer_common_reprogram(struct ktimer *timer,
+				    struct ktimer_base *base, ktime_t now);
+#endif
+
+/*
+ * The timer bases:
+ */
+
+#define MAX_KTIMER_BASES 2
+
+static DEFINE_PER_CPU(struct ktimer_base, ktimer_bases[MAX_KTIMER_BASES]) =
+{
+	{
+		.index = CLOCK_REALTIME,
+		.name = "Realtime",
+		.get_time = &ktime_get_real,
+		.resolution = KTIME_REALTIME_RES,
+#ifdef CONFIG_HIGH_RES_TIMERS
+		.reprogram = &hrtimer_common_reprogram,
+		.getoffset = &get_realtime_offset,
+#endif
+	},
+	{
+		.index = CLOCK_MONOTONIC,
+		.name = "Monotonic",
+		.get_time = &ktime_get,
+		.resolution = KTIME_MONOTONIC_RES,
+#ifdef CONFIG_HIGH_RES_TIMERS
+		.reprogram = &hrtimer_common_reprogram,
+#endif
+	},
+};
+
+#ifndef CONFIG_GENERIC_TIME
+
+/**
+ * ktime_get - get the monotonic time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get(void)
+{
+	struct timespec now;
+
+	ktime_get_ts(&now);
+
+	return timespec_to_ktime(now);
+}
+
+EXPORT_SYMBOL_GPL(ktime_get);
+
+/**
+ * ktime_get_real - get the real (wall-) time in ktime_t format
+ *
+ * returns the time in ktime_t format
+ */
+ktime_t ktime_get_real(void)
+{
+	struct timespec now;
+
+	getnstimeofday(&now);
+
+	return timespec_to_ktime(now);
+}
+
+EXPORT_SYMBOL_GPL(ktime_get_real);
+
+/**
+ * ktime_get_ts - get the monotonic clock in timespec format
+ *
+ * @ts:		pointer to timespec variable
+ *
+ * The function calculates the monotonic clock from the realtime
+ * clock and the wall_to_monotonic offset and stores the result
+ * in normalized timespec format in the variable pointed to by ts.
+ */
+void ktime_get_ts(struct timespec *ts)
+{
+	struct timespec tomono;
+	unsigned long seq;
+
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		getnstimeofday(ts);
+		tomono = wall_to_monotonic;
+
+	} while (read_seqretry(&xtime_lock, seq));
+
+	set_normalized_timespec(ts, ts->tv_sec + tomono.tv_sec,
+				ts->tv_nsec + tomono.tv_nsec);
+}
+#endif
+
+/*
+ * Functions and macros which are different for UP/SMP systems are kept in a
+ * single place
+ */
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+
+#define set_curr_timer(b, t)		(b)->curr_timer = (t)
+#define wake_up_timer_waiters(b)	wake_up(&(b)->wait)
+
+/**
+ * wait_for_ktimer - Wait for a running ktimer
+ *
+ * @timer:	timer to wait for
+ *
+ * The function waits in case the timers callback function is
+ * currently executed on the waitqueue of the timer base. The
+ * waitqueue is woken up after the timer callback function has
+ * finished execution.
+ */
+void wait_for_ktimer(struct ktimer *timer)
+{
+	struct ktimer_base *base = timer->base;
+
+	if (base)
+		wait_event(base->wait,
+			   base->curr_timer != timer);
+}
+
+/*
+ * We are using hashed locking: holding per_cpu(ktimer_bases)[n].lock
+ * means that all timers which are tied to this base via timer->base are
+ * locked, and the base itself is locked too.
+ *
+ * So __run_timers/migrate_timers can safely modify all timers which could
+ * be found on the lists/queues.
+ *
+ * When the timer's base is locked, and the timer removed from list, it is
+ * possible to set timer->base = NULL and drop the lock: the timer remains
+ * locked.
+ */
+static struct ktimer_base *lock_ktimer_base(struct ktimer *timer,
+					    unsigned long *flags)
+{
+	struct ktimer_base *base;
+
+	for (;;) {
+		base = timer->base;
+		if (likely(base != NULL)) {
+			spin_lock_irqsave(&base->lock, *flags);
+			if (likely(base == timer->base))
+				return base;
+			/* The timer has migrated to another CPU */
+			spin_unlock_irqrestore(&base->lock, *flags);
+		}
+		cpu_relax();
+	}
+}
+
+/*
+ * Switch the timer base to the current CPU when possible.
+ */
+static inline struct ktimer_base *
+switch_ktimer_base(struct ktimer *timer, struct ktimer_base *base)
+{
+	struct ktimer_base *new_base;
+
+	new_base = &__get_cpu_var(ktimer_bases[base->index]);
+
+	if (base != new_base) {
+		/*
+		 * We are trying to schedule the timer on the local CPU.
+		 * However we can't change timer's base while it is running,
+		 * so we keep it on the same CPU. No hassle vs. reprogramming
+		 * the event source in the high resolution case. The softirq
+		 * code will take care of this when the timer function has
+		 * completed. There is no conflict as we hold the lock until
+		 * the timer is enqueued.
+		 */
+		if (unlikely(base->curr_timer == timer))
+			return base;
+
+		/* See the comment in lock_timer_base() */
+		timer->base = NULL;
+		spin_unlock(&base->lock);
+		spin_lock(&new_base->lock);
+		timer->base = new_base;
+	}
+	return new_base;
+}
+
+/*
+ * Get the timer base unlocked
+ *
+ * Take care of timer->base = NULL in switch_ktimer_base !
+ */
+static inline struct ktimer_base *
+get_ktimer_base_unlocked(struct ktimer *timer)
+{
+	struct ktimer_base *base;
+
+	while (!(base = timer->base))
+		cpu_relax();
+
+	return base;
+}
+
+#else /* CONFIG_SMP */
+
+#define set_curr_timer(b, t)		do { } while (0)
+#define wake_up_timer_waiters(b)	do { } while (0)
+
+static inline struct ktimer_base *
+lock_ktimer_base(struct ktimer *timer, unsigned long *flags)
+{
+	struct ktimer_base *base = timer->base;
+
+	spin_lock_irqsave(&base->lock, *flags);
+
+	return base;
+}
+
+#define switch_ktimer_base(t, b)	(b)
+#define get_ktimer_base_unlocked(t)	(t)->base
+
+#endif	/* !CONFIG_SMP */
+
+/* High resolution timer related functions */
+#ifdef CONFIG_HIGH_RES_TIMERS
+
+#define ktimer_hres_active (__get_cpu_var(ktimer_hres).active)
+
+struct ktimer_hres {
+	ktime_t		expires_next;
+	ktime_t		next_tick;
+	ktime_t		tick_incr;
+	int		active;
+	int		dotick;
+	unsigned long	check_clocks;
+};
+
+DEFINE_PER_CPU(struct ktimer_hres, ktimer_hres);
+
+/*
+ * Shared reprogramming for clock_realtime and clock_monotonic
+ *
+ * When a new expires first timer is enqueued, we have
+ * to check, whether it expires earlier than the timer
+ * for which the hrt time source was armed.
+ *
+ * Called with interrupts disabled and base lock held
+ */
+static int hrtimer_common_reprogram(struct ktimer *timer,
+				    struct ktimer_base *base, ktime_t now)
+{
+	ktime_t *expires_next = &__get_cpu_var(ktimer_hres).expires_next;
+	ktime_t expires = timer->expires;
+	int res;
+
+	if (base->getoffset)
+		expires = ktime_sub(expires, base->getoffset());
+
+	if (ktime_cmp(expires, >= ,*expires_next))
+		return 0;
+
+	res = clockevents_set_next_event(timer->expires, now);
+	if (!res)
+		*expires_next = expires;
+	return res;
+}
+
+/*
+ * High resolution timer interrupt
+ * Called with interrupts disabled
+ */
+int ktimer_interrupt(void)
+{
+	struct ktimer_base *base;
+	ktime_t expires_next, now;
+	int i, raise = 0, ret = 0;
+	int cpu = smp_processor_id();
+	struct ktimer_hres *hres = &per_cpu(ktimer_hres, cpu);
+
+	/* As long as we did not switch over to high resolution mode
+	 * we expect, that the event source is running in periodic
+	 * mode when it is a source serving other (tick based)
+	 * functionality than next event
+	 *
+	 */
+	if (!hres->active) {
+		trace_special(-1, -1, -1);
+		return 1;
+	}
+
+	now = ktime_get();
+
+	if (hres->dotick) {
+		while (ktime_cmp(now, >= , hres->next_tick)) {
+			hres->next_tick = ktime_add(hres->next_tick,
+							hres->tick_incr);
+			ret++;
+		}
+		expires_next = hres->next_tick;
+	} else
+		ktime_set_scalar(expires_next, KTIME_MAX);
+
+	base = per_cpu(ktimer_bases, cpu);
+
+	for (i = 0; i < MAX_KTIMER_BASES; i++) {
+		ktime_t basenow;
+		DEFINE_KTIME(offset);
+
+		spin_lock(&base->lock);
+
+ 		if (list_empty(&base->pending)) {
+			trace_special(0, 0, 0);
+ 			spin_unlock(&base->lock);
+ 			base++;
+ 			continue;
+ 		}
+
+		if (base->getoffset) {
+			offset = base->getoffset();
+			basenow = ktime_add(now, offset);
+		} else {
+			basenow = now;
+		}
+
+		while (!list_empty(&base->pending)) {
+			struct ktimer *timer = list_entry(base->pending.next,
+							  struct ktimer, list);
+
+			if (ktime_cmp(basenow, < , timer->expires)) {
+				ktime_t expires;
+
+				expires = ktime_sub(timer->expires, offset);
+				if (ktime_cmp(expires, < , expires_next))
+					expires_next = expires;
+				break;
+			}
+			timer->expired = basenow;
+			list_del_init(&timer->list);
+			timer->state = KTIMER_EXPIRED;
+			/* Timer function executable in irq context ? */
+			if (timer->prio < 0) {
+				timer->function(timer->data);
+			} else {
+				list_add_tail(&timer->list, &base->expired);
+				raise = 1;
+			}
+		}
+		spin_unlock(&base->lock);
+		base++;
+	}
+
+	hres->expires_next = expires_next;
+
+	/* Reprogramming necessary ? */
+	if (ktime_cmp_val(expires_next, !=, KTIME_MAX))
+		clockevents_set_next_event(expires_next, now);
+
+	/* Raise softirq ? */
+	if (raise)
+		raise_softirq(KTIMER_SOFTIRQ);
+
+	return ret;
+}
+
+/*
+ * Retrigger next event is called after clock was set
+ */
+void retrigger_next_event(void *arg)
+{
+	ktime_t expires_next, now;
+	int i, cpu = smp_processor_id();
+	struct ktimer_base *base = per_cpu(ktimer_bases, cpu);
+	struct ktimer_hres *hres = &per_cpu(ktimer_hres, cpu);
+
+	now = ktime_get();
+
+	if (hres->dotick)
+		expires_next = hres->next_tick;
+	else
+		ktime_set_scalar(expires_next, KTIME_MAX);
+
+	for (i = 0; i < MAX_KTIMER_BASES; i++) {
+		ktime_t basenow;
+		DEFINE_KTIME(offset);
+		struct ktimer *timer;
+
+		spin_lock(&base->lock);
+
+ 		if (list_empty(&base->pending)) {
+ 			spin_unlock(&base->lock);
+ 			base++;
+ 			continue;
+ 		}
+
+		if (base->getoffset) {
+			offset = base->getoffset();
+			basenow = ktime_add(now, offset);
+		} else {
+			basenow = now;
+		}
+		timer = list_entry(base->pending.next, struct ktimer, list);
+
+		if (ktime_cmp(basenow, < , timer->expires)) {
+			ktime_t expires;
+
+			expires = ktime_sub(timer->expires, offset);
+			if (ktime_cmp(expires, < , expires_next))
+				expires_next = expires;
+		}
+		spin_unlock(&base->lock);
+		base++;
+	}
+
+	hres->expires_next = expires_next;
+
+	/* Reprogramming necessary ? */
+	if (ktime_cmp_val(expires_next, !=, KTIME_MAX))
+		clockevents_set_next_event(expires_next, now);
+}
+
+/*
+ * Clock realtime was set
+ *
+ * Change the offset of the realtime clock vs. the monotonic
+ * clock. Called with xtime lock held !
+ *
+ * We might have to reprogram the high resolution timer interrupt. On
+ * SMP we call the architecture specific code to retrigger _all_ high
+ * resolution timer interrupts. On UP we just disable interrupts and
+ * call the high resolution interrupt code.
+ */
+void clock_was_set(void)
+{
+	preempt_disable();
+	raw_local_irq_disable();
+
+	if (ktimer_hres_active) {
+		retrigger_next_event(NULL);
+		raw_local_irq_enable();
+
+		if (smp_call_function(retrigger_next_event, NULL, 1, 1))
+			BUG();
+	} else
+		raw_local_irq_enable();
+	preempt_enable();
+}
+
+/***
+ * ktimer_clock_notify - A clock source or a clock event has been installed
+ *
+ * Notify the per cpu softirqs to recheck the clock sources and events
+ */
+void ktimer_clock_notify(void)
+{
+	int i;
+
+	for (i = 0; i < NR_CPUS; i++)
+		set_bit(0, &per_cpu(ktimer_hres, i).check_clocks);
+}
+
+/*
+ * A change in the clock source or clock events was detected.
+ * Check the clock source and the events, whether we can switch to
+ * high resolution mode or not.
+ *
+ * TODO: Handle the removal of clock sources / events
+ */
+static void ktimer_check_clocks(void)
+{
+	struct ktimer_hres *hres = &__get_cpu_var(ktimer_hres);
+	unsigned long flags;
+	int dotick;
+
+	if (!test_and_clear_bit(0, &hres->check_clocks))
+		return;
+
+	if (!timeofday_is_continuous())
+		return;
+
+	if (!(dotick = clockevents_next_event_available()))
+		return;
+
+	raw_local_irq_save(flags);
+	clockevents_init_next_event();
+	hres->active = 1;
+	if (dotick == CLOCK_EVT_SCHEDTICK) {
+		struct ktimer helper;
+		struct timespec tsnow;
+		ktime_t now;
+
+		/* Adjust to resolution */
+		ktimer_init(&helper);
+		ktime_get_ts(&tsnow);
+		now = ktimer_round_timespec(&helper, &tsnow);
+		hres->tick_incr = ktime_set(0, NSEC_PER_SEC/HZ);
+		hres->next_tick = ktime_add(now, hres->tick_incr);
+		hres->dotick = 1;
+	} else
+		hres->dotick = 0;
+
+	/* "Retrigger" the interrupt to get things going */
+	retrigger_next_event(NULL);
+	raw_local_irq_restore(flags);
+	printk(KERN_INFO "Ktimers: Switched to high resolution mode CPU %d\n",
+	       smp_processor_id());
+}
+
+/*
+ * For HRT we move expired timers directly to the expired list and set
+ * the status to KTIMER_EXPIRED_NOQUEUE
+ */
+static inline int hres_enqueue_expired(struct ktimer *timer,
+				       struct ktimer_base *base, ktime_t now)
+{
+	timer->expired = now;
+	timer->expiry_mode = __LINE__;
+	list_add_tail(&timer->list, &base->expired);
+	timer->state = KTIMER_EXPIRED_NOQUEUE;
+	base->count++;
+	raise_softirq(KTIMER_SOFTIRQ);
+	return 1;
+}
+
+static inline void
+hres_requeue_expired(struct ktimer *timer, struct ktimer_base *base)
+{
+	timer->expired = timer->expires;
+	timer->expiry_mode = __LINE__;
+	list_del(&timer->list);
+	list_add_tail(&timer->list, &base->expired);
+	timer->state = KTIMER_EXPIRED;
+	raise_softirq(KTIMER_SOFTIRQ);
+}
+
+#else
+# define ktimer_hres_active		0
+# define hres_enqueue_expired(t,b,n)	0
+# define ktimer_check_clocks()		do { } while (0)
+#endif
+
+/*
+ * Functions for the union type storage format of ktime_t which are
+ * too large for inlining:
+ */
+#if (BITS_PER_LONG < 64)
+
+#ifndef CONFIG_KTIME_SCALAR
+/**
+ * ktime_add_ns - Add a scalar nanoseconds value to a ktime_t variable
+ *
+ * @kt:		addend
+ * @nsec:	the scalar nsec value to add
+ *
+ * Returns the sum of kt and nsec in ktime_t format
+ */
+ktime_t ktime_add_ns(ktime_t kt, u64 nsec)
+{
+	ktime_t tmp;
+
+	if (likely(nsec < NSEC_PER_SEC)) {
+		tmp.tv64 = nsec;
+	} else {
+		unsigned long rem = do_div(nsec, NSEC_PER_SEC);
+
+		tmp = ktime_set((long)nsec, rem);
+	}
+
+	return ktime_add(kt, tmp);
+}
+
+#else
+
+/**
+ * ktime_modulo - Calc ktime_t modulo div
+ *
+ * @kt:		dividend
+ * @div:	divisor
+ *
+ * Return ktime_t modulo div.
+ *
+ * div is less than NSEC_PER_SEC and (NSEC_PER_SEC % div) = 0 !
+ */
+static unsigned long ktime_modulo(ktime_t kt, unsigned long div)
+{
+	return do_div(kt, div);
+}
+
+#endif
+#endif
+
+/*
+ * Counterpart to lock_timer_base above.
+ */
+static inline
+void unlock_ktimer_base(struct ktimer *timer, unsigned long *flags)
+{
+	spin_unlock_irqrestore(&timer->base->lock, *flags);
+}
+
+/**
+ * ktimer_round_timespec - convert timespec to ktime_t with resolution
+ *			     adjustment
+ *
+ * @timer:	ktimer to retrieve the base
+ * @ts:		pointer to the timespec value to be converted
+ *
+ * Returns the resolution adjusted ktime_t representation of the
+ * timespec.
+ *
+ * Note: We can access base without locking here, as ktimers can
+ * migrate between CPUs but can not be moved from one clock source to
+ * another. The clock source binding is set at init_ktimer_XXX time.
+ */
+ktime_t ktimer_round_timespec(struct ktimer *timer, struct timespec *ts)
+{
+	struct ktimer_base *base = get_ktimer_base_unlocked(timer);
+	long rem = ts->tv_nsec % base->resolution;
+	ktime_t t;
+
+	t = ktime_set(ts->tv_sec, ts->tv_nsec);
+
+	/* Check, if the value has to be rounded */
+	if (rem)
+		t = ktime_add_ns(t, base->resolution - rem);
+
+	return t;
+}
+
+/**
+ * ktimer_round_timeval - convert timeval to ktime_t with resolution
+ *			    adjustment
+ *
+ * @timer:	ktimer to retrieve the base
+ * @tv:		pointer to the timeval value to be converted
+ *
+ * Returns the resolution adjusted ktime_t representation of the
+ * timeval.
+ */
+ktime_t ktimer_round_timeval(struct ktimer *timer, struct timeval *tv)
+{
+	struct timespec ts;
+
+	ts.tv_sec = tv->tv_sec;
+	ts.tv_nsec = tv->tv_usec * NSEC_PER_USEC;
+
+	return ktimer_round_timespec(timer, &ts);
+}
+
+/*
+ * enqueue_ktimer - internal function to (re)start a timer
+ *
+ * The timer is inserted in expiry order. Insertion into the
+ * red black tree is O(log(n)). Must hold the base lock.
+ *
+ *  Todo:
+ *
+ * Reimplement the dynamic priority setting of the timer softirq. Its
+ * a bit more complicated due to the seperate bases. The simplest
+ * solution would be split softirqs - yeah I know its not opportune :)
+ */
+static int enqueue_ktimer(struct ktimer *timer, struct ktimer_base *base,
+			  ktime_t *tim, int mode)
+{
+	struct rb_node **link = &base->active.rb_node;
+	struct list_head *prev = &base->pending;
+	struct rb_node *parent = NULL;
+	struct ktimer *entry;
+	ktime_t now;
+
+	/* Get current time */
+	now = base->get_time();
+
+	/*
+	 * Calculate the absolute expiry time based on the
+	 * timer expiry mode:
+	 */
+	switch (mode & ~(KTIMER_NOCHECK | KTIMER_ROUND)) {
+
+	case KTIMER_ABS:
+		timer->expires = *tim;
+		break;
+
+	case KTIMER_REL:
+		timer->expires = ktime_add(now, *tim);
+		break;
+
+	case KTIMER_INCR:
+		timer->expires = ktime_add(timer->expires, *tim);
+		break;
+
+	case KTIMER_FORWARD:
+		while ktime_cmp(timer->expires, <= , now) {
+			timer->expires = ktime_add(timer->expires, *tim);
+			timer->overrun++;
+		}
+		goto nocheck;
+
+	case KTIMER_REARM:
+		while ktime_cmp(timer->expires, <= , now) {
+			timer->expires = ktime_add(timer->expires,
+						   timer->interval);
+			timer->overrun++;
+		}
+		goto nocheck;
+
+	case KTIMER_RESTART:
+		break;
+
+	default:
+		/* illegal mode */
+		BUG();
+	}
+
+	/*
+	 * Rounding is requested for one shot timers and the first
+	 * event of interval timers. It's done here, so we don't
+	 * have to read the current time twice for relative timers.
+	 */
+	if (mode & KTIMER_ROUND) {
+		unsigned long rem;
+
+		rem = ktime_modulo(timer->expires, base->resolution);
+		if (rem)
+			timer->expires = ktime_add_ns(timer->expires,
+						      base->resolution - rem);
+	}
+
+	/* Expiry time in the past: */
+	if (unlikely(ktime_cmp(timer->expires, <=, now))) {
+		timer->expired = now;
+		timer->expiry_mode = __LINE__;
+		/* The caller takes care of expiry */
+		if (!(mode & KTIMER_NOCHECK))
+			return -1;
+		if (hres_enqueue_expired(timer, base, now))
+			return 0;
+	}
+ nocheck:
+
+	ktimer_trace(now, 0);
+
+	/*
+	 * Find the right place in the rbtree:
+	 */
+	while (*link) {
+		parent = *link;
+		entry = rb_entry(parent, struct ktimer, node);
+		/*
+		 * We dont care about collisions. Nodes with
+		 * the same expiry time stay together.
+		 */
+		if (ktime_cmp(timer->expires, <, entry->expires))
+			link = &(*link)->rb_left;
+		else {
+			link = &(*link)->rb_right;
+			prev = &entry->list;
+		}
+	}
+
+	/*
+	 * Insert the timer to the rbtree and to the sorted list:
+	 */
+	rb_link_node(&timer->node, parent, link);
+	rb_insert_color(&timer->node, &base->active);
+	if (ktimer_hres_active && prev != &base->pending) {
+		entry = list_entry(prev, struct ktimer, list);
+		if (entry->state != KTIMER_PENDING)
+			prev = &base->pending;
+	}
+	list_add(&timer->list, prev);
+
+	timer->state = KTIMER_PENDING;
+	base->count++;
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+	if (ktimer_hres_active &&
+		base->pending.next == &timer->list &&
+			base->reprogram &&
+				base->reprogram(timer, base, now))
+		hres_requeue_expired(timer, base);
+#endif
+	return 0;
+}
+
+/*
+ * __remove_ktimer - internal function to remove a timer
+ *
+ * The function also allows automatic rearming for interval timers.
+ * Must hold the base lock.
+ */
+static void
+__remove_ktimer(struct ktimer *timer, struct ktimer_base *base,
+		enum ktimer_rearm rearm)
+{
+	/*
+	 * Remove the timer from the sorted list and from the rbtree:
+	 */
+	list_del(&timer->list);
+	if (timer->state != KTIMER_EXPIRED_NOQUEUE)
+		rb_erase(&timer->node, &base->active);
+	timer->node.rb_parent = KTIMER_POISON;
+
+	timer->state = KTIMER_INACTIVE;
+	base->count--;
+	BUG_ON(base->count < 0);
+
+	/* Auto rearm the timer ? */
+	if (rearm && ktime_cmp_val(timer->interval, !=, KTIME_ZERO))
+		enqueue_ktimer(timer, base, NULL, KTIMER_REARM);
+}
+
+/*
+ * remove ktimer, called with base lock held
+ */
+static inline int remove_ktimer(struct ktimer *timer, struct ktimer_base *base)
+{
+	if (ktimer_active(timer)) {
+		__remove_ktimer(timer, base, KTIMER_NOREARM);
+		return 1;
+	}
+	return 0;
+}
+
+/*
+ * Internal function to (re)start a timer.
+ */
+static int
+internal_restart_ktimer(struct ktimer *timer, ktime_t *tim, int mode)
+{
+	struct ktimer_base *base, *new_base;
+	unsigned long flags;
+	int ret;
+
+	BUG_ON(!timer->function);
+
+	base = lock_ktimer_base(timer, &flags);
+
+	/* Remove an active timer from the queue */
+	ret = remove_ktimer(timer, base);
+
+	/* Switch the timer base, if necessary */
+	new_base = switch_ktimer_base(timer, base);
+
+	/*
+	 * When the new timer setting is already expired,
+	 * let the calling code deal with it.
+	 */
+	if (enqueue_ktimer(timer, new_base, tim, mode))
+		ret = -1;
+
+	unlock_ktimer_base(timer, &flags);
+
+	return ret;
+}
+
+/**
+ * ktimer_start - start a timer on the current CPU
+ *
+ * @timer:	the timer to be added
+ * @tim:	expiry time (optional, if not set in the timer)
+ * @mode:	timer setup mode
+ *
+ * Returns:
+ *  0 on success
+ * -1 when the new time setting is already expired
+ */
+int ktimer_start(struct ktimer *timer, ktime_t *tim, int mode)
+{
+	BUG_ON(ktimer_active(timer));
+
+	return internal_restart_ktimer(timer, tim, mode);
+}
+
+EXPORT_SYMBOL_GPL(ktimer_start);
+
+/**
+ * ktimer_restart - modify a running timer
+ *
+ * @timer:	the timer to be modified
+ * @tim:	expiry time (required)
+ * @mode:	timer setup mode
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ * -1 when the new time setting is already expired
+ */
+int ktimer_restart(struct ktimer *timer, ktime_t *tim, int mode)
+{
+	BUG_ON(!tim);
+
+	return internal_restart_ktimer(timer, tim, mode);
+}
+
+EXPORT_SYMBOL_GPL(ktimer_restart);
+
+/**
+ * ktimer_try_to_cancel - try to deactivate a timer
+ *
+ * @timer:	ktimer to stop
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ * -1 when the timer is currently excuting the callback function and
+ *    can not be stopped
+ */
+int ktimer_try_to_cancel(struct ktimer *timer)
+{
+	struct ktimer_base *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_ktimer_base(timer, &flags);
+
+	if (base->curr_timer != timer) {
+		ret = remove_ktimer(timer, base);
+		if (ret) {
+			timer->expired = base->get_time();
+			timer->expiry_mode = __LINE__;
+		}
+	}
+
+	unlock_ktimer_base(timer, &flags);
+
+	return ret;
+
+}
+
+EXPORT_SYMBOL_GPL(ktimer_try_to_cancel);
+
+/**
+ * ktimer_cancel - cancel a timer and wait for the handler to finish.
+ *
+ * @timer:	the timer to be cancelled
+ *
+ * Returns:
+ *  0 when the timer was not active
+ *  1 when the timer was active
+ */
+int ktimer_cancel(struct ktimer *timer)
+{
+	for (;;) {
+		int ret = ktimer_try_to_cancel(timer);
+
+		if (ret >= 0)
+			return ret;
+		wait_for_ktimer(timer);
+	}
+}
+
+EXPORT_SYMBOL_GPL(ktimer_cancel);
+
+/**
+ * ktimer_get_remtime - get remaining time for the timer
+ *
+ * @timer:	the timer to read
+ *
+ * Returns the delta between the expiry time and now, which can be
+ * less than zero.
+ */
+ktime_t ktimer_get_remtime(struct ktimer *timer)
+{
+	struct ktimer_base *base;
+	unsigned long flags;
+	ktime_t rem;
+
+	base = lock_ktimer_base(timer, &flags);
+	rem = ktime_sub(timer->expires, base->get_time());
+	unlock_ktimer_base(timer, &flags);
+
+	return rem;
+}
+
+/**
+ * ktimer_get_expiry - get expiry time for the timer
+ *
+ * @timer:	the timer to read
+ * @now:	if != NULL then store current base->time into it
+ */
+ktime_t ktimer_get_expiry(struct ktimer *timer, ktime_t *now)
+{
+	struct ktimer_base *base;
+	unsigned long flags;
+	ktime_t expiry;
+
+	base = lock_ktimer_base(timer, &flags);
+	expiry = timer->expires;
+	if (now)
+		*now = base->get_time();
+	unlock_ktimer_base(timer, &flags);
+
+	return expiry;
+}
+
+/*
+ * Functions related to clock sources
+ */
+
+static inline void ktimer_common_init(struct ktimer *timer)
+{
+	memset(timer, 0, sizeof(struct ktimer));
+	timer->node.rb_parent = KTIMER_POISON;
+	timer->prio = MAX_RT_PRIO - 1;
+}
+
+/**
+ * ktimer_init - initialize a timer to the monotonic clock
+ *
+ * @timer:	the timer to be initialized
+ */
+void ktimer_init(struct ktimer *timer)
+{
+	struct ktimer_base *bases;
+
+	ktimer_common_init(timer);
+	bases = per_cpu(ktimer_bases, raw_smp_processor_id());
+	timer->base = &bases[CLOCK_MONOTONIC];
+}
+
+EXPORT_SYMBOL_GPL(ktimer_init);
+
+/**
+ * ktimer_init_real - initialize a timer to the real (wall-) clock
+ *
+ * @timer:	the timer to be initialized
+ */
+void ktimer_init_real(struct ktimer *timer)
+{
+	struct ktimer_base *bases;
+
+	ktimer_common_init(timer);
+	bases = per_cpu(ktimer_bases, raw_smp_processor_id());
+	timer->base = &bases[CLOCK_REALTIME];
+}
+
+EXPORT_SYMBOL_GPL(ktimer_init_real);
+
+/**
+ * ktimer_get_res - get the monotonic timer resolution
+ *
+ * @which_clock: unused parameter for compability with the posix timer code
+ * @tp:		 pointer to timespec variable to store the resolution
+ *
+ * Store the resolution of clock monotonic in the variable pointed to
+ * by tp.
+ */
+int ktimer_get_res(clockid_t which_clock, struct timespec *tp)
+{
+	struct ktimer_base *bases;
+
+	tp->tv_sec = 0;
+	bases = per_cpu(ktimer_bases, raw_smp_processor_id());
+	tp->tv_nsec = bases[CLOCK_MONOTONIC].resolution;
+
+	return 0;
+}
+
+/**
+ * ktimer_get_res_real - get the real timer resolution
+ *
+ * @which_clock: unused parameter for compability with the posix timer code
+ * @tp:		 pointer to timespec variable to store the resolution
+ *
+ * Store the resolution of clock realtime in the variable pointed to
+ * by tp.
+ */
+int ktimer_get_res_real(clockid_t which_clock, struct timespec *tp)
+{
+	struct ktimer_base *bases;
+
+	tp->tv_sec = 0;
+	bases = per_cpu(ktimer_bases, raw_smp_processor_id());
+	tp->tv_nsec = bases[CLOCK_REALTIME].resolution;
+
+	return 0;
+}
+
+#ifdef CONFIG_HIGH_RES_TIMERS
+/*
+ * Expire the per base ktimer-queue in high resolution mode:
+ */
+static inline void run_ktimer_hres_queue(struct ktimer_base *base)
+{
+	spin_lock_irq(&base->lock);
+
+	while (!list_empty(&base->expired)) {
+		struct ktimer *timer;
+		void (*fn)(void *);
+		void *data;
+
+		timer = list_entry(base->expired.next, struct ktimer, list);
+		fn = timer->function;
+		data = timer->data;
+		__remove_ktimer(timer, base, KTIMER_REARM);
+		set_curr_timer(base, timer);
+		spin_unlock_irq(&base->lock);
+
+ 		fn(data);
+
+		spin_lock_irq(&base->lock);
+		set_curr_timer(base, NULL);
+	}
+	spin_unlock_irq(&base->lock);
+
+	wake_up_timer_waiters(base);
+}
+
+static void run_ktimer_softirq(struct softirq_action *h)
+{
+	struct ktimer_base *base = per_cpu(ktimer_bases, smp_processor_id());
+	int i;
+
+	for (i = 0; i < MAX_KTIMER_BASES; i++)
+		run_ktimer_hres_queue(&base[i]);
+}
+
+#endif	/* CONFIG_HIGH_RES_TIMERS */
+
+/*
+ * Expire the per base ktimer-queue. Used for non HRT mode and
+ * as a fallback when HRT init failed:
+ */
+static inline void run_ktimer_queue(struct ktimer_base *base)
+{
+	ktime_t now = base->get_time();
+
+	spin_lock_irq(&base->lock);
+
+	while (!list_empty(&base->pending)) {
+		struct ktimer *timer;
+		void (*fn)(void *);
+		void *data;
+
+		timer = list_entry(base->pending.next, struct ktimer, list);
+		if (ktime_cmp(now, <=, timer->expires))
+			break;
+
+		timer->expired = now;
+		timer->expiry_mode = __LINE__;
+		fn = timer->function;
+		data = timer->data;
+		set_curr_timer(base, timer);
+		__remove_ktimer(timer, base, KTIMER_REARM);
+		spin_unlock_irq(&base->lock);
+
+		fn(data);
+
+		spin_lock_irq(&base->lock);
+		set_curr_timer(base, NULL);
+	}
+	spin_unlock_irq(&base->lock);
+
+	wake_up_timer_waiters(base);
+}
+
+/*
+ * Called from timer softirq every jiffy, to expire ktimers.
+ *
+ * For HRT its the fall back code to run the softirq in the timer
+ * softirq context in case the hrtimer initialization failed or has
+ * not been done yet.
+ */
+void ktimer_run_queues(void)
+{
+	struct ktimer_base *base = __get_cpu_var(ktimer_bases);
+	int i;
+
+	ktimer_check_clocks();
+
+	if (ktimer_hres_active)
+		return;
+
+	for (i = 0; i < MAX_KTIMER_BASES; i++)
+		run_ktimer_queue(&base[i]);
+}
+
+/*
+ * Sleep related functions:
+ */
+
+/*
+ * Process-wakeup callback:
+ */
+static void ktimer_wake_up(void *data)
+{
+	struct task_struct *p = data;
+
+	wake_up_process(p);
+}
+
+/**
+ * schedule_ktimer - sleep until timeout
+ *
+ * @timer:	ktimer variable initialized with the correct clock base
+ * @t:		timeout value
+ * @mode:	timeout value is abs/rel
+ *
+ * Make the current task sleep until @timeout is
+ * elapsed.
+ *
+ * You can set the task state as follows -
+ *
+ * %TASK_UNINTERRUPTIBLE - at least @timeout is guaranteed to
+ * pass before the routine returns. The routine will return 0
+ *
+ * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
+ * delivered to the current task. In this case the remaining time
+ * will be returned
+ *
+ * The current task state is guaranteed to be TASK_RUNNING when this
+ * routine returns.
+ */
+static ktime_t __sched
+schedule_ktimer(struct ktimer *timer, ktime_t *t, int mode)
+{
+	timer->data = current;
+	timer->function = ktimer_wake_up;
+	/*
+	 * The callback function can be executed in irq context so we
+	 * avoid the additional context switch to the softirq
+	 */
+	timer->prio = -1;
+
+	if (unlikely(ktimer_start(timer, t, mode) < 0)) {
+		__set_current_state(TASK_RUNNING);
+	} else {
+		if (current->state != TASK_RUNNING)
+			schedule();
+		ktimer_cancel(timer);
+	}
+
+	/* Store the absolute expiry time */
+	*t = timer->expires;
+
+	/* Return the remaining time */
+	return ktime_sub(timer->expires, timer->expired);
+}
+
+static ktime_t __sched
+schedule_ktimer_interruptible(struct ktimer *timer, ktime_t *t, int mode)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+
+	return schedule_ktimer(timer, t, mode);
+}
+
+static int check_ktimer_signal(struct ktimer *timer, ktime_t rem)
+{
+	static int warn_once = 1;
+
+	struct ktimer_base *base;
+	unsigned long flags;
+	ktime_t now;
+
+	if (signal_pending(current))
+		return 0;
+	if (!warn_once)
+		return 1;
+	warn_once = 0;
+
+	base = lock_ktimer_base(timer, &flags);
+	now = base->get_time();
+	unlock_ktimer_base(timer, &flags);
+
+	printk("BUG: ktimer expired short without user signal! (%s:%d)\n",
+		current->comm, current->pid);
+	printk(".. expires:   %lu/%lu\n",
+		(long)ktime_get_high(timer->expires),
+		(long)ktime_get_low(timer->expires));
+	printk(".. expired:   %lu/%lu\n",
+		(long)ktime_get_high(timer->expired),
+		(long)ktime_get_low(timer->expired));
+	printk(".. at line:   %d\n", timer->expiry_mode);
+	printk(".. interval:  %lu/%lu\n",
+		(long)ktime_get_high(timer->interval),
+		(long)ktime_get_low(timer->interval));
+	printk(".. now:       %lu/%lu\n",
+		(long)ktime_get_high(now), (long)ktime_get_low(now));
+	printk(".. rem:       %lu/%lu\n",
+		(long)ktime_get_high(rem), (long)ktime_get_low(rem));
+	printk(".. overrun:   %d\n", timer->overrun);
+#ifdef CONFIG_HIGH_RES_TIMERS
+	printk(".. getoffset: %p\n", base->getoffset);
+#endif
+	dump_stack();
+
+	return 1;
+}
+
+static long __sched
+nanosleep_restart(struct ktimer *timer, struct restart_block *restart)
+{
+	void *rfn_save = restart->fn;
+	struct timespec __user *rmtp;
+	struct timespec tu;
+	ktime_t t, rem;
+
+	restart->fn = do_no_restart_syscall;
+
+	t = ktime_set_low_high(restart->arg0, restart->arg1);
+
+	rem = schedule_ktimer_interruptible(timer, &t, KTIMER_ABS);
+
+	if (ktime_cmp_val(rem, <=, KTIME_ZERO))
+		return 0;
+
+	rmtp = (struct timespec __user *) restart->arg2;
+	ktime_to_timespec(&tu, rem);
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart->fn = rfn_save;
+
+	/*
+	 * If there is no signal pending, then the timer expired
+	 * short due to timer hardware bug: do not confuse userspace
+	 * by returning -ERESTART_RESTARTBLOCK to it!
+	 */
+	if (check_ktimer_signal(timer, rem))
+		return 0;
+
+	/* The other values in restart are already filled in */
+	return -ERESTART_RESTARTBLOCK;
+}
+
+static long __sched nanosleep_restart_mono(struct restart_block *restart)
+{
+	struct ktimer timer;
+
+	ktimer_init(&timer);
+
+	return nanosleep_restart(&timer, restart);
+}
+
+static long __sched nanosleep_restart_real(struct restart_block *restart)
+{
+	struct ktimer timer;
+
+	ktimer_init_real(&timer);
+
+	return nanosleep_restart(&timer, restart);
+}
+
+static long __ktimer_nanosleep(struct ktimer *timer, struct timespec *rqtp,
+			     struct timespec __user *rmtp, int mode,
+			     long (*rfn)(struct restart_block *))
+{
+	struct timespec tu;
+	ktime_t rem, t;
+	struct restart_block *restart;
+
+	t = timespec_to_ktime(*rqtp);
+
+	/* t is updated to absolute expiry time ! */
+	rem = schedule_ktimer_interruptible(timer, &t, mode | KTIMER_ROUND);
+
+	if (ktime_cmp_val(rem, <=, KTIME_ZERO))
+		return 0;
+
+	ktime_to_timespec(&tu, rem);
+
+	if (rmtp && copy_to_user(rmtp, &tu, sizeof(tu)))
+		return -EFAULT;
+
+	restart = &current_thread_info()->restart_block;
+	restart->fn = rfn;
+	restart->arg0 = ktime_get_low(t);
+	restart->arg1 = ktime_get_high(t);
+	restart->arg2 = (unsigned long) rmtp;
+
+	/*
+	 * If there is no signal pending, then the timer expired
+	 * short due to timer hardware bug: do not confuse userspace
+	 * by returning -ERESTART_RESTARTBLOCK to it!
+	 */
+	if (check_ktimer_signal(timer, rem))
+		return 0;
+
+	return -ERESTART_RESTARTBLOCK;
+}
+
+long ktimer_nanosleep(struct timespec *rqtp,
+			   struct timespec __user *rmtp, int mode)
+{
+	struct ktimer timer;
+
+	ktimer_init(&timer);
+
+	return __ktimer_nanosleep(&timer, rqtp, rmtp, mode,
+				nanosleep_restart_mono);
+}
+
+long ktimer_nanosleep_real(struct timespec *rqtp,
+			   struct timespec __user *rmtp, int mode)
+{
+	struct ktimer timer;
+
+	ktimer_init_real(&timer);
+	return __ktimer_nanosleep(&timer, rqtp, rmtp, mode,
+				nanosleep_restart_real);
+}
+
+asmlinkage long
+sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
+{
+	struct timespec tu;
+
+	if (copy_from_user(&tu, rqtp, sizeof(tu)))
+		return -EFAULT;
+
+	if (!timespec_valid(&tu))
+		return -EINVAL;
+
+	return ktimer_nanosleep(&tu, rmtp, KTIMER_REL);
+}
+
+static nsec_t __nsleep(nsec_t delay, int mode)
+{
+	struct ktimer timer;
+	ktime_t rem, t;
+	struct timespec ts;
+
+	ktimer_init(&timer);
+
+	// FIXME: add ns_to_ktime()
+	ns_to_timespec(&ts, delay);
+	t = timespec_to_ktime(ts);
+
+	set_current_state(mode);
+
+	if (mode == TASK_UNINTERRUPTIBLE || !signal_pending(current))
+		rem = schedule_ktimer(&timer, &t, KTIMER_REL);
+	else {
+		set_current_state(TASK_RUNNING);
+		rem = t;
+	}
+
+	return ktime_to_ns(rem);
+}
+
+nsec_t nsleep(nsec_t delay)
+{
+	return __nsleep(delay, TASK_UNINTERRUPTIBLE);
+}
+
+EXPORT_SYMBOL(nsleep);
+
+nsec_t nsleep_interruptible(nsec_t delay)
+{
+	return __nsleep(delay, TASK_INTERRUPTIBLE);
+}
+
+EXPORT_SYMBOL(nsleep_interruptible);
+
+/*
+ * Functions related to boot-time initialization:
+ */
+static void __devinit init_ktimers_cpu(int cpu)
+{
+	struct ktimer_base *base = per_cpu(ktimer_bases, cpu);
+	int i;
+
+	for (i = 0; i < MAX_KTIMER_BASES; i++) {
+		spin_lock_init(&base->lock);
+		INIT_LIST_HEAD(&base->pending);
+#ifdef CONFIG_HIGH_RES_TIMERS
+		INIT_LIST_HEAD(&base->expired);
+#endif
+		init_waitqueue_head(&base->wait);
+		base++;
+	}
+#ifdef CONFIG_HIGH_RES_TIMERS
+	{
+		ktime_t max;
+
+		ktime_set_scalar(max, KTIME_MAX);
+		per_cpu(ktimer_hres, cpu).expires_next = max;
+		set_bit(0, &per_cpu(ktimer_hres, cpu).check_clocks);
+		per_cpu(ktimer_hres, cpu).active = 0;
+	}
+#endif
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+static void migrate_ktimer_list(struct ktimer_base *old_base,
+				struct ktimer_base *new_base)
+{
+	struct ktimer *timer;
+	struct rb_node *node;
+
+	while ((node = rb_first(&old_base->active))) {
+		timer = rb_entry(node, struct ktimer, node);
+		remove_ktimer(timer, old_base);
+		timer->base = new_base;
+		enqueue_ktimer(timer, new_base, NULL, KTIMER_RESTART);
+	}
+}
+
+static void migrate_ktimers(int cpu)
+{
+	struct ktimer_base *old_base, *new_base;
+	int i;
+
+	BUG_ON(cpu_online(cpu));
+	old_base = per_cpu(ktimer_bases, cpu);
+	new_base = get_cpu_var(ktimer_bases);
+
+	raw_local_irq_disable();
+
+	for (i = 0; i < MAX_KTIMER_BASES; i++) {
+
+		spin_lock(&new_base->lock);
+		spin_lock(&old_base->lock);
+
+		BUG_ON(old_base->curr_timer);
+
+		migrate_ktimer_list(old_base, new_base);
+
+		spin_unlock(&old_base->lock);
+		spin_unlock(&new_base->lock);
+		old_base++;
+		new_base++;
+	}
+
+	raw_local_irq_enable();
+	put_cpu_var(ktimer_bases);
+}
+#endif /* CONFIG_HOTPLUG_CPU */
+
+static int __devinit ktimer_cpu_notify(struct notifier_block *self,
+				       unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch(action) {
+
+	case CPU_UP_PREPARE:
+		init_ktimers_cpu(cpu);
+		break;
+
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		migrate_ktimers(cpu);
+		break;
+#endif
+
+	default:
+		break;
+	}
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata ktimers_nb = {
+	.notifier_call	= ktimer_cpu_notify,
+};
+
+void __init ktimers_init(void)
+{
+	ktimer_cpu_notify(&ktimers_nb, (unsigned long)CPU_UP_PREPARE,
+			  (void *)(long)smp_processor_id());
+	register_cpu_notifier(&ktimers_nb);
+#ifdef CONFIG_HIGH_RES_TIMERS
+	open_softirq(KTIMER_SOFTIRQ, run_ktimer_softirq, NULL);
+#endif
+}
+
Index: linux/kernel/latency.c
===================================================================
--- /dev/null
+++ linux/kernel/latency.c
@@ -0,0 +1,2377 @@
+/*
+ *  kernel/latency.c
+ *
+ *  Copyright (C) 2004, 2005 Ingo Molnar
+ *  Copyright (C) 2004 William Lee Irwin III
+ */
+
+#include <linux/mm.h>
+#include <linux/nmi.h>
+#include <linux/rtc.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/profile.h>
+#include <linux/bootmem.h>
+#include <linux/version.h>
+#include <linux/notifier.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/interrupt.h>
+#include <linux/proc_fs.h>
+#include <linux/latency_hist.h>
+#include <asm/uaccess.h>
+#include <asm/unistd.h>
+#include <asm/rtc.h>
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+# ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+#  define irqs_off_preempt_count() preempt_count()
+# else
+#  define irqs_off_preempt_count() 0
+# endif
+#endif
+
+#ifdef CONFIG_WAKEUP_TIMING
+struct sch_struct {
+	raw_spinlock_t trace_lock;
+	struct task_struct *task;
+	int cpu;
+	struct cpu_trace *tr;
+} ____cacheline_aligned_in_smp;
+
+static __cacheline_aligned_in_smp struct sch_struct sch =
+#ifdef CONFIG_PREEMPT_RT
+		{ trace_lock: RAW_SPIN_LOCK_UNLOCKED };
+#else
+		{ trace_lock: SPIN_LOCK_UNLOCKED(sch.trace_lock) };
+#endif
+
+int wakeup_timing = 1;
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+/*
+ * Maximum preemption latency measured. Initialize to maximum,
+ * we clear it after bootup.
+ */
+#ifdef CONFIG_LATENCY_HIST
+static cycles_t preempt_max_latency = (cycles_t)0UL;
+#else
+static cycles_t preempt_max_latency = (cycles_t)ULONG_MAX;
+#endif
+
+static cycles_t preempt_thresh;
+
+/*
+ * Should this new latency be reported/recorded?
+ */
+static int report_latency(cycles_t delta)
+{
+	if (latency_hist_flag && !trace_user_triggered)
+		return 1;
+
+	if (preempt_thresh) {
+		if (delta < preempt_thresh)
+			return 0;
+	} else {
+		if (delta <= preempt_max_latency)
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Track maximum latencies and save the trace:
+ */
+
+/*
+ * trace_stop_sched_switched must not be called with runqueue locks held!
+ */
+static __cacheline_aligned_in_smp DECLARE_MUTEX(max_mutex);
+
+/*
+ * Sequence count - we record it when starting a measurement and
+ * skip the latency if the sequence has changed - some other section
+ * did a maximum and could disturb our measurement with serial console
+ * printouts, etc. Truly coinciding maximum latencies should be rare
+ * and what happens together happens separately as well, so this doesnt
+ * decrease the validity of the maximum found:
+ */
+static __cacheline_aligned_in_smp int max_sequence;
+
+enum trace_type
+{
+	__TRACE_FIRST_TYPE = 0,
+
+	TRACE_FN,
+	TRACE_SPECIAL,
+	TRACE_SPECIAL_PID,
+	TRACE_SPECIAL_U64,
+	TRACE_CMDLINE,
+	TRACE_SYSCALL,
+	TRACE_SYSRET,
+
+	__TRACE_LAST_TYPE
+};
+
+enum trace_flag_type
+{
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_NEED_RESCHED		= 0x02,
+	TRACE_FLAG_HARDIRQ		= 0x04,
+	TRACE_FLAG_SOFTIRQ		= 0x08,
+	TRACE_FLAG_IRQS_HARD_OFF	= 0x10,
+};
+
+
+#ifdef CONFIG_LATENCY_TRACE
+
+/*
+ * On DEBUG_PAGEALLOC && SMP there's not too much lowmem, so reduce
+ * the # of trace entries, or else we OOM on bootup. Same applies for
+ * ARM where we have only 4MB boot window for kernel text+data+bss.
+ *
+ * The large buffer allocates 8MB memory, which might also be more
+ * than the available memory on a small embedded box. This needs more
+ * thought for embedded devices and should be initialized at runtime
+ * under consideration of the available memory resources.
+ */
+#if !defined(CONFIG_DEBUG_PAGEALLOC) && !defined(CONFIG_SMP) && !defined(CONFIG_ARM)
+# define MAX_TRACE (unsigned long)(8192*16-1)
+#else
+# define MAX_TRACE (unsigned long)(8192*2-1)
+#endif
+
+#define CMDLINE_BYTES 16
+
+/*
+ * 32 bytes on 32-bit platforms:
+ */
+struct trace_entry {
+	char type;
+	char cpu;
+	char flags;
+	char preempt_count; // assumes PREEMPT_MASK is 8 bits or less
+	int pid;
+	cycles_t timestamp;
+	union {
+		struct {
+			unsigned long eip;
+			unsigned long parent_eip;
+		} fn;
+		struct {
+			unsigned long eip;
+			unsigned long v1, v2, v3;
+		} special;
+		struct {
+			unsigned char str[CMDLINE_BYTES];
+		} cmdline;
+		struct {
+			unsigned int nr;
+			unsigned long p1, p2, p3;
+		} syscall;
+		struct {
+			unsigned int ret;
+		} sysret;
+		struct {
+			int __pad3[4];
+		} pad;
+	} u;
+} __attribute__((packed));
+
+#endif
+
+struct cpu_trace {
+	atomic_t disabled;
+	unsigned long trace_idx;
+	cycles_t preempt_timestamp;
+	unsigned long critical_start, critical_end;
+	int critical_sequence;
+	atomic_t overrun;
+	int early_warning;
+	int latency_type;
+	int cpu;
+
+#ifdef CONFIG_LATENCY_TRACE
+	struct trace_entry trace[MAX_TRACE];
+	char comm[CMDLINE_BYTES];
+	pid_t pid;
+	unsigned long uid;
+	unsigned long nice;
+	unsigned long policy;
+	unsigned long rt_priority;
+	unsigned long saved_latency;
+#endif
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+	unsigned long stack_check;
+#endif
+} ____cacheline_aligned_in_smp;
+
+static struct cpu_trace cpu_traces[NR_CPUS] ____cacheline_aligned_in_smp =
+{ [0 ... NR_CPUS-1] = {
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+ .stack_check = 1
+#endif
+ } };
+
+static unsigned long notrace cycles_to_usecs(cycles_t delta)
+{
+#ifdef CONFIG_X86
+	do_div(delta, cpu_khz/1000+1);
+#elif defined(CONFIG_PPC)
+	delta = mulhwu(tb_to_us, delta);
+#elif defined(CONFIG_ARM)
+	delta = mach_cycles_to_usecs(delta);
+#else
+	#error Implement cycles_to_usecs.
+#endif
+
+	return (unsigned long) delta;
+}
+
+static cycles_t notrace usecs_to_cycles(unsigned long delta)
+{
+#if defined(CONFIG_X86) || defined(CONFIG_PPC)
+	return (cycles_t) delta * (cycles_t) (cpu_khz/1000+1);
+#elif defined(CONFIG_ARM)
+	return mach_usecs_to_cycles(delta);
+#else
+	#error Implement usecs_to_cycles
+#endif
+}
+
+#ifdef CONFIG_LATENCY_TRACE
+
+int trace_enabled = 1;
+int mcount_enabled = 1;
+int trace_freerunning = 0;
+int trace_print_at_crash = 0;
+int trace_verbose = 0;
+int trace_all_cpus = 0;
+
+/*
+ * user-triggered via gettimeofday(0,1)/gettimeofday(0,0)
+ */
+int trace_user_triggered = 0;
+int trace_user_trigger_irq = -1;
+
+struct saved_trace_struct {
+	int cpu;
+	cycles_t first_timestamp, last_timestamp;
+	struct cpu_trace traces[NR_CPUS];
+} ____cacheline_aligned_in_smp;
+
+/*
+ * The current worst-case trace:
+ */
+static struct saved_trace_struct max_tr;
+
+/*
+ * /proc/latency_trace atomicity:
+ */
+static DECLARE_MUTEX(out_mutex);
+
+static struct saved_trace_struct out_tr;
+
+static void notrace printk_name(unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		printk("%s+%#lx/%#lx", sym_name, offset, size);
+	else
+		printk("<%08lx>", eip);
+}
+
+#ifdef CONFIG_DEBUG_STACKOVERFLOW
+
+#define MIN_STACK_NEEDED (sizeof(struct thread_info) + STACK_WARN)
+#define MAX_STACK (THREAD_SIZE - sizeof(struct thread_info))
+
+#if (defined(__i386__) || defined(__x86_64__)) && defined(CONFIG_FRAME_POINTER)
+# define PRINT_EXACT_STACKFRAME
+#endif
+
+#ifdef PRINT_EXACT_STACKFRAME
+static unsigned long *worst_stack_bp;
+#endif
+static DEFINE_RAW_SPINLOCK(worst_stack_lock);
+unsigned long worst_stack_left = THREAD_SIZE;
+static unsigned long worst_stack_printed = THREAD_SIZE;
+static char worst_stack_comm[TASK_COMM_LEN+1];
+static int worst_stack_pid;
+static unsigned long worst_stack_sp;
+static char worst_stack[THREAD_SIZE];
+
+static notrace void fill_worst_stack(unsigned long stack_left)
+{
+	unsigned long flags;
+
+	/*
+	 * On x64, we must not read the PDA during early bootup:
+	 */
+#ifdef CONFIG_X86_64
+	if (system_state == SYSTEM_BOOTING)
+		return;
+#endif
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (likely(stack_left < worst_stack_left)) {
+		worst_stack_left = stack_left;
+		memcpy(worst_stack, current_thread_info(), THREAD_SIZE);
+		worst_stack_sp = (unsigned long)&stack_left;
+		memcpy(worst_stack_comm, current->comm, TASK_COMM_LEN);
+		worst_stack_pid = current->pid;
+#ifdef PRINT_EXACT_STACKFRAME
+# ifdef __i386__
+		asm ("mov %%ebp, %0\n" :"=g"(worst_stack_bp));
+# elif defined(__x86_64__)
+		asm ("mov %%rbp, %0\n" :"=g"(worst_stack_bp));
+# else
+#  error Poke the author of above asm code lines !
+# endif
+#endif
+	}
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+}
+
+#ifdef PRINT_EXACT_STACKFRAME
+
+/*
+ * This takes a BP offset to point the BP back into the saved stack,
+ * the original stack might be long gone (but the stackframe within
+ * the saved copy still contains references to it).
+ */
+#define CONVERT_TO_SAVED_STACK(bp) \
+	((void *)worst_stack + ((unsigned long)bp & (THREAD_SIZE-1)))
+
+static void show_stackframe(void)
+{
+	unsigned long addr, frame_size, *bp, *prev_bp, sum = 0;
+
+	bp = CONVERT_TO_SAVED_STACK(worst_stack_bp);
+
+	while (bp[0]) {
+		addr = bp[1];
+		if (!kernel_text_address(addr))
+			break;
+
+		prev_bp = bp;
+		bp = CONVERT_TO_SAVED_STACK((unsigned long *)bp[0]);
+
+		frame_size = (bp - prev_bp) * sizeof(long);
+
+		if (frame_size < THREAD_SIZE) {
+			printk("{ %4ld} ", frame_size);
+			sum += frame_size;
+		} else
+			printk("{=%4ld} ", sum);
+
+		printk("[<%08lx>] ", addr);
+		printk_name(addr);
+		printk("\n");
+	}
+}
+
+#else
+
+static inline int valid_stack_ptr(void *p)
+{
+	return  p > (void *)worst_stack &&
+                p < (void *)worst_stack + THREAD_SIZE - 3;
+}
+
+static void show_stackframe(void)
+{
+	unsigned long prev_frame, addr;
+	unsigned long *stack;
+
+	prev_frame = (unsigned long)(worst_stack +
+					(worst_stack_sp & (THREAD_SIZE-1)));
+	stack = (unsigned long *)prev_frame;
+
+	while (valid_stack_ptr(stack)) {
+		addr = *stack++;
+		if (__kernel_text_address(addr)) {
+			printk("(%4ld) ", (unsigned long)stack - prev_frame);
+			printk("[<%08lx>] ", addr);
+			print_symbol("%s\n", addr);
+			prev_frame = (unsigned long)stack;
+		}
+		if ((char *)stack >= worst_stack + THREAD_SIZE)
+			break;
+	}
+}
+
+#endif
+
+static notrace void __print_worst_stack(void)
+{
+	printk("----------------------------->\n");
+	printk("| new stack-footprint maximum: %s/%d, %ld bytes (out of %ld bytes).\n",
+		worst_stack_comm, worst_stack_pid,
+		MAX_STACK-worst_stack_left, (long)MAX_STACK);
+	printk("------------|\n");
+
+	show_stackframe();
+	printk("<---------------------------\n\n");
+}
+
+static notrace void print_worst_stack(void)
+{
+	unsigned long flags;
+
+	if (raw_irqs_disabled())
+		return;
+
+	spin_lock_irqsave(&worst_stack_lock, flags);
+	if (worst_stack_printed == worst_stack_left) {
+		spin_unlock_irqrestore(&worst_stack_lock, flags);
+		return;
+	}
+	worst_stack_printed = worst_stack_left;
+	spin_unlock_irqrestore(&worst_stack_lock, flags);
+
+	__print_worst_stack();
+}
+
+static notrace void debug_stackoverflow(struct cpu_trace *tr)
+{
+	long stack_left;
+
+	if (unlikely(tr->stack_check <= 0))
+		return;
+	atomic_inc(&tr->disabled);
+
+	/* Debugging check for stack overflow: is there less than 1KB free? */
+#ifdef __i386__
+	__asm__ __volatile__("and %%esp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#elif defined(__x86_64__)
+	__asm__ __volatile__("and %%rsp,%0" :
+				"=r" (stack_left) : "0" (THREAD_SIZE - 1));
+#else
+# error Poke the author of above asm code lines !
+#endif
+	if (unlikely(stack_left < MIN_STACK_NEEDED)) {
+		tr->stack_check = 0;
+		printk(KERN_ALERT "BUG: stack overflow: only %ld bytes left! [%08lx...(%08lx-%08lx)]\n",
+			stack_left - sizeof(struct thread_info),
+			(long)&stack_left,
+			(long)current_thread_info(),
+			(long)current_thread_info() + THREAD_SIZE);
+		fill_worst_stack(stack_left);
+		__print_worst_stack();
+		goto out;
+	}
+	if (unlikely(stack_left < worst_stack_left)) {
+		tr->stack_check--;
+		fill_worst_stack(stack_left);
+		print_worst_stack();
+		tr->stack_check++;
+	} else
+		if (worst_stack_printed != worst_stack_left) {
+			tr->stack_check--;
+			print_worst_stack();
+			tr->stack_check++;
+		}
+out:
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+static void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+	struct trace_entry *entry;
+	unsigned long idx, idx_next;
+	cycles_t timestamp;
+	u32 pc;
+
+#ifdef CONFIG_DEBUG_PREEMPT
+//	WARN_ON(!atomic_read(&tr->disabled));
+#endif
+	if (!tr->critical_start && !trace_user_triggered && !trace_all_cpus && !trace_print_at_crash)
+		goto out;
+	/*
+	 * Allocate the next index. Make sure an NMI (or interrupt)
+	 * has not taken it away. Potentially redo the timestamp as
+	 * well to make sure the trace timestamps are in chronologic
+	 * order.
+	 */
+again:
+	idx = tr->trace_idx;
+	idx_next = idx + 1;
+	timestamp = get_cycles();
+
+	if (unlikely(trace_freerunning && (idx_next >= MAX_TRACE)))
+		idx_next = 0;
+	if (unlikely(idx_next >= MAX_TRACE)) {
+		atomic_inc(&tr->overrun);
+		goto out;
+	}
+#ifdef __HAVE_ARCH_CMPXCHG
+	if (unlikely(cmpxchg(&tr->trace_idx, idx, idx_next) != idx))
+		goto again;
+#else
+# ifdef CONFIG_SMP
+#  error CMPXHG missing
+# else
+	/* No worry, we are protected by the atomic_incr(&tr->disabled)
+	 * in __trace further down
+	 */
+	tr->trace_idx = idx_next;
+# endif
+#endif
+	pc = preempt_count();
+
+	entry = tr->trace + idx;
+	entry->type = type;
+#ifdef CONFIG_SMP
+	entry->cpu = cpu;
+#endif
+	entry->flags = (irqs_off() ? TRACE_FLAG_IRQS_OFF : 0) |
+		(raw_irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_HARD_OFF : 0)|
+		((pc & HARDIRQ_MASK) ? TRACE_FLAG_HARDIRQ : 0) |
+		((pc & SOFTIRQ_MASK) ? TRACE_FLAG_SOFTIRQ : 0) |
+		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
+	entry->preempt_count = pc & 0xff;
+	entry->pid = current->pid;
+	entry->timestamp = timestamp;
+
+	switch (type) {
+	case TRACE_FN:
+		entry->u.fn.eip = eip;
+		entry->u.fn.parent_eip = parent_eip;
+		break;
+	case TRACE_SPECIAL:
+	case TRACE_SPECIAL_PID:
+	case TRACE_SPECIAL_U64:
+		entry->u.special.eip = eip;
+		entry->u.special.v1 = v1;
+		entry->u.special.v2 = v2;
+		entry->u.special.v3 = v3;
+		break;
+	case TRACE_SYSCALL:
+		entry->u.syscall.nr = eip;
+		entry->u.syscall.p1 = v1;
+		entry->u.syscall.p2 = v2;
+		entry->u.syscall.p3 = v3;
+		break;
+	case TRACE_SYSRET:
+		entry->u.sysret.ret = eip;
+		break;
+	case TRACE_CMDLINE:
+		memcpy(entry->u.cmdline.str, current->comm, CMDLINE_BYTES);
+		break;
+	default:
+		break;
+	}
+out:
+	;
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (unlikely(trace_enabled <= 0))
+		return;
+
+#if defined(CONFIG_DEBUG_STACKOVERFLOW) && defined(CONFIG_X86)
+	debug_stackoverflow(cpu_traces + raw_smp_processor_id());
+#endif
+
+	__raw_local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	/*
+	 * Trace on the CPU where the current highest-prio task
+	 * is waiting to become runnable:
+	 */
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing && !trace_all_cpus && !trace_print_at_crash) {
+		if (!sch.tr || cpu != sch.cpu)
+			goto out;
+		tr = sch.tr;
+	} else
+		tr = cpu_traces + cpu;
+#else
+	tr = cpu_traces + cpu;
+#endif
+	atomic_inc(&tr->disabled);
+	if (likely(atomic_read(&tr->disabled) == 1)) {
+//#define DEBUG_STACK_POISON
+#ifdef DEBUG_STACK_POISON
+		char stack;
+
+		memset(&stack - 128, 0x34, 128);
+#endif
+		____trace(cpu, type, tr, eip, parent_eip, v1, v2, v3, flags);
+	}
+	atomic_dec(&tr->disabled);
+#ifdef CONFIG_WAKEUP_TIMING
+out:
+#endif
+	__raw_local_irq_restore(flags);
+}
+
+/*
+ * Special, ad-hoc tracepoints:
+ */
+void notrace trace_special(unsigned long v1, unsigned long v2, unsigned long v3)
+{
+	___trace(TRACE_SPECIAL, CALLER_ADDR0, 0, v1, v2, v3);
+}
+
+EXPORT_SYMBOL(trace_special);
+
+void notrace trace_special_pid(int pid, unsigned long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_PID, CALLER_ADDR0, 0, pid, v1, v2);
+}
+
+EXPORT_SYMBOL(trace_special_pid);
+
+void notrace trace_special_u64(unsigned long long v1, unsigned long v2)
+{
+	___trace(TRACE_SPECIAL_U64, CALLER_ADDR0, 0,
+		 (unsigned long) (v1 >> 32), (unsigned long) (v1 & 0xFFFFFFFF), v2);
+}
+
+EXPORT_SYMBOL(trace_special_u64);
+
+/*
+ * Non-inlined function:
+ */
+void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+	___trace(TRACE_FN, eip, parent_eip, 0, 0, 0);
+}
+
+extern void mcount(void);
+
+EXPORT_SYMBOL(mcount);
+
+void notrace __mcount(void)
+{
+	___trace(TRACE_FN, CALLER_ADDR1, CALLER_ADDR2, 0, 0, 0);
+}
+
+void notrace
+sys_call(int nr, unsigned long p1, unsigned long p2, unsigned long p3)
+{
+	___trace(TRACE_SYSCALL, nr, 0, p1, p2, p3);
+}
+
+void notrace sys_ret(int ret)
+{
+	___trace(TRACE_SYSRET, ret, 0, 0, 0, 0);
+}
+
+static void notrace print_name(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	/*
+	 * Special trace values:
+	 */
+	if (((long)eip < 10000L) && ((long)eip > -10000L)) {
+		seq_printf(m, "(%ld)", eip);
+		return;
+	}
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_puts(m, sym_name);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static void notrace print_name_offset(struct seq_file *m, unsigned long eip)
+{
+	char namebuf[KSYM_NAME_LEN+1];
+	unsigned long size, offset;
+	const char *sym_name;
+	char *modname;
+
+	sym_name = kallsyms_lookup(eip, &size, &offset, &modname, namebuf);
+	if (sym_name)
+		seq_printf(m, "%s+%#lx/%#lx <%08lx>",
+					sym_name, offset, size, eip);
+	else
+		seq_printf(m, "<%08lx>", eip);
+}
+
+static unsigned int out_sequence = -1;
+static int pid_to_cmdline_array[PID_MAX_DEFAULT+1];
+
+static void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+	____trace(cpu, TRACE_CMDLINE, tr, 0, 0, 0, 0, 0, flags);
+}
+
+void notrace trace_cmdline(void)
+{
+	___trace(TRACE_CMDLINE, 0, 0, 0, 0, 0);
+}
+
+static void construct_pid_to_cmdline(void)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned int i, j, entries, pid;
+
+	if (tr->critical_sequence == out_sequence)
+		return;
+	out_sequence = tr->critical_sequence;
+
+	memset(pid_to_cmdline_array, -1, sizeof(int) * (PID_MAX_DEFAULT + 1));
+
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	for (i = 0; i < entries; i++) {
+		struct trace_entry *entry = tr->trace + i;
+
+		if (entry->type != TRACE_CMDLINE)
+			continue;
+		pid = entry->pid;
+		if (pid < PID_MAX_DEFAULT) {
+			pid_to_cmdline_array[pid] = i;
+			/*
+			 * Replace space with underline - makes it easier
+			 * to process for tools:
+			 */
+			for (j = 0; j < CMDLINE_BYTES; j++)
+				if (entry->u.cmdline.str[j] == ' ')
+					entry->u.cmdline.str[j] = '_';
+		}
+	}
+}
+
+char *pid_to_cmdline(unsigned long pid)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	char *cmdline = "<...>";
+	int idx;
+
+	pid = min(pid, (unsigned long)PID_MAX_DEFAULT);
+	if (!pid)
+		return "<idle>";
+
+	if (pid_to_cmdline_array[pid] != -1) {
+		idx = pid_to_cmdline_array[pid];
+		if (tr->trace[idx].type == TRACE_CMDLINE)
+			cmdline = tr->trace[idx].u.cmdline.str;
+	}
+	return cmdline;
+}
+
+struct block_idx {
+	int idx[NR_CPUS];
+};
+
+/*
+ * return the trace entry (position) of the smallest-timestamp
+ * one (that is still in the valid idx range):
+ */
+static int min_idx(struct block_idx *bidx)
+{
+	cycles_t min_stamp = (cycles_t) -1;
+	struct trace_entry *entry;
+	int cpu, min_cpu = -1, idx;
+
+	for_each_online_cpu(cpu) {
+		idx = bidx->idx[cpu];
+		if (idx >= min(max_tr.traces[cpu].trace_idx, MAX_TRACE-1))
+			continue;
+		if (idx >= MAX_TRACE*NR_CPUS) {
+			printk("huh: idx (%d) > %ld*%d!\n", idx, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+		entry = max_tr.traces[cpu].trace + bidx->idx[cpu];
+		if (entry->timestamp < min_stamp) {
+			min_cpu = cpu;
+			min_stamp = entry->timestamp;
+		}
+	}
+
+	return min_cpu;
+}
+
+/*
+ * This code is called to construct an output trace from
+ * the maximum trace. Having separate traces serves both
+ * atomicity (a new max might be saved while we are busy
+ * accessing /proc/latency_trace) and it is also used to
+ * delay the (expensive) sorting of the output trace by
+ * timestamps, in the trace_all_cpus case.
+ */
+static void update_out_trace(void)
+{
+	int cpu, sum, entries, overrun_sum;
+	struct cpu_trace *tmp_max, *tmp_out;
+	struct trace_entry *out_entry, *entry;
+	struct block_idx bidx = { { 0, }, };
+	cycles_t stamp, first_stamp, last_stamp;
+
+	/*
+	 * Nasty trick. We might overflow the first array but
+	 * there are NR_CPUS of them so we use it as a 'big'
+	 * trace buffer.
+	 */
+	tmp_out = out_tr.traces + 0;
+	*tmp_out = max_tr.traces[max_tr.cpu];
+	out_tr.cpu = max_tr.cpu;
+	out_entry = tmp_out->trace + 0;
+
+	if (!trace_all_cpus) {
+		entries = min(tmp_out->trace_idx, MAX_TRACE-1);
+		if (!entries)
+			return;
+		out_tr.first_timestamp = tmp_out->trace[0].timestamp;
+		out_tr.last_timestamp = tmp_out->trace[entries-1].timestamp;
+		return;
+	}
+	/*
+	 * Find the range of timestamps that are fully traced in
+	 * all CPU traces. (since CPU traces can cover a variable
+	 * range of time, we have to find the best range.)
+	 */
+	first_stamp = 0;
+	for_each_online_cpu(cpu) {
+		tmp_max = max_tr.traces + cpu;
+		stamp = tmp_max->trace[0].timestamp;
+		if (stamp > first_stamp)
+			first_stamp = stamp;
+	}
+	/*
+	 * Save the timestamp range:
+	 */
+	tmp_max = max_tr.traces + max_tr.cpu;
+	entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+	/*
+	 * No saved trace yet?
+	 */
+	if (!entries) {
+		out_tr.traces[0].trace_idx = 0;
+		return;
+	}
+
+	last_stamp = tmp_max->trace[entries-1].timestamp;
+
+	if (last_stamp < first_stamp) {
+		WARN_ON(1);
+
+		for_each_online_cpu(cpu) {
+			tmp_max = max_tr.traces + cpu;
+			entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+			printk("CPU%d: %016Lx (%016Lx) ... #%d (%016Lx) %016Lx\n", cpu,
+				tmp_max->trace[0].timestamp,
+				tmp_max->trace[1].timestamp,
+				entries,
+				tmp_max->trace[entries-2].timestamp,
+				tmp_max->trace[entries-1].timestamp);
+		}
+		tmp_max = max_tr.traces + max_tr.cpu;
+		entries = min(tmp_max->trace_idx, MAX_TRACE-1);
+
+		printk("CPU%d entries: %d\n", max_tr.cpu, entries);
+		printk("first stamp: %016Lx\n", first_stamp);
+		printk(" last stamp: %016Lx\n", first_stamp);
+	}
+
+#if 0
+	printk("first_stamp: %Ld [%016Lx]\n", first_stamp, first_stamp);
+	printk(" last_stamp: %Ld [%016Lx]\n", last_stamp, last_stamp);
+	printk("   +1 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries].timestamp,
+		tmp_max->trace[entries].timestamp);
+	printk("   +2 stamp: %Ld [%016Lx]\n",
+		tmp_max->trace[entries+1].timestamp,
+		tmp_max->trace[entries+1].timestamp);
+	printk("      delta: %Ld\n", last_stamp-first_stamp);
+	printk("    entries: %d\n", entries);
+#endif
+
+	out_tr.first_timestamp = first_stamp;
+	out_tr.last_timestamp = last_stamp;
+
+	/*
+	 * Fetch trace entries one by one, in increasing timestamp
+	 * order. Start at first_stamp, stop at last_stamp:
+	 */
+	sum = 0;
+	for (;;) {
+		cpu = min_idx(&bidx);
+		if (cpu == -1)
+			break;
+		entry = max_tr.traces[cpu].trace + bidx.idx[cpu];
+		if (entry->timestamp > last_stamp)
+			break;
+
+		bidx.idx[cpu]++;
+		if (entry->timestamp < first_stamp)
+			continue;
+		*out_entry = *entry;
+		out_entry++;
+		sum++;
+		if (sum >= MAX_TRACE*NR_CPUS) {
+			printk("huh: sum (%d) > %ld*%d!\n", sum, MAX_TRACE, NR_CPUS);
+			WARN_ON(1);
+			break;
+		}
+	}
+
+	sum = 0;
+	overrun_sum = 0;
+	for_each_online_cpu(cpu) {
+		sum += max_tr.traces[cpu].trace_idx;
+		overrun_sum += atomic_read(&max_tr.traces[cpu].overrun);
+	}
+	tmp_out->trace_idx = sum;
+	atomic_set(&tmp_out->overrun, overrun_sum);
+}
+
+static void notrace print_help_header(struct seq_file *m)
+{
+	seq_puts(m, "                 _------=> CPU#            \n");
+	seq_puts(m, "                / _-----=> irqs-off        \n");
+	seq_puts(m, "               | / _----=> need-resched    \n");
+	seq_puts(m, "               || / _---=> hardirq/softirq \n");
+	seq_puts(m, "               ||| / _--=> preempt-depth   \n");
+	seq_puts(m, "               |||| /                      \n");
+	seq_puts(m, "               |||||     delay             \n");
+	seq_puts(m, "   cmd     pid ||||| time  |   caller      \n");
+	seq_puts(m, "      \\   /    |||||   \\   |   /           \n");
+}
+
+static void * notrace l_start(struct seq_file *m, loff_t *pos)
+{
+	loff_t n = *pos;
+	unsigned long entries;
+	struct cpu_trace *tr;
+
+	down(&out_mutex);
+	/*
+	 * if the file is being read newly, update the output trace:
+	 */
+	if (!n) {
+		// TODO: use the sequence counter here to optimize
+		down(&max_mutex);
+		update_out_trace();
+		up(&max_mutex);
+		if (!out_tr.traces[0].trace_idx) {
+			up(&out_mutex);
+			return NULL;
+		}
+		construct_pid_to_cmdline();
+	}
+	tr = out_tr.traces;
+	entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (!n) {
+		seq_printf(m, "preemption latency trace v1.1.5 on %s\n", UTS_RELEASE);
+		seq_puts(m, "--------------------------------------------------------------------\n");
+		seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d | (M:%s VP:%d, KP:%d, SP:%d HP:%d",
+			cycles_to_usecs(tr->saved_latency),
+			entries, entries + atomic_read(&tr->overrun),
+			out_tr.cpu,
+#if defined(CONFIG_PREEMPT_NONE)
+			"server",
+#elif defined(CONFIG_PREEMPT_VOLUNTARY)
+			"desktop",
+#elif defined(CONFIG_PREEMPT_DESKTOP)
+			"preempt",
+#else
+			"rt",
+#endif
+			0, 0,
+			softirq_preemption, hardirq_preemption);
+#ifdef CONFIG_SMP
+		seq_printf(m, " #P:%d)\n", num_online_cpus());
+#else
+		seq_puts(m, ")\n");
+#endif
+		seq_puts(m, "    -----------------\n");
+		seq_printf(m, "    | task: %.16s-%d (uid:%ld nice:%ld policy:%ld rt_prio:%ld)\n",
+			tr->comm, tr->pid, tr->uid, tr->nice,
+			tr->policy, tr->rt_priority);
+		seq_puts(m, "    -----------------\n");
+		if (trace_user_triggered) {
+			seq_puts(m, " => started at: ");
+			print_name_offset(m, tr->critical_start);
+			seq_puts(m, "\n => ended at:   ");
+			print_name_offset(m, tr->critical_end);
+			seq_puts(m, "\n");
+		}
+		seq_puts(m, "\n");
+
+		if (!trace_verbose)
+			print_help_header(m);
+	}
+	if (n >= entries)
+		return NULL;
+
+	return tr->trace + n;
+}
+
+static void * notrace l_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	unsigned long entries = min(tr->trace_idx, MAX_TRACE-1);
+
+	if (++*pos >= entries) {
+		if (*pos == entries)
+			seq_puts(m, "\n\nvim:ft=help\n");
+		return NULL;
+	}
+	return tr->trace + *pos;
+}
+
+static void notrace l_stop(struct seq_file *m, void *p)
+{
+	up(&out_mutex);
+}
+
+static void print_timestamp(struct seq_file *m, unsigned long abs_usecs,
+						unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_puts(m, "!: ");
+	else if (rel_usecs > 1)
+		seq_puts(m, "+: ");
+	else
+		seq_puts(m, " : ");
+}
+
+static void
+print_timestamp_short(struct seq_file *m, unsigned long abs_usecs,
+			unsigned long rel_usecs)
+{
+	seq_printf(m, " %4ldus", abs_usecs);
+	if (rel_usecs > 100)
+		seq_putc(m, '!');
+	else if (rel_usecs > 1)
+		seq_putc(m, '+');
+	else
+		seq_putc(m, ' ');
+}
+
+static void
+print_generic(struct seq_file *m, struct trace_entry *entry)
+{
+	int hardirq, softirq;
+
+	seq_printf(m, "%8.8s-%-5d ", pid_to_cmdline(entry->pid), entry->pid);
+	seq_printf(m, "%d", entry->cpu);
+	seq_printf(m, "%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		seq_putc(m, 'H');
+	else {
+		if (hardirq)
+			seq_putc(m, 'h');
+		else {
+			if (softirq)
+				seq_putc(m, 's');
+			else
+				seq_putc(m, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		seq_printf(m, "%x", entry->preempt_count);
+	else
+		seq_puts(m, ".");
+}
+
+
+static int notrace l_show_fn(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	if (trace_verbose) {
+		seq_printf(m, "%16s %5d %d %d %08x %08lx [%016Lx] %ld.%03ldms (+%ld.%03ldms): ",
+			pid_to_cmdline(entry->pid),
+			entry->pid, entry->cpu, entry->flags,
+			entry->preempt_count, trace_idx,
+			entry->timestamp, abs_usecs/1000,
+			abs_usecs % 1000, rel_usecs/1000, rel_usecs % 1000);
+		print_name_offset(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name_offset(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	} else {
+		print_generic(m, entry);
+		print_timestamp(m, abs_usecs, rel_usecs);
+		print_name(m, entry->u.fn.eip);
+		seq_puts(m, " (");
+		print_name(m, entry->u.fn.parent_eip);
+		seq_puts(m, ")\n");
+	}
+	return 0;
+}
+
+static int notrace l_show_special(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry, int mode64)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+
+	if (!mode64) {
+		seq_printf(m, " (%lx %lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	} else {
+		seq_printf(m, " (%lx%8lx %lx)\n",
+			   entry->u.special.v1, entry->u.special.v2, entry->u.special.v3);
+	}
+	return 0;
+}
+
+static int notrace
+l_show_special_pid(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int pid;
+
+	pid = entry->u.special.v1;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp(m, abs_usecs, rel_usecs);
+	if (trace_verbose)
+		print_name_offset(m, entry->u.special.eip);
+	else
+		print_name(m, entry->u.special.eip);
+	seq_printf(m, " <%.8s-%d> (%lx %lx)\n",
+		pid_to_cmdline(pid), pid,
+		entry->u.special.v2, entry->u.special.v3);
+
+	return 0;
+}
+
+static int notrace l_show_cmdline(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	if (!trace_verbose)
+		return 0;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	seq_printf(m,
+		"[ => %16s ] %ld.%03ldms (+%ld.%03ldms)\n",
+			entry->u.cmdline.str,
+			abs_usecs/1000, abs_usecs % 1000,
+			rel_usecs/1000, rel_usecs % 1000);
+
+	return 0;
+}
+
+extern unsigned long sys_call_table[NR_syscalls];
+
+static int notrace l_show_syscall(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	unsigned int nr;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_puts(m, "> ");
+	nr = entry->u.syscall.nr;
+	if (nr < NR_syscalls)
+		print_name(m, sys_call_table[nr]);
+	else
+		seq_printf(m, "<badsys(%u)>", nr);
+
+	seq_printf(m, " (%08lx %08lx %08lx)\n",
+		entry->u.syscall.p1, entry->u.syscall.p2, entry->u.syscall.p3);
+
+	return 0;
+}
+
+static int notrace l_show_sysret(struct seq_file *m, unsigned long trace_idx,
+		struct trace_entry *entry, struct trace_entry *entry0,
+		struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	print_generic(m, entry);
+	print_timestamp_short(m, abs_usecs, rel_usecs);
+
+	seq_printf(m, "< (%d)\n", entry->u.sysret.ret);
+
+	return 0;
+}
+
+
+static int notrace l_show(struct seq_file *m, void *p)
+{
+	struct cpu_trace *tr = out_tr.traces;
+	struct trace_entry *entry, *entry0, *next_entry;
+	unsigned long trace_idx;
+
+	cond_resched();
+	entry = p;
+	if (entry->timestamp < out_tr.first_timestamp)
+		return 0;
+	if (entry->timestamp > out_tr.last_timestamp)
+		return 0;
+
+	entry0 = tr->trace;
+	trace_idx = entry - entry0;
+
+	if (trace_idx + 1 < tr->trace_idx)
+		next_entry = entry + 1;
+	else
+		next_entry = entry;
+
+	if (trace_verbose)
+		seq_printf(m, "(T%d/#%ld) ", entry->type, trace_idx);
+
+	switch (entry->type) {
+		case TRACE_FN:
+			l_show_fn(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 0);
+			break;
+		case TRACE_SPECIAL_PID:
+			l_show_special_pid(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SPECIAL_U64:
+			l_show_special(m, trace_idx, entry, entry0, next_entry, 1);
+			break;
+		case TRACE_CMDLINE:
+			l_show_cmdline(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSCALL:
+			l_show_syscall(m, trace_idx, entry, entry0, next_entry);
+			break;
+		case TRACE_SYSRET:
+			l_show_sysret(m, trace_idx, entry, entry0, next_entry);
+			break;
+		default:
+			seq_printf(m, "unknown trace type %d\n", entry->type);
+	}
+	return 0;
+}
+
+struct seq_operations latency_trace_op = {
+	.start	= l_start,
+	.next	= l_next,
+	.stop	= l_stop,
+	.show	= l_show
+};
+
+static void copy_trace(struct cpu_trace *save, struct cpu_trace *tr)
+{
+	/* free-running needs reordering */
+	if (trace_freerunning) {
+		int i, idx, idx0 = tr->trace_idx;
+
+		for (i = 0; i < MAX_TRACE; i++) {
+			idx = (idx0 + i) % MAX_TRACE;
+			save->trace[i] = tr->trace[idx];
+		}
+		save->trace_idx = MAX_TRACE-1;
+	} else {
+		save->trace_idx = tr->trace_idx;
+
+		memcpy(save->trace, tr->trace,
+			min(save->trace_idx + 1, MAX_TRACE-1) *
+					sizeof(struct trace_entry));
+	}
+	save->overrun = tr->overrun;
+}
+
+/*
+ * Copy the new maximum trace into the separate maximum-trace
+ * structure. (this way the maximum trace is permanently saved,
+ * for later retrieval via /proc/latency_trace)
+ */
+static void update_max_tr(struct cpu_trace *tr)
+{
+	struct cpu_trace *save;
+	int cpu, all_cpus = 0;
+
+#ifdef CONFIG_PREEMPT
+	WARN_ON(!preempt_count() && !raw_irqs_disabled());
+#endif
+
+	max_tr.cpu = tr->cpu;
+	save = max_tr.traces + tr->cpu;
+
+	if ((wakeup_timing || trace_user_triggered || trace_print_at_crash) &&
+				trace_all_cpus) {
+		all_cpus = 1;
+		for_each_online_cpu(cpu)
+			atomic_inc(&cpu_traces[cpu].disabled);
+	}
+
+	save->saved_latency = preempt_max_latency;
+	save->preempt_timestamp = tr->preempt_timestamp;
+	save->critical_start = tr->critical_start;
+	save->critical_end = tr->critical_end;
+	save->critical_sequence = tr->critical_sequence;
+
+	memcpy(save->comm, current->comm, CMDLINE_BYTES);
+	save->pid = current->pid;
+	save->uid = current->uid;
+	save->nice = current->static_prio - 20 - MAX_RT_PRIO;
+	save->policy = current->policy;
+	save->rt_priority = current->rt_priority;
+
+	if (all_cpus) {
+		for_each_online_cpu(cpu) {
+			copy_trace(max_tr.traces + cpu, cpu_traces + cpu);
+			atomic_dec(&cpu_traces[cpu].disabled);
+		}
+	} else
+		copy_trace(save, tr);
+}
+
+#else /* !LATENCY_TRACE */
+
+static inline void notrace
+____trace(int cpu, enum trace_type type, struct cpu_trace *tr,
+	  unsigned long eip, unsigned long parent_eip,
+	  unsigned long v1, unsigned long v2, unsigned long v3,
+	  unsigned long flags)
+{
+}
+
+static inline void notrace
+___trace(enum trace_type type, unsigned long eip, unsigned long parent_eip,
+		unsigned long v1, unsigned long v2,
+			unsigned long v3)
+{
+}
+
+static inline void notrace __trace(unsigned long eip, unsigned long parent_eip)
+{
+}
+
+static inline void update_max_tr(struct cpu_trace *tr)
+{
+}
+
+static inline void notrace _trace_cmdline(int cpu, struct cpu_trace *tr)
+{
+}
+
+#endif
+
+static int setup_preempt_thresh(char *s)
+{
+	int thresh;
+
+	get_option(&s, &thresh);
+	if (thresh > 0) {
+		preempt_thresh = usecs_to_cycles(thresh);
+		printk("Preemption threshold = %u us\n", thresh);
+	}
+	return 1;
+}
+__setup("preempt_thresh=", setup_preempt_thresh);
+
+static inline void notrace reset_trace_idx(int cpu, struct cpu_trace *tr)
+{
+	if (trace_all_cpus)
+		for_each_online_cpu(cpu)
+			cpu_traces[cpu].trace_idx = 0;
+	else
+		tr->trace_idx = 0;
+}
+
+#ifdef CONFIG_CRITICAL_TIMING
+
+static void notrace
+check_critical_timing(int cpu, struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1;
+	cycles_t T0, T1, T2, delta;
+	unsigned long flags;
+
+	if (trace_user_triggered)
+		return;
+	/*
+	 * usecs conversion is slow so we try to delay the conversion
+	 * as long as possible:
+	 */
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	delta = T1-T0;
+
+	raw_local_save_flags(flags);
+
+	if (!report_latency(delta))
+		goto out;
+
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	/*
+	 * Update the timestamp, because the trace entry above
+	 * might change it (it can only get larger so the latency
+	 * is fair to be reported):
+	 */
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_CRITICAL_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us critical section "
+			"violates %lu us threshold.\n"
+			" => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, cycles_to_usecs(preempt_thresh), t0);
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"critical section.\n => started at timestamp %lu: ",
+				current->comm, current->pid,
+				raw_smp_processor_id(),
+				latency, t0);
+
+	print_symbol("<%s>\n", tr->critical_start);
+	printk(" =>   ended at timestamp %lu: ", t1);
+	print_symbol("<%s>\n", tr->critical_end);
+	dump_stack();
+	t1 = cycles_to_usecs(get_cycles());
+	printk(" =>   dump-end timestamp %lu\n\n", t1);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->early_warning = 0;
+	reset_trace_idx(cpu, tr);
+	_trace_cmdline(cpu, tr);
+	____trace(cpu, TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+}
+
+void notrace touch_critical_timing(void)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	if (preempt_count() > 0 && tr->critical_start) {
+		atomic_inc(&tr->disabled);
+		check_critical_timing(cpu, tr, CALLER_ADDR0);
+		tr->critical_start = CALLER_ADDR0;
+		tr->critical_sequence = max_sequence;
+		atomic_dec(&tr->disabled);
+	}
+}
+EXPORT_SYMBOL(touch_critical_timing);
+
+void notrace stop_critical_timing(void)
+{
+	struct cpu_trace *tr = cpu_traces + raw_smp_processor_id();
+
+	tr->critical_start = 0;
+}
+EXPORT_SYMBOL(stop_critical_timing);
+
+static inline void notrace
+__start_critical_timing(unsigned long eip, unsigned long parent_eip, int latency_type)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = eip;
+	atomic_set(&tr->overrun, 0);
+	reset_trace_idx(cpu, tr);
+	tr->latency_type = latency_type;
+	_trace_cmdline(cpu, tr);
+
+	raw_local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+
+	atomic_dec(&tr->disabled);
+}
+
+static inline void notrace
+__stop_critical_timing(unsigned long eip, unsigned long parent_eip)
+{
+	int cpu = raw_smp_processor_id();
+	struct cpu_trace *tr = cpu_traces + cpu;
+	unsigned long flags;
+
+	if (!tr->critical_start || atomic_read(&tr->disabled) ||
+			trace_user_triggered || wakeup_timing)
+		return;
+
+	atomic_inc(&tr->disabled);
+	raw_local_save_flags(flags);
+	____trace(cpu, TRACE_FN, tr, eip, parent_eip, 0, 0, 0, flags);
+	check_critical_timing(cpu, tr, eip);
+	tr->critical_start = 0;
+	atomic_dec(&tr->disabled);
+}
+
+#endif
+
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+
+void notrace trace_irqs_off_lowlevel(void)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, 0, INTERRUPT_LATENCY);
+}
+
+void notrace trace_irqs_off(void)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags))
+		__start_critical_timing(CALLER_ADDR0, CALLER_ADDR1, INTERRUPT_LATENCY);
+}
+
+EXPORT_SYMBOL(trace_irqs_off);
+
+void notrace trace_irqs_on(void)
+{
+	unsigned long flags;
+
+	raw_local_save_flags(flags);
+
+	if (!irqs_off_preempt_count() && raw_irqs_disabled_flags(flags))
+		__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+
+	__raw_local_irq_restore(flags);
+}
+
+EXPORT_SYMBOL(trace_irqs_on);
+
+#endif
+
+#endif /* LATENCY_TIMING */
+
+#if defined(CONFIG_DEBUG_PREEMPT) || defined(CONFIG_CRITICAL_TIMING)
+
+static inline unsigned long get_parent_eip(void)
+{
+	unsigned long parent_eip = CALLER_ADDR1;
+
+	if (in_lock_functions(parent_eip)) {
+		parent_eip = CALLER_ADDR2;
+		if (in_lock_functions(parent_eip))
+			parent_eip = CALLER_ADDR3;
+	}
+
+	return parent_eip;
+}
+
+void notrace add_preempt_count_ti(struct thread_info *ti, unsigned int val)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(((int)preempt_count_ti(ti) < 0));
+	/*
+	 * Spinlock count overflowing soon?
+	 */
+	BUG_ON((preempt_count_ti(ti) & PREEMPT_MASK) >= PREEMPT_MASK-10);
+#endif
+
+	preempt_count_ti(ti) += val;
+#ifdef CONFIG_PREEMPT_TRACE
+	if (val <= 10) {
+		unsigned int idx = preempt_count_ti(ti) & PREEMPT_MASK;
+		if (idx < MAX_PREEMPT_TRACE) {
+			current->preempt_trace_eip[idx] = eip;
+			current->preempt_trace_parent_eip[idx] = parent_eip;
+		}
+	}
+#endif
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == val)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void)eip, (void)parent_eip;
+}
+EXPORT_SYMBOL(add_preempt_count_ti);
+
+void notrace add_preempt_count(unsigned int val)
+{
+	add_preempt_count_ti(current_thread_info(), val);
+}
+
+EXPORT_SYMBOL(add_preempt_count);
+
+void notrace sub_preempt_count_ti(struct thread_info *ti, unsigned int val)
+{
+#ifdef CONFIG_DEBUG_PREEMPT
+	/*
+	 * Underflow?
+	 */
+	BUG_ON(unlikely(val > preempt_count_ti(ti)));
+
+	/*
+	 * Is the spinlock portion underflowing?
+	 */
+	BUG_ON((val < PREEMPT_MASK) && !(preempt_count_ti(ti) & PREEMPT_MASK));
+#endif
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count_ti(ti) == val)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count_ti(ti) -= val;
+}
+
+EXPORT_SYMBOL(sub_preempt_count_ti);
+
+void notrace sub_preempt_count(unsigned int val)
+{
+	sub_preempt_count_ti(current_thread_info(), val);
+}
+
+EXPORT_SYMBOL(sub_preempt_count);
+
+void notrace mask_preempt_count(unsigned int mask)
+{
+	unsigned long eip = CALLER_ADDR0;
+	unsigned long parent_eip = get_parent_eip();
+
+	preempt_count() |= mask;
+
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__start_critical_timing(eip, parent_eip, PREEMPT_LATENCY);
+	}
+#endif
+	(void) eip, (void) parent_eip;
+}
+EXPORT_SYMBOL(mask_preempt_count);
+
+void notrace unmask_preempt_count(unsigned int mask)
+{
+#ifdef CONFIG_CRITICAL_PREEMPT_TIMING
+	{
+#ifdef CONFIG_CRITICAL_IRQSOFF_TIMING
+		unsigned long flags;
+
+		raw_local_save_flags(flags);
+
+		if (!raw_irqs_disabled_flags(flags))
+#endif
+			if (preempt_count() == mask)
+				__stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
+	}
+#endif
+	preempt_count() &= ~mask;
+}
+EXPORT_SYMBOL(unmask_preempt_count);
+
+
+#endif
+
+/*
+ * Wakeup latency timing/tracing. We get upcalls from the scheduler
+ * when a task is being woken up and we time/trace it until it gets
+ * to a CPU - or an even-higher-prio task supercedes it. (in that
+ * case we throw away the currently traced task - we dont try to
+ * handle nesting, that simplifies things significantly)
+ */
+#ifdef CONFIG_WAKEUP_TIMING
+
+static void notrace
+check_wakeup_timing(struct cpu_trace *tr, unsigned long parent_eip)
+{
+	unsigned long latency, t0, t1, flags;
+	cycles_t T0, T1, T2, delta;
+	int cpu = raw_smp_processor_id();
+
+	if (trace_user_triggered)
+		return;
+
+	atomic_inc(&tr->disabled);
+	if (atomic_read(&tr->disabled) != 1)
+		goto out;
+
+	T0 = tr->preempt_timestamp;
+	T1 = get_cycles();
+	/*
+	 * maybe preempt_timestamp originated on another CPU,
+	 * with a TSC drift:
+	 */
+	if (T0 > T1)
+		T0 = T1;
+	delta = T1-T0;
+
+	if (!report_latency(delta))
+		goto out;
+
+	raw_local_save_flags(flags);
+	____trace(smp_processor_id(), TRACE_FN, tr, CALLER_ADDR0, parent_eip, 0, 0, 0, flags);
+	T2 = get_cycles();
+	if (T2 < T1)
+		printk("bug2: %016Lx < %016Lx!\n", T2, T1);
+	delta = T2-T0;
+
+	latency = cycles_to_usecs(delta);
+	latency_hist(tr->latency_type, cpu, latency);
+
+	if (latency_hist_flag) {
+		if (preempt_max_latency >= delta)
+			goto out;
+	}
+
+	if (tr->critical_sequence != max_sequence || down_trylock(&max_mutex))
+		goto out;
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (!preempt_thresh && preempt_max_latency > delta) {
+		printk("bug2: updating %016Lx > %016Lx?\n",
+			preempt_max_latency, delta);
+		printk("  [%016Lx %016Lx %016Lx]\n", T0, T1, T2);
+	}
+#endif
+
+	preempt_max_latency = delta;
+	t0 = cycles_to_usecs(T0);
+	t1 = cycles_to_usecs(T1);
+	tr->critical_end = parent_eip;
+
+	update_max_tr(tr);
+
+#ifndef CONFIG_WAKEUP_LATENCY_HIST
+	if (preempt_thresh)
+		printk("(%16s-%-5d|#%d): %lu us wakeup latency "
+			"violates %lu us threshold.\n",
+				current->comm, current->pid,
+				raw_smp_processor_id(), latency,
+				cycles_to_usecs(preempt_thresh));
+	else
+		printk("(%16s-%-5d|#%d): new %lu us maximum-latency "
+			"wakeup.\n", current->comm, current->pid,
+				raw_smp_processor_id(), latency);
+#endif
+
+	max_sequence++;
+
+	up(&max_mutex);
+
+out:
+	atomic_dec(&tr->disabled);
+}
+
+/*
+ * Start wakeup latency tracing - called with the runqueue held
+ * and interrupts disabled:
+ */
+void __trace_start_sched_wakeup(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	int cpu;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	spin_lock(&sch.trace_lock);
+	if (sch.task && (sch.task->prio >= p->prio))
+		goto out_unlock;
+
+	/*
+	 * New highest-prio task just woke up - start tracing:
+	 */
+	sch.task = p;
+	cpu = task_cpu(p);
+	sch.cpu = cpu;
+	/*
+	 * We keep using this CPU's trace buffer even if the task
+	 * gets migrated to another CPU. Tracing only happens on
+	 * the CPU that 'owns' the highest-prio task so it's
+	 * fundamentally single-threaded.
+	 */
+	sch.tr = tr = cpu_traces + cpu;
+	reset_trace_idx(cpu, tr);
+
+//	if (!atomic_read(&tr->disabled)) {
+		atomic_inc(&tr->disabled);
+		tr->critical_sequence = max_sequence;
+		tr->preempt_timestamp = get_cycles();
+		tr->latency_type = WAKEUP_LATENCY;
+		tr->critical_start = CALLER_ADDR0;
+		atomic_set(&tr->overrun, 0);
+		_trace_cmdline(raw_smp_processor_id(), tr);
+		atomic_dec(&tr->disabled);
+//	}
+
+	mcount();
+	trace_special_pid(p->pid, p->prio, cpu);
+out_unlock:
+	spin_unlock(&sch.trace_lock);
+}
+
+void trace_stop_sched_switched(struct task_struct *p)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+
+	if (trace_user_triggered || !wakeup_timing)
+		return;
+
+	raw_local_irq_save(flags);
+	spin_lock(&sch.trace_lock);
+	if (p == sch.task) {
+		trace_special_pid(p->pid, p->prio, task_cpu(p));
+
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		WARN_ON(!tr);
+		/*
+		 * Somewhat racy but safer - the printks within
+		 * check_wakeup_timing() can call back into the
+		 * wakup-timing code and deadlock:
+		 */
+//		atomic_inc(&tr->disabled);
+		preempt_disable();
+		spin_unlock(&sch.trace_lock);
+		check_wakeup_timing(tr, CALLER_ADDR0);
+		preempt_enable();
+//		atomic_dec(&tr->disabled);
+	} else {
+		if (sch.task)
+			trace_special_pid(sch.task->pid, sch.task->prio, p->prio);
+		if (sch.task && (sch.task->prio >= p->prio))
+			sch.task = NULL;
+		spin_unlock(&sch.trace_lock);
+	}
+	raw_local_irq_restore(flags);
+}
+
+void trace_change_sched_cpu(struct task_struct *p, int new_cpu)
+{
+	unsigned long flags;
+
+	if (!wakeup_timing)
+		return;
+
+	trace_special(task_cpu(p), task_cpu(p), new_cpu);
+	raw_local_irq_save(flags);
+	spin_lock(&sch.trace_lock);
+	if (p == sch.task && task_cpu(p) != new_cpu) {
+		sch.cpu = new_cpu;
+		trace_special(task_cpu(p), new_cpu, 0);
+	}
+	spin_unlock(&sch.trace_lock);
+	raw_local_irq_restore(flags);
+}
+
+#endif
+
+#ifdef CONFIG_LATENCY_TRACE
+
+long user_trace_start(void)
+{
+	struct cpu_trace *tr;
+	unsigned long flags;
+	int cpu;
+
+	if (!trace_user_triggered || trace_print_at_crash)
+		return -EINVAL;
+
+	/*
+	 * user_trace_start() might be called from hardirq
+	 * context, if trace_user_triggered_irq is set, so
+	 * be careful about locking:
+	 */
+	if (preempt_count()) {
+		if (down_trylock(&max_mutex))
+			return -EAGAIN;
+	} else
+		down(&max_mutex);
+
+	raw_local_irq_save(flags);
+	cpu = smp_processor_id();
+	tr = cpu_traces + cpu;
+
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		spin_lock(&sch.trace_lock);
+		sch.task = current;
+		sch.cpu = cpu;
+		sch.tr = tr;
+		spin_unlock(&sch.trace_lock);
+	}
+#endif
+	reset_trace_idx(cpu, tr);
+
+	tr->critical_sequence = max_sequence;
+	tr->preempt_timestamp = get_cycles();
+	tr->critical_start = CALLER_ADDR0;
+	atomic_set(&tr->overrun, 0);
+	_trace_cmdline(cpu, tr);
+	mcount();
+
+	WARN_ON(!raw_irqs_disabled());
+	raw_local_irq_restore(flags);
+
+	up(&max_mutex);
+
+	return 0;
+}
+
+EXPORT_SYMBOL_GPL(user_trace_start);
+
+long user_trace_stop(void)
+{
+	unsigned long latency, flags;
+	struct cpu_trace *tr;
+	cycles_t delta;
+
+
+	if (!trace_user_triggered || trace_print_at_crash)
+		return -EINVAL;
+
+	preempt_disable();
+	mcount();
+
+	raw_local_irq_save(flags);
+#ifdef CONFIG_WAKEUP_TIMING
+	if (wakeup_timing) {
+		spin_lock(&sch.trace_lock);
+		if (current != sch.task) {
+			spin_unlock(&sch.trace_lock);
+			raw_local_irq_restore(flags);
+			preempt_enable();
+			return -EINVAL;
+		}
+		sch.task = NULL;
+		tr = sch.tr;
+		sch.tr = NULL;
+		spin_unlock(&sch.trace_lock);
+	} else
+#endif
+		tr = cpu_traces + smp_processor_id();
+
+	atomic_inc(&tr->disabled);
+	if (tr->preempt_timestamp) {
+		cycles_t T0, T1;
+		unsigned long long tmp0;
+
+		T0 = tr->preempt_timestamp;
+		T1 = get_cycles();
+		tmp0 = preempt_max_latency;
+		if (T1 < T0)
+			T0 = T1;
+		delta = T1 - T0;
+		if (!report_latency(delta))
+			goto out;
+		if (tr->critical_sequence != max_sequence ||
+						down_trylock(&max_mutex))
+			goto out;
+
+		if (!preempt_thresh && preempt_max_latency > delta) {
+			raw_local_irq_restore(flags);
+			printk("bug3: updating %016Lx > %016Lx [%016Lx]?\n",
+				preempt_max_latency, delta, tmp0);
+			printk("  [%016Lx %016Lx]\n", T0, T1);
+			raw_local_irq_save(flags);
+		}
+
+		preempt_max_latency = delta;
+		update_max_tr(tr);
+
+		latency = cycles_to_usecs(delta);
+
+		raw_local_irq_restore(flags);
+		if (preempt_thresh)
+			printk("(%16s-%-5d|#%d): %lu us user-latency "
+				"violates %lu us threshold.\n",
+					current->comm, current->pid,
+					raw_smp_processor_id(), latency,
+					cycles_to_usecs(preempt_thresh));
+		else
+			printk("(%16s-%-5d|#%d): new %lu us user-latency.\n",
+				current->comm, current->pid,
+					raw_smp_processor_id(), latency);
+		raw_local_irq_save(flags);
+
+		max_sequence++;
+		up(&max_mutex);
+out:
+		tr->preempt_timestamp = 0;
+	}
+	atomic_dec(&tr->disabled);
+	raw_local_irq_restore(flags);
+	preempt_enable();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(user_trace_stop);
+
+void stop_trace(void)
+{
+	if (trace_print_at_crash)
+		trace_enabled = -1;
+}
+
+static void print_entry(struct trace_entry *entry, struct trace_entry *entry0,
+			struct trace_entry *next_entry)
+{
+	unsigned long abs_usecs, rel_usecs;
+	int hardirq, softirq;
+
+	abs_usecs = cycles_to_usecs(entry->timestamp - entry0->timestamp);
+	rel_usecs = cycles_to_usecs(next_entry->timestamp - entry->timestamp);
+
+	printk("%-5d ", entry->pid);
+
+	printk("%c%c",
+		(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+		(entry->flags & TRACE_FLAG_IRQS_HARD_OFF) ? 'D' : '.',
+ 		(entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'n' : '.');
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq)
+		printk("H");
+	else {
+		if (hardirq)
+			printk("h");
+		else {
+			if (softirq)
+				printk("s");
+			else
+				printk(".");
+		}
+	}
+
+	printk(":%d %ld.%03ldms: ",
+		entry->preempt_count, abs_usecs/1000, abs_usecs % 1000);
+
+	printk_name(entry->u.fn.eip);
+	printk("  <= (");
+	printk_name(entry->u.fn.parent_eip);
+	printk(")\n");
+}
+
+/*
+ * Print the current trace at crash time.
+ *
+ * We print it backwards, so that the newest (most interesting) entries
+ * are printed first.
+ */
+void print_last_trace(void)
+{
+	unsigned int idx0, idx, i;
+	struct cpu_trace *tr;
+	struct trace_entry *entry0, *entry, *next_entry;
+
+	if (trace_enabled != -1 || !trace_print_at_crash)
+		return;
+
+	trace_print_at_crash = 0;
+
+	preempt_disable();
+	tr = cpu_traces + smp_processor_id();
+
+	printk("Last %ld trace entries:\n", MAX_TRACE);
+	idx0 = tr->trace_idx;
+	printk("curr idx: %d\n", idx0);
+	if (idx0 >= MAX_TRACE)
+		idx0 = MAX_TRACE-1;
+	idx = idx0;
+	entry0 = tr->trace + idx0;
+
+	for (i = 0; i < MAX_TRACE; i++) {
+		next_entry = tr->trace + idx;
+		if (idx == 0)
+			idx = MAX_TRACE-1;
+		else
+			idx--;
+		entry = tr->trace + idx;
+		if (entry->type == TRACE_FN)
+			print_entry(entry, entry0, next_entry);
+	}
+	printk("printed %ld entries\n", MAX_TRACE);
+
+	preempt_enable();
+}
+
+#ifdef CONFIG_SMP
+/*
+ * On SMP, try to 'peek' on other CPU's traces and record them
+ * in this CPU's trace. This way we get a rough idea about what's
+ * going on there, without the overhead of global tracing.
+ *
+ * (no need to make this PER_CPU, we bounce it around anyway.)
+ */
+unsigned long nmi_eips[NR_CPUS];
+unsigned long nmi_flags[NR_CPUS];
+
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	int cpu, this_cpu = smp_processor_id();
+
+	__trace(eip, parent_eip);
+
+	nmi_eips[this_cpu] = parent_eip;
+	nmi_flags[this_cpu] = flags;
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		if (cpu_online(cpu) && cpu != this_cpu) {
+			__trace(eip, nmi_eips[cpu]);
+			__trace(eip, nmi_flags[cpu]);
+		}
+}
+#else
+/*
+ * On UP, NMI tracing is quite simple:
+ */
+void notrace nmi_trace(unsigned long eip, unsigned long parent_eip,
+			unsigned long flags)
+{
+	__trace(eip, parent_eip);
+}
+#endif
+
+#endif
+
+#ifdef CONFIG_PREEMPT_TRACE
+
+static void print_preempt_trace(struct task_struct *task)
+{
+	unsigned int count = task->thread_info->preempt_count;
+	unsigned int i, lim = count & PREEMPT_MASK;
+	if (lim >= MAX_PREEMPT_TRACE)
+		lim = MAX_PREEMPT_TRACE-1;
+	printk("---------------------------\n");
+	printk("| preempt count: %08x ]\n", count);
+	printk("| %d-level deep critical section nesting:\n", lim);
+	printk("----------------------------------------\n");
+	for (i = 1; i <= lim; i++) {
+		printk(".. [<%08lx>] .... ", task->preempt_trace_eip[i]);
+		print_symbol("%s\n", task->preempt_trace_eip[i]);
+		printk(".....[<%08lx>] ..   ( <= ",
+				task->preempt_trace_parent_eip[i]);
+		print_symbol("%s)\n", task->preempt_trace_parent_eip[i]);
+	}
+	printk("\n");
+}
+
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACE) || defined(CONFIG_LATENCY_TRACE)
+void print_traces(struct task_struct *task)
+{
+#ifdef CONFIG_PREEMPT_TRACE
+	print_preempt_trace(task);
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	print_last_trace();
+#endif
+}
+#endif
+
+#ifdef CONFIG_LATENCY_TIMING
+
+static int preempt_read_proc(char *page, char **start, off_t off,
+			     int count, int *eof, void *data)
+{
+	cycles_t *max = data;
+
+	return sprintf(page, "%ld\n", cycles_to_usecs(*max));
+}
+
+static int preempt_write_proc(struct file *file, const char __user *buffer,
+			      unsigned long count, void *data)
+{
+	unsigned int c, done = 0, val, sum = 0;
+	cycles_t *max = data;
+
+	while (count) {
+		if (get_user(c, buffer))
+			return -EFAULT;
+		val = c - '0';
+		buffer++;
+		done++;
+		count--;
+		if (c == 0 || c == '\n')
+			break;
+		if (val > 9)
+			return -EINVAL;
+		sum *= 10;
+		sum += val;
+	}
+	*max = usecs_to_cycles(sum);
+	return done;
+}
+
+#define	PROCNAME_PML	"sys/kernel/preempt_max_latency"
+#define PROCNAME_PT	"sys/kernel/preempt_thresh"
+
+static __init int latency_init(void)
+{
+	struct proc_dir_entry *entry;
+	int cpu;
+
+	for (cpu = 0; cpu < NR_CPUS; cpu++)
+		cpu_traces[cpu].cpu = cpu;
+
+	if (!(entry = create_proc_entry(PROCNAME_PML, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PML);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_max_latency;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+
+	if (!(entry = create_proc_entry(PROCNAME_PT, 0644, NULL)))
+		printk("latency_init(): can't create %s\n", PROCNAME_PT);
+	else {
+		entry->nlink = 1;
+		entry->data = &preempt_thresh;
+		entry->read_proc = preempt_read_proc;
+		entry->write_proc = preempt_write_proc;
+	}
+	return 0;
+}
+__initcall(latency_init);
+
+#endif
Index: linux/kernel/latency_hist.c
===================================================================
--- /dev/null
+++ linux/kernel/latency_hist.c
@@ -0,0 +1,267 @@
+/*
+ * kernel/latency_hist.c
+ *
+ * Add support for histograms of preemption-off latency and
+ * interrupt-off latency and wakeup latency, it depends on
+ * Real-Time Preemption Support.
+ *
+ *  Copyright (C) 2005 MontaVista Software, Inc.
+ *  Yi Yang <yyang@ch.mvista.com>
+ *
+ */
+#include <linux/config.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/percpu.h>
+#include <linux/latency_hist.h>
+#include <asm/atomic.h>
+
+typedef struct hist_data_struct {
+	atomic_t hist_mode; /* 0 log, 1 don't log */
+	unsigned long min_lat;
+	unsigned long avg_lat;
+	unsigned long max_lat;
+	unsigned long long beyond_hist_bound_samples;
+	unsigned long long accumulate_lat;
+	unsigned long long total_samples;
+	unsigned long long hist_array[MAX_ENTRY_NUM];
+} hist_data_t;
+
+static struct proc_dir_entry * latency_hist_root = NULL;
+static char * latency_hist_proc_dir_root = "latency_hist";
+
+static char * percpu_proc_name = "CPU";
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, interrupt_off_hist);
+static char * interrupt_off_hist_proc_dir = "interrupt_off_latency";
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+static DEFINE_PER_CPU(hist_data_t, preempt_off_hist);
+static char * preempt_off_hist_proc_dir = "preempt_off_latency";
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+static DEFINE_PER_CPU(hist_data_t, wakeup_latency_hist);
+static char * wakeup_latency_hist_proc_dir = "wakeup_latency";
+#endif
+
+static struct proc_dir_entry *entry[LATENCY_TYPE_NUM][NR_CPUS];
+
+static inline u64 u64_div(u64 x, u64 y)
+{
+        do_div(x, y);
+        return x;
+}
+
+void latency_hist(int latency_type, int cpu, unsigned long latency)
+{
+	hist_data_t * my_hist;
+
+	if ((cpu < 0) || (cpu >= NR_CPUS) || (latency_type < INTERRUPT_LATENCY)
+			|| (latency_type > WAKEUP_LATENCY) || (latency < 0))
+		return;
+
+	switch(latency_type) {
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	case INTERRUPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(interrupt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	case PREEMPT_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(preempt_off_hist, cpu);
+		break;
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	case WAKEUP_LATENCY:
+		my_hist = (hist_data_t *)&per_cpu(wakeup_latency_hist, cpu);
+		break;
+#endif
+	default:
+		return;
+	}
+
+	if (atomic_read(&my_hist->hist_mode) == 0)
+		return;
+
+	if (latency >= MAX_ENTRY_NUM)
+		my_hist->beyond_hist_bound_samples++;
+	else
+		my_hist->hist_array[latency]++;
+
+	if (latency < my_hist->min_lat)
+		my_hist->min_lat = latency;
+	else if (latency > my_hist->max_lat)
+		my_hist->max_lat = latency;
+
+	my_hist->total_samples++;
+	my_hist->accumulate_lat += latency;
+	my_hist->avg_lat = (unsigned long) u64_div(my_hist->accumulate_lat,
+						  my_hist->total_samples);
+	return;
+}
+
+static void *l_start(struct seq_file *m, loff_t * pos)
+{
+	loff_t *index_ptr = kmalloc(sizeof(loff_t), GFP_KERNEL);
+	loff_t index = *pos;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (!index_ptr)
+		return NULL;
+
+	if (index == 0) {
+		atomic_dec(&my_hist->hist_mode);
+		seq_printf(m, "#Minimum latency: %lu microseconds.\n"
+			   "#Average latency: %lu microseconds.\n"
+			   "#Maximum latency: %lu microseconds.\n"
+			   "#Total samples: %llu\n"
+			   "#There are %llu samples greater or equal than %d microseconds\n"
+			   "#usecs\t%16s\n"
+			   , my_hist->min_lat
+			   , my_hist->avg_lat
+			   , my_hist->max_lat
+			   , my_hist->total_samples
+			   , my_hist->beyond_hist_bound_samples
+			   , MAX_ENTRY_NUM, "samples");
+	}
+	if (index >= MAX_ENTRY_NUM)
+		return NULL;
+
+	*index_ptr = index;
+	return index_ptr;
+}
+
+static void *l_next(struct seq_file *m, void *p, loff_t * pos)
+{
+	loff_t *index_ptr = p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	if (++*pos >= MAX_ENTRY_NUM) {
+		atomic_inc(&my_hist->hist_mode);
+		return NULL;
+	}
+	*index_ptr = *pos;
+	return index_ptr;
+}
+
+static void l_stop(struct seq_file *m, void *p)
+{
+	kfree(p);
+}
+
+static int l_show(struct seq_file *m, void *p)
+{
+	int index = *(loff_t *) p;
+	hist_data_t *my_hist = (hist_data_t *) m->private;
+
+	seq_printf(m, "%5d\t%16llu\n", index, my_hist->hist_array[index]);
+	return 0;
+}
+
+static struct seq_operations latency_hist_seq_op = {
+	.start = l_start,
+	.next  = l_next,
+	.stop  = l_stop,
+	.show  = l_show
+};
+
+static int latency_hist_seq_open(struct inode *inode, struct file *file)
+{
+	struct proc_dir_entry *entry_ptr = NULL;
+	int ret, i, j, break_flags = 0;
+	struct seq_file *seq;
+
+	entry_ptr = PDE(file->f_dentry->d_inode);
+	for (i = 0; i < LATENCY_TYPE_NUM; i++) {
+		for (j = 0; j < NR_CPUS; j++) {
+			if (entry[i][j] == NULL)
+				continue;
+			if (entry_ptr->low_ino == entry[i][j]->low_ino) {
+				break_flags = 1;
+				break;
+			}
+		}
+		if (break_flags == 1)
+			break;
+	}
+	ret = seq_open(file, &latency_hist_seq_op);
+	if (break_flags == 1) {
+		seq = (struct seq_file *)file->private_data;
+		seq->private = entry[i][j]->data;
+	}
+	return ret;
+}
+
+static struct file_operations latency_hist_seq_fops = {
+	.open = latency_hist_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int latency_hist_init(void)
+{
+	struct proc_dir_entry *tmp_parent_proc_dir;
+	int i = 0, len = 0;
+	hist_data_t *my_hist;
+	char procname[64];
+
+	latency_hist_root = proc_mkdir(latency_hist_proc_dir_root, NULL);
+
+
+#ifdef CONFIG_INTERRUPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(interrupt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[INTERRUPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[INTERRUPT_LATENCY][i]->data = (void *)&per_cpu(interrupt_off_hist, i);
+		entry[INTERRUPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[INTERRUPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_PREEMPT_OFF_HIST
+	tmp_parent_proc_dir = proc_mkdir(preempt_off_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[PREEMPT_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[PREEMPT_LATENCY][i]->data = (void *)&per_cpu(preempt_off_hist, i);
+		entry[PREEMPT_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[PREEMPT_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+
+#ifdef CONFIG_WAKEUP_LATENCY_HIST
+	tmp_parent_proc_dir = proc_mkdir(wakeup_latency_hist_proc_dir, latency_hist_root);
+	for (i = 0; i < NR_CPUS; i++) {
+		len = sprintf(procname, "%s%d", percpu_proc_name, i);
+		procname[len] = '\0';
+		entry[WAKEUP_LATENCY][i] =
+			create_proc_entry(procname, 0, tmp_parent_proc_dir);
+		entry[WAKEUP_LATENCY][i]->data = (void *)&per_cpu(wakeup_latency_hist, i);
+		entry[WAKEUP_LATENCY][i]->proc_fops = &latency_hist_seq_fops;
+		my_hist = (hist_data_t *) entry[WAKEUP_LATENCY][i]->data;
+		atomic_set(&my_hist->hist_mode,1);
+		my_hist->min_lat = 0xFFFFFFFFUL;
+	}
+#endif
+	return 0;
+
+}
+
+__initcall(latency_hist_init);
+
Index: linux/kernel/panic.c
===================================================================
--- linux.orig/kernel/panic.c
+++ linux/kernel/panic.c
@@ -128,7 +128,7 @@ NORET_TYPE void panic(const char * fmt, 
 #if defined(CONFIG_ARCH_S390)
         disabled_wait(caller);
 #endif
-	local_irq_enable();
+	raw_local_irq_enable();
 	for (i = 0;;) {
 		i += panic_blink(i);
 		mdelay(1);
Index: linux/kernel/pid.c
===================================================================
--- linux.orig/kernel/pid.c
+++ linux/kernel/pid.c
@@ -60,7 +60,7 @@ typedef struct pidmap {
 static pidmap_t pidmap_array[PIDMAP_ENTRIES] =
 	 { [ 0 ... PIDMAP_ENTRIES-1 ] = { ATOMIC_INIT(BITS_PER_PAGE), NULL } };
 
-static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(pidmap_lock);
+static DEFINE_SPINLOCK(pidmap_lock);
 
 fastcall void free_pidmap(int pid)
 {
@@ -136,7 +136,7 @@ struct pid * fastcall find_pid(enum pid_
 	struct hlist_node *elem;
 	struct pid *pid;
 
-	hlist_for_each_entry(pid, elem,
+	hlist_for_each_entry_rcu(pid, elem,
 			&pid_hash[type][pid_hashfn(nr)], pid_chain) {
 		if (pid->nr == nr)
 			return pid;
@@ -151,12 +151,12 @@ int fastcall attach_pid(task_t *task, en
 	task_pid = &task->pids[type];
 	pid = find_pid(type, nr);
 	if (pid == NULL) {
-		hlist_add_head(&task_pid->pid_chain,
-				&pid_hash[type][pid_hashfn(nr)]);
 		INIT_LIST_HEAD(&task_pid->pid_list);
+		hlist_add_head_rcu(&task_pid->pid_chain,
+				   &pid_hash[type][pid_hashfn(nr)]);
 	} else {
 		INIT_HLIST_NODE(&task_pid->pid_chain);
-		list_add_tail(&task_pid->pid_list, &pid->pid_list);
+		list_add_tail_rcu(&task_pid->pid_list, &pid->pid_list);
 	}
 	task_pid->nr = nr;
 
@@ -170,20 +170,20 @@ static fastcall int __detach_pid(task_t 
 
 	pid = &task->pids[type];
 	if (!hlist_unhashed(&pid->pid_chain)) {
-		hlist_del(&pid->pid_chain);
 
-		if (list_empty(&pid->pid_list))
+		if (list_empty(&pid->pid_list)) {
 			nr = pid->nr;
-		else {
+			hlist_del_rcu(&pid->pid_chain);
+		} else {
 			pid_next = list_entry(pid->pid_list.next,
 						struct pid, pid_list);
 			/* insert next pid from pid_list to hash */
-			hlist_add_head(&pid_next->pid_chain,
-				&pid_hash[type][pid_hashfn(pid_next->nr)]);
+			hlist_replace_rcu(&pid->pid_chain,
+					  &pid_next->pid_chain);
 		}
 	}
 
-	list_del(&pid->pid_list);
+	list_del_rcu(&pid->pid_list);
 	pid->nr = 0;
 
 	return nr;
Index: linux/kernel/posix-cpu-timers.c
===================================================================
--- linux.orig/kernel/posix-cpu-timers.c
+++ linux/kernel/posix-cpu-timers.c
@@ -302,7 +302,7 @@ int posix_cpu_clock_get(clockid_t which_
 		 * should be able to see it.
 		 */
 		struct task_struct *p;
-		read_lock(&tasklist_lock);
+		rcu_read_lock();
 		p = find_task_by_pid(pid);
 		if (p) {
 			if (CPUCLOCK_PERTHREAD(which_clock)) {
@@ -311,11 +311,13 @@ int posix_cpu_clock_get(clockid_t which_
 								 p, &rtn);
 				}
 			} else if (p->tgid == pid && p->signal) {
+				read_lock(&tasklist_lock);
 				error = cpu_clock_sample_group(which_clock,
 							       p, &rtn);
+				read_unlock(&tasklist_lock);
 			}
 		}
-		read_unlock(&tasklist_lock);
+		rcu_read_unlock();
 	}
 
 	if (error)
@@ -1407,7 +1409,7 @@ void set_process_cpu_timer(struct task_s
 static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
 
 int posix_cpu_nsleep(clockid_t which_clock, int flags,
-		     struct timespec *rqtp)
+		     struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	struct restart_block *restart_block =
 	    &current_thread_info()->restart_block;
@@ -1432,7 +1434,6 @@ int posix_cpu_nsleep(clockid_t which_clo
 	error = posix_cpu_timer_create(&timer);
 	timer.it_process = current;
 	if (!error) {
-		struct timespec __user *rmtp;
 		static struct itimerspec zero_it;
 		struct itimerspec it = { .it_value = *rqtp,
 					 .it_interval = {} };
@@ -1479,7 +1480,6 @@ int posix_cpu_nsleep(clockid_t which_clo
 		/*
 		 * Report back to the user the time still remaining.
 		 */
-		rmtp = (struct timespec __user *) restart_block->arg1;
 		if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
 		    copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
 			return -EFAULT;
@@ -1487,6 +1487,7 @@ int posix_cpu_nsleep(clockid_t which_clo
 		restart_block->fn = posix_cpu_clock_nanosleep_restart;
 		/* Caller already set restart_block->arg1 */
 		restart_block->arg0 = which_clock;
+		restart_block->arg1 = (unsigned long) rmtp;
 		restart_block->arg2 = rqtp->tv_sec;
 		restart_block->arg3 = rqtp->tv_nsec;
 
@@ -1500,10 +1501,15 @@ static long
 posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
 {
 	clockid_t which_clock = restart_block->arg0;
-	struct timespec t = { .tv_sec = restart_block->arg2,
-			      .tv_nsec = restart_block->arg3 };
+	struct timespec __user *rmtp;
+	struct timespec t;
+
+	rmtp = (struct timespec __user *) restart_block->arg1;
+	t.tv_sec = restart_block->arg2;
+	t.tv_nsec = restart_block->arg3;
+
 	restart_block->fn = do_no_restart_syscall;
-	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+	return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t, rmtp);
 }
 
 
@@ -1524,9 +1530,10 @@ static int process_cpu_timer_create(stru
 	return posix_cpu_timer_create(timer);
 }
 static int process_cpu_nsleep(clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+			      struct timespec *rqtp,
+			      struct timespec __user *rmtp)
 {
-	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+	return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
 static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
 {
@@ -1542,7 +1549,7 @@ static int thread_cpu_timer_create(struc
 	return posix_cpu_timer_create(timer);
 }
 static int thread_cpu_nsleep(clockid_t which_clock, int flags,
-			      struct timespec *rqtp)
+			      struct timespec *rqtp, struct timespec __user *rmtp)
 {
 	return -EINVAL;
 }
Index: linux/kernel/posix-timers.c
===================================================================
--- linux.orig/kernel/posix-timers.c
+++ linux/kernel/posix-timers.c
@@ -34,7 +34,7 @@
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
-#include <linux/time.h>
+#include <linux/timeofday.h>
 
 #include <asm/uaccess.h>
 #include <asm/semaphore.h>
@@ -48,21 +48,6 @@
 #include <linux/workqueue.h>
 #include <linux/module.h>
 
-#ifndef div_long_long_rem
-#include <asm/div64.h>
-
-#define div_long_long_rem(dividend,divisor,remainder) ({ \
-		       u64 result = dividend;		\
-		       *remainder = do_div(result,divisor); \
-		       result; })
-
-#endif
-#define CLOCK_REALTIME_RES TICK_NSEC  /* In nano seconds. */
-
-static inline u64  mpy_l_X_l_ll(unsigned long mpy1,unsigned long mpy2)
-{
-	return (u64)mpy1 * mpy2;
-}
 /*
  * Management arrays for POSIX timers.	 Timers are kept in slab memory
  * Timer ids are allocated by an external routine that keeps track of the
@@ -148,18 +133,18 @@ static DEFINE_SPINLOCK(idr_lock);
  */
 
 static struct k_clock posix_clocks[MAX_CLOCKS];
+
 /*
- * We only have one real clock that can be set so we need only one abs list,
- * even if we should want to have several clocks with differing resolutions.
+ * These ones are defined below.
  */
-static struct k_clock_abs abs_list = {.list = LIST_HEAD_INIT(abs_list.list),
-				      .lock = SPIN_LOCK_UNLOCKED};
+static int common_nsleep(clockid_t, int flags, struct timespec *t,
+			 struct timespec __user *rmtp);
+static void common_timer_get(struct k_itimer *, struct itimerspec *);
+static int common_timer_set(struct k_itimer *, int,
+			    struct itimerspec *, struct itimerspec *);
+static int common_timer_del(struct k_itimer *timer);
 
-static void posix_timer_fn(unsigned long);
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo);
-int do_posix_clock_monotonic_gettime(struct timespec *tp);
-static int do_posix_clock_monotonic_get(clockid_t, struct timespec *tp);
+static void posix_timer_fn(void *data);
 
 static struct k_itimer *lock_timer(timer_t timer_id, unsigned long *flags);
 
@@ -205,21 +190,25 @@ static inline int common_clock_set(clock
 
 static inline int common_timer_create(struct k_itimer *new_timer)
 {
-	INIT_LIST_HEAD(&new_timer->it.real.abs_timer_entry);
-	init_timer(&new_timer->it.real.timer);
-	new_timer->it.real.timer.data = (unsigned long) new_timer;
+	return -EINVAL;
+}
+
+static int timer_create_mono(struct k_itimer *new_timer)
+{
+	ktimer_init(&new_timer->it.real.timer);
+	new_timer->it.real.timer.data = new_timer;
+	new_timer->it.real.timer.function = posix_timer_fn;
+	return 0;
+}
+
+static int timer_create_real(struct k_itimer *new_timer)
+{
+	ktimer_init_real(&new_timer->it.real.timer);
+	new_timer->it.real.timer.data = new_timer;
 	new_timer->it.real.timer.function = posix_timer_fn;
 	return 0;
 }
 
-/*
- * These ones are defined below.
- */
-static int common_nsleep(clockid_t, int flags, struct timespec *t);
-static void common_timer_get(struct k_itimer *, struct itimerspec *);
-static int common_timer_set(struct k_itimer *, int,
-			    struct itimerspec *, struct itimerspec *);
-static int common_timer_del(struct k_itimer *timer);
 
 /*
  * Return nonzero iff we know a priori this clockid_t value is bogus.
@@ -239,19 +228,44 @@ static inline int invalid_clockid(clocki
 	return 1;
 }
 
+/*
+ * Get real time for posix timers
+ */
+static int posix_ktime_get_real_ts(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_real_ts(tp);
+	return 0;
+}
+
+/*
+ * Get monotonic time for posix timers
+ */
+static int posix_ktime_get_ts(clockid_t which_clock, struct timespec *tp)
+{
+	ktime_get_ts(tp);
+	return 0;
+}
+
+void do_posix_clock_monotonic_gettime(struct timespec *ts)
+{
+	ktime_get_ts(ts);
+}
 
 /*
  * Initialize everything, well, just everything in Posix clocks/timers ;)
  */
 static __init int init_posix_timers(void)
 {
-	struct k_clock clock_realtime = {.res = CLOCK_REALTIME_RES,
-					 .abs_struct = &abs_list
+	struct k_clock clock_realtime = {
+		.clock_getres = ktimer_get_res_real,
+		.clock_get = posix_ktime_get_real_ts,
+		.timer_create = timer_create_real,
 	};
-	struct k_clock clock_monotonic = {.res = CLOCK_REALTIME_RES,
-		.abs_struct = NULL,
-		.clock_get = do_posix_clock_monotonic_get,
-		.clock_set = do_posix_clock_nosettime
+	struct k_clock clock_monotonic = {
+		.clock_getres = ktimer_get_res,
+		.clock_get = posix_ktime_get_ts,
+		.clock_set = do_posix_clock_nosettime,
+		.timer_create = timer_create_mono,
 	};
 
 	register_posix_clock(CLOCK_REALTIME, &clock_realtime);
@@ -265,117 +279,16 @@ static __init int init_posix_timers(void
 
 __initcall(init_posix_timers);
 
-static void tstojiffie(struct timespec *tp, int res, u64 *jiff)
-{
-	long sec = tp->tv_sec;
-	long nsec = tp->tv_nsec + res - 1;
-
-	if (nsec > NSEC_PER_SEC) {
-		sec++;
-		nsec -= NSEC_PER_SEC;
-	}
-
-	/*
-	 * The scaling constants are defined in <linux/time.h>
-	 * The difference between there and here is that we do the
-	 * res rounding and compute a 64-bit result (well so does that
-	 * but it then throws away the high bits).
-  	 */
-	*jiff =  (mpy_l_X_l_ll(sec, SEC_CONVERSION) +
-		  (mpy_l_X_l_ll(nsec, NSEC_CONVERSION) >> 
-		   (NSEC_JIFFIE_SC - SEC_JIFFIE_SC))) >> SEC_JIFFIE_SC;
-}
-
-/*
- * This function adjusts the timer as needed as a result of the clock
- * being set.  It should only be called for absolute timers, and then
- * under the abs_list lock.  It computes the time difference and sets
- * the new jiffies value in the timer.  It also updates the timers
- * reference wall_to_monotonic value.  It is complicated by the fact
- * that tstojiffies() only handles positive times and it needs to work
- * with both positive and negative times.  Also, for negative offsets,
- * we need to defeat the res round up.
- *
- * Return is true if there is a new time, else false.
- */
-static long add_clockset_delta(struct k_itimer *timr,
-			       struct timespec *new_wall_to)
-{
-	struct timespec delta;
-	int sign = 0;
-	u64 exp;
-
-	set_normalized_timespec(&delta,
-				new_wall_to->tv_sec -
-				timr->it.real.wall_to_prev.tv_sec,
-				new_wall_to->tv_nsec -
-				timr->it.real.wall_to_prev.tv_nsec);
-	if (likely(!(delta.tv_sec | delta.tv_nsec)))
-		return 0;
-	if (delta.tv_sec < 0) {
-		set_normalized_timespec(&delta,
-					-delta.tv_sec,
-					1 - delta.tv_nsec -
-					posix_clocks[timr->it_clock].res);
-		sign++;
-	}
-	tstojiffie(&delta, posix_clocks[timr->it_clock].res, &exp);
-	timr->it.real.wall_to_prev = *new_wall_to;
-	timr->it.real.timer.expires += (sign ? -exp : exp);
-	return 1;
-}
-
-static void remove_from_abslist(struct k_itimer *timr)
-{
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		list_del_init(&timr->it.real.abs_timer_entry);
-		spin_unlock(&abs_list.lock);
-	}
-}
-
 static void schedule_next_timer(struct k_itimer *timr)
 {
-	struct timespec new_wall_to;
-	struct now_struct now;
-	unsigned long seq;
-
-	/*
-	 * Set up the timer for the next interval (if there is one).
-	 * Note: this code uses the abs_timer_lock to protect
-	 * it.real.wall_to_prev and must hold it until exp is set, not exactly
-	 * obvious...
-
-	 * This function is used for CLOCK_REALTIME* and
-	 * CLOCK_MONOTONIC* timers.  If we ever want to handle other
-	 * CLOCKs, the calling code (do_schedule_next_timer) would need
-	 * to pull the "clock" info from the timer and dispatch the
-	 * "other" CLOCKs "next timer" code (which, I suppose should
-	 * also be added to the k_clock structure).
-	 */
-	if (!timr->it.real.incr)
+	if (ktime_cmp_val(timr->it.real.incr, ==, KTIME_ZERO))
 		return;
 
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		new_wall_to =	wall_to_monotonic;
-		posix_get_now(&now);
-	} while (read_seqretry(&xtime_lock, seq));
-
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		add_clockset_delta(timr, &new_wall_to);
-
-		posix_bump_timer(timr, now);
-
-		spin_unlock(&abs_list.lock);
-	} else {
-		posix_bump_timer(timr, now);
-	}
-	timr->it_overrun_last = timr->it_overrun;
-	timr->it_overrun = -1;
+	timr->it_overrun_last = timr->it.real.overrun;
+	timr->it.real.overrun = timr->it.real.timer.overrun = -1;
 	++timr->it_requeue_pending;
-	add_timer(&timr->it.real.timer);
+	ktimer_start(&timr->it.real.timer, &timr->it.real.incr, KTIMER_FORWARD);
+	timr->it.real.overrun = timr->it.real.timer.overrun;
 }
 
 /*
@@ -413,14 +326,7 @@ int posix_timer_event(struct k_itimer *t
 {
 	memset(&timr->sigq->info, 0, sizeof(siginfo_t));
 	timr->sigq->info.si_sys_private = si_private;
-	/*
-	 * Send signal to the process that owns this timer.
-
-	 * This code assumes that all the possible abs_lists share the
-	 * same lock (there is only one list at this time). If this is
-	 * not the case, the CLOCK info would need to be used to find
-	 * the proper abs list lock.
-	 */
+	/* Send signal to the process that owns this timer.*/
 
 	timr->sigq->info.si_signo = timr->it_sigev_signo;
 	timr->sigq->info.si_errno = 0;
@@ -454,65 +360,28 @@ EXPORT_SYMBOL_GPL(posix_timer_event);
 
  * This code is for CLOCK_REALTIME* and CLOCK_MONOTONIC* timers.
  */
-static void posix_timer_fn(unsigned long __data)
+static void posix_timer_fn(void *data)
 {
-	struct k_itimer *timr = (struct k_itimer *) __data;
+	struct k_itimer *timr = data;
 	unsigned long flags;
-	unsigned long seq;
-	struct timespec delta, new_wall_to;
-	u64 exp = 0;
-	int do_notify = 1;
+	int si_private = 0;
 
 	spin_lock_irqsave(&timr->it_lock, flags);
-	if (!list_empty(&timr->it.real.abs_timer_entry)) {
-		spin_lock(&abs_list.lock);
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-		set_normalized_timespec(&delta,
-					new_wall_to.tv_sec -
-					timr->it.real.wall_to_prev.tv_sec,
-					new_wall_to.tv_nsec -
-					timr->it.real.wall_to_prev.tv_nsec);
-		if (likely((delta.tv_sec | delta.tv_nsec ) == 0)) {
-			/* do nothing, timer is on time */
-		} else if (delta.tv_sec < 0) {
-			/* do nothing, timer is already late */
-		} else {
-			/* timer is early due to a clock set */
-			tstojiffie(&delta,
-				   posix_clocks[timr->it_clock].res,
-				   &exp);
-			timr->it.real.wall_to_prev = new_wall_to;
-			timr->it.real.timer.expires += exp;
-			add_timer(&timr->it.real.timer);
-			do_notify = 0;
-		}
-		spin_unlock(&abs_list.lock);
 
-	}
-	if (do_notify)  {
-		int si_private=0;
+	if (ktime_cmp_val(timr->it.real.incr, !=, KTIME_ZERO))
+		si_private = ++timr->it_requeue_pending;
 
-		if (timr->it.real.incr)
-			si_private = ++timr->it_requeue_pending;
-		else {
-			remove_from_abslist(timr);
-		}
+	if (posix_timer_event(timr, si_private))
+		/*
+		 * signal was not sent because of sig_ignor
+		 * we will not get a call back to restart it AND
+		 * it should be restarted.
+		 */
+		schedule_next_timer(timr);
 
-		if (posix_timer_event(timr, si_private))
-			/*
-			 * signal was not sent because of sig_ignor
-			 * we will not get a call back to restart it AND
-			 * it should be restarted.
-			 */
-			schedule_next_timer(timr);
-	}
 	unlock_timer(timr, flags); /* hold thru abs lock to keep irq off */
 }
 
-
 static inline struct task_struct * good_sigevent(sigevent_t * event)
 {
 	struct task_struct *rtn = current->group_leader;
@@ -776,39 +645,41 @@ static struct k_itimer * lock_timer(time
 static void
 common_timer_get(struct k_itimer *timr, struct itimerspec *cur_setting)
 {
-	unsigned long expires;
-	struct now_struct now;
+	ktime_t expires, now, remaining;
+	struct ktimer *timer = &timr->it.real.timer;
 
-	do
-		expires = timr->it.real.timer.expires;
-	while ((volatile long) (timr->it.real.timer.expires) != expires);
-
-	posix_get_now(&now);
-
-	if (expires &&
-	    ((timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) &&
-	    !timr->it.real.incr &&
-	    posix_time_before(&timr->it.real.timer, &now))
-		timr->it.real.timer.expires = expires = 0;
-	if (expires) {
-		if (timr->it_requeue_pending & REQUEUE_PENDING ||
-		    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
-			posix_bump_timer(timr, now);
-			expires = timr->it.real.timer.expires;
-		}
-		else
-			if (!timer_pending(&timr->it.real.timer))
-				expires = 0;
-		if (expires)
-			expires -= now.jiffies;
-	}
-	jiffies_to_timespec(expires, &cur_setting->it_value);
-	jiffies_to_timespec(timr->it.real.incr, &cur_setting->it_interval);
-
-	if (cur_setting->it_value.tv_sec < 0) {
+	memset(cur_setting, 0, sizeof(struct itimerspec));
+	expires = ktimer_get_expiry(timer, &now);
+	remaining = ktime_sub(expires, now);
+
+	/* Time left ? or timer pending */
+	if (ktime_cmp_val(remaining, >, KTIME_ZERO) || ktimer_active(timer))
+		goto calci;
+	/* interval timer ? */
+	if (ktime_cmp_val(timr->it.real.incr, ==, 0))
+		return;
+	/*
+	 * When a requeue is pending or this is a SIGEV_NONE timer
+	 * move the expiry time forward by intervals, so expiry is >
+	 * now.
+	 * The active (non SIGEV_NONE) rearm should be done
+	 * automatically by the ktimer REARM mode. Thats the next
+	 * iteration.  The REQUEUE_PENDING part will go away !
+	 */
+	if (timr->it_requeue_pending & REQUEUE_PENDING ||
+	    (timr->it_sigev_notify & ~SIGEV_THREAD_ID) == SIGEV_NONE) {
+		remaining = forward_posix_timer(timr, now);
+	}
+ calci:
+	/* interval timer ? */
+	if (ktime_cmp_val(timr->it.real.incr, !=, KTIME_ZERO))
+		ktime_to_timespec(&cur_setting->it_interval,
+				  timr->it.real.incr);
+	/* Return 0 only, when the timer is expired and not pending */
+	if (ktime_cmp_val(remaining, <=, KTIME_ZERO))
 		cur_setting->it_value.tv_nsec = 1;
-		cur_setting->it_value.tv_sec = 0;
-	}
+	else
+		ktime_to_timespec(&cur_setting->it_value, remaining);
 }
 
 /* Get the time remaining on a POSIX.1b interval timer. */
@@ -832,6 +703,7 @@ sys_timer_gettime(timer_t timer_id, stru
 
 	return 0;
 }
+
 /*
  * Get the number of overruns of a POSIX.1b interval timer.  This is to
  * be the overrun of the timer last delivered.  At the same time we are
@@ -858,84 +730,6 @@ sys_timer_getoverrun(timer_t timer_id)
 
 	return overrun;
 }
-/*
- * Adjust for absolute time
- *
- * If absolute time is given and it is not CLOCK_MONOTONIC, we need to
- * adjust for the offset between the timer clock (CLOCK_MONOTONIC) and
- * what ever clock he is using.
- *
- * If it is relative time, we need to add the current (CLOCK_MONOTONIC)
- * time to it to get the proper time for the timer.
- */
-static int adjust_abs_time(struct k_clock *clock, struct timespec *tp, 
-			   int abs, u64 *exp, struct timespec *wall_to)
-{
-	struct timespec now;
-	struct timespec oc = *tp;
-	u64 jiffies_64_f;
-	int rtn =0;
-
-	if (abs) {
-		/*
-		 * The mask pick up the 4 basic clocks 
-		 */
-		if (!((clock - &posix_clocks[0]) & ~CLOCKS_MASK)) {
-			jiffies_64_f = do_posix_clock_monotonic_gettime_parts(
-				&now,  wall_to);
-			/*
-			 * If we are doing a MONOTONIC clock
-			 */
-			if((clock - &posix_clocks[0]) & CLOCKS_MONO){
-				now.tv_sec += wall_to->tv_sec;
-				now.tv_nsec += wall_to->tv_nsec;
-			}
-		} else {
-			/*
-			 * Not one of the basic clocks
-			 */
-			clock->clock_get(clock - posix_clocks, &now);
-			jiffies_64_f = get_jiffies_64();
-		}
-		/*
-		 * Take away now to get delta and normalize
-		 */
-		set_normalized_timespec(&oc, oc.tv_sec - now.tv_sec,
-					oc.tv_nsec - now.tv_nsec);
-	}else{
-		jiffies_64_f = get_jiffies_64();
-	}
-	/*
-	 * Check if the requested time is prior to now (if so set now)
-	 */
-	if (oc.tv_sec < 0)
-		oc.tv_sec = oc.tv_nsec = 0;
-
-	if (oc.tv_sec | oc.tv_nsec)
-		set_normalized_timespec(&oc, oc.tv_sec,
-					oc.tv_nsec + clock->res);
-	tstojiffie(&oc, clock->res, exp);
-
-	/*
-	 * Check if the requested time is more than the timer code
-	 * can handle (if so we error out but return the value too).
-	 */
-	if (*exp > ((u64)MAX_JIFFY_OFFSET))
-			/*
-			 * This is a considered response, not exactly in
-			 * line with the standard (in fact it is silent on
-			 * possible overflows).  We assume such a large 
-			 * value is ALMOST always a programming error and
-			 * try not to compound it by setting a really dumb
-			 * value.
-			 */
-			rtn = -EINVAL;
-	/*
-	 * return the actual jiffies expire time, full 64 bits
-	 */
-	*exp += jiffies_64_f;
-	return rtn;
-}
 
 /* Set a POSIX.1b interval timer. */
 /* timr->it_lock is taken. */
@@ -943,68 +737,52 @@ static inline int
 common_timer_set(struct k_itimer *timr, int flags,
 		 struct itimerspec *new_setting, struct itimerspec *old_setting)
 {
-	struct k_clock *clock = &posix_clocks[timr->it_clock];
-	u64 expire_64;
+	ktime_t expires;
+	int mode;
 
 	if (old_setting)
 		common_timer_get(timr, old_setting);
 
 	/* disable the timer */
-	timr->it.real.incr = 0;
+	ktime_set_scalar(timr->it.real.incr, KTIME_ZERO);
 	/*
 	 * careful here.  If smp we could be in the "fire" routine which will
 	 * be spinning as we hold the lock.  But this is ONLY an SMP issue.
 	 */
-	if (try_to_del_timer_sync(&timr->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (ktimer_try_to_cancel(&timr->it.real.timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timr);
 
 	timr->it_requeue_pending = (timr->it_requeue_pending + 2) & 
 		~REQUEUE_PENDING;
 	timr->it_overrun_last = 0;
 	timr->it_overrun = -1;
-	/*
-	 *switch off the timer when it_value is zero
-	 */
-	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec) {
-		timr->it.real.timer.expires = 0;
+
+	/* switch off the timer when it_value is zero */
+	if (!new_setting->it_value.tv_sec && !new_setting->it_value.tv_nsec)
 		return 0;
-	}
 
-	if (adjust_abs_time(clock,
-			    &new_setting->it_value, flags & TIMER_ABSTIME, 
-			    &expire_64, &(timr->it.real.wall_to_prev))) {
-		return -EINVAL;
-	}
-	timr->it.real.timer.expires = (unsigned long)expire_64;
-	tstojiffie(&new_setting->it_interval, clock->res, &expire_64);
-	timr->it.real.incr = (unsigned long)expire_64;
+	mode = flags & TIMER_ABSTIME ? KTIMER_ABS : KTIMER_REL;
 
-	/*
-	 * We do not even queue SIGEV_NONE timers!  But we do put them
-	 * in the abs list so we can do that right.
+	/* Posix madness. Only absolute CLOCK_REALTIME timers
+	 * are affected by clock sets. So we must reiniatilize
+	 * the timer.
 	 */
+	if (timr->it_clock == CLOCK_REALTIME && mode == KTIMER_ABS)
+		timer_create_real(timr);
+	else
+		timer_create_mono(timr);
+
+	expires = timespec_to_ktime(new_setting->it_value);
+
+	/* Convert and round the interval */
+	timr->it.real.incr = ktimer_round_timespec(&timr->it.real.timer,
+						     &new_setting->it_interval);
+
+	/* SIGEV_NONE timers are not queued ! See common_timer_get */
 	if (((timr->it_sigev_notify & ~SIGEV_THREAD_ID) != SIGEV_NONE))
-		add_timer(&timr->it.real.timer);
+		ktimer_start(&timr->it.real.timer, &expires,
+			     mode | KTIMER_NOCHECK | KTIMER_ROUND);
 
-	if (flags & TIMER_ABSTIME && clock->abs_struct) {
-		spin_lock(&clock->abs_struct->lock);
-		list_add_tail(&(timr->it.real.abs_timer_entry),
-			      &(clock->abs_struct->list));
-		spin_unlock(&clock->abs_struct->lock);
-	}
 	return 0;
 }
 
@@ -1039,6 +817,7 @@ retry:
 
 	unlock_timer(timr, flag);
 	if (error == TIMER_RETRY) {
+		wait_for_ktimer(&timr->it.real.timer);
 		rtn = NULL;	// We already got the old time...
 		goto retry;
 	}
@@ -1052,24 +831,10 @@ retry:
 
 static inline int common_timer_del(struct k_itimer *timer)
 {
-	timer->it.real.incr = 0;
+	ktime_set_scalar(timer->it.real.incr, KTIME_ZERO);
 
-	if (try_to_del_timer_sync(&timer->it.real.timer) < 0) {
-#ifdef CONFIG_SMP
-		/*
-		 * It can only be active if on an other cpu.  Since
-		 * we have cleared the interval stuff above, it should
-		 * clear once we release the spin lock.  Of course once
-		 * we do that anything could happen, including the
-		 * complete melt down of the timer.  So return with
-		 * a "retry" exit status.
-		 */
+	if (ktimer_try_to_cancel(&timer->it.real.timer) < 0)
 		return TIMER_RETRY;
-#endif
-	}
-
-	remove_from_abslist(timer);
-
 	return 0;
 }
 
@@ -1085,24 +850,17 @@ sys_timer_delete(timer_t timer_id)
 	struct k_itimer *timer;
 	long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	timer = lock_timer(timer_id, &flags);
 	if (!timer)
 		return -EINVAL;
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		wait_for_ktimer(&timer->it.real.timer);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
+
 	spin_lock(&current->sighand->siglock);
 	list_del(&timer->list);
 	spin_unlock(&current->sighand->siglock);
@@ -1119,6 +877,7 @@ retry_delete:
 	release_posix_timer(timer, IT_ID_SET);
 	return 0;
 }
+
 /*
  * return timer owned by the process, used by exit_itimers
  */
@@ -1126,22 +885,14 @@ static inline void itimer_delete(struct 
 {
 	unsigned long flags;
 
-#ifdef CONFIG_SMP
-	int error;
 retry_delete:
-#endif
 	spin_lock_irqsave(&timer->it_lock, flags);
 
-#ifdef CONFIG_SMP
-	error = timer_delete_hook(timer);
-
-	if (error == TIMER_RETRY) {
+	if (timer_delete_hook(timer) == TIMER_RETRY) {
 		unlock_timer(timer, flags);
+		wait_for_ktimer(&timer->it.real.timer);
 		goto retry_delete;
 	}
-#else
-	timer_delete_hook(timer);
-#endif
 	list_del(&timer->list);
 	/*
 	 * This keeps any tasks waiting on the spin lock from thinking
@@ -1170,60 +921,7 @@ void exit_itimers(struct signal_struct *
 	}
 }
 
-/*
- * And now for the "clock" calls
- *
- * These functions are called both from timer functions (with the timer
- * spin_lock_irq() held and from clock calls with no locking.	They must
- * use the save flags versions of locks.
- */
-
-/*
- * We do ticks here to avoid the irq lock ( they take sooo long).
- * The seqlock is great here.  Since we a reader, we don't really care
- * if we are interrupted since we don't take lock that will stall us or
- * any other cpu. Voila, no irq lock is needed.
- *
- */
-
-static u64 do_posix_clock_monotonic_gettime_parts(
-	struct timespec *tp, struct timespec *mo)
-{
-	u64 jiff;
-	unsigned int seq;
-
-	do {
-		seq = read_seqbegin(&xtime_lock);
-		getnstimeofday(tp);
-		*mo = wall_to_monotonic;
-		jiff = jiffies_64;
-
-	} while(read_seqretry(&xtime_lock, seq));
-
-	return jiff;
-}
-
-static int do_posix_clock_monotonic_get(clockid_t clock, struct timespec *tp)
-{
-	struct timespec wall_to_mono;
-
-	do_posix_clock_monotonic_gettime_parts(tp, &wall_to_mono);
-
-	tp->tv_sec += wall_to_mono.tv_sec;
-	tp->tv_nsec += wall_to_mono.tv_nsec;
-
-	if ((tp->tv_nsec - NSEC_PER_SEC) > 0) {
-		tp->tv_nsec -= NSEC_PER_SEC;
-		tp->tv_sec++;
-	}
-	return 0;
-}
-
-int do_posix_clock_monotonic_gettime(struct timespec *tp)
-{
-	return do_posix_clock_monotonic_get(CLOCK_MONOTONIC, tp);
-}
-
+/* Not available / possible... functions */
 int do_posix_clock_nosettime(clockid_t clockid, struct timespec *tp)
 {
 	return -EINVAL;
@@ -1236,7 +934,8 @@ int do_posix_clock_notimer_create(struct
 }
 EXPORT_SYMBOL_GPL(do_posix_clock_notimer_create);
 
-int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t)
+int do_posix_clock_nonanosleep(clockid_t clock, int flags, struct timespec *t,
+			       struct timespec __user *r)
 {
 #ifndef ENOTSUP
 	return -EOPNOTSUPP;	/* aka ENOTSUP in userland for POSIX */
@@ -1295,125 +994,34 @@ sys_clock_getres(clockid_t which_clock, 
 	return error;
 }
 
-static void nanosleep_wake_up(unsigned long __data)
-{
-	struct task_struct *p = (struct task_struct *) __data;
-
-	wake_up_process(p);
-}
-
 /*
- * The standard says that an absolute nanosleep call MUST wake up at
- * the requested time in spite of clock settings.  Here is what we do:
- * For each nanosleep call that needs it (only absolute and not on
- * CLOCK_MONOTONIC* (as it can not be set)) we thread a little structure
- * into the "nanosleep_abs_list".  All we need is the task_struct pointer.
- * When ever the clock is set we just wake up all those tasks.	 The rest
- * is done by the while loop in clock_nanosleep().
- *
- * On locking, clock_was_set() is called from update_wall_clock which
- * holds (or has held for it) a write_lock_irq( xtime_lock) and is
- * called from the timer bh code.  Thus we need the irq save locks.
- *
- * Also, on the call from update_wall_clock, that is done as part of a
- * softirq thing.  We don't want to delay the system that much (possibly
- * long list of timers to fix), so we defer that work to keventd.
+ * nanosleep for monotonic and realtime clocks
  */
-
-static DECLARE_WAIT_QUEUE_HEAD(nanosleep_abs_wqueue);
-static DECLARE_WORK(clock_was_set_work, (void(*)(void*))clock_was_set, NULL);
-
-static DECLARE_MUTEX(clock_was_set_lock);
-
-void clock_was_set(void)
+static int common_nsleep(clockid_t which_clock, int flags,
+			 struct timespec *tsave, struct timespec __user *rmtp)
 {
-	struct k_itimer *timr;
-	struct timespec new_wall_to;
-	LIST_HEAD(cws_list);
-	unsigned long seq;
+	int mode = flags & TIMER_ABSTIME ? KTIMER_ABS : KTIMER_REL;
 
-
-	if (unlikely(in_interrupt())) {
-		schedule_work(&clock_was_set_work);
-		return;
+	switch (which_clock) {
+	case CLOCK_REALTIME:
+		/* Posix madness. Only absolute timers on clock realtime
+		   are affected by clock set. */
+		if (mode == KTIMER_ABS)
+			return ktimer_nanosleep_real(tsave, rmtp, mode);
+	case CLOCK_MONOTONIC:
+		return ktimer_nanosleep(tsave, rmtp, mode);
+	default:
+		break;
 	}
-	wake_up_all(&nanosleep_abs_wqueue);
-
-	/*
-	 * Check if there exist TIMER_ABSTIME timers to correct.
-	 *
-	 * Notes on locking: This code is run in task context with irq
-	 * on.  We CAN be interrupted!  All other usage of the abs list
-	 * lock is under the timer lock which holds the irq lock as
-	 * well.  We REALLY don't want to scan the whole list with the
-	 * interrupt system off, AND we would like a sequence lock on
-	 * this code as well.  Since we assume that the clock will not
-	 * be set often, it seems ok to take and release the irq lock
-	 * for each timer.  In fact add_timer will do this, so this is
-	 * not an issue.  So we know when we are done, we will move the
-	 * whole list to a new location.  Then as we process each entry,
-	 * we will move it to the actual list again.  This way, when our
-	 * copy is empty, we are done.  We are not all that concerned
-	 * about preemption so we will use a semaphore lock to protect
-	 * aginst reentry.  This way we will not stall another
-	 * processor.  It is possible that this may delay some timers
-	 * that should have expired, given the new clock, but even this
-	 * will be minimal as we will always update to the current time,
-	 * even if it was set by a task that is waiting for entry to
-	 * this code.  Timers that expire too early will be caught by
-	 * the expire code and restarted.
-
-	 * Absolute timers that repeat are left in the abs list while
-	 * waiting for the task to pick up the signal.  This means we
-	 * may find timers that are not in the "add_timer" list, but are
-	 * in the abs list.  We do the same thing for these, save
-	 * putting them back in the "add_timer" list.  (Note, these are
-	 * left in the abs list mainly to indicate that they are
-	 * ABSOLUTE timers, a fact that is used by the re-arm code, and
-	 * for which we have no other flag.)
-
-	 */
-
-	down(&clock_was_set_lock);
-	spin_lock_irq(&abs_list.lock);
-	list_splice_init(&abs_list.list, &cws_list);
-	spin_unlock_irq(&abs_list.lock);
-	do {
-		do {
-			seq = read_seqbegin(&xtime_lock);
-			new_wall_to =	wall_to_monotonic;
-		} while (read_seqretry(&xtime_lock, seq));
-
-		spin_lock_irq(&abs_list.lock);
-		if (list_empty(&cws_list)) {
-			spin_unlock_irq(&abs_list.lock);
-			break;
-		}
-		timr = list_entry(cws_list.next, struct k_itimer,
-				  it.real.abs_timer_entry);
-
-		list_del_init(&timr->it.real.abs_timer_entry);
-		if (add_clockset_delta(timr, &new_wall_to) &&
-		    del_timer(&timr->it.real.timer))  /* timer run yet? */
-			add_timer(&timr->it.real.timer);
-		list_add(&timr->it.real.abs_timer_entry, &abs_list.list);
-		spin_unlock_irq(&abs_list.lock);
-	} while (1);
-
-	up(&clock_was_set_lock);
+	return -EINVAL;
 }
 
-long clock_nanosleep_restart(struct restart_block *restart_block);
-
 asmlinkage long
 sys_clock_nanosleep(clockid_t which_clock, int flags,
 		    const struct timespec __user *rqtp,
 		    struct timespec __user *rmtp)
 {
 	struct timespec t;
-	struct restart_block *restart_block =
-	    &(current_thread_info()->restart_block);
-	int ret;
 
 	if (invalid_clockid(which_clock))
 		return -EINVAL;
@@ -1421,135 +1029,8 @@ sys_clock_nanosleep(clockid_t which_cloc
 	if (copy_from_user(&t, rqtp, sizeof (struct timespec)))
 		return -EFAULT;
 
-	if ((unsigned) t.tv_nsec >= NSEC_PER_SEC || t.tv_sec < 0)
+	if (!timespec_valid(&t))
 		return -EINVAL;
 
-	/*
-	 * Do this here as nsleep function does not have the real address.
-	 */
-	restart_block->arg1 = (unsigned long)rmtp;
-
-	ret = CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t));
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && rmtp &&
-					copy_to_user(rmtp, &t, sizeof (t)))
-		return -EFAULT;
-	return ret;
-}
-
-
-static int common_nsleep(clockid_t which_clock,
-			 int flags, struct timespec *tsave)
-{
-	struct timespec t, dum;
-	struct timer_list new_timer;
-	DECLARE_WAITQUEUE(abs_wqueue, current);
-	u64 rq_time = (u64)0;
-	s64 left;
-	int abs;
-	struct restart_block *restart_block =
-	    &current_thread_info()->restart_block;
-
-	abs_wqueue.flags = 0;
-	init_timer(&new_timer);
-	new_timer.expires = 0;
-	new_timer.data = (unsigned long) current;
-	new_timer.function = nanosleep_wake_up;
-	abs = flags & TIMER_ABSTIME;
-
-	if (restart_block->fn == clock_nanosleep_restart) {
-		/*
-		 * Interrupted by a non-delivered signal, pick up remaining
-		 * time and continue.  Remaining time is in arg2 & 3.
-		 */
-		restart_block->fn = do_no_restart_syscall;
-
-		rq_time = restart_block->arg3;
-		rq_time = (rq_time << 32) + restart_block->arg2;
-		if (!rq_time)
-			return -EINTR;
-		left = rq_time - get_jiffies_64();
-		if (left <= (s64)0)
-			return 0;	/* Already passed */
-	}
-
-	if (abs && (posix_clocks[which_clock].clock_get !=
-			    posix_clocks[CLOCK_MONOTONIC].clock_get))
-		add_wait_queue(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	do {
-		t = *tsave;
-		if (abs || !rq_time) {
-			adjust_abs_time(&posix_clocks[which_clock], &t, abs,
-					&rq_time, &dum);
-		}
-
-		left = rq_time - get_jiffies_64();
-		if (left >= (s64)MAX_JIFFY_OFFSET)
-			left = (s64)MAX_JIFFY_OFFSET;
-		if (left < (s64)0)
-			break;
-
-		new_timer.expires = jiffies + left;
-		__set_current_state(TASK_INTERRUPTIBLE);
-		add_timer(&new_timer);
-
-		schedule();
-
-		del_timer_sync(&new_timer);
-		left = rq_time - get_jiffies_64();
-	} while (left > (s64)0 && !test_thread_flag(TIF_SIGPENDING));
-
-	if (abs_wqueue.task_list.next)
-		finish_wait(&nanosleep_abs_wqueue, &abs_wqueue);
-
-	if (left > (s64)0) {
-
-		/*
-		 * Always restart abs calls from scratch to pick up any
-		 * clock shifting that happened while we are away.
-		 */
-		if (abs)
-			return -ERESTARTNOHAND;
-
-		left *= TICK_NSEC;
-		tsave->tv_sec = div_long_long_rem(left, 
-						  NSEC_PER_SEC, 
-						  &tsave->tv_nsec);
-		/*
-		 * Restart works by saving the time remaing in 
-		 * arg2 & 3 (it is 64-bits of jiffies).  The other
-		 * info we need is the clock_id (saved in arg0). 
-		 * The sys_call interface needs the users 
-		 * timespec return address which _it_ saves in arg1.
-		 * Since we have cast the nanosleep call to a clock_nanosleep
-		 * both can be restarted with the same code.
-		 */
-		restart_block->fn = clock_nanosleep_restart;
-		restart_block->arg0 = which_clock;
-		/*
-		 * Caller sets arg1
-		 */
-		restart_block->arg2 = rq_time & 0xffffffffLL;
-		restart_block->arg3 = rq_time >> 32;
-
-		return -ERESTART_RESTARTBLOCK;
-	}
-
-	return 0;
-}
-/*
- * This will restart clock_nanosleep.
- */
-long
-clock_nanosleep_restart(struct restart_block *restart_block)
-{
-	struct timespec t;
-	int ret = common_nsleep(restart_block->arg0, 0, &t);
-
-	if ((ret == -ERESTART_RESTARTBLOCK) && restart_block->arg1 &&
-	    copy_to_user((struct timespec __user *)(restart_block->arg1), &t,
-			 sizeof (t)))
-		return -EFAULT;
-	return ret;
+	return CLOCK_DISPATCH(which_clock, nsleep, (which_clock, flags, &t, rmtp));
 }
Index: linux/kernel/power/swsusp.c
===================================================================
--- linux.orig/kernel/power/swsusp.c
+++ linux/kernel/power/swsusp.c
@@ -1045,6 +1045,7 @@ int swsusp_suspend(void)
 	restore_processor_state();
 	BUG_ON (nr_copy_pages_check != nr_copy_pages);
 	restore_highmem();
+	touch_softlockup_watchdog();
 	device_power_up();
 	local_irq_enable();
 	return error;
Index: linux/kernel/printk.c
===================================================================
--- linux.orig/kernel/printk.c
+++ linux/kernel/printk.c
@@ -83,7 +83,7 @@ static int console_locked;
  * It is also used in interesting ways to provide interlocking in
  * release_console_sem().
  */
-static DEFINE_SPINLOCK(logbuf_lock);
+static DEFINE_RAW_SPINLOCK(logbuf_lock);
 
 #define LOG_BUF_MASK	(log_buf_len-1)
 #define LOG_BUF(idx) (log_buf[(idx) & LOG_BUF_MASK])
@@ -363,10 +363,12 @@ static void __call_console_drivers(unsig
 {
 	struct console *con;
 
+	touch_critical_timing();
 	for (con = console_drivers; con; con = con->next) {
 		if ((con->flags & CON_ENABLED) && con->write)
 			con->write(con, &LOG_BUF(start), end - start);
 	}
+	touch_critical_timing();
 }
 
 /*
@@ -375,7 +377,10 @@ static void __call_console_drivers(unsig
 static void _call_console_drivers(unsigned long start,
 				unsigned long end, int msg_log_level)
 {
-	if (msg_log_level < console_loglevel &&
+	if (
+#ifndef CONFIG_PRINTK_IGNORE_LOGLEVEL
+			msg_log_level < console_loglevel &&
+#endif
 			console_drivers && start != end) {
 		if ((start & LOG_BUF_MASK) > (end & LOG_BUF_MASK)) {
 			/* wrapped write */
@@ -470,6 +475,7 @@ static void zap_locks(void)
 	spin_lock_init(&logbuf_lock);
 	/* And make sure that we print immediately */
 	init_MUTEX(&console_sem);
+	zap_rt_locks();
 }
 
 #if defined(CONFIG_PRINTK_TIME)
@@ -539,6 +545,7 @@ asmlinkage int vprintk(const char *fmt, 
 	/* This stops the holder of console_sem just where we want him */
 	spin_lock_irqsave(&logbuf_lock, flags);
 	printk_cpu = smp_processor_id();
+	preempt_enable();
 
 	/* Emit the output into the temporary buffer */
 	printed_len = vscnprintf(printk_buf, sizeof(printk_buf), fmt, args);
@@ -631,7 +638,6 @@ asmlinkage int vprintk(const char *fmt, 
 		spin_unlock_irqrestore(&logbuf_lock, flags);
 	}
 out:
-	preempt_enable();
 	return printed_len;
 }
 EXPORT_SYMBOL(printk);
@@ -747,14 +753,31 @@ void release_console_sem(void)
 		_con_start = con_start;
 		_log_end = log_end;
 		con_start = log_end;		/* Flush */
+		/*
+		 * on PREEMPT_RT, call console drivers with
+		 * interrupts enabled (unless we are debugging):
+		 */
+#if defined(CONFIG_PREEMPT_RT) && !defined(CONFIG_PRINTK_IGNORE_LOGLEVEL) && !defined(CONFIG_PPC)
+		spin_unlock_irq(&logbuf_lock);
+#else
 		spin_unlock(&logbuf_lock);
+#endif
 		call_console_drivers(_con_start, _log_end);
-		local_irq_restore(flags);
+		raw_local_irq_restore(flags);
 	}
 	console_locked = 0;
 	console_may_schedule = 0;
-	up(&console_sem);
 	spin_unlock_irqrestore(&logbuf_lock, flags);
+	up(&console_sem);
+	/*
+	 * On PREEMPT_RT kernels __wake_up may sleep, so wake syslogd
+	 * up only if we are in a preemptible section. We normally dont
+	 * printk from non-preemptible sections so this is for the emergency
+	 * case only.
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (!in_atomic() && !irqs_disabled() && !raw_irqs_disabled())
+#endif
 	if (wake_klogd && !oops_in_progress && waitqueue_active(&log_wait))
 		wake_up_interruptible(&log_wait);
 }
@@ -993,7 +1016,7 @@ void tty_write_message(struct tty_struct
  */
 int __printk_ratelimit(int ratelimit_jiffies, int ratelimit_burst)
 {
-	static DEFINE_SPINLOCK(ratelimit_lock);
+	static DEFINE_RAW_SPINLOCK(ratelimit_lock);
 	static unsigned long toks = 10*5*HZ;
 	static unsigned long last_msg;
 	static int missed;
@@ -1032,3 +1055,20 @@ int printk_ratelimit(void)
 				printk_ratelimit_burst);
 }
 EXPORT_SYMBOL(printk_ratelimit);
+
+static DEFINE_RAW_SPINLOCK(warn_lock);
+
+void __WARN_ON(const char *func, const char *file, const int line)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&warn_lock, flags);
+	printk("%s/%d[CPU#%d]: BUG in %s at %s:%d\n",
+		current->comm, current->pid, raw_smp_processor_id(),
+		func, file, line);
+	dump_stack();
+	spin_unlock_irqrestore(&warn_lock, flags);
+}
+
+EXPORT_SYMBOL(__WARN_ON);
+
Index: linux/kernel/profile.c
===================================================================
--- linux.orig/kernel/profile.c
+++ linux/kernel/profile.c
@@ -41,6 +41,7 @@ static atomic_t *prof_buffer;
 static unsigned long prof_len, prof_shift;
 static int prof_on __read_mostly;
 static cpumask_t prof_cpu_mask = CPU_MASK_ALL;
+int prof_pid = -1;
 #ifdef CONFIG_SMP
 static DEFINE_PER_CPU(struct profile_hit *[2], cpu_profile_hits);
 static DEFINE_PER_CPU(int, cpu_profile_flip);
@@ -294,7 +295,7 @@ void profile_hit(int type, void *__pc)
 		put_cpu();
 		return;
 	}
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	do {
 		for (j = 0; j < PROFILE_GRPSZ; ++j) {
 			if (hits[i + j].pc == pc) {
@@ -314,7 +315,7 @@ void profile_hit(int type, void *__pc)
 		hits[i].pc = hits[i].hits = 0;
 	}
 out:
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	put_cpu();
 }
 
@@ -387,7 +388,7 @@ void profile_tick(int type, struct pt_re
 {
 	if (type == CPU_PROFILING && timer_hook)
 		timer_hook(regs);
-	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask))
+	if (!user_mode(regs) && cpu_isset(smp_processor_id(), prof_cpu_mask) && (prof_pid == -1 || prof_pid == current->pid))
 		profile_hit(type, (void *)profile_pc(regs));
 }
 
Index: linux/kernel/rcupdate.c
===================================================================
--- linux.orig/kernel/rcupdate.c
+++ linux/kernel/rcupdate.c
@@ -19,15 +19,15 @@
  *
  * Authors: Dipankar Sarma <dipankar@in.ibm.com>
  *	    Manfred Spraul <manfred@colorfullife.com>
+ *	    Paul E. McKenney <paulmck@us.ibm.com> (PREEMPT_RCU)
  * 
  * Based on the original work by Paul McKenney <paulmck@us.ibm.com>
  * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen.
- * Papers:
- * http://www.rdrop.com/users/paulmck/paper/rclockpdcsproof.pdf
- * http://lse.sourceforge.net/locking/rclock_OLS.2001.05.01c.sc.pdf (OLS2001)
+ *
+ * Papers:  http://www.rdrop.com/users/paulmck/RCU
  *
  * For detailed explanation of Read-Copy Update mechanism see -
- * 		http://lse.sourceforge.net/locking/rcupdate.html
+ * 		Documentation/RCU/ *.txt
  *
  */
 #include <linux/types.h>
@@ -35,6 +35,7 @@
 #include <linux/init.h>
 #include <linux/spinlock.h>
 #include <linux/smp.h>
+#include <linux/rcupdate.h>
 #include <linux/interrupt.h>
 #include <linux/sched.h>
 #include <asm/atomic.h>
@@ -47,6 +48,69 @@
 #include <linux/rcupdate.h>
 #include <linux/rcuref.h>
 #include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+
+/* Because of FASTCALL declaration of complete, we use this wrapper */
+static void wakeme_after_rcu(struct rcu_head  *head)
+{
+	struct rcu_synchronize *rcu;
+
+	rcu = container_of(head, struct rcu_synchronize, head);
+	complete(&rcu->completion);
+}
+
+/**
+ * synchronize_rcu - wait until a grace period has elapsed.
+ *
+ * Control will return to the caller some time after a full grace
+ * period has elapsed, in other words after all currently executing RCU
+ * read-side critical sections have completed.  RCU read-side critical
+ * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
+ * and may be nested.
+ *
+ * If your read-side code is not protected by rcu_read_lock(), do -not-
+ * use synchronize_rcu().
+ */
+void synchronize_rcu(void)
+{
+	struct rcu_synchronize rcu;
+
+	init_completion(&rcu.completion);
+	/* Will wake me after RCU finished */
+	call_rcu(&rcu.head, wakeme_after_rcu);
+
+	/* Wait for it */
+	wait_for_completion(&rcu.completion);
+}
+
+#ifndef __HAVE_ARCH_CMPXCHG
+/*
+ * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
+ * 32 bit atomic_t implementations, and a hash function similar to that
+ * for our refcounting needs.
+ * Can't help multiprocessors which donot have cmpxchg :(
+ */
+spinlock_t __rcuref_hash[RCUREF_HASH_SIZE];
+
+static inline void init_rcurefs(void)
+{
+	int i;
+
+	for (i = 0; i < RCUREF_HASH_SIZE; i++)
+		spin_lock_init(&__rcuref_hash[i]);
+}
+#else
+#define init_rcurefs()	do { } while (0)
+#endif
+
+#ifndef CONFIG_PREEMPT_RCU
 
 /* Definition for rcupdate control block. */
 struct rcu_ctrlblk rcu_ctrlblk = 
@@ -62,9 +126,9 @@ struct rcu_state {
 };
 
 static struct rcu_state rcu_state ____cacheline_maxaligned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
+	  {.lock = SPIN_LOCK_UNLOCKED(rcu_state.lock), .cpumask = CPU_MASK_NONE };
 static struct rcu_state rcu_bh_state ____cacheline_maxaligned_in_smp =
-	  {.lock = SPIN_LOCK_UNLOCKED, .cpumask = CPU_MASK_NONE };
+	  {.lock = SPIN_LOCK_UNLOCKED(rcu_bh_state.lock), .cpumask = CPU_MASK_NONE };
 
 DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
@@ -73,18 +137,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_bh_d
 static DEFINE_PER_CPU(struct tasklet_struct, rcu_tasklet) = {NULL};
 static int maxbatch = 10000;
 
-#ifndef __HAVE_ARCH_CMPXCHG
-/*
- * We use an array of spinlocks for the rcurefs -- similar to ones in sparc
- * 32 bit atomic_t implementations, and a hash function similar to that
- * for our refcounting needs.
- * Can't help multiprocessors which donot have cmpxchg :(
- */
-
-spinlock_t __rcuref_hash[RCUREF_HASH_SIZE] = {
-	[0 ... (RCUREF_HASH_SIZE-1)] = SPIN_LOCK_UNLOCKED
-};
-#endif
 
 /**
  * call_rcu - Queue an RCU callback for invocation after a grace period.
@@ -105,7 +157,7 @@ void fastcall call_rcu(struct rcu_head *
 
 	head->func = func;
 	head->next = NULL;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	rdp = &__get_cpu_var(rcu_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
@@ -113,7 +165,7 @@ void fastcall call_rcu(struct rcu_head *
 	if (unlikely(++rdp->count > 10000))
 		set_need_resched();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /**
@@ -140,7 +192,7 @@ void fastcall call_rcu_bh(struct rcu_hea
 
 	head->func = func;
 	head->next = NULL;
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	rdp = &__get_cpu_var(rcu_bh_data);
 	*rdp->nxttail = head;
 	rdp->nxttail = &head->next;
@@ -150,7 +202,16 @@ void fastcall call_rcu_bh(struct rcu_hea
  *  if (unlikely(rdp->count > 10000))
  *      rcu_do_batch(rdp);
  */
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
+}
+
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
+{
+	return rcu_ctrlblk.completed;
 }
 
 /*
@@ -287,11 +348,11 @@ static void rcu_check_quiescent_state(st
 static void rcu_move_batch(struct rcu_data *this_rdp, struct rcu_head *list,
 				struct rcu_head **tail)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	*this_rdp->nxttail = list;
 	if (list)
 		this_rdp->nxttail = tail;
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 static void __rcu_offline_cpu(struct rcu_data *this_rdp,
@@ -344,13 +405,13 @@ static void __rcu_process_callbacks(stru
 		rdp->curtail = &rdp->curlist;
 	}
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	if (rdp->nxtlist && !rdp->curlist) {
 		rdp->curlist = rdp->nxtlist;
 		rdp->curtail = rdp->nxttail;
 		rdp->nxtlist = NULL;
 		rdp->nxttail = &rdp->nxtlist;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		/*
 		 * start the next batch of callbacks
@@ -370,7 +431,7 @@ static void __rcu_process_callbacks(stru
 			spin_unlock(&rsp->lock);
 		}
 	} else {
-		local_irq_enable();
+		raw_local_irq_enable();
 	}
 	rcu_check_quiescent_state(rcp, rsp, rdp);
 	if (rdp->donelist)
@@ -448,48 +509,364 @@ static struct notifier_block __devinitda
  */
 void __init rcu_init(void)
 {
+	init_rcurefs();
 	rcu_cpu_notify(&rcu_nb, CPU_UP_PREPARE,
 			(void *)(long)smp_processor_id());
 	/* Register notifier for non-boot CPUs */
 	register_cpu_notifier(&rcu_nb);
 }
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
+/*
+ * Deprecated, use synchronize_rcu() or synchronize_sched() instead.
+ */
+void synchronize_kernel(void)
+{
+	synchronize_rcu();
+}
+
+module_param(maxbatch, int, 0);
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(synchronize_rcu);
+EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+
+#else /* #ifndef CONFIG_PREEMPT_RCU */
+
+struct rcu_data {
+	raw_spinlock_t	lock;
+	long		completed;	/* Number of last completed batch. */
+	struct tasklet_struct rcu_tasklet;
+	struct rcu_head *nextlist;
+	struct rcu_head **nexttail;
+	struct rcu_head *waitlist;
+	struct rcu_head **waittail;
+	struct rcu_head *donelist;
+	struct rcu_head **donetail;
+#ifdef CONFIG_RCU_STATS
+	long		n_next_length;
+	long		n_next_add;
+	long		n_wait_length;
+	long		n_wait_add;
+	long		n_done_length;
+	long		n_done_add;
+	long		n_done_remove;
+	atomic_t	n_done_invoked;
+	long		n_rcu_check_callbacks;
+	atomic_t	n_rcu_try_flip1;
+	long		n_rcu_try_flip2;
+	long		n_rcu_try_flip3;
+	atomic_t	n_rcu_try_flip_e1;
+	long		n_rcu_try_flip_e2;
+	long		n_rcu_try_flip_e3;
+#endif /* #ifdef CONFIG_RCU_STATS */
 };
+struct rcu_ctrlblk {
+	raw_spinlock_t	fliplock;
+	long		completed;	/* Number of last completed batch. */
+};
+static struct rcu_data rcu_data;
+static struct rcu_ctrlblk rcu_ctrlblk = {
+	.fliplock = RAW_SPIN_LOCK_UNLOCKED,
+	.completed = 0,
+};
+static DEFINE_PER_CPU(atomic_t [2], rcu_flipctr) =
+	{ ATOMIC_INIT(0), ATOMIC_INIT(0) };
 
-/* Because of FASTCALL declaration of complete, we use this wrapper */
-static void wakeme_after_rcu(struct rcu_head  *head)
+/*
+ * Return the number of RCU batches processed thus far.  Useful
+ * for debug and statistics.
+ */
+long rcu_batches_completed(void)
 {
-	struct rcu_synchronize *rcu;
+	return rcu_ctrlblk.completed;
+}
 
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
+void
+rcu_read_lock(void)
+{
+	int flipctr;
+	unsigned long oldirq;
+
+	raw_local_irq_save(oldirq);
+	if (current->rcu_read_lock_nesting++ == 0) {
+
+		/*
+		 * Outermost nesting of rcu_read_lock(), so atomically
+		 * increment the current counter for the current CPU.
+		 */
+
+		flipctr = rcu_ctrlblk.completed & 0x1;
+		smp_read_barrier_depends();
+		current->rcu_flipctr1 = &(__get_cpu_var(rcu_flipctr)[flipctr]);
+		/* Can optimize to non-atomic on fastpath, but start simple. */
+		atomic_inc(current->rcu_flipctr1);
+		smp_mb__after_atomic_inc();  /* might optimize out... */
+		if (unlikely(flipctr != (rcu_ctrlblk.completed & 0x1))) {
+
+			/*
+			 * We raced with grace-period processing (flip).
+			 * Although we cannot be preempted here, there
+			 * could be interrupts, ECC errors and the like,
+			 * so just nail down both sides of the rcu_flipctr
+			 * array for the duration of our RCU read-side
+			 * critical section, preventing a second flip
+			 * from racing with us.  At some point, it would
+			 * be safe to decrement one of the counters, but
+			 * we have no way of knowing when that would be.
+			 * So just decrement them both in rcu_read_unlock().
+			 */
+
+			current->rcu_flipctr2 =
+				&(__get_cpu_var(rcu_flipctr)[!flipctr]);
+			/* Can again optimize to non-atomic on fastpath. */
+			atomic_inc(current->rcu_flipctr2);
+			smp_mb__after_atomic_inc();  /* might optimize out... */
+		}
+	}
+	raw_local_irq_restore(oldirq);
 }
 
-/**
- * synchronize_rcu - wait until a grace period has elapsed.
- *
- * Control will return to the caller some time after a full grace
- * period has elapsed, in other words after all currently executing RCU
- * read-side critical sections have completed.  RCU read-side critical
- * sections are delimited by rcu_read_lock() and rcu_read_unlock(),
- * and may be nested.
+void
+rcu_read_unlock(void)
+{
+	unsigned long oldirq;
+
+	raw_local_irq_save(oldirq);
+	if (--current->rcu_read_lock_nesting == 0) {
+
+		/*
+		 * Just atomically decrement whatever we incremented.
+		 * Might later want to awaken some task waiting for the
+		 * grace period to complete, but keep it simple for the
+		 * moment.
+		 */
+
+		smp_mb__before_atomic_dec();
+		atomic_dec(current->rcu_flipctr1);
+		current->rcu_flipctr1 = NULL;
+		if (unlikely(current->rcu_flipctr2 != NULL)) {
+			atomic_dec(current->rcu_flipctr2);
+			current->rcu_flipctr2 = NULL;
+		}
+	}
+	raw_local_irq_restore(oldirq);
+}
+
+static void
+__rcu_advance_callbacks(void)
+{
+
+	if (rcu_data.completed != rcu_ctrlblk.completed) {
+		if (rcu_data.waitlist != NULL) {
+			*rcu_data.donetail = rcu_data.waitlist;
+			rcu_data.donetail = rcu_data.waittail;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_done_length += rcu_data.n_wait_length;
+			rcu_data.n_done_add += rcu_data.n_wait_length;
+			rcu_data.n_wait_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		}
+		if (rcu_data.nextlist != NULL) {
+			rcu_data.waitlist = rcu_data.nextlist;
+			rcu_data.waittail = rcu_data.nexttail;
+			rcu_data.nextlist = NULL;
+			rcu_data.nexttail = &rcu_data.nextlist;
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_wait_length += rcu_data.n_next_length;
+			rcu_data.n_wait_add += rcu_data.n_next_length;
+			rcu_data.n_next_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		} else {
+			rcu_data.waitlist = NULL;
+			rcu_data.waittail = &rcu_data.waitlist;
+		}
+		rcu_data.completed = rcu_ctrlblk.completed;
+	}
+}
+
+/*
+ * Attempt a single flip of the counters.  Remember, a single flip does
+ * -not- constitute a grace period.  Instead, the interval between
+ * a pair of consecutive flips is a grace period.
  *
- * If your read-side code is not protected by rcu_read_lock(), do -not-
- * use synchronize_rcu().
+ * If anyone is nuts enough to run this CONFIG_PREEMPT_RCU implementation
+ * on a large SMP, they might want to use a hierarchical organization of
+ * the per-CPU-counter pairs.
+ */
+static void
+rcu_try_flip(void)
+{
+	int cpu;
+	long flipctr;
+	unsigned long oldirq;
+
+	flipctr = rcu_ctrlblk.completed;
+#ifdef CONFIG_RCU_STATS
+	atomic_inc(&rcu_data.n_rcu_try_flip1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	if (unlikely(!spin_trylock_irqsave(&rcu_ctrlblk.fliplock, oldirq))) {
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_rcu_try_flip_e1);
+#endif /* #ifdef CONFIG_RCU_STATS */
+		return;
+	}
+	if (unlikely(flipctr != rcu_ctrlblk.completed)) {
+
+		/* Our work is done!  ;-) */
+
+#ifdef CONFIG_RCU_STATS
+		rcu_data.n_rcu_try_flip_e2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+		spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+		return;
+	}
+	flipctr &= 0x1;
+
+	/*
+	 * Check for completion of all RCU read-side critical sections
+	 * that started prior to the previous flip.
+	 */
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_try_flip2++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	for_each_cpu(cpu) {
+		if (atomic_read(&per_cpu(rcu_flipctr, cpu)[!flipctr]) != 0) {
+#ifdef CONFIG_RCU_STATS
+			rcu_data.n_rcu_try_flip_e3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+			spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+			return;
+		}
+	}
+
+	/* Do the flip. */
+
+	smp_mb();
+	rcu_ctrlblk.completed++;
+
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_try_flip3++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_ctrlblk.fliplock, oldirq);
+}
+
+void
+rcu_check_callbacks(int cpu, int user)
+{
+	unsigned long oldirq;
+
+	if (rcu_ctrlblk.completed == rcu_data.completed) {
+		rcu_try_flip();
+		if (rcu_ctrlblk.completed == rcu_data.completed) {
+			return;
+		}
+	}
+	spin_lock_irqsave(&rcu_data.lock, oldirq);
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_rcu_check_callbacks++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	__rcu_advance_callbacks();
+	if (rcu_data.donelist == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+	} else {
+		spin_unlock_irqrestore(&rcu_data.lock, oldirq);
+		tasklet_schedule(&rcu_data.rcu_tasklet);
+	}
+}
+
+static
+void rcu_process_callbacks(unsigned long data)
+{
+	unsigned long flags;
+	struct rcu_head *next, *list;
+
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	list = rcu_data.donelist;
+	if (list == NULL) {
+		spin_unlock_irqrestore(&rcu_data.lock, flags);
+		return;
+	}
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_done_remove += rcu_data.n_done_length;
+	rcu_data.n_done_length = 0;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+	while (list) {
+		next = list->next;
+		list->func(list);
+		list = next;
+#ifdef CONFIG_RCU_STATS
+		atomic_inc(&rcu_data.n_done_invoked);
+#endif /* #ifdef CONFIG_RCU_STATS */
+	}
+}
+
+void fastcall
+call_rcu(struct rcu_head *head,
+	 void (*func)(struct rcu_head *rcu))
+{
+	unsigned long flags;
+
+	head->func = func;
+	head->next = NULL;
+	spin_lock_irqsave(&rcu_data.lock, flags);
+	__rcu_advance_callbacks();
+	*rcu_data.nexttail = head;
+	rcu_data.nexttail = &head->next;
+#ifdef CONFIG_RCU_STATS
+	rcu_data.n_next_add++;
+	rcu_data.n_next_length++;
+#endif /* #ifdef CONFIG_RCU_STATS */
+	spin_unlock_irqrestore(&rcu_data.lock, flags);
+}
+
+/*
+ * Crude hack, reduces but does not eliminate possibility of failure.
+ * Needs to wait for all CPUs to pass through a -voluntary- context
+ * switch to eliminate possibility of failure.  (Maybe just crank
+ * priority down...)
  */
-void synchronize_rcu(void)
+void
+synchronize_sched(void)
 {
-	struct rcu_synchronize rcu;
+	cpumask_t oldmask;
+	int cpu;
 
-	init_completion(&rcu.completion);
-	/* Will wake me after RCU finished */
-	call_rcu(&rcu.head, wakeme_after_rcu);
+	if (sched_getaffinity(0, &oldmask) < 0) {
+		oldmask = cpu_possible_map;
+	}
+	for_each_cpu(cpu) {
+		sched_setaffinity(0, cpumask_of_cpu(cpu));
+		schedule();
+	}
+	sched_setaffinity(0, oldmask);
+}
 
-	/* Wait for it */
-	wait_for_completion(&rcu.completion);
+int
+rcu_pending(int cpu)
+{
+	return (rcu_data.donelist != NULL ||
+		rcu_data.waitlist != NULL ||
+		rcu_data.nextlist != NULL);
+}
+
+void __init rcu_init(void)
+{
+	init_rcurefs();
+/*&&&&*/printk("WARNING: experimental RCU implementation.\n");
+	spin_lock_init(&rcu_data.lock);
+	rcu_data.completed = 0;
+	rcu_data.nextlist = NULL;
+	rcu_data.nexttail = &rcu_data.nextlist;
+	rcu_data.waitlist = NULL;
+	rcu_data.waittail = &rcu_data.waitlist;
+	rcu_data.donelist = NULL;
+	rcu_data.donetail = &rcu_data.donelist;
+	tasklet_init(&rcu_data.rcu_tasklet, rcu_process_callbacks, 0UL);
 }
 
 /*
@@ -500,8 +877,79 @@ void synchronize_kernel(void)
 	synchronize_rcu();
 }
 
-module_param(maxbatch, int, 0);
-EXPORT_SYMBOL(call_rcu);  /* WARNING: GPL-only in April 2006. */
-EXPORT_SYMBOL(call_rcu_bh);  /* WARNING: GPL-only in April 2006. */
+#ifdef CONFIG_RCU_STATS
+int rcu_read_proc_data(char *page)
+{
+	return sprintf(page,
+		       "ggp=%ld lgp=%ld rcc=%ld\n"
+		       "na=%ld nl=%ld wa=%ld wl=%ld da=%ld dl=%ld dr=%ld di=%d\n"
+		       "rtf1=%d rtf2=%ld rtf3=%ld rtfe1=%d rtfe2=%ld rtfe3=%ld\n",
+
+		       rcu_ctrlblk.completed,
+		       rcu_data.completed,
+		       rcu_data.n_rcu_check_callbacks,
+
+		       rcu_data.n_next_add,
+		       rcu_data.n_next_length,
+		       rcu_data.n_wait_add,
+		       rcu_data.n_wait_length,
+		       rcu_data.n_done_add,
+		       rcu_data.n_done_length,
+		       rcu_data.n_done_remove,
+		       atomic_read(&rcu_data.n_done_invoked),
+
+		       atomic_read(&rcu_data.n_rcu_try_flip1),
+		       rcu_data.n_rcu_try_flip2,
+		       rcu_data.n_rcu_try_flip3,
+		       atomic_read(&rcu_data.n_rcu_try_flip_e1),
+		       rcu_data.n_rcu_try_flip_e2,
+		       rcu_data.n_rcu_try_flip_e3);
+}
+
+int rcu_read_proc_gp_data(char *page)
+{
+	long oldgp = rcu_ctrlblk.completed;
+
+	synchronize_rcu();
+	return sprintf(page, "oldggp=%ld  newggp=%ld\n",
+		       oldgp, rcu_ctrlblk.completed);
+}
+
+int rcu_read_proc_ptrs_data(char *page)
+{
+	return sprintf(page,
+		       "nl=%p/%p nt=%p\n wl=%p/%p wt=%p dl=%p/%p dt=%p\n",
+		       &rcu_data.nextlist, rcu_data.nextlist, rcu_data.nexttail,
+		       &rcu_data.waitlist, rcu_data.waitlist, rcu_data.waittail,
+		       &rcu_data.donelist, rcu_data.donelist, rcu_data.donetail
+		      );
+}
+
+int rcu_read_proc_ctrs_data(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int f = rcu_data.completed & 0x1;
+
+	cnt += sprintf(&page[cnt], "CPU last cur\n");
+	for_each_cpu(cpu) {
+		cnt += sprintf(&page[cnt], "%3d %4d %3d\n",
+			       cpu,
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[!f]),
+			       atomic_read(&per_cpu(rcu_flipctr, cpu)[f]));
+	}
+	cnt += sprintf(&page[cnt], "ggp = %ld\n", rcu_data.completed);
+	return (cnt);
+}
+
+#endif /* #ifdef CONFIG_RCU_STATS */
+
+EXPORT_SYMBOL_GPL(rcu_batches_completed);
+EXPORT_SYMBOL(call_rcu); /* WARNING: GPL-only in April 2006. */
 EXPORT_SYMBOL_GPL(synchronize_rcu);
-EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL_GPL(synchronize_sched);
+EXPORT_SYMBOL(rcu_read_lock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(rcu_read_unlock);  /* WARNING: GPL-only in April 2006. */
+EXPORT_SYMBOL(synchronize_kernel);  /* WARNING: Removal in April 2006. */
+
+#endif /* #else #ifndef CONFIG_PREEMPT_RCU */
Index: linux/kernel/rcutorture.c
===================================================================
--- /dev/null
+++ linux/kernel/rcutorture.c
@@ -0,0 +1,512 @@
+/*
+ * Read-Copy Update module-based torture test facility
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) IBM Corporation, 2005
+ *
+ * Authors: Paul E. McKenney <paulmck@us.ibm.com>
+ *
+ * See also:  Documentation/RCU/torture.txt
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/kthread.h>
+#include <linux/err.h>
+#include <linux/spinlock.h>
+#include <linux/smp.h>
+#include <linux/rcupdate.h>
+#include <linux/interrupt.h>
+#include <linux/sched.h>
+#include <asm/atomic.h>
+#include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/completion.h>
+#include <linux/moduleparam.h>
+#include <linux/percpu.h>
+#include <linux/notifier.h>
+#include <linux/rcuref.h>
+#include <linux/cpu.h>
+#include <linux/random.h>
+#include <linux/delay.h>
+#include <linux/byteorder/swabb.h>
+#include <linux/stat.h>
+
+MODULE_LICENSE("GPL");
+
+static int nreaders = -1;	/* # reader threads, defaults to 4*ncpus */
+static int stat_interval = 0;	/* Interval between stats, in seconds. */
+				/*  Defaults to "only at end of test". */
+static int verbose = 0;		/* Print more debug info. */
+
+MODULE_PARM(nreaders, "i");
+MODULE_PARM_DESC(nreaders, "Number of RCU reader threads");
+MODULE_PARM(stat_interval, "i");
+MODULE_PARM_DESC(stat_interval, "Number of seconds between stats printk()s");
+MODULE_PARM(verbose, "i");
+MODULE_PARM_DESC(verbose, "Enable verbose debugging printk()s");
+#define TORTURE_FLAG "rcutorture: "
+#define PRINTK_STRING(s) \
+	do { printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_STRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG s "\n"); } while (0)
+#define VERBOSE_PRINTK_ERRSTRING(s) \
+	do { if (verbose) printk(KERN_ALERT TORTURE_FLAG "!!! " s "\n"); } while (0)
+
+static char printk_buf[4096];
+
+static int nrealreaders;
+static struct task_struct *writer_task;
+static struct task_struct **reader_tasks;
+static struct task_struct *stats_task;
+
+#define RCU_TORTURE_PIPE_LEN 10
+
+struct rcu_torture {
+	struct rcu_head rtort_rcu;
+	int rtort_pipe_count;
+	struct list_head rtort_free;
+	int rtort_mbtest;
+};
+
+static int fullstop = 0;	/* stop generating callbacks at test end. */
+static LIST_HEAD(rcu_torture_freelist);
+static struct rcu_torture *rcu_torture_current = NULL;
+static long rcu_torture_current_version = 0;
+static struct rcu_torture rcu_tortures[10 * RCU_TORTURE_PIPE_LEN];
+static DEFINE_SPINLOCK(rcu_torture_lock);
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_count) =
+	{ 0 };
+static DEFINE_PER_CPU(long [RCU_TORTURE_PIPE_LEN + 1], rcu_torture_batch) =
+	{ 0 };
+static atomic_t rcu_torture_wcount[RCU_TORTURE_PIPE_LEN + 1];
+atomic_t n_rcu_torture_alloc;
+atomic_t n_rcu_torture_alloc_fail;
+atomic_t n_rcu_torture_free;
+atomic_t n_rcu_torture_mberror;
+atomic_t n_rcu_torture_error;
+
+/*
+ * Allocate an element from the rcu_tortures pool.
+ */
+struct rcu_torture *
+rcu_torture_alloc(void)
+{
+	struct list_head *p;
+
+	spin_lock(&rcu_torture_lock);
+	if (list_empty(&rcu_torture_freelist)) {
+		atomic_inc(&n_rcu_torture_alloc_fail);
+		spin_unlock(&rcu_torture_lock);
+		return NULL;
+	}
+	atomic_inc(&n_rcu_torture_alloc);
+	p = rcu_torture_freelist.next;
+	list_del_init(p);
+	spin_unlock(&rcu_torture_lock);
+	return container_of(p, struct rcu_torture, rtort_free);
+}
+
+/*
+ * Free an element to the rcu_tortures pool.
+ */
+static void
+rcu_torture_free(struct rcu_torture *p)
+{
+	atomic_inc(&n_rcu_torture_free);
+	spin_lock(&rcu_torture_lock);
+	list_add_tail(&p->rtort_free, &rcu_torture_freelist);
+	spin_unlock(&rcu_torture_lock);
+}
+
+static void
+rcu_torture_cb(struct rcu_head *p)
+{
+	int i;
+	struct rcu_torture *rp = container_of(p, struct rcu_torture, rtort_rcu);
+
+	if (fullstop) {
+		/* Test is ending, just drop callbacks on the floor. */
+		/* The next initialization will pick up the pieces. */
+		return;
+	}
+	i = rp->rtort_pipe_count;
+	if (i > RCU_TORTURE_PIPE_LEN)
+		i = RCU_TORTURE_PIPE_LEN;
+	atomic_inc(&rcu_torture_wcount[i]);
+	if (++rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) {
+		rp->rtort_mbtest = 0;
+		rcu_torture_free(rp);
+	} else
+		call_rcu(p, rcu_torture_cb);
+}
+
+struct rcu_random_state {
+	unsigned long rrs_state;
+	unsigned long rrs_count;
+};
+
+#define RCU_RANDOM_MULT 39916801  /* prime */
+#define RCU_RANDOM_ADD	479001701 /* prime */
+#define RCU_RANDOM_REFRESH 10000
+
+#define DEFINE_RCU_RANDOM(name) struct rcu_random_state name = { 0, 0 }
+
+/*
+ * Crude but fast random-number generator.  Uses a linear congruential
+ * generator, with occasional help from get_random_bytes().
+ */
+static long
+rcu_random(struct rcu_random_state *rrsp)
+{
+	long refresh;
+
+	if (--rrsp->rrs_count < 0) {
+		get_random_bytes(&refresh, sizeof(refresh));
+		rrsp->rrs_state += refresh;
+		rrsp->rrs_count = RCU_RANDOM_REFRESH;
+	}
+	rrsp->rrs_state = rrsp->rrs_state * RCU_RANDOM_MULT + RCU_RANDOM_ADD;
+	return swahw32(rrsp->rrs_state);
+}
+
+/*
+ * RCU torture writer kthread.  Repeatedly substitutes a new structure
+ * for that pointed to by rcu_torture_current, freeing the old structure
+ * after a series of grace periods (the "pipeline").
+ */
+static int
+rcu_torture_writer(void *arg)
+{
+	int i;
+	long oldbatch = rcu_batches_completed();
+	struct rcu_torture *rp;
+	struct rcu_torture *old_rp;
+	static DEFINE_RCU_RANDOM(rand);
+
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task started");
+	set_user_nice(current, 19);
+	do {
+		schedule_timeout_uninterruptible(1);
+		if (rcu_batches_completed() == oldbatch)
+			continue;
+		if ((rp = rcu_torture_alloc()) == NULL)
+			continue;
+		rp->rtort_pipe_count = 0;
+		udelay(rcu_random(&rand) & 0x3ff);
+		old_rp = rcu_torture_current;
+		rp->rtort_mbtest = 1;
+		rcu_assign_pointer(rcu_torture_current, rp);
+		smp_wmb();
+		if (old_rp != NULL) {
+			i = old_rp->rtort_pipe_count;
+			if (i > RCU_TORTURE_PIPE_LEN)
+				i = RCU_TORTURE_PIPE_LEN;
+			atomic_inc(&rcu_torture_wcount[i]);
+			old_rp->rtort_pipe_count++;
+			call_rcu(&old_rp->rtort_rcu, rcu_torture_cb);
+		}
+		rcu_torture_current_version++;
+		oldbatch = rcu_batches_completed();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_writer task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * RCU torture reader kthread.  Repeatedly dereferences rcu_torture_current,
+ * incrementing the corresponding element of the pipeline array.  The
+ * counter in the element should never be greater than 1, otherwise, the
+ * RCU implementation is broken.
+ */
+static int
+rcu_torture_reader(void *arg)
+{
+	int completed;
+	DEFINE_RCU_RANDOM(rand);
+	struct rcu_torture *p;
+	int pipe_count;
+
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task started");
+	set_user_nice(current, 19);
+	do {
+		rcu_read_lock();
+		completed = rcu_batches_completed();
+		p = rcu_dereference(rcu_torture_current);
+		if (p == NULL) {
+			/* Wait for rcu_torture_writer to get underway */
+			rcu_read_unlock();
+			msleep(1000);
+			continue;
+		}
+		if (p->rtort_mbtest == 0)
+			atomic_inc(&n_rcu_torture_mberror);
+		udelay(rcu_random(&rand) & 0x7f);
+		preempt_disable();
+		pipe_count = p->rtort_pipe_count;
+		if (pipe_count > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			pipe_count = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_count)[pipe_count];
+		completed = rcu_batches_completed() - completed;
+		if (completed > RCU_TORTURE_PIPE_LEN) {
+			/* Should not happen, but... */
+			completed = RCU_TORTURE_PIPE_LEN;
+		}
+		++__get_cpu_var(rcu_torture_batch)[completed];
+		preempt_enable();
+		rcu_read_unlock();
+		cond_resched();
+	} while (!kthread_should_stop() && !fullstop);
+	VERBOSE_PRINTK_STRING("rcu_torture_reader task stopping");
+	while (!kthread_should_stop())
+		schedule_timeout_uninterruptible(1);
+	return 0;
+}
+
+/*
+ * Create an RCU-torture statistics message in the specified buffer.
+ */
+static int
+rcu_torture_printk(char *page)
+{
+	int cnt = 0;
+	int cpu;
+	int i;
+	long pipesummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+	long batchsummary[RCU_TORTURE_PIPE_LEN + 1] = { 0 };
+
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			pipesummary[i] += per_cpu(rcu_torture_count, cpu)[i];
+			batchsummary[i] += per_cpu(rcu_torture_batch, cpu)[i];
+		}
+	}
+	for (i = RCU_TORTURE_PIPE_LEN - 1; i >= 0; i--) {
+		if (pipesummary[i] != 0)
+			break;
+	}
+	cnt += sprintf(&page[cnt], "rcutorture: ");
+	cnt += sprintf(&page[cnt],
+		       "rtc: %p ver: %ld tfle: %d rta: %d rtaf: %d rtf: %d "
+		       "rtmbe: %d",
+		       rcu_torture_current,
+		       rcu_torture_current_version,
+		       list_empty(&rcu_torture_freelist),
+		       atomic_read(&n_rcu_torture_alloc),
+		       atomic_read(&n_rcu_torture_alloc_fail),
+		       atomic_read(&n_rcu_torture_free),
+		       atomic_read(&n_rcu_torture_mberror));
+	if (atomic_read(&n_rcu_torture_mberror) != 0)
+		cnt += sprintf(&page[cnt], " !!!");
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	if (i > 1) {
+		cnt += sprintf(&page[cnt], "!!! ");
+		atomic_inc(&n_rcu_torture_error);
+	}
+	cnt += sprintf(&page[cnt], "Reader Pipe: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		cnt += sprintf(&page[cnt], " %ld", pipesummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Reader Batch: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		cnt += sprintf(&page[cnt], " %ld", batchsummary[i]);
+	cnt += sprintf(&page[cnt], "\nrcutorture: ");
+	cnt += sprintf(&page[cnt], "Free-Block Circulation: ");
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+		cnt += sprintf(&page[cnt], " %d",
+			       atomic_read(&rcu_torture_wcount[i]));
+	}
+	cnt += sprintf(&page[cnt], "\n");
+	return cnt;
+}
+
+/*
+ * Print torture statistics.  Caller must ensure that there is only
+ * one call to this function at a given time!!!  This is normally
+ * accomplished by relying on the module system to only have one copy
+ * of the module loaded, and then by giving the rcu_torture_stats
+ * kthread full control (or the init/cleanup functions when rcu_torture_stats
+ * thread is not running).
+ */
+static void
+rcu_torture_stats_print(void)
+{
+	int cnt;
+
+	cnt = rcu_torture_printk(printk_buf);
+	printk(KERN_ALERT "%s", printk_buf);
+}
+
+/*
+ * Periodically prints torture statistics, if periodic statistics printing
+ * was specified via the stat_interval module parameter.
+ *
+ * No need to worry about fullstop here, since this one doesn't reference
+ * volatile state or register callbacks.
+ */
+static int
+rcu_torture_stats(void *arg)
+{
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task started");
+	do {
+		schedule_timeout_interruptible(stat_interval * HZ);
+		rcu_torture_stats_print();
+	} while (!kthread_should_stop());
+	VERBOSE_PRINTK_STRING("rcu_torture_stats task stopping");
+	return 0;
+}
+
+static void
+rcu_torture_cleanup(void)
+{
+	int i;
+
+	fullstop = 1;
+	if (writer_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_writer task");
+		kthread_stop(writer_task);
+	}
+	writer_task = NULL;
+
+	if (reader_tasks != NULL) {
+		for (i = 0; i < nrealreaders; i++) {
+			if (reader_tasks[i] != NULL) {
+				VERBOSE_PRINTK_STRING(
+					"Stopping rcu_torture_reader task");
+				kthread_stop(reader_tasks[i]);
+			}
+			reader_tasks[i] = NULL;
+		}
+		kfree(reader_tasks);
+		reader_tasks = NULL;
+	}
+	rcu_torture_current = NULL;
+
+	if (stats_task != NULL) {
+		VERBOSE_PRINTK_STRING("Stopping rcu_torture_stats task");
+		kthread_stop(stats_task);
+	}
+	stats_task = NULL;
+
+	/* Wait for all RCU callbacks to fire.  */
+
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN; i++)
+		synchronize_rcu();
+	rcu_torture_stats_print();  /* -After- the stats thread is stopped! */
+	printk(KERN_ALERT TORTURE_FLAG
+	       "--- End of test: %s\n",
+	       atomic_read(&n_rcu_torture_error) == 0 ? "SUCCESS" : "FAILURE");
+}
+
+static int
+rcu_torture_init(void)
+{
+	int i;
+	int cpu;
+	int firsterr = 0;
+
+	/* Process args and tell the world that the torturer is on the job. */
+
+	if (nreaders >= 0)
+		nrealreaders = nreaders;
+	else
+		nrealreaders = 2 * num_online_cpus();
+	printk(KERN_ALERT TORTURE_FLAG
+	       "--- Start of test: nreaders=%d stat_interval=%d verbose=%d\n",
+	       nrealreaders, stat_interval, verbose);
+	fullstop = 0;
+
+	/* Set up the freelist. */
+
+	INIT_LIST_HEAD(&rcu_torture_freelist);
+	for (i = 0; i < sizeof(rcu_tortures) / sizeof(rcu_tortures[0]); i++) {
+		rcu_tortures[i].rtort_mbtest = 0;
+		list_add_tail(&rcu_tortures[i].rtort_free,
+			      &rcu_torture_freelist);
+	}
+
+	/* Initialize the statistics so that each run gets its own numbers. */
+
+	rcu_torture_current = NULL;
+	rcu_torture_current_version = 0;
+	atomic_set(&n_rcu_torture_alloc, 0);
+	atomic_set(&n_rcu_torture_alloc_fail, 0);
+	atomic_set(&n_rcu_torture_free, 0);
+	atomic_set(&n_rcu_torture_mberror, 0);
+	atomic_set(&n_rcu_torture_error, 0);
+	for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++)
+		atomic_set(&rcu_torture_wcount[i], 0);
+	for_each_cpu(cpu) {
+		for (i = 0; i < RCU_TORTURE_PIPE_LEN + 1; i++) {
+			per_cpu(rcu_torture_count, cpu)[i] = 0;
+			per_cpu(rcu_torture_batch, cpu)[i] = 0;
+		}
+	}
+
+	/* Start up the kthreads. */
+
+	VERBOSE_PRINTK_STRING("Creating rcu_torture_writer task");
+	writer_task = kthread_run(rcu_torture_writer, NULL,
+				  "rcu_torture_writer");
+	if (IS_ERR(writer_task)) {
+		firsterr = PTR_ERR(writer_task);
+		VERBOSE_PRINTK_ERRSTRING("Failed to create writer");
+		writer_task = NULL;
+		goto unwind;
+	}
+	reader_tasks = kmalloc(nrealreaders * sizeof(reader_tasks[0]),
+			       GFP_KERNEL);
+	if (reader_tasks == NULL) {
+		VERBOSE_PRINTK_ERRSTRING("out of memory");
+		firsterr = -ENOMEM;
+		goto unwind;
+	}
+	for (i = 0; i < nrealreaders; i++) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_reader task");
+		reader_tasks[i] = kthread_run(rcu_torture_reader, NULL,
+					      "rcu_torture_reader");
+		if (IS_ERR(reader_tasks[i])) {
+			firsterr = PTR_ERR(reader_tasks[i]);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create reader");
+			reader_tasks[i] = NULL;
+			goto unwind;
+		}
+	}
+	if (stat_interval > 0) {
+		VERBOSE_PRINTK_STRING("Creating rcu_torture_stats task");
+		stats_task = kthread_run(rcu_torture_stats, NULL,
+					"rcu_torture_stats");
+		if (IS_ERR(stats_task)) {
+			firsterr = PTR_ERR(stats_task);
+			VERBOSE_PRINTK_ERRSTRING("Failed to create stats");
+			stats_task = NULL;
+			goto unwind;
+		}
+	}
+	return 0;
+
+unwind:
+	rcu_torture_cleanup();
+	return firsterr;
+}
+
+module_init(rcu_torture_init);
+module_exit(rcu_torture_cleanup);
Index: linux/kernel/rt.c
===================================================================
--- /dev/null
+++ linux/kernel/rt.c
@@ -0,0 +1,3045 @@
+/*
+ * kernel/rt.c
+ *
+ * Real-Time Preemption Support
+ *
+ * started by Ingo Molnar:
+ *
+ *  Copyright (C) 2004, 2005 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * historic credit for proving that Linux spinlocks can be implemented via
+ * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow
+ * and others) who prototyped it on 2.4 and did lots of comparative
+ * research and analysis; TimeSys, for proving that you can implement a
+ * fully preemptible kernel via the use of IRQ threading and mutexes;
+ * Bill Huey for persuasively arguing on lkml that the mutex model is the
+ * right one; and to MontaVista, who ported pmutexes to 2.6.
+ *
+ * This code is a from-scratch implementation and is not based on pmutexes,
+ * but the idea of converting spinlocks to mutexes is used here too.
+ *
+ * lock debugging, locking tree, deadlock detection:
+ *
+ *  Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey
+ *  Released under the General Public License (GPL).
+ *
+ * Includes portions of the generic R/W semaphore implementation from:
+ *
+ *  Copyright (c) 2001   David Howells (dhowells@redhat.com).
+ *  - Derived partially from idea by Andrea Arcangeli <andrea@suse.de>
+ *  - Derived also from comments by Linus
+ *
+ * Pending ownership of locks and ownership stealing:
+ *
+ *  Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt
+ *
+ *   (also by Steven Rostedt)
+ *    - Converted single pi_lock to individual task locks.
+ *
+ */
+#include <linux/config.h>
+#include <linux/rt_lock.h>
+#include <linux/sched.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/interrupt.h>
+#include <linux/plist.h>
+#include <linux/fs.h>
+#include <linux/futex.h>
+
+#define CAPTURE_LOCK
+
+/*
+ * Lock order:
+ *
+ *  To keep from having a single lock for PI, each task and lock
+ *  has their own locking. The order is as follows:
+ *
+ * blocked task->pi_lock -> lock->wait_lock -> owner task->pi_lock.
+ *
+ * This is safe since a owner task should never block on a lock that
+ * is owned by a blocking task.  Otherwise you would have a deadlock
+ * in the normal system.
+ * The same goes for the locks. A lock held by one task, should not be
+ * taken by task that holds a lock that is blocking this lock's owner.
+ *
+ * A task that is about to grab a lock is first considered to be a
+ * blocking task, even if the task successfully acquires the lock.
+ * This is because the taking of the locks happen before the
+ * task becomes the owner.
+ */
+
+/*
+ * These flags are used for allowing of stealing of ownerships.
+ */
+#define RT_PENDOWNER	1	/* pending owner on a lock */
+
+#define TASK_PENDING(task) \
+	((task)->rt_flags & RT_PENDOWNER)
+
+/*
+ * We can speed up the lock-acquire, if the architecture supports
+ * cmpxchg, if the lists dont have to be initialized (!SMP), and if
+ * there's no debugging state to be set up (!DEBUG_DEADLOCKS).
+ */
+#if defined(__HAVE_ARCH_CMPXCHG) && !defined(CONFIG_SMP) && \
+    !defined(CONFIG_DEBUG_DEADLOCKS)
+# define RT_ATOMIC_ACQUIRE
+/*
+ * trick: we can use cmpxchg on the release side too, if bit 0 of
+ * lock->owner is set if there is at least a single pending task in
+ * the wait_list. This way the release atomic-fastpath can be a mirror
+ * image of the acquire path.
+ */
+//# define RT_ATOMIC_RELEASE
+#endif
+
+/*
+ * lock->owner's bit 0 encodes 'are other tasks pending on this lock'
+ * information. This way we can use cmpxchg to release the lock
+ * atomically. It also means that the value has to be decoded before use.
+ */
+#ifdef RT_ATOMIC_RELEASE
+# define lock_owner(lock) ({ typecheck(struct rt_mutex *,(lock)); ((struct thread_info *)((unsigned long)((lock)->owner) & ~1UL)); })
+# define lock_owner_pending(lock) ({ typecheck(struct rt_mutex *,(lock)); ((unsigned long)((lock)->owner) & 1UL); })
+# define set_lock_owner_pending(lock) do { typecheck(struct rt_mutex *,(lock)); (lock)->owner = ((struct thread_info *)((unsigned long)((lock)->owner) | 1UL)); } while (0)
+#else
+# define lock_owner(lock) ((lock)->owner)
+# define set_lock_owner_pending(lock) do { } while (0)
+#endif
+
+#define __local_save_flags_inline(flags) \
+	do { (flags) = irqs_off() | RAW_LOCAL_ILLEGAL_MASK; } while (0)
+/*
+ * This flag is good for debugging the PI code - it makes all tasks
+ * in the system fall under PI handling. Normally only SCHED_FIFO/RR
+ * tasks are PI-handled:
+ */
+#define ALL_TASKS_PI 0
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+# define __EIP_DECL__ , unsigned long eip
+# define __EIP__ , eip
+# define __W_EIP__(waiter) , (waiter)->eip
+# define __CALLER0__ , CALLER_ADDR0
+#else
+# define __EIP_DECL__
+# define __EIP__
+# define __W_EIP__(waiter)
+# define __CALLER0__
+#endif
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+/*
+ * We need a global lock when we walk through the multi-process
+ * lock tree...
+ */
+static raw_spinlock_t trace_lock = RAW_SPIN_LOCK_UNLOCKED;
+
+static LIST_HEAD(held_locks);
+
+/*
+ * deadlock detection flag. We turn it off when we detect
+ * the first problem because we dont want to recurse back
+ * into the tracing code when doing error printk or
+ * executing a BUG():
+ */
+static int trace_on = 1;
+
+void deadlock_trace_off(void)
+{
+	trace_on = 0;
+}
+
+#define trace_lock_irq(lock, ti)		\
+	do {					\
+		(void)(ti);			\
+		raw_local_irq_disable();	\
+		if (trace_on)			\
+			spin_lock(lock);	\
+	} while (0)
+
+#define trace_unlock(lock, ti)			\
+	do {					\
+		(void)(ti);			\
+		if (trace_on)			\
+			spin_unlock(lock);	\
+	} while (0)
+
+#define trace_unlock_irq(lock, ti)		\
+	do {					\
+		(void)(ti);			\
+		if (trace_on)			\
+			spin_unlock(lock);	\
+		raw_local_irq_enable();		\
+		preempt_check_resched();	\
+	} while (0)
+
+#define trace_lock_irqsave(lock, flags, ti)	\
+	do {					\
+		(void)(ti);			\
+		raw_local_irq_save(flags);	\
+		if (trace_on)			\
+			spin_lock(lock);	\
+	} while (0)
+
+#define trace_unlock_irqrestore(lock, flags, ti)\
+	do {					\
+		(void)(ti);			\
+		if (trace_on)			\
+			spin_unlock(lock);	\
+		raw_local_irq_restore(flags);	\
+		preempt_check_resched();	\
+	} while (0)
+
+# define TRACE_WARN_ON(x)			WARN_ON(x)
+# define TRACE_BUG_ON(x)			BUG_ON(x)
+
+#define TRACE_OFF()				\
+do {						\
+	if (trace_on) {				\
+		trace_on = 0;			\
+		console_verbose();		\
+		if (spin_is_locked(&current->pi_lock)) \
+			_raw_spin_unlock(&current->pi_lock); \
+		spin_unlock(&trace_lock);	\
+	}					\
+} while (0)
+
+#define TRACE_BUG_LOCKED()			\
+do {						\
+	TRACE_OFF();				\
+	BUG();					\
+} while (0)
+
+#define TRACE_WARN_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c)) {			\
+		TRACE_OFF();			\
+		WARN_ON(1);			\
+	}					\
+} while (0)
+
+# define TRACE_BUG_ON_LOCKED(c)			\
+do {						\
+	if (unlikely(c))			\
+		TRACE_BUG_LOCKED();		\
+} while (0)
+
+#ifdef CONFIG_SMP
+# define SMP_TRACE_BUG_ON_LOCKED(c)	TRACE_BUG_ON_LOCKED(c)
+#else
+# define SMP_TRACE_BUG_ON_LOCKED(c)	do { } while (0)
+#endif
+
+# define trace_local_irq_disable(ti)		raw_local_irq_disable()
+# define trace_local_irq_enable(ti)		raw_local_irq_enable()
+# define trace_local_irq_restore(flags, ti)	raw_local_irq_restore(flags)
+
+#else /* !CONFIG_DEBUG_DEADLOCKS */
+
+# define trace_lock_irq(lock, ti)		do { raw_local_irq_disable(); (void)(ti); } while (0)
+# define trace_lock_irqsave(lock, flags, ti)	do { raw_local_irq_save(flags); (void)(ti); } while (0)
+# define trace_unlock_irq(lock, ti)		raw_local_irq_enable()
+# define trace_unlock_irqrestore(lock, flags, ti)	\
+	do {						\
+		(void)(ti);				\
+		raw_local_irq_restore(flags);		\
+		preempt_check_resched();		\
+	} while (0)
+# define trace_local_irq_disable(ti)		raw_local_irq_disable()
+# define trace_local_irq_enable(ti)		raw_local_irq_enable()
+# define trace_local_irq_restore(flags, ti)	raw_local_irq_restore(flags)
+
+# define trace_unlock(lock, ti)			do { } while (0)
+
+# define TRACE_WARN_ON(x)			do { } while (0)
+# define TRACE_BUG_ON(x)			do { } while (0)
+# define TRACE_BUG_LOCKED()			do { } while (0)
+# define TRACE_WARN_ON_LOCKED(c)		do { } while (0)
+# define TRACE_OFF()				do { } while (0)
+# define TRACE_BUG_ON_LOCKED(c)			do { } while (0)
+# define SMP_TRACE_BUG_ON_LOCKED(c)		do { } while (0)
+
+#endif /* CONFIG_DEBUG_DEADLOCKS */
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Unlock these on crash:
+ */
+void zap_rt_locks(void)
+{
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	spin_lock_init(&trace_lock);
+#endif
+}
+#endif
+
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPT_RT)
+int check_locking_preempt_off(struct task_struct *p)
+{
+	int i;
+
+	for (i = 0; i < p->lock_count; i++)
+		if (p->owned_lock[i]->was_preempt_off)
+			return 1;
+	return 0;
+}
+
+void check_preempt_wakeup(struct task_struct * p)
+{
+	/*
+	 * Possible PREEMPT_RT race scenario when
+	 * wake_up_proces() is usually called with
+	 * preemption off, but PREEMPT_RT enables
+	 * it. If the task is dependent on preventing
+	 * context switches either with spinlocks
+	 * or rcu locks, then this could result in
+	 * hangs and race conditions.
+	 */
+	if (!preempt_count() &&
+		!__raw_irqs_disabled() &&
+		p->prio < current->prio &&
+		rt_task(p) &&
+		(current->rcu_read_lock_nesting != 0 ||
+				check_locking_preempt_off(current))) {
+
+			printk("BUG: %s/%d, possible wake_up race on %s/%d\n",
+				current->comm, current->pid, p->comm, p->pid);
+			dump_stack();
+		}
+}
+
+static inline void
+account_mutex_owner_down(struct task_struct *task, struct rt_mutex *lock)
+{
+	if (task->lock_count >= MAX_LOCK_STACK) {
+		TRACE_OFF();
+		printk("BUG: %s/%d: lock count overflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	task->owned_lock[task->lock_count] = lock;
+	task->lock_count++;
+}
+
+static inline void
+account_mutex_owner_up(struct task_struct *task)
+{
+	if (!task->lock_count) {
+		TRACE_OFF();
+		printk("BUG: %s/%d: lock count underflow!\n",
+			task->comm, task->pid);
+		dump_stack();
+		return;
+	}
+	task->lock_count--;
+	task->owned_lock[task->lock_count] = NULL;
+}
+
+#else
+# define account_mutex_owner_down(task, lock)	do { } while(0)
+# define account_mutex_owner_up(task)		do { } while(0)
+#endif
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+
+static void printk_task(struct task_struct *p)
+{
+	if (p)
+		printk("%16s:%5d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_ti(struct thread_info *ti)
+{
+	if (ti)
+		printk_task(ti->task);
+	else
+		printk("<none>");
+}
+
+static void printk_task_short(struct task_struct *p)
+{
+	if (p)
+		printk("%s/%d [%p, %3d]", p->comm, p->pid, p, p->prio);
+	else
+		printk("<none>");
+}
+
+static void printk_lock(struct rt_mutex *lock, int print_owner)
+{
+	if (lock->name)
+		printk(" [%p] {%s}\n",
+			lock, lock->name);
+	else
+		printk(" [%p] {%s:%d}\n",
+			lock, lock->file, lock->line);
+
+	if (print_owner && lock_owner(lock)) {
+		printk(".. held by:  ");
+		printk_ti(lock_owner(lock));
+		printk("\n");
+	}
+	if (lock_owner(lock)) {
+		printk("... acquired at:               ");
+		print_symbol("%s\n", lock->acquire_eip);
+	}
+}
+
+static void printk_waiter(struct rt_mutex_waiter *w)
+{
+	printk("-------------------------\n");
+	printk("| waiter struct %p:\n", w);
+	printk("| w->list: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", w->list.dp_node.prev, w->list.dp_node.next, w->list.sp_node.prev, w->list.sp_node.next, w->list.prio);
+	printk("| w->pi_list: [DP:%p/%p|SP:%p/%p|PRI:%d]\n", w->pi_list.dp_node.prev, w->pi_list.dp_node.next, w->pi_list.sp_node.prev, w->pi_list.sp_node.next, w->pi_list.prio);
+	printk("\n| lock:\n");
+	printk_lock(w->lock, 1);
+	printk("| w->ti->task:\n");
+	printk_ti(w->ti);
+	printk("| blocked at:  ");
+	print_symbol("%s\n", w->eip);
+	printk("-------------------------\n");
+}
+
+static void show_task_locks(struct task_struct *p)
+{
+	switch (p->state) {
+	case TASK_RUNNING:		printk("R"); break;
+	case TASK_RUNNING_MUTEX:	printk("M"); break;
+	case TASK_INTERRUPTIBLE:	printk("S"); break;
+	case TASK_UNINTERRUPTIBLE:	printk("D"); break;
+	case TASK_STOPPED:		printk("T"); break;
+	case EXIT_ZOMBIE:		printk("Z"); break;
+	case EXIT_DEAD:			printk("X"); break;
+	default:			printk("?"); break;
+	}
+	printk_task(p);
+	if (p->blocked_on) {
+		struct rt_mutex *lock = p->blocked_on->lock;
+
+		printk(" blocked on:");
+		printk_lock(lock, 1);
+	} else
+		printk(" (not blocked)\n");
+}
+
+void show_held_locks(struct task_struct *filter)
+{
+	struct thread_info *ti = current_thread_info();
+	struct list_head *curr, *cursor = NULL;
+	struct rt_mutex *lock;
+	struct thread_info *t;
+	unsigned long flags;
+	int count = 0;
+
+	if (filter) {
+		printk("------------------------------\n");
+		printk("| showing all locks held by: |  (");
+		printk_task_short(filter);
+		printk("):\n");
+		printk("------------------------------\n");
+	} else {
+		printk("---------------------------\n");
+		printk("| showing all locks held: |\n");
+		printk("---------------------------\n");
+	}
+
+	/*
+	 * Play safe and acquire the global trace lock. We
+	 * cannot printk with that lock held so we iterate
+	 * very carefully:
+	 */
+next:
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	list_for_each(curr, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		t = lock_owner(lock);
+		if (filter && (t != filter->thread_info))
+			continue;
+		count++;
+		cursor = curr->next;
+		trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+		printk("\n#%03d:            ", count);
+		printk_lock(lock, filter ? 0 : 1);
+		goto next;
+	}
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+	printk("\n");
+}
+
+void show_all_locks(void)
+{
+	struct task_struct *g, *p;
+	int count = 10;
+	int unlock = 1;
+
+	printk("\nshowing all tasks:\n");
+
+	/*
+	 * Here we try to get the tasklist_lock as hard as possible,
+	 * if not successful after 2 seconds we ignore it (but keep
+	 * trying). This is to enable a debug printout even if a
+	 * tasklist_lock-holding task deadlocks or crashes.
+	 */
+retry:
+	if (!read_trylock(&tasklist_lock)) {
+		if (count == 10)
+			printk("hm, tasklist_lock locked, retrying... ");
+		if (count) {
+			count--;
+			printk(" #%d", 10-count);
+			mdelay(200);
+			goto retry;
+		}
+		printk(" ignoring it.\n");
+		unlock = 0;
+	}
+	if (count != 10)
+		printk(" locked it.\n");
+
+	do_each_thread(g, p) {
+		show_task_locks(p);
+		if (!unlock)
+			if (read_trylock(&tasklist_lock))
+				unlock = 1;
+	} while_each_thread(g, p);
+
+	printk("\n");
+	show_held_locks(NULL);
+	printk("=============================================\n\n");
+
+	if (unlock)
+		read_unlock(&tasklist_lock);
+}
+
+static int check_deadlock(struct rt_mutex *lock, int depth,
+			  struct thread_info *ti, unsigned long eip)
+{
+	struct rt_mutex *lockblk;
+	struct task_struct *task;
+
+	if (!trace_on)
+		return 0;
+	/*
+	 * Special-case: the BKL self-releases at schedule()
+	 * time so it can never deadlock:
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	if (lock == &kernel_sem.lock)
+		return 0;
+#endif
+	ti = lock_owner(lock);
+	if (!ti)
+		return 0;
+	task = ti->task;
+	lockblk = NULL;
+	if (task->blocked_on)
+		lockblk = task->blocked_on->lock;
+	if (current == task) {
+		TRACE_OFF();
+		if (depth)
+			return 1;
+		printk("\n==========================================\n");
+		printk(  "[ BUG: lock recursion deadlock detected! |\n");
+		printk(  "------------------------------------------\n");
+		printk("already locked: ");
+		printk_lock(lock, 1);
+		show_held_locks(task);
+		printk("-{current task's backtrace}----------------->\n");
+		dump_stack();
+		show_all_locks();
+		printk("[ turning off deadlock detection. Please report this trace. ]\n\n");
+		trace_local_irq_disable(ti);
+		return 0;
+	}
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Skip the BKL:
+	 */
+	if (lockblk == &kernel_sem.lock)
+		return 0;
+#endif
+	/*
+	 * Ugh, something corrupted the lock data structure?
+	 */
+	if (depth > 20) {
+		TRACE_OFF();
+		printk("\n===========================================\n");
+		printk(  "[ BUG: infinite lock dependency detected!? |\n");
+		printk(  "-------------------------------------------\n");
+		goto print_it;
+	}
+	barrier();
+	if (lockblk && check_deadlock(lockblk, depth+1, ti, eip)) {
+		printk("\n============================================\n");
+		printk(  "[ BUG: circular locking deadlock detected! ]\n");
+		printk(  "--------------------------------------------\n");
+print_it:
+		printk("%s/%d is deadlocking current task %s/%d\n\n",
+			task->comm, task->pid, current->comm, current->pid);
+		printk("\n1) %s/%d is trying to acquire this lock:\n",
+			current->comm, current->pid);
+		printk_lock(lock, 1);
+
+		printk("... trying at:                 ");
+		print_symbol("%s\n", eip);
+
+		printk("\n2) %s/%d is blocked on this lock:\n",
+			task->comm, task->pid);
+		printk_lock(lockblk, 1);
+
+		show_held_locks(current);
+		show_held_locks(task);
+
+		printk("\n%s/%d's [blocked] stackdump:\n\n", task->comm, task->pid);
+		show_stack(task, NULL);
+		printk("\n%s/%d's [current] stackdump:\n\n",
+			current->comm, current->pid);
+		dump_stack();
+		show_all_locks();
+		printk("[ turning off deadlock detection. Please report this trace. ]\n\n");
+		trace_local_irq_disable(ti);
+		return 0;
+	}
+	return 0;
+}
+
+void check_no_held_locks(struct task_struct *task)
+{
+	struct thread_info *ti = task->thread_info;
+	struct list_head *curr, *next, *cursor = NULL;
+	struct plist *curr1;
+	struct rt_mutex *lock;
+	struct rt_mutex_waiter *w;
+	struct thread_info *t;
+	unsigned long flags;
+
+	if (!trace_on)
+		return;
+#ifdef CONFIG_DEBUG_PREEMPT
+	if (task->lock_count) {
+		static int once = 1;
+		if (once) {
+			once = 0;
+			printk("BUG: nonzero lock count %d at exit time?\n",
+				task->lock_count);
+			printk_task(task);
+			printk("\n");
+			dump_stack();
+		}
+	}
+#endif
+	if (!rt_prio(task->normal_prio) && rt_prio(task->prio)) {
+		printk("BUG: PI priority boost leaked!\n");
+		printk_task(task);
+		printk("\n");
+	}
+restart:
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	list_for_each_safe(curr, next, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		t = lock_owner(lock);
+		if (t != task->thread_info)
+			continue;
+		cursor = next;
+		list_del_init(curr);
+		trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+		if (lock == &kernel_sem.lock) {
+			printk("BUG: %s/%d, BKL held at task exit time!\n",
+				task->comm, task->pid);
+			printk("BKL acquired at: ");
+			print_symbol("%s\n",
+				(unsigned long) task->last_kernel_lock);
+		} else
+			printk("BUG: %s/%d, lock held at task exit time!\n",
+				task->comm, task->pid);
+		printk_lock(lock, 1);
+		if (lock_owner(lock) != task->thread_info)
+			printk("exiting task is not even the owner??\n");
+		goto restart;
+	}
+	_raw_spin_lock(&task->pi_lock);
+	plist_for_each(curr1, &task->pi_waiters) {
+		w = plist_entry(curr1, struct rt_mutex_waiter, pi_list);
+		TRACE_OFF();
+		_raw_spin_unlock(&task->pi_lock);
+		trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+		printk("hm, PI interest held at exit time? Task:\n");
+		printk_task(task);
+		printk_waiter(w);
+		return;
+	}
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+}
+
+int check_no_locks_freed(const void *from, const void *to)
+{
+	struct thread_info *ti = current_thread_info();
+	struct list_head *curr, *next, *cursor = NULL;
+	struct rt_mutex *lock;
+	unsigned long flags;
+	void *lock_addr;
+	int err = 0;
+
+	if (!trace_on)
+		return err;
+restart:
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	list_for_each_safe(curr, next, &held_locks) {
+		if (cursor && curr != cursor)
+			continue;
+		lock = list_entry(curr, struct rt_mutex, held_list);
+		lock_addr = lock;
+		if (lock_addr < from || lock_addr >= to)
+			continue;
+		cursor = next;
+		list_del_init(curr);
+		TRACE_OFF();
+		trace_unlock_irqrestore(&trace_lock, flags, ti);
+		err = 1;
+
+		printk("BUG: %s/%d, active lock [%p(%p-%p)] freed!\n",
+			current->comm, current->pid, lock, from, to);
+		dump_stack();
+		printk_lock(lock, 1);
+		if (lock_owner(lock) != current_thread_info())
+			printk("freeing task is not even the owner??\n");
+		goto restart;
+	}
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+	return err;
+}
+
+#endif
+
+#if ALL_TASKS_PI && defined(CONFIG_DEBUG_DEADLOCKS)
+
+static void
+check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter,
+		      struct thread_info *old_owner)
+{
+	struct rt_mutex_waiter *w;
+	struct plist *curr1;
+
+	_raw_spin_lock(&old_owner->task->pi_lock);
+	TRACE_WARN_ON_LOCKED(plist_empty(&waiter->pi_list));
+
+	plist_for_each(curr1, &old_owner->task->pi_waiters) {
+		w = plist_entry(curr1, struct rt_mutex_waiter, pi_list);
+		if (w == waiter)
+			goto ok;
+	}
+	TRACE_WARN_ON_LOCKED(1);
+ok:
+	_raw_spin_unlock(&old_owner->task->pi_lock);
+	return;
+}
+
+static void
+check_pi_list_empty(struct rt_mutex *lock, struct thread_info *old_owner)
+{
+	struct rt_mutex_waiter *w;
+	struct plist *curr1;
+
+	_raw_spin_lock(&old_owner->task->pi_lock);
+	plist_for_each(curr1, &old_owner->task->pi_waiters) {
+		w = plist_entry(curr1, struct rt_mutex_waiter, pi_list);
+		if (w->lock == lock) {
+			TRACE_OFF();
+			printk("hm, PI interest but no waiter? Old owner:\n");
+			printk_waiter(w);
+			printk("\n");
+			TRACE_WARN_ON(1);
+			break;
+		}
+	}
+	_raw_spin_unlock(&old_owner->task->pi_lock);
+}
+
+#else
+
+static inline void
+check_pi_list_present(struct rt_mutex *lock, struct rt_mutex_waiter *waiter,
+		      struct thread_info *old_owner)
+{
+}
+
+static inline void
+check_pi_list_empty(struct rt_mutex *lock, struct thread_info *old_owner)
+{
+}
+
+#endif
+
+/*
+ * Move PI waiters of this lock to the new owner:
+ */
+static void
+change_owner(struct rt_mutex *lock, struct thread_info *old_owner,
+	     struct thread_info *new_owner)
+{
+	struct plist *next1, *curr1;
+	struct rt_mutex_waiter *w;
+	int requeued = 0, sum = 0;
+
+	if (old_owner == new_owner)
+		return;
+
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&old_owner->task->pi_lock));
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&new_owner->task->pi_lock));
+	plist_for_each_safe(curr1, next1, &old_owner->task->pi_waiters) {
+		w = plist_entry(curr1, struct rt_mutex_waiter, pi_list);
+		if (w->lock == lock) {
+			trace_special_pid(w->ti->task->pid, w->ti->task->prio, w->ti->task->normal_prio);
+			plist_del(&w->pi_list, &old_owner->task->pi_waiters);
+			plist_init(&w->pi_list, w->ti->task->prio);
+			plist_add(&w->pi_list, &new_owner->task->pi_waiters);
+			requeued++;
+		}
+		sum++;
+	}
+	trace_special(sum, requeued, 0);
+}
+
+int pi_walk, pi_null, pi_prio, pi_initialized;
+
+/*
+ * The lock->wait_lock and p->pi_lock must be held.
+ */
+static void pi_setprio(struct rt_mutex *lock, struct task_struct *task, int prio)
+{
+	struct rt_mutex *l = lock;
+	struct task_struct *p = task;
+	/*
+	 * We don't want to release the parameters locks.
+	 */
+
+	if (unlikely(!p->pid)) {
+		pi_null++;
+		return;
+	}
+
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&lock->wait_lock));
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&p->pi_lock));
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	pi_prio++;
+	if (p->policy != SCHED_NORMAL && prio > normal_prio(p)) {
+		TRACE_OFF();
+
+		printk("huh? (%d->%d??)\n", p->prio, prio);
+		printk("owner:\n");
+		printk_task(p);
+		printk("\ncurrent:\n");
+		printk_task(current);
+		printk("\nlock:\n");
+		printk_lock(lock, 1);
+		dump_stack();
+		trace_local_irq_disable(ti);
+	}
+#endif
+	/*
+	 * If the task is blocked on some other task then boost that
+	 * other task (or tasks) too:
+	 */
+	for (;;) {
+		struct rt_mutex_waiter *w = p->blocked_on;
+#ifdef CONFIG_DEBUG_DEADLOCKS
+		int was_rt = rt_task(p);
+#endif
+
+		mutex_setprio(p, prio);
+
+		/*
+		 * The BKL can really be a pain. It can happen where the
+		 * BKL is being held by one task that is just about to
+		 * block on another task that is waiting for the BKL.
+		 * This isn't a deadlock, since the BKL is released
+		 * when the task goes to sleep.  This also means that
+		 * all holders of the BKL are not blocked, or are just
+		 * about to be blocked.
+		 *
+		 * Another side-effect of this is that there's a small
+		 * window where the spinlocks are not held, and the blocked
+		 * process hasn't released the BKL.  So if we are going
+		 * to boost the owner of the BKL, stop after that,
+		 * since that owner is either running, or about to sleep
+		 * but don't go any further or we are in a loop.
+		 */
+		if (!w || unlikely(p->lock_depth >= 0))
+			break;
+		/*
+		 * If the task is blocked on a lock, and we just made
+		 * it RT, then register the task in the PI list and
+		 * requeue it to the wait list:
+		 */
+
+		/*
+		 * Don't unlock the original lock->wait_lock
+		 */
+		if (l != lock)
+			_raw_spin_unlock(&l->wait_lock);
+		l = w->lock;
+		TRACE_BUG_ON_LOCKED(!lock);
+
+#ifdef CONFIG_PREEMPT_RT
+		/*
+		 * The current task that is blocking can also the one
+		 * holding the BKL, and blocking on a task that wants
+		 * it.  So if it were to get this far, we would deadlock.
+		 */
+		if (unlikely(l == &kernel_sem.lock) && lock_owner(l) == current_thread_info()) {
+			/*
+			 * No locks are held for locks, so fool the unlocking code
+			 * by thinking the last lock was the original.
+			 */
+			l = lock;
+			break;
+		}
+#endif
+
+		if (l != lock)
+			_raw_spin_lock(&l->wait_lock);
+
+		TRACE_BUG_ON_LOCKED(!lock_owner(l));
+
+		if (!plist_empty(&w->pi_list)) {
+			TRACE_BUG_ON_LOCKED(!was_rt && !ALL_TASKS_PI && !rt_task(p));
+			/*
+			 * If the task is blocked on a lock, and we just restored
+			 * it from RT to non-RT then unregister the task from
+			 * the PI list and requeue it to the wait list.
+			 *
+			 * (TODO: this can be unfair to SCHED_NORMAL tasks if they
+			 *        get PI handled.)
+			 */
+			plist_del(&w->pi_list, &lock_owner(l)->task->pi_waiters);
+		} else
+			TRACE_BUG_ON_LOCKED((ALL_TASKS_PI || rt_task(p)) && was_rt);
+
+		if (ALL_TASKS_PI || rt_task(p)) {
+			plist_init(&w->pi_list,prio);
+			plist_add(&w->pi_list, &lock_owner(l)->task->pi_waiters);
+		}
+
+		plist_del(&w->list, &l->wait_list);
+		plist_init(&w->list, prio);
+		plist_add(&w->list, &l->wait_list);
+
+		pi_walk++;
+
+		if (p != task)
+			_raw_spin_unlock(&p->pi_lock);
+
+		p = lock_owner(l)->task;
+		TRACE_BUG_ON_LOCKED(!p);
+		_raw_spin_lock(&p->pi_lock);
+		/*
+		 * If the dependee is already higher-prio then
+		 * no need to boost it, and all further tasks down
+		 * the dependency chain are already boosted:
+		 */
+		if (p->prio <= prio)
+			break;
+	}
+	if (l != lock)
+		_raw_spin_unlock(&l->wait_lock);
+	if (p != task)
+		_raw_spin_unlock(&p->pi_lock);
+}
+
+/*
+ * Change priority of a task pi aware
+ *
+ * There are several aspects to consider:
+ * - task is priority boosted
+ * - task is blocked on a mutex
+ *
+ */
+void pi_changeprio(struct task_struct *p, int prio)
+{
+	unsigned long flags;
+	int oldprio;
+
+	spin_lock_irqsave(&p->pi_lock,flags);
+	if (p->blocked_on)
+		spin_lock(&p->blocked_on->lock->wait_lock);
+
+	oldprio = p->normal_prio;
+	if (oldprio == prio)
+		goto out;
+
+	/* Set normal prio in any case */
+	p->normal_prio = prio;
+
+	/* Check, if we can safely lower the priority */
+	if (prio > p->prio && !plist_empty(&p->pi_waiters)) {
+		struct rt_mutex_waiter *w;
+		w = plist_first_entry(&p->pi_waiters,
+				      struct rt_mutex_waiter, pi_list);
+		if (w->ti->task->prio < prio)
+			prio = w->ti->task->prio;
+	}
+
+	if (prio == p->prio)
+		goto out;
+
+	/* Is task blocked on a mutex ? */
+	if (p->blocked_on)
+		pi_setprio(p->blocked_on->lock, p, prio);
+	else
+		mutex_setprio(p, prio);
+ out:
+	if (p->blocked_on)
+		spin_unlock(&p->blocked_on->lock->wait_lock);
+
+	spin_unlock_irqrestore(&p->pi_lock, flags);
+
+}
+
+/*
+ * This is called with both the waiter->task->pi_lock and
+ * lock->wait_lock held.
+ */
+static void
+task_blocks_on_lock(struct rt_mutex_waiter *waiter, struct thread_info *ti,
+		    struct rt_mutex *lock __EIP_DECL__)
+{
+	struct task_struct *task = ti->task;
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	check_deadlock(lock, 0, ti, eip);
+	/* mark the current thread as blocked on the lock */
+	waiter->eip = eip;
+#endif
+	task->blocked_on = waiter;
+	waiter->lock = lock;
+	waiter->ti = ti;
+	plist_init(&waiter->pi_list, task->prio);
+	/*
+	 * Add SCHED_NORMAL tasks to the end of the waitqueue (FIFO):
+	 */
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&task->pi_lock));
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&lock->wait_lock));
+#if !ALL_TASKS_PI
+	if ((!rt_task(task) &&
+		!(lock->mutex_attr & FUTEX_ATTR_PRIORITY_INHERITANCE))) {
+		plist_add(&waiter->list, &lock->wait_list);
+		set_lock_owner_pending(lock);
+		return;
+	}
+#endif
+	_raw_spin_lock(&lock_owner(lock)->task->pi_lock);
+	plist_add(&waiter->pi_list, &lock_owner(lock)->task->pi_waiters);
+	/*
+	 * Add RT tasks to the head:
+	 */
+	plist_add(&waiter->list, &lock->wait_list);
+	set_lock_owner_pending(lock);
+	/*
+	 * If the waiter has higher priority than the owner
+	 * then temporarily boost the owner:
+	 */
+	if (task->prio < lock_owner(lock)->task->prio)
+		pi_setprio(lock, lock_owner(lock)->task, task->prio);
+	_raw_spin_unlock(&lock_owner(lock)->task->pi_lock);
+}
+
+/*
+ * initialise the lock:
+ */
+static void __init_rt_mutex(struct rt_mutex *lock, int save_state,
+				char *name, char *file, int line)
+{
+	lock->owner = NULL;
+	spin_lock_init(&lock->wait_lock);
+	preempt_disable();
+	plist_init(&lock->wait_list, MAX_PRIO);
+	preempt_enable();
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	lock->save_state = save_state;
+	INIT_LIST_HEAD(&lock->held_list);
+	lock->name = name;
+	lock->file = file;
+	lock->line = line;
+#endif
+#ifdef CONFIG_DEBUG_PREEMPT
+	lock->was_preempt_off = 0;
+#endif
+}
+
+#ifdef CONFIG_PREEMPT_RT
+void fastcall __init_rwsem(struct rw_semaphore *rwsem, int save_state,
+			char *name, char *file, int line)
+{
+	__init_rt_mutex(&rwsem->lock, save_state, name, file, line);
+	rwsem->read_depth = 0;
+}
+EXPORT_SYMBOL(__init_rwsem);
+#endif
+
+/*
+ * This must be called with both the old_owner and new_owner pi_locks held.
+ * As well as the lock->wait_lock.
+ */
+static inline
+void set_new_owner(struct rt_mutex *lock, struct thread_info *old_owner,
+			struct thread_info *new_owner __EIP_DECL__)
+{
+	if (new_owner)
+		trace_special_pid(new_owner->task->pid, new_owner->task->prio, 0);
+	if (unlikely(old_owner))
+		change_owner(lock, old_owner, new_owner);
+	lock->owner = new_owner;
+	if (!plist_empty(&lock->wait_list))
+		set_lock_owner_pending(lock);
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	TRACE_WARN_ON_LOCKED(!list_empty(&lock->held_list));
+	list_add_tail(&lock->held_list, &held_locks);
+	lock->acquire_eip = eip;
+#endif
+	account_mutex_owner_down(new_owner->task, lock);
+}
+
+/*
+ * handle the lock release when processes blocked on it that can now run
+ * - the spinlock must be held by the caller
+ *
+ * The lock->wait_lock must be held, and the lock's owner->pi_lock must not.
+ */
+static struct thread_info *
+pick_new_owner(struct rt_mutex *lock, struct thread_info *old_owner,
+	       int save_state __EIP_DECL__)
+{
+	struct rt_mutex_waiter *waiter = NULL;
+	struct thread_info *new_owner;
+
+	SMP_TRACE_BUG_ON_LOCKED(!spin_is_locked(&lock->wait_lock));
+	/*
+	 * Get the highest prio one:
+	 *
+	 * (same-prio RT tasks go FIFO)
+	 */
+	waiter = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, list);
+
+#ifdef CONFIG_SMP
+ try_again:
+#endif
+	trace_special_pid(waiter->ti->task->pid, waiter->ti->task->prio, 0);
+
+#if ALL_TASKS_PI
+	check_pi_list_present(lock, waiter, old_owner);
+#endif
+	new_owner = waiter->ti;
+	/*
+	 * The new owner is still blocked on this lock, so we
+	 * must release the lock->wait_lock before grabing
+	 * the new_owner lock.
+	 */
+	_raw_spin_unlock(&lock->wait_lock);
+	_raw_spin_lock(&new_owner->task->pi_lock);
+	_raw_spin_lock(&lock->wait_lock);
+	/*
+	 * In this split second of releasing the lock, a high priority
+	 * process could have come along and blocked as well.
+	 */
+#ifdef CONFIG_SMP
+	waiter = plist_first_entry(&lock->wait_list, struct rt_mutex_waiter, list);
+	if (unlikely(waiter->ti != new_owner)) {
+		_raw_spin_unlock(&new_owner->task->pi_lock);
+		goto try_again;
+	}
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Once again the BKL comes to play.  Since the BKL can be grabbed and released
+	 * out of the normal P1->L1->P2 order, there's a chance that someone has the
+	 * BKL owner's lock and is waiting on the new owner lock.
+	 */
+	if (unlikely(lock == &kernel_sem.lock)) {
+		if (!_raw_spin_trylock(&old_owner->task->pi_lock)) {
+			_raw_spin_unlock(&new_owner->task->pi_lock);
+			goto try_again;
+		}
+	} else
+#endif
+#endif
+		_raw_spin_lock(&old_owner->task->pi_lock);
+
+	plist_del_init(&waiter->list, &lock->wait_list);
+	plist_del(&waiter->pi_list, &old_owner->task->pi_waiters);
+	plist_init(&waiter->pi_list, waiter->ti->task->prio);
+
+	set_new_owner(lock, old_owner, new_owner __W_EIP__(waiter));
+	/* Don't touch waiter after ->task has been NULLed */
+	mb();
+	waiter->ti = NULL;
+	new_owner->task->blocked_on = NULL;
+	TRACE_WARN_ON(save_state != lock->save_state);
+
+	_raw_spin_unlock(&old_owner->task->pi_lock);
+	_raw_spin_unlock(&new_owner->task->pi_lock);
+
+	return new_owner;
+}
+
+/*
+ * TODO: on SMP we still have to initialize the wait_list runtime,
+ *       due to percpu.data.
+ */
+static inline void init_lists(struct rt_mutex *lock)
+{
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_DEADLOCKS)
+	// we have to do this until the static initializers get fixed:
+	if (unlikely(!lock->wait_list.dp_node.prev)) {
+		plist_init(&lock->wait_list, MAX_PRIO);
+#ifdef CONFIG_DEBUG_DEADLOCKS
+		pi_initialized++;
+#endif
+#ifndef CONFIG_SMP
+		{
+			static int once = 1;
+			if (once) {
+				once = 0;
+				printk("BUG: lock wait_list not initialized?\n");
+				printk_lock(lock, 1);
+				WARN_ON(1);
+			}
+		}
+#endif
+	}
+#endif
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	if (!lock->held_list.prev && !lock->held_list.next)
+		INIT_LIST_HEAD(&lock->held_list);
+#endif
+}
+
+/*
+ * Try to grab a lock, and if it is owned but the owner
+ * hasn't woken up yet, see if we can steal it.
+ *
+ * Return: 1 if task can grab lock.
+ *         0 if not.
+ */
+static int __grab_lock(struct rt_mutex *lock, struct task_struct *task, struct task_struct *owner)
+{
+#ifndef CAPTURE_LOCK
+	return 0;
+#endif
+	/*
+	 * The lock is owned, but now test to see if the owner
+	 * is still sleeping and hasn't woken up to get the lock.
+	 */
+
+	TRACE_BUG_ON_LOCKED(!owner);
+
+	/* The owner is pending on a lock, but is it this lock? */
+	if (owner->pending_owner != lock)
+		return 0;
+
+	/*
+	 * There's an owner, but it hasn't woken up to take the lock yet.
+	 * See if we should steal it from him.
+	 */
+	if (task->prio > owner->prio)
+		return 0;
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * The BKL is a PITA. Don't ever steal it
+	 */
+	if (lock == &kernel_sem.lock)
+		return 0;
+#endif
+	/*
+	 * This task is of higher priority than the current pending
+	 * owner, so we may steal it.
+	 */
+	owner->rt_flags &= ~RT_PENDOWNER;
+	owner->pending_owner = NULL;
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	/*
+	 * This task will be taking the ownership away, and
+	 * when it does, the lock can't be on the held list.
+	 */
+	TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list));
+	list_del_init(&lock->held_list);
+#endif
+	account_mutex_owner_up(owner);
+
+	return 1;
+}
+
+/*
+ * Bring a task from pending ownership to owning a lock.
+ *
+ * Return 0 if we secured it, otherwise non-zero if it was
+ * stolen.
+ */
+static int
+capture_lock(struct rt_mutex_waiter *waiter, struct thread_info *ti,
+	     struct task_struct *task)
+{
+	struct rt_mutex *lock = waiter->lock;
+	struct thread_info *old_owner;
+	unsigned long flags;
+	int ret = 0;
+
+#ifndef CAPTURE_LOCK
+	return 0;
+#endif
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * The BKL is special, we always get it.
+	 */
+	if (lock == &kernel_sem.lock)
+		return 0;
+#endif
+
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	/*
+	 * We are no longer blocked on the lock, so we are considered a
+	 * owner. So we must grab the lock->wait_lock first.
+	 */
+	_raw_spin_lock(&lock->wait_lock);
+	_raw_spin_lock(&task->pi_lock);
+
+	if (!(task->rt_flags & RT_PENDOWNER)) {
+		/*
+		 * Someone else stole it.
+		 */
+		old_owner = lock_owner(lock);
+		TRACE_BUG_ON_LOCKED(old_owner == ti);
+		if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
+			/* we got it back! */
+			if (old_owner) {
+				_raw_spin_lock(&old_owner->task->pi_lock);
+				set_new_owner(lock, old_owner, ti __W_EIP__(waiter));
+				_raw_spin_unlock(&old_owner->task->pi_lock);
+			} else
+				set_new_owner(lock, old_owner, ti __W_EIP__(waiter));
+			ret = 0;
+		} else {
+			/* Add ourselves back to the list */
+			TRACE_BUG_ON_LOCKED(!plist_empty(&waiter->list));
+			plist_init(&waiter->list, task->prio);
+			task_blocks_on_lock(waiter, ti, lock __W_EIP__(waiter));
+			ret = 1;
+		}
+	} else {
+		task->rt_flags &= ~RT_PENDOWNER;
+		task->pending_owner = NULL;
+	}
+
+	_raw_spin_unlock(&lock->wait_lock);
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+	return ret;
+}
+
+static inline void INIT_WAITER(struct rt_mutex_waiter *waiter)
+{
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	memset(waiter, 0x11, sizeof(*waiter));
+	plist_init(&waiter->list, MAX_PRIO);
+	plist_init(&waiter->pi_list, MAX_PRIO);
+#endif
+}
+
+static inline void FREE_WAITER(struct rt_mutex_waiter *waiter)
+{
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	TRACE_WARN_ON(!plist_empty(&waiter->list));
+	TRACE_WARN_ON(!plist_empty(&waiter->pi_list));
+	TRACE_WARN_ON(current->blocked_on);
+	memset(waiter, 0x22, sizeof(*waiter));
+#endif
+}
+
+/*
+ * lock it semaphore-style: no worries about missed wakeups.
+ */
+static inline void
+____down(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info(), *old_owner;
+	struct task_struct *task = ti->task;
+	unsigned long flags, nosched_flag;
+	struct rt_mutex_waiter waiter;
+
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	_raw_spin_lock(&task->pi_lock);
+	_raw_spin_lock(&lock->wait_lock);
+	INIT_WAITER(&waiter);
+
+	old_owner = lock_owner(lock);
+	init_lists(lock);
+
+	if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
+		/* granted */
+		TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
+		if (old_owner) {
+			_raw_spin_lock(&old_owner->task->pi_lock);
+			set_new_owner(lock, old_owner, ti __EIP__);
+			_raw_spin_unlock(&old_owner->task->pi_lock);
+		} else
+			set_new_owner(lock, old_owner, ti __EIP__);
+		_raw_spin_unlock(&lock->wait_lock);
+		_raw_spin_unlock(&task->pi_lock);
+		trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+		FREE_WAITER(&waiter);
+		return;
+	}
+
+	set_task_state(task, TASK_UNINTERRUPTIBLE);
+
+	plist_init(&waiter.list, task->prio);
+	task_blocks_on_lock(&waiter, ti, lock __EIP__);
+
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	/* we don't need to touch the lock struct anymore */
+	_raw_spin_unlock(&lock->wait_lock);
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+	might_sleep();
+
+	nosched_flag = current->flags & PF_NOSCHED;
+	current->flags &= ~PF_NOSCHED;
+
+wait_again:
+	/* wait to be given the lock */
+	for (;;) {
+		if (!waiter.ti)
+			break;
+		schedule();
+		set_task_state(task, TASK_UNINTERRUPTIBLE);
+	}
+	/*
+	 * Check to see if we didn't have ownership stolen.
+	 */
+	if (capture_lock(&waiter, ti, task)) {
+		set_task_state(task, TASK_UNINTERRUPTIBLE);
+		goto wait_again;
+	}
+
+	current->flags |= nosched_flag;
+	task->state = TASK_RUNNING;
+	FREE_WAITER(&waiter);
+}
+
+/*
+ * lock it mutex-style: this variant is very careful not to
+ * miss any non-mutex wakeups.
+ *
+ * The wakeup side uses wake_up_process_mutex, which, combined with
+ * the xchg code of this function is a transparent sleep/wakeup
+ * mechanism nested within any existing sleep/wakeup mechanism. This
+ * enables the seemless use of arbitrary (blocking) spinlocks within
+ * sleep/wakeup event loops.
+ */
+static inline void
+____down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info(), *old_owner;
+	unsigned long state, saved_state, nosched_flag;
+	struct task_struct *task = ti->task;
+	struct rt_mutex_waiter waiter;
+	unsigned long flags;
+	int got_wakeup = 0, saved_lock_depth;
+
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	_raw_spin_lock(&task->pi_lock);
+	_raw_spin_lock(&lock->wait_lock);
+	INIT_WAITER(&waiter);
+
+	old_owner = lock_owner(lock);
+	init_lists(lock);
+
+	if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
+		/* granted */
+		TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
+		if (old_owner) {
+			_raw_spin_lock(&old_owner->task->pi_lock);
+			set_new_owner(lock, old_owner, ti __EIP__);
+			_raw_spin_unlock(&old_owner->task->pi_lock);
+		} else
+			set_new_owner(lock, old_owner, ti __EIP__);
+		_raw_spin_unlock(&lock->wait_lock);
+		_raw_spin_unlock(&task->pi_lock);
+		trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+		FREE_WAITER(&waiter);
+		return;
+	}
+
+	plist_init(&waiter.list, task->prio);
+	task_blocks_on_lock(&waiter, ti, lock __EIP__);
+
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	/*
+	 * Here we save whatever state the task was in originally,
+	 * we'll restore it at the end of the function and we'll
+	 * take any intermediate wakeup into account as well,
+	 * independently of the mutex sleep/wakeup mechanism:
+	 */
+	saved_state = xchg(&task->state, TASK_UNINTERRUPTIBLE);
+
+	/* we don't need to touch the lock struct anymore */
+	_raw_spin_unlock(&lock->wait_lock);
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock(&trace_lock, ti);
+
+	/*
+	 * TODO: check 'flags' for the IRQ bit here - it is illegal to
+	 * call down() from an IRQs-off section that results in
+	 * an actual reschedule.
+	 */
+
+	nosched_flag = current->flags & PF_NOSCHED;
+	current->flags &= ~PF_NOSCHED;
+
+	/*
+	 * BKL users expect the BKL to be held across spinlock/rwlock-acquire.
+	 * Save and clear it, this will cause the scheduler to not drop the
+	 * BKL semaphore if we end up scheduling:
+	 */
+	saved_lock_depth = task->lock_depth;
+	task->lock_depth = -1;
+
+wait_again:
+	/* wait to be given the lock */
+	for (;;) {
+		unsigned long saved_flags = current->flags & PF_NOSCHED;
+
+		if (!waiter.ti)
+			break;
+		trace_local_irq_enable(ti);
+		// no need to check for preemption here, we schedule().
+		current->flags &= ~PF_NOSCHED;
+
+		schedule();
+
+		current->flags |= saved_flags;
+		trace_local_irq_disable(ti);
+		state = xchg(&task->state, TASK_UNINTERRUPTIBLE);
+		if (state == TASK_RUNNING)
+			got_wakeup = 1;
+	}
+	/*
+	 * Check to see if we didn't have ownership stolen.
+	 */
+	if (capture_lock(&waiter, ti, task)) {
+		state = xchg(&task->state, TASK_UNINTERRUPTIBLE);
+		if (state == TASK_RUNNING)
+			got_wakeup = 1;
+		goto wait_again;
+	}
+	/*
+	 * Only set the task's state to TASK_RUNNING if it got
+	 * a non-mutex wakeup. We keep the original state otherwise.
+	 * A mutex wakeup changes the task's state to TASK_RUNNING_MUTEX,
+	 * not TASK_RUNNING - hence we can differenciate between the two
+	 * cases:
+	 */
+	state = xchg(&task->state, saved_state);
+	if (state == TASK_RUNNING)
+		got_wakeup = 1;
+	if (got_wakeup)
+		task->state = TASK_RUNNING;
+	trace_local_irq_enable(ti);
+	preempt_check_resched();
+
+	task->lock_depth = saved_lock_depth;
+	current->flags |= nosched_flag;
+	FREE_WAITER(&waiter);
+}
+
+static void __up_mutex_waiter_savestate(struct rt_mutex *lock __EIP_DECL__);
+static void __up_mutex_waiter_nosavestate(struct rt_mutex *lock __EIP_DECL__);
+
+/*
+ * release the lock:
+ */
+static inline void
+____up_mutex(struct rt_mutex *lock, int save_state __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info();
+	unsigned long flags;
+
+	TRACE_WARN_ON(lock->save_state != save_state);
+
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	_raw_spin_lock(&lock->wait_lock);
+	TRACE_BUG_ON_LOCKED(!lock->wait_list.dp_node.prev && !lock->wait_list.dp_node.next);
+
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	TRACE_WARN_ON_LOCKED(lock_owner(lock) != ti);
+	TRACE_WARN_ON_LOCKED(list_empty(&lock->held_list));
+	list_del_init(&lock->held_list);
+#endif
+
+#if ALL_TASKS_PI
+	if (plist_empty(&lock->wait_list))
+		check_pi_list_empty(lock, lock_owner(lock));
+#endif
+	if (unlikely(!plist_empty(&lock->wait_list))) {
+		if (save_state)
+			__up_mutex_waiter_savestate(lock __EIP__);
+		else
+			__up_mutex_waiter_nosavestate(lock __EIP__);
+	} else
+		lock->owner = NULL;
+	_raw_spin_unlock(&lock->wait_lock);
+#if defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPT_RT)
+	account_mutex_owner_up(current);
+	if (!current->lock_count && !rt_prio(current->normal_prio) &&
+					rt_prio(current->prio)) {
+		static int once = 1;
+
+		if (once) {
+			once = 0;
+			TRACE_OFF();
+			printk("BUG: %s/%d: leaked RT prio %d (%d)?\n",
+				current->comm, current->pid,
+				current->prio, current->normal_prio);
+			dump_stack();
+		}
+	}
+#endif
+
+#ifdef PREEMPT_DIRECT
+	trace_unlock(&trace_lock, ti);
+	/*
+	 * Common place where preemption is requested - if we can
+	 * reschedule then do it here without enabling interrupts
+	 * again (and lengthening latency):
+	 */
+	if (need_resched() && !raw_irqs_disabled_flags(flags) && !preempt_count())
+		preempt_schedule_irq();
+	trace_local_irq_restore(flags, ti);
+#else
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+#endif
+	/* no need to check for preempt here - we just handled it */
+}
+
+#ifdef RT_ATOMIC_ACQUIRE
+static void __sched
+___down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	____down_mutex(lock __EIP__);
+}
+
+static void __sched
+___down(struct rt_mutex *lock __EIP_DECL__)
+{
+	____down(lock __EIP__);
+}
+
+
+static inline void __down_mutex_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info();
+
+	if (unlikely(cmpxchg(&lock->owner, NULL, ti)))
+		___down_mutex(lock __EIP__);
+	else
+		account_mutex_owner_down(current, lock);
+}
+
+static inline void __down_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info();
+
+	if (unlikely(cmpxchg(&lock->owner, NULL, ti)))
+		___down(lock __EIP__);
+	else
+		account_mutex_owner_down(current, lock);
+}
+
+void __sched __down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	__down_mutex_inline(lock __EIP__);
+}
+
+
+void __sched __down(struct rt_mutex *lock __EIP_DECL__)
+{
+	__down_inline(lock __EIP__);
+}
+
+#else /* RT_ATOMIC_ACQUIRE */
+
+void __sched __down_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	____down_mutex(lock __EIP__);
+}
+#define __down_mutex_inline __down_mutex
+
+void __sched __down(struct rt_mutex *lock __EIP_DECL__)
+{
+	____down(lock __EIP__);
+}
+#define __down_inline __down
+
+#endif /* !RT_ATOMIC_ACQUIRE */
+
+EXPORT_SYMBOL(__down_mutex);
+EXPORT_SYMBOL(__down);
+
+#ifdef RT_ATOMIC_RELEASE
+
+static void __sched ___up_mutex_savestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 1 __EIP__);
+}
+
+static void __sched ___up_mutex_nosavestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 0 __EIP__);
+}
+
+static inline void
+ __up_mutex_savestate_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info();
+
+	if (unlikely(cmpxchg(&lock->owner, ti, NULL) != ti))
+		___up_mutex_savestate(lock __EIP__);
+	else
+		account_mutex_owner_up(current, lock);
+}
+
+void __sched __up_mutex_savestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	__up_mutex_savestate_inline(lock __EIP__);
+}
+
+static inline void
+__up_mutex_nosavestate_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info();
+
+	if (unlikely(cmpxchg(&lock->owner, ti, NULL) != ti))
+		___up_mutex_nosavestate(lock __EIP__);
+	else
+		account_mutex_owner_up(current, lock);
+}
+
+void __sched __up_mutex_nosavestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	__up_mutex_nosavestate_inline(lock __EIP__);
+}
+
+#else
+
+static inline void
+__up_mutex_nosavestate_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 0 __EIP__);
+}
+
+static inline void
+__up_mutex_savestate_inline(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 1 __EIP__);
+}
+
+void __sched __up_mutex_nosavestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 0 __EIP__);
+}
+
+void __sched __up_mutex_savestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	____up_mutex(lock, 1 __EIP__);
+}
+
+#endif
+
+EXPORT_SYMBOL(__up_mutex_savestate);
+EXPORT_SYMBOL(__up_mutex_nosavestate);
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * get a write lock on the rw-semaphore
+ */
+void fastcall __sched rt_down_write(struct rw_semaphore *rwsem)
+{
+	__down_inline(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_write);
+
+/*
+ * get a read lock on the rw-semaphore
+ */
+void fastcall __sched rt_down_read(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	if (lock_owner(&rwsem->lock) == current_thread_info()) {
+		rwsem->read_depth++;
+		return;
+	}
+	return __down_inline(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_read);
+
+
+static inline
+void __sched down_write_mutex(struct rw_semaphore *rwsem,
+			      struct task_struct *tsk __EIP_DECL__)
+{
+	__down_mutex_inline(&rwsem->lock __EIP__);
+}
+
+static inline
+void __sched down_read_mutex(struct rw_semaphore *rwsem,
+			     struct thread_info *ti __EIP_DECL__)
+{
+	/*
+	 * Read locks within the write lock succeed.
+	 */
+	if (lock_owner(&rwsem->lock) == ti) {
+		rwsem->read_depth++;
+		return;
+	}
+	__down_mutex_inline(&rwsem->lock __EIP__);
+}
+#endif
+
+static void process_timeout(unsigned long __data)
+{
+	wake_up_process((task_t *)__data);
+}
+/*
+ * get a lock - interruptible
+ */
+static int __sched __down_interruptible(struct rt_mutex *lock, unsigned long time __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info(), *old_owner;
+	struct task_struct *task = ti->task;
+	unsigned long flags, nosched_flag;
+	struct rt_mutex_waiter waiter;
+	struct timer_list timer;
+	unsigned long expire = 0;
+	int ret;
+
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	_raw_spin_lock(&task->pi_lock);
+	_raw_spin_lock(&lock->wait_lock);
+	INIT_WAITER(&waiter);
+
+	old_owner = lock_owner(lock);
+	init_lists(lock);
+
+	if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
+		/* granted */
+		TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
+		if (old_owner) {
+			_raw_spin_lock(&old_owner->task->pi_lock);
+			set_new_owner(lock, old_owner, ti __EIP__);
+			_raw_spin_unlock(&old_owner->task->pi_lock);
+		} else
+			set_new_owner(lock, old_owner, ti __EIP__);
+		_raw_spin_unlock(&lock->wait_lock);
+		_raw_spin_unlock(&task->pi_lock);
+		trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+		FREE_WAITER(&waiter);
+		return 0;
+	}
+
+	set_task_state(task, TASK_INTERRUPTIBLE);
+
+	plist_init(&waiter.list, task->prio);
+	task_blocks_on_lock(&waiter, ti, lock __EIP__);
+
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	/* we don't need to touch the lock struct anymore */
+	_raw_spin_unlock(&lock->wait_lock);
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+	might_sleep();
+
+	nosched_flag = current->flags & PF_NOSCHED;
+	current->flags &= ~PF_NOSCHED;
+	if (time) {
+		expire = time + jiffies;
+		init_timer(&timer);
+		timer.expires = expire;
+		timer.data = (unsigned long)current;
+		timer.function = process_timeout;
+		add_timer(&timer);
+	}
+
+	ret = 0;
+wait_again:
+	/* wait to be given the lock */
+	for (;;) {
+		if (signal_pending(current) || (time && !timer_pending(&timer))) {
+			/*
+			 * Remove ourselves from the wait list if we
+			 * didnt get the lock - else return success:
+			 */
+			trace_lock_irq(&trace_lock, ti);
+			_raw_spin_lock(&task->pi_lock);
+			_raw_spin_lock(&lock->wait_lock);
+			if (waiter.ti || time) {
+				plist_del_init(&waiter.list, &lock->wait_list);
+				/*
+				 * Just remove ourselves from the PI list.
+				 * (No big problem if our PI effect lingers
+				 *  a bit - owner will restore prio.)
+				 */
+				TRACE_WARN_ON_LOCKED(waiter.ti != ti);
+				TRACE_WARN_ON_LOCKED(current->blocked_on != &waiter);
+				plist_del(&waiter.pi_list, &task->pi_waiters);
+				plist_init(&waiter.pi_list, task->prio);
+				waiter.ti = NULL;
+				current->blocked_on = NULL;
+				if (time) {
+					ret = (int)(expire - jiffies);
+					if (!timer_pending(&timer)) {
+						del_singleshot_timer_sync(&timer);
+						ret = -ETIMEDOUT;
+					}
+				} else
+					ret = -EINTR;
+			}
+			_raw_spin_unlock(&lock->wait_lock);
+			_raw_spin_unlock(&task->pi_lock);
+			trace_unlock_irq(&trace_lock, ti);
+			break;
+		}
+		if (!waiter.ti)
+			break;
+		schedule();
+		set_task_state(task, TASK_INTERRUPTIBLE);
+	}
+
+	/*
+	 * Check to see if we didn't have ownership stolen.
+	 */
+	if (!ret) {
+		if (capture_lock(&waiter, ti, task)) {
+			set_task_state(task, TASK_INTERRUPTIBLE);
+			goto wait_again;
+		}
+	}
+
+	task->state = TASK_RUNNING;
+	current->flags |= nosched_flag;
+
+	FREE_WAITER(&waiter);
+	return ret;
+}
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * trylock for writing -- returns 1 if successful, 0 if contention
+ */
+static int __down_trylock(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *ti = current_thread_info(), *old_owner;
+	struct task_struct *task = ti->task;
+	unsigned long flags;
+	int ret = 0;
+
+	trace_lock_irqsave(&trace_lock, flags, ti);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	_raw_spin_lock(&task->pi_lock);
+	_raw_spin_lock(&lock->wait_lock);
+
+	old_owner = lock_owner(lock);
+	init_lists(lock);
+
+	if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
+		/* granted */
+		TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
+		if (old_owner) {
+			_raw_spin_lock(&old_owner->task->pi_lock);
+			set_new_owner(lock, old_owner, ti __EIP__);
+			_raw_spin_unlock(&old_owner->task->pi_lock);
+		} else
+			set_new_owner(lock, old_owner, ti __EIP__);
+		ret = 1;
+	}
+	_raw_spin_unlock(&lock->wait_lock);
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock_irqrestore(&trace_lock, flags, ti);
+
+	return ret;
+}
+
+int fastcall rt_down_write_trylock(struct rw_semaphore *rwsem)
+{
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_write_trylock);
+
+/*
+ * trylock for reading -- returns 1 if successful, 0 if contention
+ */
+int fastcall rt_down_read_trylock(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (lock_owner(&rwsem->lock) == current_thread_info()) {
+		rwsem->read_depth++;
+		return 1;
+	}
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down_read_trylock);
+
+static int down_write_trylock_mutex(struct rw_semaphore *rwsem)
+{
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+
+static int down_read_trylock_mutex(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (lock_owner(&rwsem->lock) == current_thread_info()) {
+		rwsem->read_depth++;
+		return 1;
+	}
+	return __down_trylock(&rwsem->lock __CALLER0__);
+}
+#endif
+
+static void __up_mutex_waiter_nosavestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *old_owner_ti, *new_owner_ti;
+	struct task_struct *old_owner, *new_owner;
+	struct rt_mutex_waiter *w;
+	int prio;
+
+	old_owner_ti = lock_owner(lock);
+	old_owner = old_owner_ti->task;
+	new_owner_ti = pick_new_owner(lock, old_owner_ti, 0 __EIP__);
+	new_owner = new_owner_ti->task;
+
+	/*
+	 * If the owner got priority-boosted then restore it
+	 * to the previous priority (or to the next highest prio
+	 * waiter's priority):
+	 */
+	_raw_spin_lock(&old_owner->pi_lock);
+	prio = old_owner->normal_prio;
+	if (unlikely(!plist_empty(&old_owner->pi_waiters))) {
+		w = plist_first_entry(&old_owner->pi_waiters, struct rt_mutex_waiter, pi_list);
+		if (w->ti->task->prio < prio)
+			prio = w->ti->task->prio;
+	}
+	if (unlikely(prio != old_owner->prio))
+		pi_setprio(lock, old_owner, prio);
+	_raw_spin_unlock(&old_owner->pi_lock);
+#ifdef CAPTURE_LOCK
+#ifdef CONFIG_PREEMPT_RT
+	if (lock != &kernel_sem.lock) {
+#endif
+		new_owner->rt_flags |= RT_PENDOWNER;
+		new_owner->pending_owner = lock;
+#ifdef CONFIG_PREEMPT_RT
+	}
+#endif
+#endif
+	wake_up_process(new_owner);
+}
+
+static void __up_mutex_waiter_savestate(struct rt_mutex *lock __EIP_DECL__)
+{
+	struct thread_info *old_owner_ti, *new_owner_ti;
+	struct task_struct *old_owner, *new_owner;
+	struct rt_mutex_waiter *w;
+	int prio;
+
+	old_owner_ti = lock_owner(lock);
+	old_owner = old_owner_ti->task;
+	new_owner_ti = pick_new_owner(lock, old_owner_ti, 1 __EIP__);
+	new_owner = new_owner_ti->task;
+
+	/*
+	 * If the owner got priority-boosted then restore it
+	 * to the previous priority (or to the next highest prio
+	 * waiter's priority):
+	 */
+	_raw_spin_lock(&old_owner->pi_lock);
+	prio = old_owner->normal_prio;
+	if (unlikely(!plist_empty(&old_owner->pi_waiters))) {
+		w = plist_first_entry(&old_owner->pi_waiters, struct rt_mutex_waiter, pi_list);
+		if (w->ti->task->prio < prio)
+			prio = w->ti->task->prio;
+	}
+	if (unlikely(prio != old_owner->prio))
+		pi_setprio(lock, old_owner, prio);
+	_raw_spin_unlock(&old_owner->pi_lock);
+#ifdef CAPTURE_LOCK
+#ifdef CONFIG_PREEMPT_RT
+	if (lock != &kernel_sem.lock) {
+#endif
+		new_owner->rt_flags |= RT_PENDOWNER;
+		new_owner->pending_owner = lock;
+#ifdef CONFIG_PREEMPT_RT
+	}
+#endif
+#endif
+	wake_up_process_mutex(new_owner);
+}
+
+#ifdef CONFIG_PREEMPT_RT
+/*
+ * Do owner check too:
+ */
+void fastcall rt_up_write(struct rw_semaphore *rwsem)
+{
+	TRACE_WARN_ON(lock_owner(&rwsem->lock) != current_thread_info());
+	TRACE_BUG_ON(rwsem->read_depth);
+	__up_mutex_nosavestate_inline(&rwsem->lock __CALLER0__);
+}
+EXPORT_SYMBOL(rt_up_write);
+
+static inline void _up_write(struct rw_semaphore *rwsem __EIP_DECL__)
+{
+	TRACE_WARN_ON(lock_owner(&rwsem->lock) != current_thread_info());
+	TRACE_BUG_ON(rwsem->read_depth);
+	__up_mutex_nosavestate_inline(&rwsem->lock __EIP__);
+}
+
+static void up_write_mutex(struct rw_semaphore *rwsem __EIP_DECL__)
+{
+	TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	TRACE_WARN_ON(lock_owner(&rwsem->lock) != current_thread_info());
+	TRACE_BUG_ON(rwsem->read_depth);
+	__up_mutex_savestate_inline(&rwsem->lock __EIP__);
+}
+
+/*
+ * release a read lock on the semaphore
+ */
+void fastcall rt_up_read(struct rw_semaphore *rwsem)
+{
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (lock_owner(&rwsem->lock) == current_thread_info() &&
+							rwsem->read_depth) {
+		rwsem->read_depth--;
+		return;
+	}
+	return _up_write(rwsem __CALLER0__);
+}
+EXPORT_SYMBOL(rt_up_read);
+
+static void up_read_mutex(struct rw_semaphore *rwsem __EIP_DECL__)
+{
+	TRACE_WARN_ON(rwsem->lock.save_state != 1);
+	/*
+	 * Read locks within the self-held write lock succeed.
+	 */
+	if (lock_owner(&rwsem->lock) == current_thread_info() &&
+							rwsem->read_depth) {
+		rwsem->read_depth--;
+		return;
+	}
+	return up_write_mutex(rwsem __EIP__);
+}
+
+/*
+ * downgrade a write lock into a read lock
+ * - just wake up any readers at the front of the queue
+ */
+void fastcall rt_downgrade_write(struct rw_semaphore *rwsem)
+{
+	BUG();
+}
+EXPORT_SYMBOL(rt_downgrade_write);
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+static int rt_mutex_is_locked(struct rt_mutex *lock)
+{
+	int ret;
+
+	mb();
+	ret = lock_owner(lock) != NULL;
+
+	return ret;
+}
+
+int fastcall rt_rwsem_is_locked(struct rw_semaphore *rwsem)
+{
+	return rt_mutex_is_locked(&rwsem->lock);
+}
+EXPORT_SYMBOL(rt_rwsem_is_locked);
+
+void fastcall __sema_init(struct semaphore *sem, int val,
+			  char *name, char *file, int line)
+{
+	atomic_set(&sem->count, val);
+	switch (val) {
+	case 0:
+		__init_rt_mutex(&sem->lock, 0, name, file, line);
+		__down_inline(&sem->lock __CALLER0__);
+		break;
+	default:
+		__init_rt_mutex(&sem->lock, 0, name, file, line);
+		break;
+	}
+}
+EXPORT_SYMBOL(__sema_init);
+
+void fastcall __init_MUTEX(struct semaphore *sem, char *name, char *file,
+			   int line)
+{
+	__sema_init(sem, 1, name, file, line);
+}
+EXPORT_SYMBOL(__init_MUTEX);
+
+static int down_trylock_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	TRACE_WARN_ON(lock->save_state != 1);
+	return __down_trylock(lock __EIP__);
+}
+
+static void fastcall up_mutex(struct rt_mutex *lock __EIP_DECL__)
+{
+	TRACE_WARN_ON(lock->save_state != 1);
+	TRACE_WARN_ON(lock_owner(lock) != current_thread_info());
+	__up_mutex_savestate_inline(lock __EIP__);
+}
+
+/*
+ * Linux Semaphores implemented via RT-mutexes.
+ *
+ * In the down() variants we use the mutex as the semaphore blocking
+ * object: we always acquire it, decrease the counter and keep the lock
+ * locked if we did the 1->0 transition. The next down() will then block.
+ *
+ * In the up() path we atomically increase the counter and do the
+ * unlock if we were the one doing the 0->1 transition.
+ */
+
+static inline void __down_complete(struct semaphore *sem __EIP_DECL__)
+{
+	int count = atomic_dec_return(&sem->count);
+
+	TRACE_WARN_ON(sem->lock.save_state != 0 || count < 0);
+
+	if (unlikely(count > 0))
+		__up_mutex_nosavestate_inline(&sem->lock __EIP__);
+}
+
+void fastcall rt_down(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	__down_inline(&sem->lock __CALLER0__);
+	__down_complete(sem __CALLER0__);
+}
+EXPORT_SYMBOL(rt_down);
+
+int fastcall rt_down_interruptible(struct semaphore *sem)
+{
+	int ret;
+
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	ret = __down_interruptible(&sem->lock, 0 __CALLER0__);
+	if (ret)
+		return ret;
+	__down_complete(sem __CALLER0__);
+	return 0;
+}
+EXPORT_SYMBOL(rt_down_interruptible);
+
+/*
+ * try to down the semaphore, 0 on success and 1 on failure. (inverted)
+ */
+int fastcall rt_down_trylock(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	/*
+	 * Here we are a tiny bit different from ordinary Linux semaphores,
+	 * because we can get 'transient' locking-failures when say a
+	 * process decreases the count from 9 to 8 and locks/releases the
+	 * embedded mutex internally. It would be quite complex to remove
+	 * these transient failures so lets try it the simple way first:
+	 */
+	if (__down_trylock(&sem->lock __CALLER0__)) {
+		__down_complete(sem __CALLER0__);
+		return 0;
+	}
+	return 1;
+}
+EXPORT_SYMBOL(rt_down_trylock);
+
+void fastcall rt_up(struct semaphore *sem)
+{
+	struct thread_info *ti = current_thread_info();
+	int count;
+
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	/*
+	 * Disable preemption to make sure a highprio trylock-er cannot
+	 * preempt us here and get into an infinite loop:
+	 */
+	preempt_disable_ti(ti);
+	count = atomic_inc_return(&sem->count);
+	/*
+	 * If we did the 0 -> 1 transition then we are the ones to unlock it:
+	 */
+	if (likely(count == 1))
+		__up_mutex_nosavestate_inline(&sem->lock __CALLER0__);
+	preempt_enable_ti(ti);
+}
+EXPORT_SYMBOL(rt_up);
+
+int fastcall rt_sem_is_locked(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	return rt_mutex_is_locked(&sem->lock);
+}
+EXPORT_SYMBOL(rt_sem_is_locked);
+
+int fastcall rt_sema_count(struct semaphore *sem)
+{
+	TRACE_WARN_ON(sem->lock.save_state != 0);
+	return atomic_read(&sem->count);
+}
+EXPORT_SYMBOL(rt_sema_count);
+#endif
+
+/*
+ * Spinlock wrappers:
+ *
+ * (DEBUG_RT_LOCKING_MODE is a spinning/preempt-disabling variant of the APIs.
+ * Used for debugging/profiling only.)
+ */
+
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+
+int preempt_locks_user = 0;
+int preempt_locks = 0;
+
+EXPORT_SYMBOL(preempt_locks);
+
+/*
+ * Called from the idle thread - it is not safe to switch the locking
+ * mode runtime from a normal process context (locks might be in use)
+ */
+void propagate_preempt_locks_value(void)
+{
+	if (preempt_locks != preempt_locks_user)
+		preempt_locks = preempt_locks_user;
+}
+
+#else
+# define preempt_locks 1
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+static inline void __spin_lock(spinlock_t *lock __EIP_DECL__)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		_raw_spin_lock(&lock->lock.debug_slock);
+	else
+#endif
+	{
+		TRACE_WARN_ON(lock->lock.save_state != 1);
+		__down_mutex_inline(&lock->lock __EIP__);
+	}
+}
+
+static inline void __spin_unlock(spinlock_t *lock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		_raw_spin_unlock(&lock->lock.debug_slock);
+	else
+#endif
+		up_mutex(&lock->lock __CALLER0__);
+}
+
+
+#ifdef DEBUG_RT_DONT_INLINE
+void __lockfunc _spin_lock(spinlock_t *spin)
+{
+	__spin_lock(spin __CALLER0__);
+}
+EXPORT_SYMBOL(_spin_lock);
+
+void __lockfunc _spin_lock_bh(spinlock_t *spin)
+{
+	__spin_lock(spin __CALLER0__);
+}
+EXPORT_SYMBOL(_spin_lock_bh);
+
+void __lockfunc _spin_lock_irq(spinlock_t *spin)
+{
+	__spin_lock(spin __CALLER0__);
+}
+EXPORT_SYMBOL(_spin_lock_irq);
+
+void __lockfunc _spin_unlock(spinlock_t *lock)
+{
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock);
+
+void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+{
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock_bh);
+
+void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+{
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock_irq);
+
+void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+{
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+	if (flags != (RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK)) {
+		static int print_once = 1;
+
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: %s/%d: spin-unlock irq flags assymetry?\n",
+				current->comm, current->pid);
+			dump_stack();
+		}
+		local_irq_enable();
+	}
+#endif
+	__spin_unlock(lock);
+}
+EXPORT_SYMBOL(_spin_unlock_irqrestore);
+
+unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *spin)
+{
+	unsigned long flags;
+
+	__spin_lock(spin __CALLER0__);
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+	flags = RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK;
+#else
+	flags = 0;
+#endif
+
+	return flags;
+}
+EXPORT_SYMBOL(_spin_lock_irqsave);
+
+#endif /* DEBUG_RT_DONT_INLINE */
+
+void __lockfunc _spin_unlock_wait(spinlock_t *lock)
+{
+	do {
+		barrier();
+	} while (spin_is_locked(&lock->lock.wait_lock));
+}
+EXPORT_SYMBOL(_spin_unlock_wait);
+
+static inline int __spin_trylock(spinlock_t *lock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		return _raw_spin_trylock(&lock->lock.debug_slock);
+	else
+#endif
+		return down_trylock_mutex(&lock->lock __CALLER0__);
+}
+
+int __lockfunc _spin_trylock(spinlock_t *lock)
+{
+	return __spin_trylock(lock);
+}
+EXPORT_SYMBOL(_spin_trylock);
+
+int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+{
+	return __spin_trylock(lock);
+}
+EXPORT_SYMBOL(_spin_trylock_bh);
+
+int __lockfunc _spin_trylock_irq(spinlock_t *lock)
+{
+	return __spin_trylock(lock);
+}
+EXPORT_SYMBOL(_spin_trylock_irq);
+
+int __lockfunc _spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags)
+{
+	int ret;
+
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+	*flags = RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK;
+#else
+	*flags = 0;
+#endif
+
+	ret = __spin_trylock(lock);
+
+	return ret;
+}
+EXPORT_SYMBOL(_spin_trylock_irqsave);
+
+static inline int __spin_is_locked(spinlock_t *lock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		return _raw_spin_is_locked(&lock->lock.debug_slock);
+	else
+#endif
+		return rt_mutex_is_locked(&lock->lock);
+}
+
+int _spin_is_locked(spinlock_t *lock)
+{
+	return __spin_is_locked(lock);
+}
+EXPORT_SYMBOL(_spin_is_locked);
+
+int _spin_can_lock(spinlock_t *lock)
+{
+	return !__spin_is_locked(lock);
+}
+EXPORT_SYMBOL(_spin_can_lock);
+
+int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock)
+{
+	__spin_lock(lock __CALLER0__);
+	if (atomic_dec_and_test(atomic))
+		return 1;
+	__spin_unlock(lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(atomic_dec_and_spin_lock);
+
+void _spin_lock_init(spinlock_t *lock, char *name, char *file, int line)
+{
+	__init_rt_mutex(&lock->lock, 1, name, file, line);
+#ifdef CONFIG_DEBUG_PREEMPT
+	lock->lock.was_preempt_off = 1;
+#endif
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	_raw_spin_lock_init(&lock->lock.debug_slock);
+#endif
+}
+EXPORT_SYMBOL(_spin_lock_init);
+
+
+/*
+ * RW-lock wrappers:
+ */
+int __lockfunc _read_trylock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+	return _raw_read_trylock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		return down_read_trylock_mutex(&rwlock->lock);
+}
+EXPORT_SYMBOL(_read_trylock);
+
+int __lockfunc _write_trylock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		return _raw_write_trylock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		return down_write_trylock_mutex(&rwlock->lock);
+}
+EXPORT_SYMBOL(_write_trylock);
+
+inline void __lockfunc _write_lock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		_raw_write_lock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		down_write_mutex(&rwlock->lock, current __CALLER0__);
+}
+EXPORT_SYMBOL(_write_lock);
+
+inline void __lockfunc _read_lock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		_raw_read_lock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		down_read_mutex(&rwlock->lock,
+				current_thread_info() __CALLER0__);
+}
+EXPORT_SYMBOL(_read_lock);
+
+inline void __lockfunc _write_unlock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		_raw_write_unlock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		up_write_mutex(&rwlock->lock __CALLER0__);
+}
+EXPORT_SYMBOL(_write_unlock);
+
+static inline void __read_unlock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		_raw_read_unlock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		up_read_mutex(&rwlock->lock __CALLER0__);
+}
+
+void __lockfunc _read_unlock(rwlock_t *rwlock)
+{
+	__read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock);
+
+unsigned long __lockfunc _write_lock_irqsave(rwlock_t *rwlock)
+{
+	unsigned long flags;
+
+	_write_lock(rwlock);
+
+	__local_save_flags_inline(flags);
+	return flags;
+}
+EXPORT_SYMBOL(_write_lock_irqsave);
+
+unsigned long __lockfunc _read_lock_irqsave(rwlock_t *rwlock)
+{
+	unsigned long flags;
+
+	_read_lock(rwlock);
+
+	__local_save_flags_inline(flags);
+	return flags;
+}
+EXPORT_SYMBOL(_read_lock_irqsave);
+
+void __lockfunc _write_lock_irq(rwlock_t *rwlock)
+{
+	_write_lock(rwlock);
+}
+EXPORT_SYMBOL(_write_lock_irq);
+
+void __lockfunc _read_lock_irq(rwlock_t *rwlock)
+{
+	_read_lock(rwlock);
+}
+EXPORT_SYMBOL(_read_lock_irq);
+
+void __lockfunc _write_lock_bh(rwlock_t *rwlock)
+{
+	_write_lock(rwlock);
+}
+EXPORT_SYMBOL(_write_lock_bh);
+
+void __lockfunc _read_lock_bh(rwlock_t *rwlock)
+{
+	_read_lock(rwlock);
+}
+EXPORT_SYMBOL(_read_lock_bh);
+
+void __lockfunc _write_unlock_irq(rwlock_t *rwlock)
+{
+	_write_unlock(rwlock);
+}
+EXPORT_SYMBOL(_write_unlock_irq);
+
+void __lockfunc _read_unlock_irq(rwlock_t *rwlock)
+{
+	_read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock_irq);
+
+void __lockfunc _write_unlock_bh(rwlock_t *rwlock)
+{
+	_write_unlock(rwlock);
+}
+EXPORT_SYMBOL(_write_unlock_bh);
+
+void __lockfunc _read_unlock_bh(rwlock_t *rwlock)
+{
+	_read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock_bh);
+
+void __lockfunc _write_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags)
+{
+	_write_unlock(rwlock);
+}
+EXPORT_SYMBOL(_write_unlock_irqrestore);
+
+void __lockfunc _read_unlock_irqrestore(rwlock_t *rwlock, unsigned long flags)
+{
+	_read_unlock(rwlock);
+}
+EXPORT_SYMBOL(_read_unlock_irqrestore);
+
+void _rwlock_init(rwlock_t *rwlock, char *name, char *file, int line)
+{
+	__init_rwsem(&rwlock->lock, 1, name, file, line);
+#ifdef CONFIG_DEBUG_PREEMT
+	lock->lock.was_preempt_off = 1;
+#endif
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	_raw_rwlock_init(&rwlock->lock.lock.debug_rwlock);
+#endif
+}
+EXPORT_SYMBOL(_rwlock_init);
+
+/*
+ * _read_can_lock() and _write_can_lock() does the same
+ */
+int _read_can_lock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		return _raw_read_can_lock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		return !rt_rwsem_is_locked(&rwlock->lock);
+}
+EXPORT_SYMBOL(_read_can_lock);
+
+int _write_can_lock(rwlock_t *rwlock)
+{
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	if (!preempt_locks)
+		return _raw_write_can_lock(&rwlock->lock.lock.debug_rwlock);
+	else
+#endif
+		return !rt_rwsem_is_locked(&rwlock->lock);
+}
+EXPORT_SYMBOL(_write_can_lock);
+#endif
+
+/*
+ * Soft irq-flag support:
+ */
+
+#ifdef CONFIG_DEBUG_IRQ_FLAGS
+
+void check_raw_flags(unsigned long flags)
+{
+	if (flags & RAW_LOCAL_ILLEGAL_MASK) {
+		static int print_once = 1;
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: bad raw irq-flag value %08lx, %s/%d!\n",
+				flags, current->comm, current->pid);
+			dump_stack();
+		}
+	}
+}
+
+EXPORT_SYMBOL(check_raw_flags);
+
+static int check_soft_flags(unsigned long flags)
+{
+	if (flags == (RAW_LOCAL_ILLEGAL_MASK | LOCAL_ILLEGAL_MASK)) {
+		static int print_once = 1;
+		if (print_once) {
+			print_once = 0;
+			printk("BUG: %s/%d: spin-lock irq flags assymetry?\n",
+				current->comm, current->pid);
+			dump_stack();
+		}
+		local_irq_enable();
+		return -1;
+	}
+	if ((flags & ~PF_IRQSOFF) != RAW_LOCAL_ILLEGAL_MASK) {
+		static int print_once = 1;
+		if (print_once) {
+			print_once = 0;
+#if 1
+			raw_local_irq_disable();
+			printk("BUG: bad soft irq-flag value %08lx, %s/%d! %08lx/%08lx\n",
+				flags, current->comm, current->pid, CALLER_ADDR0, CALLER_ADDR1);
+#else
+			printk("BUG: bad soft irq-flag value %08lx, %s/%d!\n",
+				flags, current->comm, current->pid);
+#endif
+			dump_stack();
+			raw_local_irq_enable();
+		}
+		return -1;
+	}
+	return 0;
+}
+#else
+static inline int check_soft_flags(unsigned long flags)
+{
+	return 0;
+}
+#endif
+
+#ifdef CONFIG_PREEMPT_RT
+static void illegal_API_call(void)
+{
+	static int print_once = 1;
+
+	if (print_once) {
+		print_once = 0;
+		printk("WARNING: %s/%d changed soft IRQ-flags.\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+static inline void turn_soft_irqs_off(void)
+{
+	current->flags |= PF_IRQSOFF;
+	illegal_API_call();
+}
+
+static inline void turn_soft_irqs_on(void)
+{
+	current->flags &= ~PF_IRQSOFF;
+	illegal_API_call();
+}
+
+void local_irq_enable(void)
+{
+	turn_soft_irqs_on();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(local_irq_enable);
+
+void local_irq_disable(void)
+{
+	turn_soft_irqs_off();
+}
+EXPORT_SYMBOL(local_irq_disable);
+
+int irqs_disabled_flags(unsigned long flags)
+{
+	check_soft_flags(flags);
+
+	return (flags & PF_IRQSOFF) != 0;
+}
+EXPORT_SYMBOL(irqs_disabled_flags);
+
+void __local_save_flags(unsigned long *flags)
+{
+	__local_save_flags_inline(*flags);
+}
+EXPORT_SYMBOL(__local_save_flags);
+
+void __local_irq_save(unsigned long *flags)
+{
+	*flags = irqs_off() | RAW_LOCAL_ILLEGAL_MASK;
+	turn_soft_irqs_off();
+}
+EXPORT_SYMBOL(__local_irq_save);
+
+void local_irq_restore(unsigned long flags)
+{
+	if (check_soft_flags(flags))
+		return;
+	if (flags & ~RAW_LOCAL_ILLEGAL_MASK)
+		turn_soft_irqs_off();
+	else {
+		turn_soft_irqs_on();
+		preempt_check_resched();
+	}
+}
+EXPORT_SYMBOL(local_irq_restore);
+
+notrace int irqs_disabled(void)
+{
+	return irqs_off();
+}
+EXPORT_SYMBOL(irqs_disabled);
+#endif
+
+/*
+ * This routine changes the owner of a mutex. It's only
+ * caller is the futex code which locks a futex on behalf
+ * of another thread.
+ */
+void fastcall rt_mutex_set_owner(struct rt_mutex *lock, struct thread_info *t)
+{
+	account_mutex_owner_up(current);
+	account_mutex_owner_down(t->task, lock);
+	lock->owner = t;
+}
+
+struct thread_info * fastcall rt_mutex_owner(struct rt_mutex *lock)
+{
+	return lock->owner ? lock->owner : NULL;
+}
+EXPORT_SYMBOL(rt_mutex_owner);
+
+int fastcall rt_mutex_has_waiters(struct rt_mutex *lock)
+{
+	return !plist_empty(&lock->wait_list);
+}
+EXPORT_SYMBOL(rt_mutex_has_waiters);
+
+int fastcall rt_mutex_free(struct rt_mutex *lock)
+{
+	return plist_empty(&lock->wait_list) && !lock->owner;
+}
+
+int fastcall rt_mutex_owned_by(struct rt_mutex *lock, struct thread_info *t)
+{
+	return(lock->owner == t ? 1 : 0);
+}
+EXPORT_SYMBOL(rt_mutex_owned_by);
+
+static int
+down_try_futex(struct rt_mutex *lock, struct thread_info *proxy_owner __EIP_DECL__)
+{
+	struct thread_info *old_owner;
+	struct task_struct *task = proxy_owner->task;
+	unsigned long flags;
+	int ret = 0;
+
+	trace_lock_irqsave(&trace_lock, flags, proxy_owner);
+	TRACE_BUG_ON_LOCKED(!raw_irqs_disabled());
+	_raw_spin_lock(&task->pi_lock);
+	_raw_spin_lock(&lock->wait_lock);
+
+	old_owner = lock_owner(lock);
+	init_lists(lock);
+
+	if (likely(!old_owner) || __grab_lock(lock, task, old_owner->task)) {
+		/* granted */
+		TRACE_WARN_ON_LOCKED(!plist_empty(&lock->wait_list) && !old_owner);
+		if (old_owner) {
+			_raw_spin_lock(&old_owner->task->pi_lock);
+			set_new_owner(lock, old_owner, proxy_owner __EIP__);
+			_raw_spin_unlock(&old_owner->task->pi_lock);
+		} else
+			set_new_owner(lock, old_owner, proxy_owner __EIP__);
+		ret = 1;
+	}
+	_raw_spin_unlock(&lock->wait_lock);
+	_raw_spin_unlock(&task->pi_lock);
+	trace_unlock_irqrestore(&trace_lock, flags, proxy_owner);
+
+	return ret;
+}
+
+/*
+ * This call has two functions.  The first is to lock the lock on behalf of
+ * another thread if the rt_mutex has no owner.  If the rt_mutex has no
+ * owner then the futex was locked in user space.  The second function
+ * of this routine is to block on the rt_mutex for a specified amount
+ * of time.  We can only block if the rt_mutex has already been locked
+ * and has an owner.
+ *
+ * This routine returns zero if we waited successfully for the futex
+ * and now own the lock, or negative values for failure, or positive
+ * values for the amount of time we waited before getting the lock.
+ */
+int fastcall
+down_futex(struct rt_mutex *lock, unsigned long time, pid_t owner_pid, struct semaphore *sem)
+{
+	struct task_struct *owner_task = NULL;
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	unsigned long eip = CALLER_ADDR0;
+#endif
+	int ret = 0;
+
+	rcu_read_lock();
+	owner_task = find_task_by_pid(owner_pid);
+	if (!get_task_struct_rcu(owner_task))
+		owner_task = NULL;
+	rcu_read_unlock();
+
+	/*
+	 * if the owner can't be found or has changed to us
+	 * then just return.
+	 */
+
+	if (!owner_task || owner_task == current) {
+		up(sem);
+		up_read(&current->mm->mmap_sem);
+		return -EAGAIN;
+	}
+
+	/*
+	 * This works for both ways the down_try_futex functions.
+	 * If it gets the lock then we are the first waiter (This
+	 * is being called from wait_robust because the lock is
+	 * contended) and we've just locked the lock on behalf of
+	 * the owning thread.  If it finds contention then we aren't
+	 * the first waiter and we'll just block on the down_interruptible.
+	 */
+
+	down_try_futex(lock, owner_task->thread_info __EIP__);
+
+	/*
+	 * we can now drop the locks because the rt_mutex is held.
+	 * and we'll just block on the down interruptible OR
+	 * we'll get the lock and return without blocking, if
+	 * it was unlocked between the down_try_futex and the
+	 * down interruptible.
+	 */
+
+	up(sem);
+	up_read(&current->mm->mmap_sem);
+
+	ret = __down_interruptible(lock, time __EIP__);
+	put_task_struct(owner_task);
+
+	return ret;
+}
+EXPORT_SYMBOL(down_futex);
+
+void fastcall up_futex(struct rt_mutex *lock)
+{
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	unsigned long eip = CALLER_ADDR0;
+#endif
+	____up_mutex(lock, 1 __EIP__);
+}
+EXPORT_SYMBOL(up_futex);
+
+void fastcall init_rt_mutex(struct rt_mutex *lock, int save_state,
+				char *name, char *file, int line)
+{
+	__init_rt_mutex(lock, save_state, name, file, line);
+}
+EXPORT_SYMBOL(init_rt_mutex);
Index: linux/kernel/sched.c
===================================================================
--- linux.orig/kernel/sched.c
+++ linux/kernel/sched.c
@@ -4,6 +4,7 @@
  *  Kernel scheduler and related syscalls
  *
  *  Copyright (C) 1991-2002  Linus Torvalds
+ *  Copyright (C) 2004 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
  *
  *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
  *		make semaphores SMP safe
@@ -16,6 +17,7 @@
  *		by Davide Libenzi, preemptible kernel bits by Robert Love.
  *  2003-09-03	Interactivity tuning by Con Kolivas.
  *  2004-04-02	Scheduler domains code by Nick Piggin
+ *  2004-10-13  Real-Time Preemption support by Ingo Molnar
  */
 
 #include <linux/mm.h>
@@ -46,6 +48,7 @@
 #include <linux/seq_file.h>
 #include <linux/syscalls.h>
 #include <linux/times.h>
+#include <linux/kallsyms.h>
 #include <linux/acct.h>
 #include <asm/tlb.h>
 
@@ -176,6 +179,13 @@ static unsigned int task_timeslice(task_
 #define task_hot(p, now, sd) ((long long) ((now) - (p)->last_ran)	\
 				< (long long) (sd)->cache_hot_time)
 
+void __put_task_struct_cb(struct rcu_head *rhp)
+{
+	__put_task_struct(container_of(rhp, struct task_struct, rcu));
+}
+
+EXPORT_SYMBOL_GPL(__put_task_struct_cb);
+
 /*
  * These are the runqueue data structures:
  */
@@ -184,7 +194,27 @@ static unsigned int task_timeslice(task_
 
 typedef struct runqueue runqueue_t;
 
+#ifdef CURRENT_PTR
+struct task_struct * const ___current = &init_task;
+struct task_struct ** const current_ptr = (struct task_struct ** const)&___current;
+struct thread_info * const current_ti = &init_thread_union.thread_info;
+struct thread_info ** const current_ti_ptr = (struct thread_info ** const)&current_ti;
+
+EXPORT_SYMBOL(___current);
+EXPORT_SYMBOL(current_ti);
+
+/*
+ * The scheduler itself doesnt want 'current' to be cached
+ * during context-switches:
+ */
+# undef current
+# define current __current()
+# undef current_thread_info
+# define current_thread_info() __current_thread_info()
+#endif
+
 struct prio_array {
+	runqueue_t *rq;
 	unsigned int nr_active;
 	unsigned long bitmap[BITMAP_SIZE];
 	struct list_head queue[MAX_PRIO];
@@ -198,7 +228,7 @@ struct prio_array {
  * acquire operations must be ordered by ascending &runqueue.
  */
 struct runqueue {
-	spinlock_t lock;
+	raw_spinlock_t lock;
 
 	/*
 	 * nr_running and cpu_load should be in the same cacheline because
@@ -206,6 +236,9 @@ struct runqueue {
 	 */
 	unsigned long nr_running;
 #ifdef CONFIG_SMP
+# ifdef CONFIG_PREEMPT_RT
+	unsigned long rt_nr_running;
+# endif
 	unsigned long cpu_load[3];
 #endif
 	unsigned long long nr_switches;
@@ -275,11 +308,23 @@ for (domain = rcu_dereference(cpu_rq(cpu
 #define task_rq(p)		cpu_rq(task_cpu(p))
 #define cpu_curr(cpu)		(cpu_rq(cpu)->curr)
 
+/*
+ * We really dont want to do anything complex within switch_to()
+ * on PREEMPT_RT - this check enforces this.
+ */
+#ifdef prepare_arch_switch
+# ifdef CONFIG_PREEMPT_RT
+#   error FIXME
+# else
+#  define _finish_arch_switch finish_arch_switch
+# endif
+#endif
+
 #ifndef prepare_arch_switch
 # define prepare_arch_switch(next)	do { } while (0)
 #endif
 #ifndef finish_arch_switch
-# define finish_arch_switch(prev)	do { } while (0)
+# define _finish_arch_switch(prev)	do { } while (0)
 #endif
 
 #ifndef __ARCH_WANT_UNLOCKED_CTXSW
@@ -340,7 +385,7 @@ static inline void finish_lock_switch(ru
 	prev->oncpu = 0;
 #endif
 #ifndef __ARCH_WANT_INTERRUPTS_ON_CTXSW
-	local_irq_enable();
+	raw_local_irq_enable();
 #endif
 }
 #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
@@ -356,7 +401,7 @@ static inline runqueue_t *task_rq_lock(t
 	struct runqueue *rq;
 
 repeat_lock_task:
-	local_irq_save(*flags);
+	raw_local_irq_save(*flags);
 	rq = task_rq(p);
 	spin_lock(&rq->lock);
 	if (unlikely(rq != task_rq(p))) {
@@ -478,7 +523,7 @@ static inline runqueue_t *this_rq_lock(v
 {
 	runqueue_t *rq;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	rq = this_rq();
 	spin_lock(&rq->lock);
 
@@ -591,6 +636,33 @@ static inline void sched_info_switch(tas
 #define sched_info_switch(t, next)	do { } while (0)
 #endif /* CONFIG_SCHEDSTATS */
 
+int rt_overload_schedule, rt_overload_wakeup, rt_overload_pulled;
+
+__cacheline_aligned_in_smp atomic_t rt_overload;
+
+static inline void inc_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (rt_task(p)) {
+		rq->rt_nr_running++;
+		if (rq->rt_nr_running == 2)
+			atomic_inc(&rt_overload);
+	}
+#endif
+}
+
+static inline void dec_rt_tasks(task_t *p, runqueue_t *rq)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (rt_task(p)) {
+		WARN_ON(!rq->rt_nr_running);
+		rq->rt_nr_running--;
+		if (rq->rt_nr_running == 1)
+			atomic_dec(&rt_overload);
+	}
+#endif
+}
+
 /*
  * Adding/removing a task to/from a priority array:
  */
@@ -600,15 +672,21 @@ static void dequeue_task(struct task_str
 	list_del(&p->run_list);
 	if (list_empty(array->queue + p->prio))
 		__clear_bit(p->prio, array->bitmap);
+	dec_rt_tasks(p, array->rq);
 }
 
 static void enqueue_task(struct task_struct *p, prio_array_t *array)
 {
+	if (p->flags & PF_DEAD) {
+		printk("BUG: %s/%d: dead task enqueued!\n", p->comm, p->pid);
+		dump_stack();
+	}
 	sched_info_queued(p);
 	list_add_tail(&p->run_list, array->queue + p->prio);
 	__set_bit(p->prio, array->bitmap);
 	array->nr_active++;
 	p->array = array;
+	inc_rt_tasks(p, array->rq);
 }
 
 /*
@@ -629,7 +707,7 @@ static inline void enqueue_task_head(str
 }
 
 /*
- * effective_prio - return the priority that is based on the static
+ * __normal_prio - return the priority that is based on the static
  * priority but is modified by bonuses/penalties.
  *
  * We scale the actual sleep average [0 .... MAX_SLEEP_AVG]
@@ -642,13 +720,11 @@ static inline void enqueue_task_head(str
  *
  * Both properties are important to certain workloads.
  */
-static int effective_prio(task_t *p)
+
+static inline int __normal_prio(task_t *p)
 {
 	int bonus, prio;
 
-	if (rt_task(p))
-		return p->prio;
-
 	bonus = CURRENT_BONUS(p) - MAX_BONUS / 2;
 
 	prio = p->static_prio - bonus;
@@ -660,23 +736,49 @@ static int effective_prio(task_t *p)
 }
 
 /*
- * __activate_task - move a task to the runqueue.
+ * Calculate the expected normal priority: i.e. priority
+ * without taking RT-inheritance into account. Might be
+ * boosted by interactivity modifiers. Changes upon fork,
+ * setprio syscalls, and whenever the interactivity
+ * estimator recalculates.
  */
-static inline void __activate_task(task_t *p, runqueue_t *rq)
+inline int normal_prio(task_t *p)
 {
-	enqueue_task(p, rq->active);
-	rq->nr_running++;
+	int prio;
+
+	if (p->policy != SCHED_NORMAL)
+		prio = MAX_RT_PRIO-1 - p->rt_priority;
+	else
+		prio = __normal_prio(p);
+
+	trace_special_pid(p->pid, p->prio, prio);
+	return prio;
 }
 
 /*
- * __activate_idle_task - move idle task to the _front_ of runqueue.
+ * Calculate the current priority, i.e. the priority
+ * taken into account by the scheduler. This value might
+ * be boosted by RT tasks, or might be boosted by
+ * interactivity modifiers. Will be RT if the task got
+ * RT-boosted. If not then it returns p->normal_prio.
  */
-static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+static int __recalc_task_prio(task_t *p)
 {
-	enqueue_task_head(p, rq->active);
-	rq->nr_running++;
+	p->normal_prio = normal_prio(p);
+	/*
+	 * If we are RT tasks or we were boosted to RT priority,
+	 * keep the priority unchanged. Otherwise, update priority
+	 * to the normal priority:
+	 */
+	if (!rt_prio(p->prio))
+		return p->normal_prio;
+	return p->prio;
 }
 
+/*
+ * Recalculate p->normal_prio and p->prio after having slept,
+ * updating the sleep-average too:
+ */
 static int recalc_task_prio(task_t *p, unsigned long long now)
 {
 	/* Caller must always ensure 'now >= p->timestamp' */
@@ -736,10 +838,50 @@ static int recalc_task_prio(task_t *p, u
 		}
 	}
 
-	return effective_prio(p);
+	return __recalc_task_prio(p);
+}
+
+static inline void trace_start_sched_wakeup(task_t *p, runqueue_t *rq)
+{
+	if (TASK_PREEMPTS_CURR(p, rq) && (p != rq->curr))
+		__trace_start_sched_wakeup(p);
 }
 
 /*
+ * __activate_task - move a task to the runqueue.
+ */
+static inline void __activate_task(task_t *p, runqueue_t *rq)
+{
+	trace_special_pid(p->pid, p->prio, rq->nr_running);
+	enqueue_task(p, rq->active);
+	rq->nr_running++;
+}
+
+/*
+ * __activate_task_after - move a task to the runqueue,
+ *                         to execute after a specific task.
+ */
+static inline
+void __activate_task_after(task_t *p, task_t *parent, runqueue_t *rq)
+{
+	// FIXME: to head rather?
+	list_add_tail(&p->run_list, &parent->run_list);
+	p->array = parent->array;
+	p->array->nr_active++;
+	rq->nr_running++;
+	inc_rt_tasks(p, rq);
+}
+
+/*
+ * __activate_idle_task - move idle task to the _front_ of runqueue.
+ */
+static inline void __activate_idle_task(task_t *p, runqueue_t *rq)
+{
+	enqueue_task_head(p, rq->active);
+	rq->nr_running++;
+	WARN_ON(rt_task(p));
+}
+/*
  * activate_task - move a task to the runqueue and do priority recalculation
  *
  * Update all the scheduling statistics stuff. (sleep average
@@ -1008,6 +1150,101 @@ nextgroup:
 	return idlest;
 }
 
+#ifdef CONFIG_PREEMPT_RT
+
+static task_t * pick_rt_task(runqueue_t *src_rq, int this_cpu)
+{
+	struct list_head *head, *curr;
+	prio_array_t *array;
+	task_t *tmp;
+	int idx;
+
+	WARN_ON(!spin_is_locked(&src_rq->lock));
+	/*
+	 * Only consider the active array - we are looking for
+	 * RT tasks. Must have 2 tasks at least:
+	 */
+	array = src_rq->active;
+	if (unlikely(array->nr_active < 2))
+		return NULL;
+
+	idx = sched_find_first_bit(array->bitmap);
+next_in_bitmap:
+	/*
+	 * Only non-RT tasks available - abort the search:
+	 */
+	if (idx >= MAX_RT_PRIO)
+		return NULL;
+
+	head = array->queue + idx;
+	curr = head->next;
+next_in_queue:
+	tmp = list_entry(curr, task_t, run_list);
+	/*
+	 * Return the highest-prio non-running RT task (if task
+	 * may run on this CPU):
+	 */
+	if (!task_running(src_rq, tmp) &&
+				cpu_isset(this_cpu, tmp->cpus_allowed))
+		return tmp;
+
+	curr = curr->next;
+	if (curr != head)
+		goto next_in_queue;
+
+	idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx + 1);
+	goto next_in_bitmap;
+}
+
+static void double_lock_balance(runqueue_t *this_rq, runqueue_t *busiest);
+
+/*
+ * Pull RT tasks from other CPUs in the RT-overload
+ * case. Interrupts are disabled, local rq is locked.
+ */
+static void pull_rt_tasks(runqueue_t *this_rq, int this_cpu)
+{
+	runqueue_t *src_rq;
+	task_t *p;
+	int cpu;
+
+	WARN_ON(!raw_irqs_disabled());
+
+	for_each_online_cpu(cpu) {
+		if (cpu == this_cpu)
+			continue;
+		src_rq = cpu_rq(cpu);
+		if (src_rq->rt_nr_running <= 1)
+			continue;
+
+		double_lock_balance(this_rq, src_rq);
+
+		p = pick_rt_task(src_rq, this_cpu);
+
+		if (p /* && TASK_PREEMPTS_CURR(p, this_rq) */ ) {
+			WARN_ON(p == src_rq->curr);
+			WARN_ON(!p->array);
+			rt_overload_pulled++;
+
+			set_task_cpu(p, this_cpu);
+
+			p->timestamp = p->timestamp -
+				src_rq->timestamp_last_tick
+				+ this_rq->timestamp_last_tick;
+			deactivate_task(p, src_rq);
+			activate_task(p, this_rq, 0);
+			/*
+			 * We continue with the search, just in
+			 * case there's an even higher prio task
+			 * in another runqueue.
+			 */
+		}
+		spin_unlock(&src_rq->lock);
+	}
+}
+
+#endif
+
 /*
  * find_idlest_queue - find the idlest runqueue among the cpus in group.
  */
@@ -1140,7 +1377,7 @@ static inline int wake_idle(int cpu, tas
  *
  * returns failure only if the task is already active.
  */
-static int try_to_wake_up(task_t *p, unsigned int state, int sync)
+static int try_to_wake_up(task_t *p, unsigned int state, int sync, int mutex)
 {
 	int cpu, this_cpu, success = 0;
 	unsigned long flags;
@@ -1152,6 +1389,13 @@ static int try_to_wake_up(task_t *p, uns
 	int new_cpu;
 #endif
 
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * sync wakeups can increase wakeup latencies:
+	 */
+	if (rt_task(p))
+		sync = 0;
+#endif
 	rq = task_rq_lock(p, &flags);
 	old_state = p->state;
 	if (!(old_state & state))
@@ -1251,6 +1495,16 @@ out_set_cpu:
 
 		this_cpu = smp_processor_id();
 		cpu = task_cpu(p);
+	} else {
+		/*
+		 * If a newly woken up RT task cannot preempt the
+		 * current (RT) task then try to find another
+		 * CPU it can preempt:
+		 */
+		if (rt_task(p) && !TASK_PREEMPTS_CURR(p, rq)) {
+			smp_send_reschedule_allbutself();
+			rt_overload_wakeup++;
+		}
 	}
 
 out_activate:
@@ -1265,48 +1519,113 @@ out_activate:
 	}
 
 	/*
-	 * Tasks that have marked their sleep as noninteractive get
-	 * woken up without updating their sleep average. (i.e. their
-	 * sleep is handled in a priority-neutral manner, no priority
-	 * boost and no penalty.)
-	 */
-	if (old_state & TASK_NONINTERACTIVE)
-		__activate_task(p, rq);
-	else
-		activate_task(p, rq, cpu == this_cpu);
-	/*
 	 * Sync wakeups (i.e. those types of wakeups where the waker
 	 * has indicated that it will leave the CPU in short order)
-	 * don't trigger a preemption, if the woken up task will run on
-	 * this cpu. (in this case the 'I will reschedule' promise of
-	 * the waker guarantees that the freshly woken up task is going
-	 * to be considered on this CPU.)
+	 * trigger a 'delayed preemption', if the woken up task will run on
+	 * this cpu. Delayed preemption is guaranteed to happen upon
+	 * return to userspace.
 	 */
 	if (!sync || cpu != this_cpu) {
-		if (TASK_PREEMPTS_CURR(p, rq))
+		/*
+		 * Mutex wakeups cause no boosting:
+		 */
+		if (mutex)
+			__activate_task(p, rq);
+		else
+			activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq)) {
+			trace_start_sched_wakeup(p, rq);
 			resched_task(rq->curr);
+		}
+	} else {
+		activate_task(p, rq, cpu == this_cpu);
+		if (TASK_PREEMPTS_CURR(p, rq))
+			set_tsk_need_resched_delayed(rq->curr);
 	}
+	if (rq->curr && p && rq && _need_resched())
+		trace_special_pid(p->pid, p->prio, rq->curr->prio);
 	success = 1;
 
 out_running:
-	p->state = TASK_RUNNING;
+	if (mutex)
+		p->state = TASK_RUNNING_MUTEX;
+	else
+		p->state = TASK_RUNNING;
 out:
-	task_rq_unlock(rq, &flags);
+#ifdef PREEMPT_DIRECT
+	spin_unlock(&rq->lock);
+	/*
+	 * Common place where preemption is requested - if we can
+	 * reschedule then do it here without enabling interrupts
+	 * again (and lengthening latency):
+	 */
+	if (_need_resched() && !irqs_disabled_flags(flags) && !preempt_count())
+		preempt_schedule_irq();
+	raw_local_irq_restore(flags);
+#else
+	spin_unlock_irqrestore(&rq->lock, flags);
+#endif
+	/* no need to check for preempt here - we just handled it */
 
 	return success;
 }
 
-int fastcall wake_up_process(task_t *p)
+int fastcall wake_up_process(task_t * p)
 {
-	return try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
-				 TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE, 0);
+	int ret;
+
+	check_preempt_wakeup(p);
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 0);
+	mcount();
+	return ret;
 }
 
 EXPORT_SYMBOL(wake_up_process);
 
+int fastcall wake_up_process_sync(task_t * p)
+{
+	int ret;
+
+	check_preempt_wakeup(p);
+	ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 0);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_sync);
+
+
+int fastcall wake_up_process_mutex(task_t * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 0, 1);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_mutex);
+
+int fastcall wake_up_process_mutex_sync(task_t * p)
+{
+	int ret = try_to_wake_up(p, TASK_STOPPED | TASK_TRACED |
+				 TASK_RUNNING_MUTEX | TASK_INTERRUPTIBLE |
+				 TASK_UNINTERRUPTIBLE, 1, 1);
+	mcount();
+	return ret;
+}
+
+EXPORT_SYMBOL(wake_up_process_mutex_sync);
+
 int fastcall wake_up_state(task_t *p, unsigned int state)
 {
-	return try_to_wake_up(p, state, 0);
+	int ret = try_to_wake_up(p, state | TASK_RUNNING_MUTEX, 0, 0);
+	mcount();
+	return ret;
 }
 
 /*
@@ -1329,6 +1648,27 @@ void fastcall sched_fork(task_t *p, int 
 	 * event cannot wake it up and insert it on the runqueue either.
 	 */
 	p->state = TASK_RUNNING;
+#ifdef CONFIG_PREEMPT_RT
+	/*
+	 * Some callers of copy_process() (e.g. kernel_thread()) might
+	 * hold locks/semaphores, which might cause the current thread
+	 * to be boosted. To make sure it does not leak to the child,
+	 * we restore the parent's normal prio into the child:
+	 */
+	{
+		if (!rt_prio(current->normal_prio) && rt_prio(current->prio)) {
+			static int once = 1;
+
+			if (once) {
+				once = 0;
+				printk("BUG in %s/%d: priority-boost leaks to child! fixed it up.\n",
+					current->comm, current->pid);
+				dump_stack();
+			}
+			p->prio = current->normal_prio;
+		}
+	}
+#endif
 	INIT_LIST_HEAD(&p->run_list);
 	p->array = NULL;
 #ifdef CONFIG_SCHEDSTATS
@@ -1346,7 +1686,7 @@ void fastcall sched_fork(task_t *p, int 
 	 * total amount of pending timeslices in the system doesn't change,
 	 * resulting in more scheduling fairness.
 	 */
-	local_irq_disable();
+	raw_local_irq_disable();
 	p->time_slice = (current->time_slice + 1) >> 1;
 	/*
 	 * The remainder of the first timeslice might be recovered by
@@ -1364,7 +1704,7 @@ void fastcall sched_fork(task_t *p, int 
 		current->time_slice = 1;
 		scheduler_tick();
 	}
-	local_irq_enable();
+	raw_local_irq_enable();
 	put_cpu();
 }
 
@@ -1395,7 +1735,7 @@ void fastcall wake_up_new_task(task_t *p
 	p->sleep_avg = JIFFIES_TO_NS(CURRENT_BONUS(p) *
 		CHILD_PENALTY / 100 * MAX_SLEEP_AVG / MAX_BONUS);
 
-	p->prio = effective_prio(p);
+	p->prio = __recalc_task_prio(p);
 
 	if (likely(cpu == this_cpu)) {
 		if (!(clone_flags & CLONE_VM)) {
@@ -1408,15 +1748,17 @@ void fastcall wake_up_new_task(task_t *p
 				__activate_task(p, rq);
 			else {
 				p->prio = current->prio;
-				list_add_tail(&p->run_list, &current->run_list);
-				p->array = current->array;
-				p->array->nr_active++;
-				rq->nr_running++;
+				p->normal_prio = current->normal_prio;
+				__activate_task_after(p, current, rq);
 			}
 			set_need_resched();
-		} else
+			trace_start_sched_wakeup(p, rq);
+		} else {
 			/* Run child last */
 			__activate_task(p, rq);
+			if (rt_task(p) && TASK_PREEMPTS_CURR(p, rq))
+				set_need_resched();
+		}
 		/*
 		 * We skip the following code due to cpu == this_cpu
 	 	 *
@@ -1533,10 +1875,25 @@ static inline void finish_task_switch(ru
 	 *		Manfred Spraul <manfred@colorfullife.com>
 	 */
 	prev_task_flags = prev->flags;
-	finish_arch_switch(prev);
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	/*
+	 * If we pushed an RT task off the runqueue,
+	 * then kick other CPUs, they might run it:
+	 */
+	if (unlikely(rt_task(current) && prev->array && rt_task(prev))) {
+		rt_overload_schedule++;
+		smp_send_reschedule_allbutself();
+	}
+#endif
+	_finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
+	trace_stop_sched_switched(current);
+	/*
+	 * Delay the final freeing of the mm or task, so that we dont have
+	 * to do complex work from within the scheduler:
+	 */
 	if (mm)
-		mmdrop(mm);
+		mmdrop_delayed(mm);
 	if (unlikely(prev_task_flags & PF_DEAD))
 		put_task_struct(prev);
 }
@@ -1548,12 +1905,17 @@ static inline void finish_task_switch(ru
 asmlinkage void schedule_tail(task_t *prev)
 	__releases(rq->lock)
 {
-	runqueue_t *rq = this_rq();
-	finish_task_switch(rq, prev);
+	preempt_disable(); // TODO: move this to fork setup
+	finish_task_switch(this_rq(), prev);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
 #ifdef __ARCH_WANT_UNLOCKED_CTXSW
 	/* In this case, finish_task_switch does not reenable preemption */
 	preempt_enable();
+#else
+	preempt_check_resched();
 #endif
+
 	if (current->set_child_tid)
 		put_user(current->pid, current->set_child_tid);
 }
@@ -1581,6 +1943,13 @@ task_t * context_switch(runqueue_t *rq, 
 		rq->prev_mm = oldmm;
 	}
 
+	trace_cmdline();
+
+#ifdef CURRENT_PTR
+	barrier();
+	*current_ptr = next;
+	*current_ti_ptr = next->thread_info;
+#endif
 	/* Here we just switch the register state and the stack. */
 	switch_to(prev, next, prev);
 
@@ -1621,6 +1990,21 @@ unsigned long nr_uninterruptible(void)
 	return sum;
 }
 
+unsigned long nr_uninterruptible_cpu(int cpu)
+{
+	return cpu_rq(cpu)->nr_uninterruptible;
+}
+
+unsigned long rt_nr_running_cpu(int cpu)
+{
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	return cpu_rq(cpu)->rt_nr_running;
+#else
+	return 0;
+#endif
+}
+
+
 unsigned long long nr_context_switches(void)
 {
 	unsigned long long i, sum = 0;
@@ -2438,10 +2822,11 @@ unsigned long long current_sched_time(co
 {
 	unsigned long long ns;
 	unsigned long flags;
-	local_irq_save(flags);
+
+	raw_local_irq_save(flags);
 	ns = max(tsk->timestamp, task_rq(tsk)->timestamp_last_tick);
 	ns = tsk->sched_time + (sched_clock() - ns);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 	return ns;
 }
 
@@ -2550,6 +2935,8 @@ void scheduler_tick(void)
 	task_t *p = current;
 	unsigned long long now = sched_clock();
 
+	BUG_ON(!raw_irqs_disabled());
+
 	update_cpu_clock(p, rq, now);
 
 	rq->timestamp_last_tick = now;
@@ -2573,11 +2960,17 @@ void scheduler_tick(void)
 	 * priority until it either goes to sleep or uses up its
 	 * timeslice. This makes it possible for interactive tasks
 	 * to use up their timeslices at their highest priority levels.
+	 *
+	 * Priority-boosted SCHED_NORMAL tasks may go here too.
 	 */
 	if (rt_task(p)) {
 		/*
 		 * RR tasks need a special form of timeslice management.
 		 * FIFO tasks have no timeslices.
+		 *
+		 * On PREEMPT_RT, boosted tasks will also get into this
+		 * branch and wont get their timeslice decreased until
+		 * they have done their work.
 		 */
 		if ((p->policy == SCHED_RR) && !--p->time_slice) {
 			p->time_slice = task_timeslice(p);
@@ -2592,7 +2985,7 @@ void scheduler_tick(void)
 	if (!--p->time_slice) {
 		dequeue_task(p, rq->active);
 		set_tsk_need_resched(p);
-		p->prio = effective_prio(p);
+		p->prio = __recalc_task_prio(p);
 		p->time_slice = task_timeslice(p);
 		p->first_time_slice = 0;
 
@@ -2698,13 +3091,13 @@ static inline unsigned long smt_slice(ta
 	return p->time_slice * (100 - sd->per_cpu_gain) / 100;
 }
 
-static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
+static int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
 {
 	struct sched_domain *tmp, *sd = NULL;
 	cpumask_t sibling_map;
 	prio_array_t *array;
-	int ret = 0, i;
-	task_t *p;
+	int ret = 0, i, stop_sched_switched = 0;
+	task_t *p = NULL;
 
 	for_each_domain(this_cpu, tmp)
 		if (tmp->flags & SD_SHARE_CPUPOWER)
@@ -2766,6 +3159,12 @@ static inline int dependent_sleeper(int 
 				!TASK_PREEMPTS_CURR(p, smt_rq) &&
 				smt_slice(smt_curr, sd) > task_timeslice(p))
 					ret = 1;
+		if (ret) {
+			trace_special_pid(smt_curr->pid, smt_curr->prio,
+						smt_curr->static_prio);
+			trace_special_pid(p->pid, p->prio, p->static_prio);
+			stop_sched_switched = 1;
+		}
 
 check_smt_task:
 		if ((!smt_curr->mm && smt_curr != smt_rq->idle) ||
@@ -2796,6 +3195,9 @@ check_smt_task:
 out_unlock:
 	for_each_cpu_mask(i, sibling_map)
 		spin_unlock(&cpu_rq(i)->lock);
+	if (p && stop_sched_switched)
+		trace_stop_sched_switched(p);
+
 	return ret;
 }
 #else
@@ -2809,42 +3211,51 @@ static inline int dependent_sleeper(int 
 }
 #endif
 
-#if defined(CONFIG_PREEMPT) && defined(CONFIG_DEBUG_PREEMPT)
+#if defined(CONFIG_LATENCY_TRACE) && defined(CONFIG_DEBUG_DEADLOCKS)
 
-void fastcall add_preempt_count(int val)
+static void trace_array(prio_array_t *array)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON((preempt_count() < 0));
-	preempt_count() += val;
-	/*
-	 * Spinlock count overflowing soon?
-	 */
-	BUG_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10);
+	int i;
+	task_t *p;
+	struct list_head *head, *tmp;
+
+	for (i = 0; i < MAX_PRIO; i++) {
+		head = array->queue + i;
+		if (list_empty(head)) {
+			WARN_ON(test_bit(i, array->bitmap));
+			continue;
+		}
+		WARN_ON(!test_bit(i, array->bitmap));
+		list_for_each(tmp, head) {
+			p = list_entry(tmp, task_t, run_list);
+			trace_special_pid(p->pid, p->prio,
+				p->policy == SCHED_NORMAL ?
+					p->static_prio :
+					(MAX_RT_PRIO-1) - p->rt_priority);
+		}
+	}
 }
-EXPORT_SYMBOL(add_preempt_count);
 
-void fastcall sub_preempt_count(int val)
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
+{
+	if (trace_enabled) {
+		trace_array(rq->active);
+		trace_array(rq->expired);
+	}
+}
+
+#else
+
+static inline void trace_all_runnable_tasks(runqueue_t *rq)
 {
-	/*
-	 * Underflow?
-	 */
-	BUG_ON(val > preempt_count());
-	/*
-	 * Is the spinlock portion underflowing?
-	 */
-	BUG_ON((val < PREEMPT_MASK) && !(preempt_count() & PREEMPT_MASK));
-	preempt_count() -= val;
 }
-EXPORT_SYMBOL(sub_preempt_count);
 
 #endif
 
 /*
- * schedule() is the main scheduler function.
+ * __schedule() is the main scheduler function.
  */
-asmlinkage void __sched schedule(void)
+void __sched __schedule(void)
 {
 	long *switch_count;
 	task_t *prev, *next;
@@ -2855,26 +3266,24 @@ asmlinkage void __sched schedule(void)
 	unsigned long run_time;
 	int cpu, idx, new_prio;
 
+	WARN_ON(system_state == SYSTEM_BOOTING);
 	/*
-	 * Test if we are atomic.  Since do_exit() needs to call into
-	 * schedule() atomically, we ignore that path for now.
-	 * Otherwise, whine if we are scheduling when we should not be.
-	 */
-	if (likely(!current->exit_state)) {
-		if (unlikely(in_atomic())) {
-			printk(KERN_ERR "scheduling while atomic: "
-				"%s/0x%08x/%d\n",
-				current->comm, preempt_count(), current->pid);
-			dump_stack();
-		}
+	 * Test if we are atomic.
+	 */
+	if (unlikely(in_atomic())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling while atomic: "
+			"%s/0x%08x/%d\n",
+			current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
 	}
 	profile_hit(SCHED_PROFILING, __builtin_return_address(0));
 
-need_resched:
-	preempt_disable();
+	preempt_disable(); // FIXME: disable irqs here
 	prev = current;
 	release_kernel_lock(prev);
-need_resched_nonpreemptible:
 	rq = this_rq();
 
 	/*
@@ -2882,7 +3291,7 @@ need_resched_nonpreemptible:
 	 * Remove this check after it has been exercised a bit.
 	 */
 	if (unlikely(prev == rq->idle) && prev->state != TASK_RUNNING) {
-		printk(KERN_ERR "bad: scheduling from the idle thread!\n");
+		printk(KERN_ERR "BUG: scheduling from the idle thread!\n");
 		dump_stack();
 	}
 
@@ -2901,13 +3310,12 @@ need_resched_nonpreemptible:
 	 */
 	run_time /= (CURRENT_BONUS(prev) ? : 1);
 
+	cpu = smp_processor_id();
 	spin_lock_irq(&rq->lock);
 
-	if (unlikely(prev->flags & PF_DEAD))
-		prev->state = EXIT_DEAD;
-
-	switch_count = &prev->nivcsw;
-	if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) {
+	switch_count = &prev->nvcsw; // TODO: temporary - to see it in vmstat
+	if ((prev->state & ~TASK_RUNNING_MUTEX) &&
+			!(preempt_count() & PREEMPT_ACTIVE)) {
 		switch_count = &prev->nvcsw;
 		if (unlikely((prev->state & TASK_INTERRUPTIBLE) &&
 				unlikely(signal_pending(prev))))
@@ -2915,11 +3323,27 @@ need_resched_nonpreemptible:
 		else {
 			if (prev->state == TASK_UNINTERRUPTIBLE)
 				rq->nr_uninterruptible++;
+			touch_light_softlockup_watchdog();
 			deactivate_task(prev, rq);
 		}
 	}
+	if (preempt_count() & PREEMPT_ACTIVE)
+		sub_preempt_count(PREEMPT_ACTIVE);
+	if (unlikely(prev->flags & PF_DEAD)) {
+		if (prev->state != TASK_RUNNING) {
+			printk("prev->state: %ld != TASK_RUNNING??\n",
+				prev->state);
+			WARN_ON(1);
+		} else
+			deactivate_task(prev, rq);
+		prev->state = EXIT_DEAD;
+	}
+
+#if defined(CONFIG_PREEMPT_RT) && defined(CONFIG_SMP)
+	if (unlikely(atomic_read(&rt_overload)))
+		pull_rt_tasks(rq, cpu);
+#endif
 
-	cpu = smp_processor_id();
 	if (unlikely(!rq->nr_running)) {
 go_idle:
 		idle_balance(cpu, rq);
@@ -2991,6 +3415,7 @@ switch_tasks:
 	prefetch(next);
 	prefetch_stack(next);
 	clear_tsk_need_resched(prev);
+	clear_tsk_need_resched_delayed(prev);
 	rcu_qsctr_inc(task_cpu(prev));
 
 	update_cpu_clock(prev, rq, now);
@@ -3000,6 +3425,8 @@ switch_tasks:
 		prev->sleep_avg = 0;
 	prev->timestamp = prev->last_ran = now;
 
+	trace_all_runnable_tasks(rq);
+
 	sched_info_switch(prev, next);
 	if (likely(prev != next)) {
 		next->timestamp = now;
@@ -3010,26 +3437,81 @@ switch_tasks:
 		prepare_task_switch(rq, next);
 		prev = context_switch(rq, prev, next);
 		barrier();
+		trace_special_pid(prev->pid, prev->prio, current->prio);
 		/*
 		 * this_rq must be evaluated again because prev may have moved
 		 * CPUs since it called schedule(), thus the 'rq' on its stack
 		 * frame will be invalid.
 		 */
 		finish_task_switch(this_rq(), prev);
-	} else
-		spin_unlock_irq(&rq->lock);
+		__preempt_enable_no_resched();
+	} else {
+		__preempt_enable_no_resched();
+		spin_unlock(&rq->lock);
+		trace_stop_sched_switched(next);
+	}
 
-	prev = current;
-	if (unlikely(reacquire_kernel_lock(prev) < 0))
-		goto need_resched_nonpreemptible;
-	preempt_enable_no_resched();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
-		goto need_resched;
+	reacquire_kernel_lock(current);
 }
 
+/*
+ * schedule() is the main scheduler function.
+ */
+asmlinkage void __sched schedule(void)
+{
+	WARN_ON(system_state == SYSTEM_BOOTING);
+	/*
+	 * Test if we have interrupts disabled.
+	 */
+	if (unlikely(irqs_disabled() || raw_irqs_disabled())) {
+		stop_trace();
+		printk(KERN_ERR "BUG: scheduling with irqs disabled: "
+			"%s/0x%08x/%d\n",
+				current->comm, preempt_count(), current->pid);
+		print_symbol("caller is %s\n",
+			(long)__builtin_return_address(0));
+		dump_stack();
+	}
+	if (unlikely(current->flags & PF_NOSCHED)) {
+		current->flags &= ~PF_NOSCHED;
+		printk(KERN_ERR "%s:%d userspace BUG: scheduling in user-atomic context!\n", current->comm, current->pid);
+		dump_stack();
+		send_sig(SIGUSR2, current, 1);
+	}
+	do {
+		__schedule();
+	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)));
+	raw_local_irq_enable(); // TODO: do sti; ret
+}
 EXPORT_SYMBOL(schedule);
 
 #ifdef CONFIG_PREEMPT
+
+int kernel_preemption = 1;
+
+static int __init preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3)) {
+		if (kernel_preemption) {
+			printk("turning off kernel preemption!\n");
+			kernel_preemption = 0;
+		}
+		return 1;
+	}
+	if (!strncmp(str, "on", 2)) {
+		if (!kernel_preemption) {
+			printk("turning on kernel preemption!\n");
+			kernel_preemption = 1;
+		}
+		return 1;
+	}
+	get_option(&str, &kernel_preemption);
+
+	return 1;
+}
+
+__setup("preempt=", preempt_setup);
+
 /*
  * this is is the entry point to schedule() from in-kernel preemption
  * off of preempt_enable.  Kernel preemptions off return from interrupt
@@ -3042,14 +3524,17 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
+	if (!kernel_preemption)
+		return;
 	/*
 	 * If there is a non-zero preempt_count or interrupts are disabled,
 	 * we do not want to preempt the current task.  Just return..
 	 */
-	if (unlikely(ti->preempt_count || irqs_disabled()))
+	if (unlikely(ti->preempt_count || irqs_disabled() || raw_irqs_disabled()))
 		return;
 
 need_resched:
+	raw_local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3060,25 +3545,24 @@ need_resched:
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	schedule();
+	__schedule();
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
+	raw_local_irq_enable();
 }
 
 EXPORT_SYMBOL(preempt_schedule);
 
 /*
- * this is is the entry point to schedule() from kernel preemption
- * off of irq context.
- * Note, that this is called and return with irqs disabled. This will
- * protect us against recursive calling from irq.
+ * this is is the entry point for the IRQ return path. Called with
+ * interrupts disabled.  To avoid infinite irq-entry recursion problems
+ * with fast-paced IRQ sources we do all of this carefully to never
+ * enable interrupts again.
  */
 asmlinkage void __sched preempt_schedule_irq(void)
 {
@@ -3087,10 +3571,17 @@ asmlinkage void __sched preempt_schedule
 	struct task_struct *task = current;
 	int saved_lock_depth;
 #endif
-	/* Catch callers which need to be fixed*/
-	BUG_ON(ti->preempt_count || !irqs_disabled());
+	if (!kernel_preemption)
+		return;
+	/*
+	 * If there is a non-zero preempt_count then just return.
+	 * (interrupts are disabled)
+	 */
+	if (unlikely(ti->preempt_count))
+		return;
 
 need_resched:
+	raw_local_irq_disable();
 	add_preempt_count(PREEMPT_ACTIVE);
 	/*
 	 * We keep the big kernel semaphore locked, but we
@@ -3101,17 +3592,16 @@ need_resched:
 	saved_lock_depth = task->lock_depth;
 	task->lock_depth = -1;
 #endif
-	local_irq_enable();
-	schedule();
-	local_irq_disable();
+	__schedule();
+
+	raw_local_irq_disable();
+
 #ifdef CONFIG_PREEMPT_BKL
 	task->lock_depth = saved_lock_depth;
 #endif
-	sub_preempt_count(PREEMPT_ACTIVE);
-
 	/* we could miss a preemption opportunity between schedule and now */
 	barrier();
-	if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+	if (unlikely(test_thread_flag(TIF_NEED_RESCHED) || test_thread_flag(TIF_NEED_RESCHED_DELAYED)))
 		goto need_resched;
 }
 
@@ -3121,7 +3611,7 @@ int default_wake_function(wait_queue_t *
 			  void *key)
 {
 	task_t *p = curr->private;
-	return try_to_wake_up(p, mode, sync);
+	return try_to_wake_up(p, mode | TASK_RUNNING_MUTEX, sync, 0);
 }
 
 EXPORT_SYMBOL(default_wake_function);
@@ -3165,8 +3655,9 @@ void fastcall __wake_up(wait_queue_head_
 	unsigned long flags;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, 0, key);
+	__wake_up_common(q, mode, nr_exclusive, 1, key);
 	spin_unlock_irqrestore(&q->lock, flags);
+	preempt_check_resched_delayed();
 }
 
 EXPORT_SYMBOL(__wake_up);
@@ -3217,8 +3708,9 @@ void fastcall complete(struct completion
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 1, 0, NULL);
+			 1, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete);
 
@@ -3229,11 +3721,19 @@ void fastcall complete_all(struct comple
 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done += UINT_MAX/2;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
-			 0, 0, NULL);
+			 0, 1, NULL);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
+	preempt_check_resched_delayed();
 }
 EXPORT_SYMBOL(complete_all);
 
+unsigned int fastcall completion_done(struct completion *x)
+{
+	return x->done;
+}
+EXPORT_SYMBOL(completion_done);
+
+
 void fastcall __sched wait_for_completion(struct completion *x)
 {
 	might_sleep();
@@ -3460,7 +3960,7 @@ void set_user_nice(task_t *p, long nice)
 	new_prio = NICE_TO_PRIO(nice);
 	delta = new_prio - old_prio;
 	p->static_prio = NICE_TO_PRIO(nice);
-	p->prio += delta;
+	p->prio = __recalc_task_prio(p);
 
 	if (array) {
 		enqueue_task(p, array);
@@ -3490,6 +3990,53 @@ int can_nice(const task_t *p, const int 
 		capable(CAP_SYS_NICE));
 }
 
+/*
+ * Used by the PREEMPT_RT code to implement
+ * priority inheritance logic:
+ */
+void mutex_setprio(task_t *p, int prio)
+{
+	unsigned long flags;
+	prio_array_t *array;
+	runqueue_t *rq;
+	int oldprio, prev_resched;
+
+	BUG_ON(prio < 0 || prio > MAX_PRIO);
+
+	rq = task_rq_lock(p, &flags);
+
+	oldprio = p->prio;
+	array = p->array;
+	if (array)
+		dequeue_task(p, array);
+	p->prio = prio;
+
+	trace_special_pid(p->pid, oldprio, prio);
+	prev_resched = _need_resched();
+	if (array) {
+		/*
+		 * If changing to an RT priority then queue it
+		 * in the active array!
+		 */
+		if (rt_task(p))
+			array = rq->active;
+		enqueue_task(p, array);
+		/*
+		 * Reschedule if we are currently running on this runqueue and
+		 * our priority decreased, or if we are not currently running on
+		 * this runqueue and our priority is higher than the current's
+		 */
+		if (task_running(rq, p)) {
+			if (p->prio > oldprio)
+				resched_task(rq->curr);
+		} else if (TASK_PREEMPTS_CURR(p, rq))
+			resched_task(rq->curr);
+	}
+	trace_special(prev_resched, _need_resched(), 0);
+
+	task_rq_unlock(rq, &flags);
+}
+
 #ifdef __ARCH_WANT_SYS_NICE
 
 /*
@@ -3591,10 +4138,8 @@ static void __setscheduler(struct task_s
 	BUG_ON(p->array);
 	p->policy = policy;
 	p->rt_priority = prio;
-	if (policy != SCHED_NORMAL)
-		p->prio = MAX_RT_PRIO-1 - p->rt_priority;
-	else
-		p->prio = p->static_prio;
+	__recalc_task_prio(p);
+	p->prio = p->normal_prio;
 }
 
 /**
@@ -3977,11 +4522,11 @@ asmlinkage long sys_sched_yield(void)
 	 * Since we are going to call schedule() anyway, there's
 	 * no need to preempt or enable interrupts:
 	 */
-	__release(rq->lock);
-	_raw_spin_unlock(&rq->lock);
-	preempt_enable_no_resched();
+	spin_unlock_no_resched(&rq->lock);
 
-	schedule();
+	__schedule();
+	raw_local_irq_enable();
+	preempt_check_resched();
 
 	return 0;
 }
@@ -3996,10 +4541,11 @@ static inline void __cond_resched(void)
 	if (unlikely(preempt_count()))
 		return;
 	do {
+		raw_local_irq_disable();
 		add_preempt_count(PREEMPT_ACTIVE);
-		schedule();
-		sub_preempt_count(PREEMPT_ACTIVE);
+		__schedule();
 	} while (need_resched());
+	raw_local_irq_enable();
 }
 
 int __sched cond_resched(void)
@@ -4021,43 +4567,121 @@ EXPORT_SYMBOL(cond_resched);
  * operations here to prevent schedule() from being called twice (once via
  * spin_unlock(), once by hand).
  */
-int cond_resched_lock(spinlock_t *lock)
+int __cond_resched_raw_spinlock(raw_spinlock_t *lock)
 {
 	int ret = 0;
 
-	if (need_lockbreak(lock)) {
+	if (need_lockbreak_raw(lock)) {
 		spin_unlock(lock);
 		cpu_relax();
-		ret = 1;
 		spin_lock(lock);
+		ret = 1;
 	}
 	if (need_resched()) {
-		_raw_spin_unlock(lock);
-		preempt_enable_no_resched();
+		spin_unlock_no_resched(lock);
 		__cond_resched();
-		ret = 1;
 		spin_lock(lock);
+		ret = 1;
 	}
 	return ret;
 }
 
-EXPORT_SYMBOL(cond_resched_lock);
+EXPORT_SYMBOL(__cond_resched_raw_spinlock);
+
+#ifdef CONFIG_PREEMPT_RT
+
+int __cond_resched_spinlock(spinlock_t *lock)
+{
+#if (defined(CONFIG_SMP) && defined(CONFIG_PREEMPT)) || defined(CONFIG_PREEMPT_RT)
+	if (lock->break_lock) {
+		lock->break_lock = 0;
+		_spin_unlock(lock);
+		__cond_resched();
+		_spin_lock(lock);
+		return 1;
+	}
+#endif
+	return 0;
+}
+
+EXPORT_SYMBOL(__cond_resched_spinlock);
 
+#endif
+
+
+/*
+ * Preempt a softirq context if necessary:
+ */
 int __sched cond_resched_softirq(void)
 {
+#ifndef CONFIG_PREEMPT_RT
 	BUG_ON(!in_softirq());
 
-	if (need_resched()) {
+	if (softirq_need_resched()) {
 		__local_bh_enable();
 		__cond_resched();
 		local_bh_disable();
 		return 1;
 	}
+#endif
 	return 0;
 }
 
 EXPORT_SYMBOL(cond_resched_softirq);
 
+/*
+ * Preempt a hardirq context if necessary:
+ */
+int cond_resched_hardirq(void)
+{
+	BUG_ON(!in_irq());
+
+	if (hardirq_need_resched()) {
+		irq_exit();
+		__cond_resched();
+		irq_enter();
+		return 1;
+	}
+	return 0;
+}
+
+EXPORT_SYMBOL(cond_resched_hardirq);
+
+/*
+ * Preempt any context:
+ */
+int cond_resched_all(void)
+{
+	if (hardirq_count())
+		return cond_resched_hardirq();
+	if (softirq_count())
+		return cond_resched_softirq();
+	return cond_resched();
+}
+
+EXPORT_SYMBOL(cond_resched_all);
+
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+
+int voluntary_preemption = 1;
+
+EXPORT_SYMBOL(voluntary_preemption);
+
+static int __init voluntary_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		voluntary_preemption = 0;
+	else
+		get_option(&str, &voluntary_preemption);
+	if (!voluntary_preemption)
+		printk("turning off voluntary preemption!\n");
+
+	return 1;
+}
+
+__setup("voluntary-preempt=", voluntary_preempt_setup);
+
+#endif
 
 /**
  * yield - yield the current processor to other threads.
@@ -4065,12 +4689,31 @@ EXPORT_SYMBOL(cond_resched_softirq);
  * this is a shortcut for kernel-space yielding - it marks the
  * thread runnable and calls sys_sched_yield().
  */
-void __sched yield(void)
+void __sched __yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 }
 
+void __sched yield(void)
+{
+	static int once = 1;
+
+	/*
+	 * it's a bug to rely on yield() with RT priorities. We print
+	 * the first occurance after bootup ... this will still give
+	 * us an idea about the scope of the problem, without spamming
+	 * the syslog:
+	 */
+	if (once && rt_task(current)) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d RT task yield()-ing!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+	__yield();
+}
+
 EXPORT_SYMBOL(yield);
 
 /*
@@ -4209,25 +4852,29 @@ static void show_task(task_t *p)
 	task_t *relative;
 	unsigned state;
 	unsigned long free = 0;
-	static const char *stat_nam[] = { "R", "S", "D", "T", "t", "Z", "X" };
+	static const char *stat_nam[] = { "R", "M", "S", "D", "T", "t", "Z", "X" };
 
-	printk("%-13.13s ", p->comm);
+	printk("%-13.13s [%p]", p->comm, p);
 	state = p->state ? __ffs(p->state) + 1 : 0;
 	if (state < ARRAY_SIZE(stat_nam))
 		printk(stat_nam[state]);
 	else
 		printk("?");
 #if (BITS_PER_LONG == 32)
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk(" running ");
 	else
 		printk(" %08lX ", thread_saved_pc(p));
 #else
-	if (state == TASK_RUNNING)
+	if (0 && (state == TASK_RUNNING))
 		printk("  running task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(p));
 #endif
+	if (task_curr(p))
+		printk("[curr] ");
+	else if (p->array)
+		printk("[on rq #%d] ", task_cpu(p));
 #ifdef CONFIG_DEBUG_STACK_USAGE
 	{
 		unsigned long *n = (unsigned long *) (p->thread_info+1);
@@ -4254,13 +4901,14 @@ static void show_task(task_t *p)
 	else
 		printk(" (NOTLB)\n");
 
-	if (state != TASK_RUNNING)
+//	if (state != TASK_RUNNING)
 		show_stack(p, NULL);
 }
 
 void show_state(void)
 {
 	task_t *g, *p;
+	int do_unlock = 1;
 
 #if (BITS_PER_LONG == 32)
 	printk("\n"
@@ -4271,7 +4919,16 @@ void show_state(void)
 	       "                                                       sibling\n");
 	printk("  task                 PC          pid father child younger older\n");
 #endif
+#ifdef CONFIG_PREEMPT_RT
+	if (!read_trylock(&tasklist_lock)) {
+		printk("hm, tasklist_lock write-locked.\n");
+		printk("ignoring ...\n");
+		do_unlock = 0;
+	}
+#else
 	read_lock(&tasklist_lock);
+#endif
+
 	do_each_thread(g, p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
@@ -4281,7 +4938,9 @@ void show_state(void)
 		show_task(p);
 	} while_each_thread(g, p);
 
-	read_unlock(&tasklist_lock);
+	if (do_unlock)
+		read_unlock(&tasklist_lock);
+	show_all_locks();
 }
 
 /**
@@ -4299,7 +4958,7 @@ void __devinit init_idle(task_t *idle, i
 
 	idle->sleep_avg = 0;
 	idle->array = NULL;
-	idle->prio = MAX_PRIO;
+	idle->prio = idle->normal_prio = MAX_PRIO;
 	idle->state = TASK_RUNNING;
 	idle->cpus_allowed = cpumask_of_cpu(cpu);
 	set_task_cpu(idle, cpu);
@@ -4312,7 +4971,9 @@ void __devinit init_idle(task_t *idle, i
 	spin_unlock_irqrestore(&rq->lock, flags);
 
 	/* Set the preempt count _outside_ the spinlocks! */
-#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_BKL)
+#if defined(CONFIG_PREEMPT) && \
+	!defined(CONFIG_PREEMPT_BKL) && \
+		!defined(CONFIG_PREEMPT_RT)
 	idle->thread_info->preempt_count = (idle->lock_depth >= 0);
 #else
 	idle->thread_info->preempt_count = 0;
@@ -4396,12 +5057,13 @@ EXPORT_SYMBOL_GPL(set_cpus_allowed);
  * So we race with normal scheduler movements, but that's OK, as long
  * as the task is no longer on this CPU.
  */
-static void __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
+static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu)
 {
 	runqueue_t *rq_dest, *rq_src;
+	int ret = 0;
 
 	if (unlikely(cpu_is_offline(dest_cpu)))
-		return;
+		return 0;
 
 	rq_src = cpu_rq(src_cpu);
 	rq_dest = cpu_rq(dest_cpu);
@@ -4414,7 +5076,9 @@ static void __migrate_task(struct task_s
 	if (!cpu_isset(dest_cpu, p->cpus_allowed))
 		goto out;
 
+	WARN_ON(p == rq_src->curr);
 	set_task_cpu(p, dest_cpu);
+
 	if (p->array) {
 		/*
 		 * Sync timestamp with rq_dest's before activating.
@@ -4428,10 +5092,13 @@ static void __migrate_task(struct task_s
 		activate_task(p, rq_dest, 0);
 		if (TASK_PREEMPTS_CURR(p, rq_dest))
 			resched_task(rq_dest->curr);
+		ret = 1;
 	}
 
 out:
 	double_rq_unlock(rq_src, rq_dest);
+
+	return ret;
 }
 
 /*
@@ -4479,7 +5146,7 @@ static int migration_thread(void *data)
 
 		spin_unlock(&rq->lock);
 		__migrate_task(req->task, cpu, req->dest_cpu);
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		complete(&req->done);
 	}
@@ -4543,12 +5210,12 @@ static void migrate_nr_uninterruptible(r
 	runqueue_t *rq_dest = cpu_rq(any_online_cpu(CPU_MASK_ALL));
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	double_rq_lock(rq_src, rq_dest);
 	rq_dest->nr_uninterruptible += rq_src->nr_uninterruptible;
 	rq_src->nr_uninterruptible = 0;
 	double_rq_unlock(rq_src, rq_dest);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 /* Run through task list and migrate tasks from the dead cpu. */
@@ -4626,9 +5293,9 @@ static void migrate_dead(unsigned int de
 	 * that's OK.  No task can be added to this CPU, so iteration is
 	 * fine.
 	 */
-	spin_unlock_irq(&rq->lock);
-	move_task_off_dead_cpu(dead_cpu, tsk);
 	spin_lock_irq(&rq->lock);
+	move_task_off_dead_cpu(dead_cpu, tsk);
+	spin_unlock_irq(&rq->lock);
 
 	put_task_struct(tsk);
 }
@@ -5527,6 +6194,7 @@ void __init sched_init(void)
 
 		for (j = 0; j < 2; j++) {
 			array = rq->arrays + j;
+			array->rq = rq;
 			for (k = 0; k < MAX_PRIO; k++) {
 				INIT_LIST_HEAD(array->queue + k);
 				__clear_bit(k, array->bitmap);
@@ -5542,6 +6210,9 @@ void __init sched_init(void)
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current);
 
+#ifdef CONFIG_PREEMPT_RT
+	printk("Real-Time Preemption Support (C) 2004-2005 Ingo Molnar\n");
+#endif
 	/*
 	 * Make us the idle thread. Technically, schedule() should not be
 	 * called from this thread, however somewhere below it might be,
@@ -5551,21 +6222,25 @@ void __init sched_init(void)
 	init_idle(current, smp_processor_id());
 }
 
-#ifdef CONFIG_DEBUG_SPINLOCK_SLEEP
+#if defined(CONFIG_DEBUG_SPINLOCK_SLEEP) || defined(CONFIG_DEBUG_PREEMPT)
 void __might_sleep(char *file, int line)
 {
 #if defined(in_atomic)
 	static unsigned long prev_jiffy;	/* ratelimiting */
 
-	if ((in_atomic() || irqs_disabled()) &&
+	if ((in_atomic() || irqs_disabled() || raw_irqs_disabled()) &&
 	    system_state == SYSTEM_RUNNING && !oops_in_progress) {
+		if (debug_direct_keyboard && hardirq_count())
+			return;
 		if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
 			return;
 		prev_jiffy = jiffies;
-		printk(KERN_ERR "Debug: sleeping function called from invalid"
-				" context at %s:%d\n", file, line);
-		printk("in_atomic():%d, irqs_disabled():%d\n",
-			in_atomic(), irqs_disabled());
+		stop_trace();
+		printk(KERN_ERR "BUG: sleeping function called from invalid"
+				" context %s(%d) at %s:%d\n",
+				current->comm, current->pid, file, line);
+		printk("in_atomic():%d [%08x], irqs_disabled():%d\n",
+			in_atomic(), preempt_count(), irqs_disabled());
 		dump_stack();
 	}
 #endif
@@ -5647,3 +6322,23 @@ void set_curr_task(int cpu, task_t *p)
 }
 
 #endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+void notrace preempt_enable_no_resched(void)
+{
+	static int once = 1;
+
+	barrier();
+	dec_preempt_count();
+
+	if (once && !preempt_count()) {
+		once = 0;
+		printk(KERN_ERR "BUG: %s:%d task might have lost a preemption check!\n",
+			current->comm, current->pid);
+		dump_stack();
+	}
+}
+
+EXPORT_SYMBOL(preempt_enable_no_resched);
+#endif
+
Index: linux/kernel/signal.c
===================================================================
--- linux.orig/kernel/signal.c
+++ linux/kernel/signal.c
@@ -330,13 +330,20 @@ void __exit_sighand(struct task_struct *
 	/* Ok, we're done with the signal handlers */
 	tsk->sighand = NULL;
 	if (atomic_dec_and_test(&sighand->count))
-		kmem_cache_free(sighand_cachep, sighand);
+		sighand_free(sighand);
 }
 
 void exit_sighand(struct task_struct *tsk)
 {
 	write_lock_irq(&tasklist_lock);
-	__exit_sighand(tsk);
+	rcu_read_lock();
+	if (tsk->sighand != NULL) {
+		struct sighand_struct *sighand = rcu_dereference(tsk->sighand);
+		spin_lock(&sighand->siglock);
+		__exit_sighand(tsk);
+		spin_unlock(&sighand->siglock);
+	}
+	rcu_read_unlock();
 	write_unlock_irq(&tasklist_lock);
 }
 
@@ -346,12 +353,14 @@ void exit_sighand(struct task_struct *ts
 void __exit_signal(struct task_struct *tsk)
 {
 	struct signal_struct * sig = tsk->signal;
-	struct sighand_struct * sighand = tsk->sighand;
+	struct sighand_struct * sighand;
 
 	if (!sig)
 		BUG();
 	if (!atomic_read(&sig->count))
 		BUG();
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
 	spin_lock(&sighand->siglock);
 	posix_cpu_timers_exit(tsk);
 	if (atomic_dec_and_test(&sig->count)) {
@@ -359,6 +368,7 @@ void __exit_signal(struct task_struct *t
 		if (tsk == sig->curr_target)
 			sig->curr_target = next_thread(tsk);
 		tsk->signal = NULL;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		flush_sigqueue(&sig->shared_pending);
 	} else {
@@ -390,9 +400,11 @@ void __exit_signal(struct task_struct *t
 		sig->nvcsw += tsk->nvcsw;
 		sig->nivcsw += tsk->nivcsw;
 		sig->sched_time += tsk->sched_time;
+		__exit_sighand(tsk);
 		spin_unlock(&sighand->siglock);
 		sig = NULL;	/* Marker for below.  */
 	}
+	rcu_read_unlock();
 	clear_tsk_thread_flag(tsk,TIF_SIGPENDING);
 	flush_sigqueue(&tsk->pending);
 	if (sig) {
@@ -853,9 +865,12 @@ specific_send_sig_info(int sig, struct s
 {
 	int ret = 0;
 
-	if (!irqs_disabled())
-		BUG();
+#ifndef CONFIG_PREEMPT_RT
+	BUG_ON(!irqs_disabled());
+#endif
+#ifdef CONFIG_SMP
 	assert_spin_locked(&t->sighand->siglock);
+#endif
 
 	if (((unsigned long)info > 2) && (info->si_code == SI_TIMER))
 		/*
@@ -1114,18 +1129,29 @@ void zap_other_threads(struct task_struc
 }
 
 /*
- * Must be called with the tasklist_lock held for reading!
+ * Must be called under rcu_read_lock() or with tasklist_lock read-held.
  */
 int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p)
 {
 	unsigned long flags;
+	struct sighand_struct *sp;
 	int ret;
 
+retry:
 	ret = check_kill_permission(sig, info, p);
-	if (!ret && sig && p->sighand) {
-		spin_lock_irqsave(&p->sighand->siglock, flags);
+	if (!ret && sig && (sp = p->sighand)) {
+		if (!get_task_struct_rcu(p)) {
+			return -ESRCH;
+		}
+		spin_lock_irqsave(&sp->siglock, flags);
+		if (p->sighand != sp) {
+			spin_unlock_irqrestore(&sp->siglock, flags);
+			put_task_struct(p);
+			goto retry;
+		}
 		ret = __group_send_sig_info(sig, info, p);
-		spin_unlock_irqrestore(&p->sighand->siglock, flags);
+		spin_unlock_irqrestore(&sp->siglock, flags);
+		put_task_struct(p);
 	}
 
 	return ret;
@@ -1170,14 +1196,21 @@ int
 kill_proc_info(int sig, struct siginfo *info, pid_t pid)
 {
 	int error;
+	int acquired_tasklist_lock = 0;
 	struct task_struct *p;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
+	if (unlikely(sig_kernel_stop(sig) || sig == SIGCONT)) {
+		read_lock(&tasklist_lock);
+		acquired_tasklist_lock = 1;
+	}
 	p = find_task_by_pid(pid);
 	error = -ESRCH;
 	if (p)
 		error = group_send_sig_info(sig, info, p);
-	read_unlock(&tasklist_lock);
+	if (unlikely(acquired_tasklist_lock))
+		read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 	return error;
 }
 
@@ -1385,16 +1418,49 @@ send_sigqueue(int sig, struct sigqueue *
 {
 	unsigned long flags;
 	int ret = 0;
+	struct sighand_struct *sh;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	/*
+	 * The rcu based delayed sighand destroy makes it possible to
+	 * run this without tasklist lock held. The task struct itself
+	 * cannot go away as create_timer did get_task_struct().
+	 *
+	 * We return -1, when the task is marked exiting, so
+	 * posix_timer_event can redirect it to the group leader
+	 *
+	 */
+	rcu_read_lock();
 
 	if (unlikely(p->flags & PF_EXITING)) {
 		ret = -1;
 		goto out_err;
 	}
 
-	spin_lock_irqsave(&p->sighand->siglock, flags);
+	sh = rcu_dereference(p->sighand);
+
+	spin_lock_irqsave(&sh->siglock, flags);
+
+	/*
+	 * We do the check here again to handle the following scenario:
+	 *
+	 * CPU 0		CPU 1
+	 * send_sigqueue
+	 * check PF_EXITING
+	 * interrupt		exit code running
+	 *			__exit_signal
+	 *			lock sighand->siglock
+	 *			unlock sighand->siglock
+	 * lock sh->siglock
+	 * add(tsk->pending) 	flush_sigqueue(tsk->pending)
+	 *
+	 */
+
+	if (unlikely(p->flags & PF_EXITING)) {
+		ret = -1;
+		goto out;
+	}
 
 	if (unlikely(!list_empty(&q->list))) {
 		/*
@@ -1412,17 +1478,16 @@ send_sigqueue(int sig, struct sigqueue *
 		goto out;
 	}
 
-	q->lock = &p->sighand->siglock;
+	q->lock = &sh->siglock;
 	list_add_tail(&q->list, &p->pending.list);
 	sigaddset(&p->pending.signal, sig);
 	if (!sigismember(&p->blocked, sig))
 		signal_wake_up(p, sig == SIGKILL);
 
 out:
-	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+	spin_unlock_irqrestore(&sh->siglock, flags);
 out_err:
-	read_unlock(&tasklist_lock);
-
+	rcu_read_unlock();
 	return ret;
 }
 
@@ -1433,7 +1498,16 @@ send_group_sigqueue(int sig, struct sigq
 	int ret = 0;
 
 	BUG_ON(!(q->flags & SIGQUEUE_PREALLOC));
-	read_lock(&tasklist_lock);
+
+	while(!read_trylock(&tasklist_lock)) {
+		if (!p->sighand)
+			return -1;
+		cpu_relax();
+	}
+	if (unlikely(!p->sighand)) {
+		ret = -1;
+		goto out_err;
+	}
 	spin_lock_irqsave(&p->sighand->siglock, flags);
 	handle_stop_signal(sig, p);
 
@@ -1467,8 +1541,9 @@ send_group_sigqueue(int sig, struct sigq
 	__group_complete_signal(sig, p);
 out:
 	spin_unlock_irqrestore(&p->sighand->siglock, flags);
+out_err:
 	read_unlock(&tasklist_lock);
-	return(ret);
+	return ret;
 }
 
 /*
@@ -1634,6 +1709,7 @@ static void ptrace_stop(int exit_code, i
 	     !unlikely(current->signal->flags & SIGNAL_GROUP_EXIT))) {
 		do_notify_parent_cldstop(current, 1, CLD_TRAPPED);
 		read_unlock(&tasklist_lock);
+		current->flags &= ~PF_NOSCHED;
 		schedule();
 	} else {
 		/*
@@ -1700,6 +1776,7 @@ finish_stop(int stop_count)
 	read_unlock(&tasklist_lock);
 
 out:
+	current->flags &= ~PF_NOSCHED;
 	schedule();
 	/*
 	 * Now we don't run again until continued.
@@ -1859,6 +1936,9 @@ int get_signal_to_deliver(siginfo_t *inf
 	sigset_t *mask = &current->blocked;
 	int signr = 0;
 
+#ifdef CONFIG_PREEMPT_RT
+	might_sleep();
+#endif
 relock:
 	spin_lock_irq(&current->sighand->siglock);
 	for (;;) {
Index: linux/kernel/softirq.c
===================================================================
--- linux.orig/kernel/softirq.c
+++ linux/kernel/softirq.c
@@ -4,6 +4,9 @@
  *	Copyright (C) 1992 Linus Torvalds
  *
  * Rewritten. Old one was good in 2.2, but in 2.3 it was immoral. --ANK (990903)
+ *
+ *	Softirq-split implemetation by
+ *	Copyright (C) 2005 Thomas Gleixner, Ingo Molnar
  */
 
 #include <linux/module.h>
@@ -16,6 +19,9 @@
 #include <linux/cpu.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/kallsyms.h>
+#include <linux/syscalls.h>
+#include <linux/delay.h>
 
 #include <asm/irq.h>
 /*
@@ -43,7 +49,13 @@ EXPORT_SYMBOL(irq_stat);
 
 static struct softirq_action softirq_vec[32] __cacheline_aligned_in_smp;
 
-static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
+struct softirqdata {
+	int			nr;
+	unsigned long		cpu;
+	struct task_struct	*tsk;
+};
+
+static DEFINE_PER_CPU(struct softirqdata, ksoftirqd[MAX_SOFTIRQ]);
 
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
@@ -51,16 +63,31 @@ static DEFINE_PER_CPU(struct task_struct
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+static void wakeup_softirqd(int softirq)
 {
 	/* Interrupts are disabled: no need to stop preemption */
-	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
+	struct task_struct *tsk = __get_cpu_var(ksoftirqd[softirq].tsk);
 
 	if (tsk && tsk->state != TASK_RUNNING)
 		wake_up_process(tsk);
 }
 
 /*
+ * Wake up the softirq threads which have work
+ */
+static void trigger_softirqs(void)
+{
+	u32 pending = local_softirq_pending();
+	int curr = 0;
+
+	while (pending) {
+		if (pending & 1)
+			wakeup_softirqd(curr);
+		pending >>= 1;
+		curr++;
+	}
+}
+/*
  * We restart softirq processing MAX_SOFTIRQ_RESTART times,
  * and we fall back to softirqd after that.
  *
@@ -71,7 +98,7 @@ static inline void wakeup_softirqd(void)
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-asmlinkage void __do_softirq(void)
+asmlinkage void ___do_softirq(void)
 {
 	struct softirq_action *h;
 	__u32 pending;
@@ -80,37 +107,106 @@ asmlinkage void __do_softirq(void)
 
 	pending = local_softirq_pending();
 
-	local_bh_disable();
 	cpu = smp_processor_id();
 restart:
 	/* Reset the pending bitmask before enabling irqs */
 	set_softirq_pending(0);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	h = softirq_vec;
 
 	do {
 		if (pending & 1) {
-			h->action(h);
+			{
+				u32 preempt_count = preempt_count();
+				h->action(h);
+				if (preempt_count != preempt_count()) {
+					print_symbol("BUG: softirq exited %s with wrong preemption count!\n", (unsigned long) h->action);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
+				}
+			}
 			rcu_bh_qsctr_inc(cpu);
+			cond_resched_all();
 		}
 		h++;
 		pending >>= 1;
 	} while (pending);
 
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	pending = local_softirq_pending();
 	if (pending && --max_restart)
 		goto restart;
 
 	if (pending)
-		wakeup_softirqd();
+		trigger_softirqs();
+}
+
+asmlinkage void __do_softirq(void)
+{
+	unsigned long p_flags;
 
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+	/*
+	 * 'preempt harder'. Push all softirq processing off to ksoftirqd.
+	 */
+	if (softirq_preemption) {
+		if (local_softirq_pending())
+			trigger_softirqs();
+		return;
+	}
+#endif
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	local_bh_disable();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
 	__local_bh_enable();
+
+	current->flags |= p_flags;
 }
 
+void do_softirq_from_hardirq(void)
+{
+	unsigned long p_flags;
+
+	if (!local_softirq_pending())
+		return;
+	/*
+	 * 'immediate' softirq execution:
+	 */
+	local_bh_disable();
+	p_flags = current->flags & PF_HARDIRQ;
+	current->flags &= ~PF_HARDIRQ;
+
+	___do_softirq();
+	__local_bh_enable();
+
+	current->flags |= p_flags;
+}
+
+
+/*
+ * 'delayed' softirq execution. Does not disable bhs and thus
+ * makes most of the softirq handlers preemptable - as long as
+ * they are not executed 'directly'.
+ */
+asmlinkage void _do_softirq(void)
+{
+	raw_local_irq_disable();
+	if (!softirq_preemption)
+		__do_softirq();
+	else
+		___do_softirq();
+	raw_local_irq_enable();
+}
+
+
 #ifndef __ARCH_HAS_DO_SOFTIRQ
 
 asmlinkage void do_softirq(void)
@@ -121,20 +217,22 @@ asmlinkage void do_softirq(void)
 	if (in_interrupt())
 		return;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 
 	pending = local_softirq_pending();
 
 	if (pending)
 		__do_softirq();
 
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(do_softirq);
 
 #endif
 
+#ifndef CONFIG_PREEMPT_RT
+
 void local_bh_enable(void)
 {
 	WARN_ON(irqs_disabled());
@@ -152,6 +250,8 @@ void local_bh_enable(void)
 }
 EXPORT_SYMBOL(local_bh_enable);
 
+#endif
+
 #ifdef __ARCH_IRQ_EXIT_IRQS_DISABLED
 # define invoke_softirq()	__do_softirq()
 #else
@@ -165,9 +265,9 @@ void irq_exit(void)
 {
 	account_system_vtime(current);
 	sub_preempt_count(IRQ_EXIT_OFFSET);
-	if (!in_interrupt() && local_softirq_pending())
-		invoke_softirq();
-	preempt_enable_no_resched();
+	//if (!in_interrupt() && local_softirq_pending())
+	//	invoke_softirq();
+	__preempt_enable_no_resched();
 }
 
 /*
@@ -186,8 +286,9 @@ inline fastcall void raise_softirq_irqof
 	 * Otherwise we wake up ksoftirqd to make sure we
 	 * schedule the softirq soon.
 	 */
-	if (!in_interrupt())
-		wakeup_softirqd();
+	if (!in_interrupt() || (current->flags & PF_HARDIRQ) || hardirq_count())
+		//trigger_softirqs();
+		wakeup_softirqd(nr);
 }
 
 EXPORT_SYMBOL(raise_softirq_irqoff);
@@ -196,9 +297,9 @@ void fastcall raise_softirq(unsigned int
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	raise_softirq_irqoff(nr);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 void open_softirq(int nr, void (*action)(struct softirq_action*), void *data)
@@ -224,11 +325,11 @@ void fastcall __tasklet_schedule(struct 
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	t->next = __get_cpu_var(tasklet_vec).list;
 	__get_cpu_var(tasklet_vec).list = t;
 	raise_softirq_irqoff(TASKLET_SOFTIRQ);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__tasklet_schedule);
@@ -237,11 +338,11 @@ void fastcall __tasklet_hi_schedule(stru
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	t->next = __get_cpu_var(tasklet_hi_vec).list;
 	__get_cpu_var(tasklet_hi_vec).list = t;
 	raise_softirq_irqoff(HI_SOFTIRQ);
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__tasklet_hi_schedule);
@@ -250,10 +351,10 @@ static void tasklet_action(struct softir
 {
 	struct tasklet_struct *list;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	list = __get_cpu_var(tasklet_vec).list;
 	__get_cpu_var(tasklet_vec).list = NULL;
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	while (list) {
 		struct tasklet_struct *t = list;
@@ -271,11 +372,11 @@ static void tasklet_action(struct softir
 			tasklet_unlock(t);
 		}
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		t->next = __get_cpu_var(tasklet_vec).list;
 		__get_cpu_var(tasklet_vec).list = t;
 		__raise_softirq_irqoff(TASKLET_SOFTIRQ);
-		local_irq_enable();
+		raw_local_irq_enable();
 	}
 }
 
@@ -283,10 +384,10 @@ static void tasklet_hi_action(struct sof
 {
 	struct tasklet_struct *list;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	list = __get_cpu_var(tasklet_hi_vec).list;
 	__get_cpu_var(tasklet_hi_vec).list = NULL;
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	while (list) {
 		struct tasklet_struct *t = list;
@@ -304,11 +405,11 @@ static void tasklet_hi_action(struct sof
 			tasklet_unlock(t);
 		}
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		t->next = __get_cpu_var(tasklet_hi_vec).list;
 		__get_cpu_var(tasklet_hi_vec).list = t;
 		__raise_softirq_irqoff(HI_SOFTIRQ);
-		local_irq_enable();
+		raw_local_irq_enable();
 	}
 }
 
@@ -332,7 +433,7 @@ void tasklet_kill(struct tasklet_struct 
 
 	while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) {
 		do
-			yield();
+			msleep(1);
 		while (test_bit(TASKLET_STATE_SCHED, &t->state));
 	}
 	tasklet_unlock_wait(t);
@@ -347,31 +448,48 @@ void __init softirq_init(void)
 	open_softirq(HI_SOFTIRQ, tasklet_hi_action, NULL);
 }
 
-static int ksoftirqd(void * __bind_cpu)
+static int ksoftirqd(void * __data)
 {
-	set_user_nice(current, 19);
-	current->flags |= PF_NOFREEZE;
+	struct sched_param param = { .sched_priority = MAX_RT_PRIO/4-1 };
+	struct softirqdata *data = __data;
+	u32 mask = (1 << data->nr);
+	struct softirq_action *h;
+
+	param.sched_priority = 1;
+	sys_sched_setscheduler(current->pid, SCHED_FIFO, &param);
+//	set_user_nice(current, -10);
+	current->flags |= PF_NOFREEZE | PF_SOFTIRQ;
 
 	set_current_state(TASK_INTERRUPTIBLE);
 
 	while (!kthread_should_stop()) {
 		preempt_disable();
-		if (!local_softirq_pending()) {
-			preempt_enable_no_resched();
+		if (!(local_softirq_pending() & mask)) {
+			__preempt_enable_no_resched();
 			schedule();
 			preempt_disable();
 		}
-
 		__set_current_state(TASK_RUNNING);
 
-		while (local_softirq_pending()) {
+		while (local_softirq_pending() & mask) {
 			/* Preempt disable stops cpu going offline.
 			   If already offline, we'll be on wrong CPU:
 			   don't process */
-			if (cpu_is_offline((long)__bind_cpu))
+			if (cpu_is_offline(data->cpu))
 				goto wait_to_die;
-			do_softirq();
-			preempt_enable_no_resched();
+
+			raw_local_irq_disable();
+			__preempt_enable_no_resched();
+			set_softirq_pending(local_softirq_pending() & ~mask);
+			local_bh_disable();
+			raw_local_irq_enable();
+
+			h = &softirq_vec[data->nr];
+			if (h)
+				h->action(h);
+			rcu_bh_qsctr_inc(data->cpu);
+
+			__local_bh_enable();
 			cond_resched();
 			preempt_disable();
 		}
@@ -423,12 +541,12 @@ void tasklet_kill_immediate(struct taskl
 	BUG();
 }
 
-static void takeover_tasklets(unsigned int cpu)
+void takeover_tasklets(unsigned int cpu)
 {
 	struct tasklet_struct **i;
 
 	/* CPU is dead, so no lock needed. */
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Find end, append list for that CPU. */
 	for (i = &__get_cpu_var(tasklet_vec).list; *i; i = &(*i)->next);
@@ -441,40 +559,66 @@ static void takeover_tasklets(unsigned i
 	per_cpu(tasklet_hi_vec, cpu).list = NULL;
 	raise_softirq_irqoff(HI_SOFTIRQ);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 #endif /* CONFIG_HOTPLUG_CPU */
 
+static const char *softirq_names [] =
+{
+  [HI_SOFTIRQ]		= "high",
+  [TIMER_SOFTIRQ]	= "timer",
+  [NET_TX_SOFTIRQ]	= "net-tx",
+  [NET_RX_SOFTIRQ]	= "net-rx",
+  [SCSI_SOFTIRQ]	= "scsi",
+  [TASKLET_SOFTIRQ]	= "tasklet",
+#ifdef CONFIG_HIGH_RES_TIMERS
+  [KTIMER_SOFTIRQ]	= "ktimer",
+#endif
+};
+
 static int __devinit cpu_callback(struct notifier_block *nfb,
 				  unsigned long action,
 				  void *hcpu)
 {
-	int hotcpu = (unsigned long)hcpu;
+	int hotcpu = (unsigned long)hcpu, i;
 	struct task_struct *p;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
-		BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
-		BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
-		p = kthread_create(ksoftirqd, hcpu, "ksoftirqd/%d", hotcpu);
-		if (IS_ERR(p)) {
-			printk("ksoftirqd for %i failed\n", hotcpu);
-			return NOTIFY_BAD;
+		/* We may have tasklets already scheduled on
+		   processor 0, so don't check there. */
+		if (hotcpu != 0) {
+			BUG_ON(per_cpu(tasklet_vec, hotcpu).list);
+			BUG_ON(per_cpu(tasklet_hi_vec, hotcpu).list);
+		}
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			per_cpu(ksoftirqd[i].nr, hotcpu) = i;
+			per_cpu(ksoftirqd[i].cpu, hotcpu) = hotcpu;
+			p = kthread_create(ksoftirqd, &per_cpu(ksoftirqd[i], hotcpu),
+					   "softirq-%s/%d", softirq_names[i], hotcpu);
+			if (IS_ERR(p)) {
+				printk("ksoftirqd %d for %i failed\n", i, hotcpu);
+				return NOTIFY_BAD;
+			}
+			kthread_bind(p, hotcpu);
+			per_cpu(ksoftirqd[i].tsk, hotcpu) = p;
 		}
-		kthread_bind(p, hotcpu);
-  		per_cpu(ksoftirqd, hotcpu) = p;
  		break;
 	case CPU_ONLINE:
-		wake_up_process(per_cpu(ksoftirqd, hotcpu));
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			wake_up_process(per_cpu(ksoftirqd[i].tsk, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
 	case CPU_UP_CANCELED:
 		/* Unbind so it can run.  Fall thru. */
-		kthread_bind(per_cpu(ksoftirqd, hotcpu), smp_processor_id());
+		for (i = 0; i < MAX_SOFTIRQ; i++)
+			kthread_bind(per_cpu(ksoftirqd[i], hotcpu).tsk, smp_processor_id());
 	case CPU_DEAD:
-		p = per_cpu(ksoftirqd, hotcpu);
-		per_cpu(ksoftirqd, hotcpu) = NULL;
-		kthread_stop(p);
+		for (i = 0; i < MAX_SOFTIRQ; i++) {
+			p = per_cpu(ksoftirqd[i], hotcpu).tsk;
+			per_cpu(ksoftirqd[i], hotcpu).tsk = NULL;
+			kthread_stop(p);
+		}
 		takeover_tasklets(hotcpu);
 		break;
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -494,3 +638,33 @@ __init int spawn_ksoftirqd(void)
 	register_cpu_notifier(&cpu_nfb);
 	return 0;
 }
+
+#ifdef CONFIG_PREEMPT_SOFTIRQS
+
+int softirq_preemption = 1;
+
+EXPORT_SYMBOL(softirq_preemption);
+
+/*
+ * Real-Time Preemption depends on softirq threading:
+ */
+#ifndef CONFIG_PREEMPT_RT
+
+static int __init softirq_preempt_setup (char *str)
+{
+	if (!strncmp(str, "off", 3))
+		softirq_preemption = 0;
+	else
+		get_option(&str, &softirq_preemption);
+	if (!softirq_preemption)
+		printk("turning off softirq preemption!\n");
+
+	return 1;
+}
+
+__setup("softirq-preempt=", softirq_preempt_setup);
+
+#endif
+
+#endif
+
Index: linux/kernel/softlockup.c
===================================================================
--- linux.orig/kernel/softlockup.c
+++ linux/kernel/softlockup.c
@@ -3,22 +3,26 @@
  *
  * started by Ingo Molnar, (C) 2005, Red Hat
  *
+ * Steven Rostedt, Kihon Technologies Inc.
+ *   Added light softlockup detection off of what Daniel Walker of
+ *   MontaVista started.
+ *
  * this code detects soft lockups: incidents in where on a CPU
- * the kernel does not reschedule for 10 seconds or more.
+ * the kernel does not reschedule for 20 seconds or more.
  */
 
 #include <linux/mm.h>
 #include <linux/cpu.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/delay.h>
+#include <linux/module.h>
 #include <linux/kthread.h>
 #include <linux/notifier.h>
-#include <linux/module.h>
-
-static DEFINE_SPINLOCK(print_lock);
 
-static DEFINE_PER_CPU(unsigned long, timestamp) = 0;
-static DEFINE_PER_CPU(unsigned long, print_timestamp) = 0;
+static DEFINE_PER_CPU(unsigned long, timeout) = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, timestamp) = INITIAL_JIFFIES;
+static DEFINE_PER_CPU(unsigned long, print_timestamp) = INITIAL_JIFFIES;
 static DEFINE_PER_CPU(struct task_struct *, watchdog_task);
 
 static int did_panic = 0;
@@ -38,17 +42,57 @@ void touch_softlockup_watchdog(void)
 {
 	per_cpu(timestamp, raw_smp_processor_id()) = jiffies;
 }
-EXPORT_SYMBOL(touch_softlockup_watchdog);
+
+EXPORT_SYMBOL_GPL(touch_softlockup_watchdog);
+
+void touch_light_softlockup_watchdog(void)
+{
+	current->softlockup_count = 0;
+}
+
+static void softlockup_detected(int this_cpu)
+{
+	stop_trace();
+
+	printk(KERN_ERR "BUG: %s:%d, possible softlockup detected on CPU#%u!\n",
+	       current->comm, current->pid, this_cpu);
+	dump_stack();
+#if defined(__i386__) && defined(CONFIG_SMP)
+	nmi_show_all_regs();
+#endif
+	touch_light_softlockup_watchdog();
+	print_last_trace();
+}
 
 /*
  * This callback runs from the timer interrupt, and checks
  * whether the watchdog thread has hung or not:
  */
-void softlockup_tick(struct pt_regs *regs)
+void softlockup_tick(void)
 {
 	int this_cpu = smp_processor_id();
+	unsigned long timeout = per_cpu(timeout, this_cpu);
 	unsigned long timestamp = per_cpu(timestamp, this_cpu);
 
+	if (time_after(jiffies, timeout)) {
+		/*
+		 * Do not try to wake up during early bootup:
+		 */
+		if (!per_cpu(watchdog_task, this_cpu))
+			return;
+
+		/*
+		 * Print out a warning upon reaching softlockup_count
+		 * of 20. Print it only once.
+		 */
+		if (current->pid && (++current->softlockup_count == 20))
+			softlockup_detected(this_cpu);
+
+		wake_up_process(per_cpu(watchdog_task, this_cpu));
+		per_cpu(timeout, this_cpu) = jiffies + msecs_to_jiffies(1000);
+	} else
+		touch_light_softlockup_watchdog();
+
 	if (per_cpu(print_timestamp, this_cpu) == timestamp)
 		return;
 
@@ -56,14 +100,9 @@ void softlockup_tick(struct pt_regs *reg
 	if (did_panic)
 		return;
 
-	if (time_after(jiffies, timestamp + 10*HZ)) {
+	if (time_after(jiffies, timestamp + msecs_to_jiffies(20000))) {
 		per_cpu(print_timestamp, this_cpu) = timestamp;
-
-		spin_lock(&print_lock);
-		printk(KERN_ERR "BUG: soft lockup detected on CPU#%d!\n",
-			this_cpu);
-		show_regs(regs);
-		spin_unlock(&print_lock);
+		softlockup_detected(this_cpu);
 	}
 }
 
@@ -84,11 +123,12 @@ static int watchdog(void * __bind_cpu)
 
 	/*
 	 * Run briefly once per second - if this gets delayed for
-	 * more than 10 seconds then the debug-printout triggers
+	 * more than 20 seconds then the debug-printout triggers
 	 * in softlockup_tick():
 	 */
 	while (!kthread_should_stop()) {
-		msleep_interruptible(1000);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule();
 		touch_softlockup_watchdog();
 	}
 	__set_current_state(TASK_RUNNING);
@@ -148,4 +188,3 @@ __init void spawn_softlockup_task(void)
 
 	notifier_chain_register(&panic_notifier_list, &panic_block);
 }
-
Index: linux/kernel/spinlock.c
===================================================================
--- linux.orig/kernel/spinlock.c
+++ linux/kernel/spinlock.c
@@ -20,151 +20,197 @@
  * Generic declaration of the raw read_trylock() function,
  * architectures are supposed to optimize this:
  */
-int __lockfunc generic__raw_read_trylock(raw_rwlock_t *lock)
+int __lockfunc generic_raw_read_trylock(raw_rwlock_t *lock)
 {
-	__raw_read_lock(lock);
+	__raw_read_lock(&lock->raw_lock);
 	return 1;
 }
-EXPORT_SYMBOL(generic__raw_read_trylock);
+EXPORT_SYMBOL(generic_raw_read_trylock);
 
-int __lockfunc _spin_trylock(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+
+	if (__raw_spin_trylock(&lock->raw_lock))
 		return 1;
 	
 	preempt_enable();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock);
+
+int __lockfunc _raw_spin_trylock_irq(raw_spinlock_t *lock)
+{
+	raw_local_irq_disable();
+	preempt_disable();
+
+	if (__raw_spin_trylock(&lock->raw_lock))
+		return 1;
+
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
+
+	return 0;
+}
+EXPORT_SYMBOL(_raw_spin_trylock_irq);
+
+int __lockfunc _raw_spin_trylock_irqsave(raw_spinlock_t *lock,
+					 unsigned long *flags)
+{
+	raw_local_irq_save(*flags);
+	preempt_disable();
+
+	if (__raw_spin_trylock(&lock->raw_lock))
+		return 1;
+
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(*flags);
+	preempt_check_resched();
+
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock);
+EXPORT_SYMBOL(_raw_spin_trylock_irqsave);
 
-int __lockfunc _read_trylock(rwlock_t *lock)
+int __lockfunc _raw_read_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_read_trylock(lock))
+
+	if (__raw_read_trylock(&lock->raw_lock))
 		return 1;
 
 	preempt_enable();
+
 	return 0;
 }
-EXPORT_SYMBOL(_read_trylock);
+EXPORT_SYMBOL(_raw_read_trylock);
 
-int __lockfunc _write_trylock(rwlock_t *lock)
+int __lockfunc _raw_write_trylock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	if (_raw_write_trylock(lock))
+
+	if (__raw_write_trylock(&lock->raw_lock))
 		return 1;
 
 	preempt_enable();
+
 	return 0;
 }
-EXPORT_SYMBOL(_write_trylock);
+EXPORT_SYMBOL(_raw_write_trylock);
 
-#if !defined(CONFIG_PREEMPT) || !defined(CONFIG_SMP)
+//#ifndef CONFIG_PREEMPT
+#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_RT)
 
-void __lockfunc _read_lock(rwlock_t *lock)
+void __lockfunc _raw_read_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_read_lock(lock);
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock);
+EXPORT_SYMBOL(_raw_read_lock);
 
-unsigned long __lockfunc _spin_lock_irqsave(spinlock_t *lock)
+unsigned long __lockfunc _raw_spin_lock_irqsave(raw_spinlock_t *lock)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	preempt_disable();
-	_raw_spin_lock_flags(lock, &flags);
+
+	__raw_spin_lock_flags(&lock->raw_lock, flags);
+
 	return flags;
 }
-EXPORT_SYMBOL(_spin_lock_irqsave);
+EXPORT_SYMBOL(_raw_spin_lock_irqsave);
 
-void __lockfunc _spin_lock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_irq(raw_spinlock_t *lock)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+
+	__raw_spin_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_spin_lock_irq);
+EXPORT_SYMBOL(_raw_spin_lock_irq);
 
-void __lockfunc _spin_lock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_spin_lock(lock);
+
+	__raw_spin_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_spin_lock_bh);
+EXPORT_SYMBOL(_raw_spin_lock_bh);
 
-unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_read_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
+
 	return flags;
 }
-EXPORT_SYMBOL(_read_lock_irqsave);
+EXPORT_SYMBOL(_raw_read_lock_irqsave);
 
-void __lockfunc _read_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_lock_irq(raw_rwlock_t *lock)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock_irq);
+EXPORT_SYMBOL(_raw_read_lock_irq);
 
-void __lockfunc _read_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_read_lock(lock);
+
+	__raw_read_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_read_lock_bh);
+EXPORT_SYMBOL(_raw_read_lock_bh);
 
-unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
+unsigned long __lockfunc _raw_write_lock_irqsave(raw_rwlock_t *lock)
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 	return flags;
 }
-EXPORT_SYMBOL(_write_lock_irqsave);
+EXPORT_SYMBOL(_raw_write_lock_irqsave);
 
-void __lockfunc _write_lock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_lock_irq(raw_rwlock_t *lock)
 {
-	local_irq_disable();
+	raw_local_irq_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_write_lock_irq);
+EXPORT_SYMBOL(_raw_write_lock_irq);
 
-void __lockfunc _write_lock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_lock_bh(raw_rwlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-EXPORT_SYMBOL(_write_lock_bh);
+EXPORT_SYMBOL(_raw_write_lock_bh);
 
-void __lockfunc _spin_lock(spinlock_t *lock)
+void __lockfunc _raw_spin_lock(raw_spinlock_t *lock)
 {
 	preempt_disable();
-	_raw_spin_lock(lock);
+	__raw_spin_lock(&lock->raw_lock);
 }
+EXPORT_SYMBOL(_raw_spin_lock);
 
-EXPORT_SYMBOL(_spin_lock);
-
-void __lockfunc _write_lock(rwlock_t *lock)
+void __lockfunc _raw_write_lock(raw_rwlock_t *lock)
 {
 	preempt_disable();
-	_raw_write_lock(lock);
+	__raw_write_lock(&lock->raw_lock);
 }
-
-EXPORT_SYMBOL(_write_lock);
+EXPORT_SYMBOL(_raw_write_lock);
 
 #else /* CONFIG_PREEMPT: */
 
@@ -177,39 +223,41 @@ EXPORT_SYMBOL(_write_lock);
  */
 
 #define BUILD_LOCK_OPS(op, locktype)					\
-void __lockfunc _##op##_lock(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock(locktype##_t *lock)			\
 {									\
 	preempt_disable();						\
 	for (;;) {							\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		if (likely(__raw_##op##_trylock(&(lock)->raw_lock)))	\
 			break;						\
 		preempt_enable();					\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
 	(lock)->break_lock = 0;						\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock);						\
+EXPORT_SYMBOL(_raw_##op##_lock);					\
 									\
-unsigned long __lockfunc _##op##_lock_irqsave(locktype##_t *lock)	\
+unsigned long __lockfunc _raw_##op##_lock_irqsave(locktype##_t *lock)	\
 {									\
 	unsigned long flags;						\
 									\
 	preempt_disable();						\
 	for (;;) {							\
-		local_irq_save(flags);					\
-		if (likely(_raw_##op##_trylock(lock)))			\
+		raw_local_irq_save(flags);				\
+		if (likely(__raw_##op##_trylock(&(lock)->raw_lock)))	\
 			break;						\
-		local_irq_restore(flags);				\
+		raw_local_irq_restore(flags);				\
 									\
 		preempt_enable();					\
 		if (!(lock)->break_lock)				\
 			(lock)->break_lock = 1;				\
-		while (!op##_can_lock(lock) && (lock)->break_lock)	\
+		while (!__raw_##op##_can_lock(&(lock)->raw_lock) &&	\
+						 (lock)->break_lock)	\
 			cpu_relax();					\
 		preempt_disable();					\
 	}								\
@@ -217,16 +265,16 @@ unsigned long __lockfunc _##op##_lock_ir
 	return flags;							\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irqsave);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irqsave);				\
 									\
-void __lockfunc _##op##_lock_irq(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock_irq(locktype##_t *lock)		\
 {									\
-	_##op##_lock_irqsave(lock);					\
+	_raw_##op##_lock_irqsave(lock);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_irq);					\
+EXPORT_SYMBOL(_raw_##op##_lock_irq);					\
 									\
-void __lockfunc _##op##_lock_bh(locktype##_t *lock)			\
+void __lockfunc _raw_##op##_lock_bh(locktype##_t *lock)			\
 {									\
 	unsigned long flags;						\
 									\
@@ -235,12 +283,12 @@ void __lockfunc _##op##_lock_bh(locktype
 	/* irq-disabling. We use the generic preemption-aware	*/	\
 	/* function:						*/	\
 	/**/								\
-	flags = _##op##_lock_irqsave(lock);				\
+	flags = _raw_##op##_lock_irqsave(lock);				\
 	local_bh_disable();						\
-	local_irq_restore(flags);					\
+	raw_local_irq_restore(flags);					\
 }									\
 									\
-EXPORT_SYMBOL(_##op##_lock_bh)
+EXPORT_SYMBOL(_raw_##op##_lock_bh)
 
 /*
  * Build preemption-friendly versions of the following
@@ -251,119 +299,132 @@ EXPORT_SYMBOL(_##op##_lock_bh)
  *         _[spin|read|write]_lock_irqsave()
  *         _[spin|read|write]_lock_bh()
  */
-BUILD_LOCK_OPS(spin, spinlock);
-BUILD_LOCK_OPS(read, rwlock);
-BUILD_LOCK_OPS(write, rwlock);
+BUILD_LOCK_OPS(spin, raw_spinlock);
+BUILD_LOCK_OPS(read, raw_rwlock);
+BUILD_LOCK_OPS(write, raw_rwlock);
 
 #endif /* CONFIG_PREEMPT */
 
-void __lockfunc _spin_unlock(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
+	__raw_spin_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock);
+EXPORT_SYMBOL(_raw_spin_unlock);
 
-void __lockfunc _write_unlock(rwlock_t *lock)
+void __lockfunc _raw_spin_unlock_no_resched(raw_spinlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	preempt_enable();
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 }
-EXPORT_SYMBOL(_write_unlock);
+/* not exported */
 
-void __lockfunc _read_unlock(rwlock_t *lock)
+void __lockfunc _raw_write_unlock(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
+	__raw_write_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_read_unlock);
+EXPORT_SYMBOL(_raw_write_unlock);
 
-void __lockfunc _spin_unlock_irqrestore(spinlock_t *lock, unsigned long flags)
+void __lockfunc _raw_read_unlock(raw_rwlock_t *lock)
 {
-	_raw_spin_unlock(lock);
-	local_irq_restore(flags);
+	__raw_read_unlock(&lock->raw_lock);
 	preempt_enable();
 }
-EXPORT_SYMBOL(_spin_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_unlock);
 
-void __lockfunc _spin_unlock_irq(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irqrestore(raw_spinlock_t *lock, unsigned long flags)
 {
-	_raw_spin_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(flags);
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_spin_unlock_irq);
+EXPORT_SYMBOL(_raw_spin_unlock_irqrestore);
 
-void __lockfunc _spin_unlock_bh(spinlock_t *lock)
+void __lockfunc _raw_spin_unlock_irq(raw_spinlock_t *lock)
 {
-	_raw_spin_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
+}
+EXPORT_SYMBOL(_raw_spin_unlock_irq);
+
+void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock)
+{
+	__raw_spin_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_spin_unlock_bh);
+EXPORT_SYMBOL(_raw_spin_unlock_bh);
 
-void __lockfunc _read_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_read_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_read_unlock(lock);
-	local_irq_restore(flags);
-	preempt_enable();
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(flags);
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_read_unlock_irqrestore);
 
-void __lockfunc _read_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_read_unlock_irq);
+EXPORT_SYMBOL(_raw_read_unlock_irq);
 
-void __lockfunc _read_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_read_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_read_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_read_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_read_unlock_bh);
+EXPORT_SYMBOL(_raw_read_unlock_bh);
 
-void __lockfunc _write_unlock_irqrestore(rwlock_t *lock, unsigned long flags)
+void __lockfunc _raw_write_unlock_irqrestore(raw_rwlock_t *lock, unsigned long flags)
 {
-	_raw_write_unlock(lock);
-	local_irq_restore(flags);
-	preempt_enable();
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_restore(flags);
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irqrestore);
+EXPORT_SYMBOL(_raw_write_unlock_irqrestore);
 
-void __lockfunc _write_unlock_irq(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_irq(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	local_irq_enable();
-	preempt_enable();
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
+	raw_local_irq_enable();
+	preempt_check_resched();
 }
-EXPORT_SYMBOL(_write_unlock_irq);
+EXPORT_SYMBOL(_raw_write_unlock_irq);
 
-void __lockfunc _write_unlock_bh(rwlock_t *lock)
+void __lockfunc _raw_write_unlock_bh(raw_rwlock_t *lock)
 {
-	_raw_write_unlock(lock);
-	preempt_enable_no_resched();
+	__raw_write_unlock(&lock->raw_lock);
+	__preempt_enable_no_resched();
 	local_bh_enable();
 }
-EXPORT_SYMBOL(_write_unlock_bh);
+EXPORT_SYMBOL(_raw_write_unlock_bh);
 
-int __lockfunc _spin_trylock_bh(spinlock_t *lock)
+int __lockfunc _raw_spin_trylock_bh(raw_spinlock_t *lock)
 {
 	local_bh_disable();
 	preempt_disable();
-	if (_raw_spin_trylock(lock))
+	if (__raw_spin_trylock(&lock->raw_lock))
 		return 1;
 
-	preempt_enable_no_resched();
+	__preempt_enable_no_resched();
 	local_bh_enable();
 	return 0;
 }
-EXPORT_SYMBOL(_spin_trylock_bh);
+EXPORT_SYMBOL(_raw_spin_trylock_bh);
 
-int in_lock_functions(unsigned long addr)
+int notrace in_lock_functions(unsigned long addr)
 {
 	/* Linker adds these: start and end of __lockfunc functions */
 	extern char __lock_text_start[], __lock_text_end[];
@@ -372,3 +433,14 @@ int in_lock_functions(unsigned long addr
 	&& addr < (unsigned long)__lock_text_end;
 }
 EXPORT_SYMBOL(in_lock_functions);
+
+void notrace __debug_atomic_dec_and_test(atomic_t *v)
+{
+	static int warn_once = 1;
+
+	if (!atomic_read(v) && warn_once) {
+		warn_once = 0;
+		printk("BUG: atomic counter underflow!\n");
+		WARN_ON(1);
+	}
+}
Index: linux/kernel/stop_machine.c
===================================================================
--- linux.orig/kernel/stop_machine.c
+++ linux/kernel/stop_machine.c
@@ -40,7 +40,7 @@ static int stopmachine(void *cpu)
 	while (stopmachine_state != STOPMACHINE_EXIT) {
 		if (stopmachine_state == STOPMACHINE_DISABLE_IRQ 
 		    && !irqs_disabled) {
-			local_irq_disable();
+			raw_local_irq_disable();
 			irqs_disabled = 1;
 			/* Ack: irqs disabled. */
 			smp_mb(); /* Must read state first. */
@@ -56,7 +56,7 @@ static int stopmachine(void *cpu)
 		/* Yield in first stage: migration threads need to
 		 * help our sisters onto their CPUs. */
 		if (!prepared && !irqs_disabled)
-			yield();
+			__yield();
 		else
 			cpu_relax();
 	}
@@ -66,7 +66,7 @@ static int stopmachine(void *cpu)
 	atomic_inc(&stopmachine_thread_ack);
 
 	if (irqs_disabled)
-		local_irq_enable();
+		raw_local_irq_enable();
 	if (prepared)
 		preempt_enable();
 
@@ -110,7 +110,7 @@ static int stop_machine(void)
 
 	/* Wait for them all to come to life. */
 	while (atomic_read(&stopmachine_thread_ack) != stopmachine_num_threads)
-		yield();
+		__yield();
 
 	/* If some failed, kill them all. */
 	if (ret < 0) {
@@ -120,7 +120,7 @@ static int stop_machine(void)
 	}
 
 	/* Don't schedule us away at this point, please. */
-	local_irq_disable();
+	raw_local_irq_disable();
 
 	/* Now they are all started, make them hold the CPUs, ready. */
 	stopmachine_set_state(STOPMACHINE_PREPARE);
@@ -134,7 +134,7 @@ static int stop_machine(void)
 static void restart_machine(void)
 {
 	stopmachine_set_state(STOPMACHINE_EXIT);
-	local_irq_enable();
+	raw_local_irq_enable();
 }
 
 struct stop_machine_data
Index: linux/kernel/sys.c
===================================================================
--- linux.orig/kernel/sys.c
+++ linux/kernel/sys.c
@@ -31,6 +31,7 @@
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
+#include <linux/rt_lock.h>
 
 #include <asm/uaccess.h>
 #include <asm/io.h>
@@ -167,7 +168,7 @@ EXPORT_SYMBOL(notifier_chain_unregister)
  *	of the last notifier function called.
  */
  
-int notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
+int notrace notifier_call_chain(struct notifier_block **n, unsigned long val, void *v)
 {
 	int ret=NOTIFY_DONE;
 	struct notifier_block *nb = *n;
Index: linux/kernel/sysctl.c
===================================================================
--- linux.orig/kernel/sysctl.c
+++ linux/kernel/sysctl.c
@@ -42,6 +42,7 @@
 #include <linux/limits.h>
 #include <linux/dcache.h>
 #include <linux/syscalls.h>
+#include <linux/profile.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -279,6 +280,158 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "prof_pid",
+		.data		= &prof_pid,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#ifdef CONFIG_PREEMPT
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "kernel_preemption",
+		.data		= &kernel_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_PREEMPT_VOLUNTARY
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "voluntary_preemption",
+		.data		= &voluntary_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_SOFTIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "softirq_preemption",
+		.data		= &softirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#if defined(CONFIG_PREEMPT_HARDIRQS) && !defined(CONFIG_PREEMPT_RT)
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "hardirq_preemption",
+		.data		= &hardirq_preemption,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_WAKEUP_TIMING
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "wakeup_timing",
+		.data		= &wakeup_timing,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_LATENCY_TRACE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_enabled",
+		.data		= &trace_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "mcount_enabled",
+		.data		= &mcount_enabled,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_triggered",
+		.data		= &trace_user_triggered,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_user_trigger_irq",
+		.data		= &trace_user_trigger_irq,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_freerunning",
+		.data		= &trace_freerunning,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_print_at_crash",
+		.data		= &trace_print_at_crash,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_verbose",
+		.data		= &trace_verbose,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "trace_all_cpus",
+		.data		= &trace_all_cpus,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_DEBUG_RT_LOCKING_MODE
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "preempt_locks",
+		.data		= &preempt_locks_user,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+#ifdef CONFIG_GENERIC_HARDIRQS
+	{
+		.ctl_name	= KERN_PANIC,
+		.procname	= "debug_direct_keyboard",
+		.data		= &debug_direct_keyboard,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec,
+	},
+#endif
+	{
 		.ctl_name	= KERN_CORE_USES_PID,
 		.procname	= "core_uses_pid",
 		.data		= &core_uses_pid,
@@ -482,7 +635,8 @@ static ctl_table kern_table[] = {
 		.proc_handler	= &proc_dointvec,
 	},
 #endif
-#ifdef CONFIG_MAGIC_SYSRQ
+// ignore sysrq-off when debug-printks are enabled:
+#if defined(CONFIG_MAGIC_SYSRQ) && !defined(CONFIG_PRINTK_IGNORE_LOGLEVEL)
 	{
 		.ctl_name	= KERN_SYSRQ,
 		.procname	= "sysrq",
Index: linux/kernel/time.c
===================================================================
--- linux.orig/kernel/time.c
+++ linux/kernel/time.c
@@ -38,6 +38,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
+#include <linux/timeofday.h>
 
 /* 
  * The timezone where the local system is located.  Used as a default by some
@@ -97,8 +98,31 @@ asmlinkage long sys_stime(time_t __user 
 
 #endif /* __ARCH_WANT_SYS_TIME */
 
+int timeofday_API_hacks(void *tv, void *tz)
+{
+#ifdef CONFIG_LATENCY_TRACE
+	if (!tv && ((long)tz == 1))
+		return user_trace_start();
+	if (!tv && !tz)
+		return user_trace_stop();
+#endif
+	if (((long)tv == 1) && ((long)tz == 1)) {
+		current->flags |= PF_NOSCHED;
+		return 0;
+	}
+	if (((long)tv == 1) && ((long)tz == 0)) {
+		current->flags &= ~PF_NOSCHED;
+		return 0;
+	}
+	return 1;
+}
+
 asmlinkage long sys_gettimeofday(struct timeval __user *tv, struct timezone __user *tz)
 {
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
 	if (likely(tv != NULL)) {
 		struct timeval ktv;
 		do_gettimeofday(&ktv);
@@ -128,6 +152,7 @@ asmlinkage long sys_gettimeofday(struct 
  * as real UNIX machines always do it. This avoids all headaches about
  * daylight saving times and warping kernel clocks.
  */
+#ifndef CONFIG_GENERIC_TIME
 static inline void warp_clock(void)
 {
 	write_seqlock_irq(&xtime_lock);
@@ -137,6 +162,18 @@ static inline void warp_clock(void)
 	write_sequnlock_irq(&xtime_lock);
 	clock_was_set();
 }
+#else /* !CONFIG_GENERIC_TIME */
+/* XXX - this is somewhat cracked out and should
+         be checked  -johnstul@us.ibm.com
+*/
+static inline void warp_clock(void)
+{
+	struct timespec ts;
+	getnstimeofday(&ts);
+	ts.tv_sec += sys_tz.tz_minuteswest * 60;
+	do_settimeofday(&ts);
+}
+#endif /* !CONFIG_GENERIC_TIME */
 
 /*
  * In case for some reason the CMOS clock has not already been running
@@ -154,6 +191,9 @@ int do_sys_settimeofday(struct timespec 
 	static int firsttime = 1;
 	int error = 0;
 
+	if (!timespec_valid(tv))
+		return -EINVAL;
+
 	error = security_settime(tv, tz);
 	if (error)
 		return error;
@@ -184,6 +224,10 @@ asmlinkage long sys_settimeofday(struct 
 	struct timespec	new_ts;
 	struct timezone new_tz;
 
+	int ret = timeofday_API_hacks(tv, tz);
+	if (ret != 1)
+		return ret;
+
 	if (tv) {
 		if (copy_from_user(&user_tv, tv, sizeof(*tv)))
 			return -EFAULT;
@@ -224,206 +268,246 @@ void __attribute__ ((weak)) notify_arch_
 	return;
 }
 
-/* adjtimex mainly allows reading (and writing, if superuser) of
- * kernel time-keeping variables. used by xntpd.
+static inline int
+process_adj_offset(const struct timex *txc, const struct timespec now,
+		   int result)
+{
+	long ltemp, mtemp;
+
+	/* note: txc values were checked earlier. */
+
+	if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
+		/* adjtime() is independent from ntp_adjtime() */
+		time_next_adjust = txc->offset;
+		if (time_next_adjust == 0)
+			time_adjust = 0;
+		return result;
+	}
+
+	if (!(time_status & (STA_PLL | STA_PPSTIME)))
+		return result;
+
+	if ((time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
+				(STA_PPSTIME | STA_PPSSIGNAL))
+		ltemp = pps_offset;
+	else
+		ltemp = txc->offset;
+
+	/* scale the phase adjustment and clamp to the operating range: */
+	if (ltemp > MAXPHASE)
+		time_offset = MAXPHASE << SHIFT_UPDATE;
+	else {
+		if (ltemp < -MAXPHASE)
+			time_offset = -(MAXPHASE << SHIFT_UPDATE);
+		else
+			time_offset = ltemp << SHIFT_UPDATE;
+	}
+
+	/*
+	 * select whether the frequency is to be controlled
+	 * and in which mode (PLL or FLL). Clamp to the operating
+	 * range. Ugly multiply/divide should be replaced someday.
+	 */
+	if ((time_status & STA_FREQHOLD) || (time_reftime == 0))
+		time_reftime = now.tv_sec;
+
+	mtemp = now.tv_sec - time_reftime;
+	time_reftime = now.tv_sec;
+
+	if (time_status & STA_FLL) { /* FLL mode: */
+		if (mtemp >= MINSEC) {
+			ltemp = (time_offset / mtemp) <<
+						(SHIFT_USEC - SHIFT_UPDATE);
+			time_freq += shift_right(ltemp, SHIFT_KH);
+		} else /* calibration interval too short (p. 12): */
+			result = TIME_ERROR;
+	} else { /* PLL mode: */
+		if (mtemp < MAXSEC) {
+			ltemp *= mtemp;
+			/* TODO: is 2*time_constant correct? --mingo */
+			time_freq += shift_right(ltemp, 2*time_constant +
+						SHIFT_KF - SHIFT_USEC);
+		} else /* calibration interval too long (p. 12) */
+			result = TIME_ERROR;
+	}
+
+	time_freq = min(time_freq, time_tolerance);
+	time_freq = max(time_freq, -time_tolerance);
+
+	return result;
+}
+
+static inline int
+process_input_params(const struct timex *txc, const struct timespec now,
+		     int result)
+{
+	if (txc->modes & ADJ_STATUS)
+		/* only set allowed bits: */
+		time_status = (txc->status & ~STA_RONLY) |
+				(time_status & STA_RONLY);
+
+	if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
+		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ)
+			return -EINVAL;
+		time_freq = txc->freq - pps_freq;
+	}
+
+	if (txc->modes & ADJ_MAXERROR) {
+		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT)
+			return -EINVAL;
+		time_maxerror = txc->maxerror;
+	}
+
+	if (txc->modes & ADJ_ESTERROR) {
+		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT)
+			return -EINVAL;
+		time_esterror = txc->esterror;
+	}
+
+	if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
+		/* NTP v4 uses values > 6 */
+		if (txc->constant < 0)
+			return -EINVAL;
+		time_constant = txc->constant;
+	}
+
+	if (txc->modes & ADJ_OFFSET)
+		result = process_adj_offset(txc, now, result);
+
+	if (txc->modes & ADJ_TICK) {
+		tick_usec = txc->tick;
+		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
+	}
+
+	return result;
+}
+
+/**
+ * do_adjtimex - allows reading (and writing, if superuser) of
+ *		 kernel time-keeping variables. Used by xntpd.
+ * @txc: time-adjustments settings structure
  */
 int do_adjtimex(struct timex *txc)
 {
-        long ltemp, mtemp, save_adjust;
+	unsigned long flags, seq;
+	struct timespec now;
+	long save_adjust;
 	int result;
 
-	/* In order to modify anything, you gotta be super-user! */
+	/* in order to modify anything, you gotta be super-user! */
 	if (txc->modes && !capable(CAP_SYS_TIME))
 		return -EPERM;
 		
-	/* Now we validate the data before disabling interrupts */
-
+	/* now we validate the data before disabling interrupts: */
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	  /* singleshot must not be used with any other mode bits */
+		/* singleshot must not be used with any other mode bits: */
 		if (txc->modes != ADJ_OFFSET_SINGLESHOT)
 			return -EINVAL;
 
 	if (txc->modes != ADJ_OFFSET_SINGLESHOT && (txc->modes & ADJ_OFFSET))
-	  /* adjustment Offset limited to +- .512 seconds */
+		/* adjustment offset limited to +- .512 seconds: */
 		if (txc->offset <= - MAXPHASE || txc->offset >= MAXPHASE )
 			return -EINVAL;	
 
 	/* if the quartz is off by more than 10% something is VERY wrong ! */
 	if (txc->modes & ADJ_TICK)
-		if (txc->tick <  900000/USER_HZ ||
-		    txc->tick > 1100000/USER_HZ)
+		if (txc->tick < 900000/USER_HZ || txc->tick > 1100000/USER_HZ)
 			return -EINVAL;
 
-	write_seqlock_irq(&xtime_lock);
-	result = time_state;	/* mostly `TIME_OK' */
+	/*
+	 * TODO: shouldnt we write-lock xtime_lock below, and then
+	 * lock the ntp lock, and do the whole adjustment from under
+	 * the xtime lock and the ntp lock? --mingo
+	 */
 
-	/* Save for later - semantics of adjtime is to return old value */
-	save_adjust = time_next_adjust ? time_next_adjust : time_adjust;
+	/* save current xtime: */
+	do {
+		seq = read_seqbegin(&xtime_lock);
+		now = xtime;
+	} while (read_seqretry(&xtime_lock, seq));
 
-#if 0	/* STA_CLOCKERR is never set yet */
-	time_status &= ~STA_CLOCKERR;		/* reset STA_CLOCKERR */
-#endif
-	/* If there are input parameters, then process them */
-	if (txc->modes)
-	{
-	    if (txc->modes & ADJ_STATUS)	/* only set allowed bits */
-		time_status =  (txc->status & ~STA_RONLY) |
-			      (time_status & STA_RONLY);
-
-	    if (txc->modes & ADJ_FREQUENCY) {	/* p. 22 */
-		if (txc->freq > MAXFREQ || txc->freq < -MAXFREQ) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_freq = txc->freq - pps_freq;
-	    }
+	write_seqlock_irqsave(&ntp_lock, flags);
 
-	    if (txc->modes & ADJ_MAXERROR) {
-		if (txc->maxerror < 0 || txc->maxerror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_maxerror = txc->maxerror;
-	    }
-
-	    if (txc->modes & ADJ_ESTERROR) {
-		if (txc->esterror < 0 || txc->esterror >= NTP_PHASE_LIMIT) {
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_esterror = txc->esterror;
-	    }
+	result = time_state;	/* mostly `TIME_OK' */
 
-	    if (txc->modes & ADJ_TIMECONST) {	/* p. 24 */
-		if (txc->constant < 0) {	/* NTP v4 uses values > 6 */
-		    result = -EINVAL;
-		    goto leave;
-		}
-		time_constant = txc->constant;
-	    }
+	/* save for later - semantics of adjtime is to return old value: */
+	if (time_next_adjust)
+		save_adjust = time_next_adjust;
+	else
+		save_adjust = time_adjust;
 
-	    if (txc->modes & ADJ_OFFSET) {	/* values checked earlier */
-		if (txc->modes == ADJ_OFFSET_SINGLESHOT) {
-		    /* adjtime() is independent from ntp_adjtime() */
-		    if ((time_next_adjust = txc->offset) == 0)
-			 time_adjust = 0;
-		}
-		else if ( time_status & (STA_PLL | STA_PPSTIME) ) {
-		    ltemp = (time_status & (STA_PPSTIME | STA_PPSSIGNAL)) ==
-		            (STA_PPSTIME | STA_PPSSIGNAL) ?
-		            pps_offset : txc->offset;
-
-		    /*
-		     * Scale the phase adjustment and
-		     * clamp to the operating range.
-		     */
-		    if (ltemp > MAXPHASE)
-		        time_offset = MAXPHASE << SHIFT_UPDATE;
-		    else if (ltemp < -MAXPHASE)
-			time_offset = -(MAXPHASE << SHIFT_UPDATE);
-		    else
-		        time_offset = ltemp << SHIFT_UPDATE;
+	/* process input parameters: */
+	if (txc->modes)
+		result = process_input_params(txc, now, result);
 
-		    /*
-		     * Select whether the frequency is to be controlled
-		     * and in which mode (PLL or FLL). Clamp to the operating
-		     * range. Ugly multiply/divide should be replaced someday.
-		     */
-
-		    if (time_status & STA_FREQHOLD || time_reftime == 0)
-		        time_reftime = xtime.tv_sec;
-		    mtemp = xtime.tv_sec - time_reftime;
-		    time_reftime = xtime.tv_sec;
-		    if (time_status & STA_FLL) {
-		        if (mtemp >= MINSEC) {
-			    ltemp = (time_offset / mtemp) << (SHIFT_USEC -
-							      SHIFT_UPDATE);
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> SHIFT_KH;
-			    else
-			        time_freq += ltemp >> SHIFT_KH;
-			} else /* calibration interval too short (p. 12) */
-				result = TIME_ERROR;
-		    } else {	/* PLL mode */
-		        if (mtemp < MAXSEC) {
-			    ltemp *= mtemp;
-			    if (ltemp < 0)
-			        time_freq -= -ltemp >> (time_constant +
-							time_constant +
-							SHIFT_KF - SHIFT_USEC);
-			    else
-			        time_freq += ltemp >> (time_constant +
-						       time_constant +
-						       SHIFT_KF - SHIFT_USEC);
-			} else /* calibration interval too long (p. 12) */
-				result = TIME_ERROR;
-		    }
-		    if (time_freq > time_tolerance)
-		        time_freq = time_tolerance;
-		    else if (time_freq < -time_tolerance)
-		        time_freq = -time_tolerance;
-		} /* STA_PLL || STA_PPSTIME */
-	    } /* txc->modes & ADJ_OFFSET */
-	    if (txc->modes & ADJ_TICK) {
-		tick_usec = txc->tick;
-		tick_nsec = TICK_USEC_TO_NSEC(tick_usec);
-	    }
-	} /* txc->modes */
-leave:	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
-	    || ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
-		&& (time_status & STA_PPSSIGNAL) == 0)
-	    /* p. 24, (b) */
-	    || ((time_status & (STA_PPSTIME|STA_PPSJITTER))
-		== (STA_PPSTIME|STA_PPSJITTER))
-	    /* p. 24, (c) */
-	    || ((time_status & STA_PPSFREQ) != 0
-		&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
-	    /* p. 24, (d) */
+	if ((time_status & (STA_UNSYNC|STA_CLOCKERR)) != 0
+		|| ((time_status & (STA_PPSFREQ|STA_PPSTIME)) != 0
+			&& (time_status & STA_PPSSIGNAL) == 0)
+		/* p. 24, (b) */
+		|| ((time_status & (STA_PPSTIME|STA_PPSJITTER))
+			== (STA_PPSTIME|STA_PPSJITTER))
+		/* p. 24, (c) */
+		|| ((time_status & STA_PPSFREQ) != 0
+			&& (time_status & (STA_PPSWANDER|STA_PPSERROR)) != 0))
+		/* p. 24, (d) */
 		result = TIME_ERROR;
 	
 	if ((txc->modes & ADJ_OFFSET_SINGLESHOT) == ADJ_OFFSET_SINGLESHOT)
-	    txc->offset	   = save_adjust;
-	else {
-	    if (time_offset < 0)
-		txc->offset = -(-time_offset >> SHIFT_UPDATE);
-	    else
-		txc->offset = time_offset >> SHIFT_UPDATE;
-	}
-	txc->freq	   = time_freq + pps_freq;
-	txc->maxerror	   = time_maxerror;
-	txc->esterror	   = time_esterror;
-	txc->status	   = time_status;
-	txc->constant	   = time_constant;
-	txc->precision	   = time_precision;
-	txc->tolerance	   = time_tolerance;
-	txc->tick	   = tick_usec;
-	txc->ppsfreq	   = pps_freq;
-	txc->jitter	   = pps_jitter >> PPS_AVG;
-	txc->shift	   = pps_shift;
-	txc->stabil	   = pps_stabil;
-	txc->jitcnt	   = pps_jitcnt;
-	txc->calcnt	   = pps_calcnt;
-	txc->errcnt	   = pps_errcnt;
-	txc->stbcnt	   = pps_stbcnt;
-	write_sequnlock_irq(&xtime_lock);
+		txc->offset = save_adjust;
+	else
+		txc->offset = shift_right(time_offset, SHIFT_UPDATE);
+
+	txc->freq	= time_freq + pps_freq;
+	txc->maxerror	= time_maxerror;
+	txc->esterror	= time_esterror;
+	txc->status	= time_status;
+	txc->constant	= time_constant;
+	txc->precision	= time_precision;
+	txc->tolerance	= time_tolerance;
+	/*
+	 * TODO: shouldnt txc->time be filled in here, within ntp_lock and
+	 * xtime_lock, to get an atomic snapshot of time state? --mingo
+	 */
+	txc->tick	= tick_usec;
+	txc->ppsfreq	= pps_freq;
+	txc->jitter	= pps_jitter >> PPS_AVG;
+	txc->shift	= pps_shift;
+	txc->stabil	= pps_stabil;
+	txc->jitcnt	= pps_jitcnt;
+	txc->calcnt	= pps_calcnt;
+	txc->errcnt	= pps_errcnt;
+	txc->stbcnt	= pps_stbcnt;
+
+	write_sequnlock_irqrestore(&ntp_lock, flags);
+
 	do_gettimeofday(&txc->time);
+
 	notify_arch_cmos_timer();
-	return(result);
+
+	return result;
 }
 
 asmlinkage long sys_adjtimex(struct timex __user *txc_p)
 {
-	struct timex txc;		/* Local copy of parameter */
+	struct timex txc;
 	int ret;
 
-	/* Copy the user data space into the kernel copy
-	 * structure. But bear in mind that the structures
-	 * may change
+	/*
+	 * copy the user data space into the kernel copy
+	 * structure:
 	 */
-	if(copy_from_user(&txc, txc_p, sizeof(struct timex)))
+	if (copy_from_user(&txc, txc_p, sizeof(struct timex)))
 		return -EFAULT;
+
 	ret = do_adjtimex(&txc);
-	return copy_to_user(txc_p, &txc, sizeof(struct timex)) ? -EFAULT : ret;
+
+	/*
+	 * copy the results back to userspace (even if there was an error):
+	 */
+	if (copy_to_user(txc_p, &txc, sizeof(struct timex)))
+		ret = -EFAULT;
+
+	return ret;
 }
 
 inline struct timespec current_kernel_time(void)
@@ -486,6 +570,7 @@ struct timespec timespec_trunc(struct ti
 }
 EXPORT_SYMBOL(timespec_trunc);
 
+#ifndef CONFIG_GENERIC_TIME
 #ifdef CONFIG_TIME_INTERPOLATION
 void getnstimeofday (struct timespec *tv)
 {
@@ -522,10 +607,7 @@ int do_settimeofday (struct timespec *tv
 		set_normalized_timespec(&xtime, sec, nsec);
 		set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec);
 
-		time_adjust = 0;		/* stop active adjtime() */
-		time_status |= STA_UNSYNC;
-		time_maxerror = NTP_PHASE_LIMIT;
-		time_esterror = NTP_PHASE_LIMIT;
+		ntp_clear();
 		time_interpolator_reset();
 	}
 	write_sequnlock_irq(&xtime_lock);
@@ -573,6 +655,106 @@ void getnstimeofday(struct timespec *tv)
 EXPORT_SYMBOL_GPL(getnstimeofday);
 #endif
 
+#endif /* !CONFIG_GENERIC_TIME */
+
+/* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
+ * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
+ * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
+ *
+ * [For the Julian calendar (which was used in Russia before 1917,
+ * Britain & colonies before 1752, anywhere else before 1582,
+ * and is still in use by some communities) leave out the
+ * -year/100+year/400 terms, and add 10.]
+ *
+ * This algorithm was first published by Gauss (I think).
+ *
+ * WARNING: this function will overflow on 2106-02-07 06:28:16 on
+ * machines were long is 32-bit! (However, as time_t is signed, we
+ * will already get problems at other places on 2038-01-19 03:14:08)
+ */
+unsigned long
+mktime (unsigned int year, unsigned int mon,
+	unsigned int day, unsigned int hour,
+	unsigned int min, unsigned int sec)
+{
+	if (0 >= (int) (mon -= 2)) {	/* 1..12 -> 11,12,1..10 */
+		mon += 12;		/* Puts Feb last since it has leap day */
+		year -= 1;
+	}
+
+	return ((((unsigned long)
+		  (year/4 - year/100 + year/400 + 367*mon/12 + day) +
+		  year*365 - 719499
+	    )*24 + hour /* now have hours */
+	  )*60 + min /* now have minutes */
+	)*60 + sec; /* finally seconds */
+}
+EXPORT_SYMBOL(mktime);
+
+/**
+ * set_normalized_timespec - set timespec sec and nsec parts and normalize
+ *
+ * @ts:		pointer to timespec variable to be set
+ * @sec:	seconds to set
+ * @nsec:	nanoseconds to set
+ *
+ * Set seconds and nanoseconds field of a timespec variable and
+ * normalize to the timespec storage format
+ *
+ * Note: The tv_nsec part is always in the range of
+ * 	0 <= tv_nsec < NSEC_PER_SEC
+ * For negative values only the tv_sec field is negative !
+ */
+void set_normalized_timespec (struct timespec *ts, time_t sec, long nsec)
+{
+	while (nsec >= NSEC_PER_SEC) {
+		nsec -= NSEC_PER_SEC;
+		++sec;
+	}
+	while (nsec < 0) {
+		nsec += NSEC_PER_SEC;
+		--sec;
+	}
+	ts->tv_sec = sec;
+	ts->tv_nsec = nsec;
+}
+
+/**
+ * ns_to_timespec - Convert nanoseconds to timespec
+ *
+ * @ts:		pointer to timespec variable to store result
+ * @nsec:	nanoseconds value to be converted
+ *
+ * Stores the timespec representation of the nanoseconds value in
+ * the timespec variable pointed to by @ts
+ */
+void ns_to_timespec(struct timespec *ts, nsec_t nsec)
+{
+	if (nsec)
+		ts->tv_sec = div_long_long_rem_signed(nsec, NSEC_PER_SEC,
+							&ts->tv_nsec);
+	else
+		ts->tv_sec = ts->tv_nsec = 0;
+}
+
+/**
+ * ns_to_timeval - Convert nanoseconds to timeval
+ *
+ * @tv:		pointer to timeval variable to store result
+ * @nsec:	nanoseconds value to be converted
+ *
+ * Stores the timeval representation of the nanoseconds value in
+ * the timeval variable pointed to by @tv
+ */
+void ns_to_timeval(struct timeval *tv, nsec_t nsec)
+{
+	struct timespec ts;
+
+	ns_to_timespec(&ts, nsec);
+	tv->tv_sec = ts.tv_sec;
+	tv->tv_usec = (suseconds_t) ts.tv_nsec / 1000;
+}
+
 #if (BITS_PER_LONG < 64)
 u64 get_jiffies_64(void)
 {
Index: linux/kernel/time/Kconfig
===================================================================
--- /dev/null
+++ linux/kernel/time/Kconfig
@@ -0,0 +1,36 @@
+#
+# Timer subsystem related configuration options
+#
+
+config KTIME_SCALAR
+	bool "Ktimers 64bit scalar representation"
+	depends on !64BIT
+	default n
+	help
+	 (You dont want to change this unless you want to hack the
+	 timer code. Just keep it disabled.)
+
+	 This option enables the 64bit based scalar representation
+	 of the ktimer internal variables on 32bit systems. On i386
+	 this results in denser code and slightly better overall
+	 performance.
+
+config HIGH_RES_TIMERS
+	bool "High Resolution Timer Support"
+	depends on GENERIC_TIME
+	help
+	  This option enables high resolution timer support. If your
+	  hardware is not capable then this option only increases
+	  the size of the kernel image.
+
+config HIGH_RES_RESOLUTION
+	int "High Resolution Timer resolution (nanoseconds)"
+	depends on HIGH_RES_TIMERS
+	default 1000
+	help
+	  This sets the resolution in nanoseconds of the high resolution
+	  timers. Too fine a resolution (small a number) will usually
+	  not be observable due to normal system latencies.  For an
+          800 MHz processor about 10,000 (10 microseconds) is recommended as a
+	  finest resolution.  If you don't need that sort of resolution,
+	  larger values may generate less overhead.
Index: linux/kernel/time/Makefile
===================================================================
--- /dev/null
+++ linux/kernel/time/Makefile
@@ -0,0 +1 @@
+obj-y	= clocksource.o jiffies.o clockevents.o timeofday.o
Index: linux/kernel/time/clockevents.c
===================================================================
--- /dev/null
+++ linux/kernel/time/clockevents.c
@@ -0,0 +1,608 @@
+/*
+ * linux/kernel/time/clockevents.c
+ *
+ * This file contains functions which manage clock event drivers.
+ *
+ * Copyright(C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ *
+ * Kudos to Ingo Molnar for review, criticism, ideas
+ *
+ * We have two types of clock event devices:
+ * - global events (one device per system)
+ * - local events (one device per cpu)
+ *
+ * We assign the various time(r) related interrupts to those devices
+ *
+ * - global tick
+ * - profiling (per cpu)
+ * - next timer events (per cpu)
+ *
+ * TODO:
+ * - implement variable frequency profiling
+ */
+
+#include <linux/clockchips.h>
+#include <linux/cpu.h>
+#include <linux/init.h>
+#include <linux/notifier.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sysdev.h>
+#include <linux/ktimer.h>
+
+#define MAX_CLOCK_EVENTS	4
+
+struct event_descr {
+	struct clock_event *event;
+	unsigned int mode;
+	unsigned int real_caps;
+	struct irqaction action;
+};
+
+struct local_events {
+	int installed;
+	struct event_descr events[MAX_CLOCK_EVENTS];
+	struct clock_event *nextevt;
+};
+
+/* Variables related to the global event source */
+static struct event_descr global_eventsource;
+
+/* Variables related to the per cpu local event sources */
+static DEFINE_PER_CPU(struct local_events, local_eventsources);
+
+#ifdef CONFIG_SMP
+# define recalc_global_event(e) do { } while(0)
+#else
+# define recalc_global_event(c) recalc_active_event(&global_eventsource, c)
+#endif
+
+/*
+ * Math helper. Convert a latch value to ns
+ */
+unsigned long clockevent_delta2ns(unsigned long latch, struct clock_event *evt)
+{
+	u64 clc = ((u64) latch << evt->shift);
+
+	do_div(clc, evt->mult);
+	if (clc < KTIME_MONOTONIC_RES)
+		clc = KTIME_MONOTONIC_RES;
+	if (clc > 0x7FFFFFFF)
+		clc = 0x7FFFFFFF;
+
+	return (unsigned long) clc;
+}
+
+/*
+ * Generic timer interrupt handler usable for all kinds of events
+ */
+static irqreturn_t timer_interrupt(int irq, void *dev_id, struct pt_regs *regs)
+{
+	struct clock_event *evt = dev_id;
+
+	if (evt->start_event)
+		evt->start_event(evt->priv);
+
+	evt->event_handler(regs);
+
+	if (evt->end_event)
+		evt->end_event(evt->priv);
+
+	return IRQ_HANDLED;
+}
+
+/*
+ * Handle tick
+ */
+static void handle_tick(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+}
+
+/*
+ * Handle tick and update
+ */
+static void handle_tick_update(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Handle tick, update and profiling
+ */
+static void handle_tick_update_profile(struct pt_regs *regs)
+{
+	write_seqlock(&xtime_lock);
+	do_timer(regs);
+	write_sequnlock(&xtime_lock);
+
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Handle update
+ */
+static void handle_update(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+}
+
+/*
+ * Handle update and profile
+ */
+static void handle_update_profile(struct pt_regs *regs)
+{
+	update_process_times(user_mode(regs));
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Handle profile
+ */
+static void handle_profile(struct pt_regs *regs)
+{
+	profile_tick(CPU_PROFILING, regs);
+}
+
+/*
+ * Handle next event
+ */
+static void handle_nextevent(struct pt_regs *regs)
+{
+	ktimer_interrupt();
+}
+
+/*
+ * Handle next event, tick
+ */
+static void handle_nextevent_tick(struct pt_regs *regs)
+{
+	int res;
+
+	res = ktimer_interrupt();
+	for (; res > 0; res--)
+		handle_tick(regs);
+}
+
+/*
+ * Handle next event, update
+ */
+static void handle_nextevent_update(struct pt_regs *regs)
+{
+	if (ktimer_interrupt() > 0)
+		handle_update(regs);
+}
+
+/*
+ * Handle next event, tick, update
+ */
+static void handle_nextevent_tick_update(struct pt_regs *regs)
+{
+	int res;
+
+	if ((res = ktimer_interrupt()) == 0)
+		return;
+
+	for (; res > 0; res--)
+		handle_tick(regs);
+
+	handle_update(regs);
+}
+
+/*
+ * Handle next event, profile
+ */
+static void handle_nextevent_profile(struct pt_regs *regs)
+{
+	if (ktimer_interrupt() > 0)
+		handle_profile(regs);
+}
+
+/*
+ * Handle next event, update, profile
+ */
+static void handle_nextevent_update_profile(struct pt_regs *regs)
+{
+	if (ktimer_interrupt() > 0)
+		handle_update_profile(regs);
+}
+
+/*
+ * Handle next event, tick, update, profile
+ */
+static void handle_nextevent_all(struct pt_regs *regs)
+{
+	int res;
+
+	if ((res = ktimer_interrupt()) == 0)
+		return;
+
+	for (; res > 0; res--)
+		handle_tick(regs);
+
+	handle_update_profile(regs);
+}
+
+/*
+ * Lookup table for event assignment
+ */
+static void *event_handlers[] = {
+	NULL,				/* 0: No capability selected */
+	handle_tick,			/* 1: Tick only	*/
+	handle_nextevent,		/* 2: Next event only */
+	handle_nextevent_tick,		/* 3: Next event + tick */
+	handle_update,			/* 4: Update process times */
+	handle_tick_update,		/* 5: Tick + update process times */
+	handle_nextevent_update,	/* 6: Next event +
+					      update process times */
+	handle_nextevent_tick_update,	/* 7: Next event + tick +
+					      update process times */
+	handle_profile,			/* 8: Profiling int */
+	NULL,				/* 9: Tick + profiling */
+	handle_nextevent_profile,	/* A: Next event + profiling */
+	NULL,				/* B: Next event + tick + profiling */
+	handle_update_profile,		/* C: Update process times +
+					      profiling */
+	handle_tick_update_profile,	/* D: Tick + update process times +
+					      profiling */
+	handle_nextevent_update_profile,/* E: Next event +
+					      update process times +
+					      profiling */
+	handle_nextevent_all,		/* F: Next event + tick +
+					      update process times +
+					      profiling */
+};
+
+/*
+ * The selection model makes following assumptions:
+ *
+ * There is only one global event source set up. Global event sources
+ * are unique devices in a system (UP/SMP) Usually they are setup
+ * early in the bootup phase to provide the basic tick environment to
+ * bring up hardware. Such a device can be capable of providing all in
+ * one functionality including next event scheduling.
+ *
+ * When a system has decicated event sources which can be used for
+ * particular purposes then we assume that there are no devices setup
+ * which provide "competing" functionality. i.e. the developer has to
+ * decide which device should be used for a particular functionality
+ * rather than letting the management code guess about the best
+ * fit. The code manages the cases, where the number of event sources
+ * is unknown during compile time, but the functionality of the event
+ * source is assigned to the respective event source by a human best
+ * fit decision.
+ *
+ * The purpose of the management code is to provide handling code for
+ * the various possible combinations and the necessary infrastructure
+ * to handle next event (e.g. high resolution) scheduling with a
+ * single event source, which makes a periodic rescheduling of the
+ * tick interupt necessary. This is done to avoid the #ifdef mess all
+ * over the architecture dependend timer and event interupt code for
+ * the various possible use case combinations and allows clean non
+ * intrusive implementation of configurable extensions to the time
+ * related event system e.g. dynamic ticks, high resolution
+ * timers.
+ *
+ * Some architectures can use a NMI based profiling mechanism. If this
+ * is used, then profiling is excluded from the event assignements.
+ *
+ * SMP systems CPU which have no unique global event source should not
+ * setup a global event source. The correct way is setting up one
+ * event source (usually local to CPU0 or the bootcpu in hotplug
+ * systems) which has the CLOCK_CAP_TICK flag set, so the management
+ * code assigns exactly one tick source for the complete system.
+ *
+ * A special case are pseudo event sources (IPI mechanisms) on SMP
+ * systems. They can be used for populating tick events from one event
+ * source across multiple CPUs.
+ *
+ */
+static int setup_event(struct event_descr *descr, struct clock_event *evt,
+		       unsigned int caps, cpumask_t cpumask)
+{
+	void *handler = event_handlers[caps];
+
+	if (!handler) {
+		printk(KERN_ERR "Unsupported event source %s\n", evt->name);
+		return -EINVAL;
+	}
+
+	/* Store the event handler */
+	evt->event_handler = handler;
+
+	/* Save the event descriptor reference */
+	descr->event = evt;
+
+	if (!(evt->capabilities & CLOCK_HAS_IRQHANDLER)) {
+		descr->action.name = evt->name;
+		descr->action.handler = timer_interrupt;
+		descr->action.flags = SA_INTERRUPT | SA_NODELAY;
+		descr->action.mask = cpumask;
+		descr->action.dev_id = evt;
+		setup_irq(evt->irq, &descr->action);
+	}
+
+	descr->real_caps = caps;
+	descr->mode = CLOCK_EVT_STARTUP;
+	if (evt->set_mode)
+		evt->set_mode(CLOCK_EVT_STARTUP);
+	printk(KERN_INFO "Event source %s installed with caps set: %02x\n",
+	       descr->event->name, descr->real_caps);
+
+	return 0;
+}
+
+/*
+ * Mask out the functionality which is covered by the new event source
+ * and assign a new event handler.
+ */
+static unsigned int recalc_active_event(struct event_descr *descr,
+					unsigned int caps)
+{
+	unsigned int gcaps;
+
+	if (!descr->event)
+		return caps;
+
+	/* Find out the overlapping bits */
+	gcaps = descr->real_caps & caps;
+
+	/*
+	 * Be careful here. We dont know in which order the event
+	 * sources are set up. So we might switch off a previously
+	 * registered source completely.
+	 *
+	 * Might need more thoughts though.
+	 */
+	if (gcaps == descr->real_caps) {
+		int i;
+
+		i = ffs(gcaps) - 1;
+		gcaps &= ~(1 << i);
+		caps &= ~(1 << i);
+	}
+	if (!gcaps)
+		return caps;
+
+	/* Mask the bits which are now covered by the new event */
+	descr->real_caps &= ~gcaps;
+
+	/* Assign the new event handler */
+	descr->event->event_handler = event_handlers[descr->real_caps];
+	printk(KERN_INFO "Event source %s new caps set: %02x\n" ,
+	       descr->event->name, descr->real_caps);
+
+	return caps;
+}
+
+/*
+ * Recalc the events and reassign the handlers if necessary
+ */
+static int recalc_events(struct local_events *sources, struct clock_event *evt,
+			 cpumask_t cpumask)
+{
+	unsigned int caps = evt->capabilities & CLOCK_CAP_MASK;
+	int i;
+
+	if (sources->installed == MAX_CLOCK_EVENTS)
+		return -ENOSPC;
+
+	if (!event_handlers[caps])
+		return -EINVAL;
+
+	recalc_global_event(caps);
+
+	for (i = 0; i < sources->installed; i++)
+		caps = recalc_active_event(&sources->events[i], caps);
+
+	setup_event(&sources->events[sources->installed], evt, caps, cpumask);
+	sources->installed++;
+	if (evt->capabilities & CLOCK_CAP_NEXTEVT) {
+		sources->nextevt = evt;
+		ktimer_clock_notify();
+	}
+
+	return 0;
+}
+
+/**
+ * setup_local_clockevent - Set up a cpu local clock event device
+ *
+ * @evtdev:	event device to be registered
+ * @cpumask:	cpumask for the irq setup
+ */
+int setup_local_clockevent(struct clock_event *evtdev, cpumask_t cpumask)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	unsigned long flags;
+	int res;
+
+	/* Recalc event sources and maybe reassign interrupts */
+	raw_local_irq_save(flags);
+	res = recalc_events(sources, evtdev, cpumask);
+	raw_local_irq_restore(flags);
+
+	return res;
+}
+EXPORT_SYMBOL(setup_local_clockevent);
+
+/**
+ * set_global_clockevent - Set the device which generates global clock events
+ *
+ * @evt:	The device which generates global clock events (ticks)
+ */
+int __init setup_global_clockevent(struct clock_event *evt, cpumask_t cpumask)
+{
+	int res;
+
+	res = setup_event(&global_eventsource, evt,
+			   evt->capabilities & CLOCK_CAP_MASK, cpumask);
+#ifndef CONFIG_SMP
+	/*
+	 * The "global" event source on UP systems can serve as
+	 * next event source !
+	 */
+	if (!res && (evt->capabilities & CLOCK_CAP_NEXTEVT))
+		per_cpu(local_eventsources, 0).nextevt = evt;
+#endif
+	return res;
+}
+
+/**
+ * clockevents_next_event_available - Check for a installed next event source
+ */
+int clockevents_next_event_available(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+        int i;
+
+	if (!sources->nextevt)
+		return 0;
+
+#ifndef CONFIG_SMP
+	if (sources->nextevt == global_eventsource.event)
+		return CLOCK_EVT_SCHEDTICK;
+#endif
+	/*
+	 * Check, whether the next event source is solely for next events or
+	 * it has to do some periodic tick functionality
+	 * We use the real_caps field here, as some other source might
+	 * have switched off one of the capability flags.
+	 */
+	for (i = 0; i < sources->installed; i++) {
+                if (sources->nextevt != sources->events[i].event)
+                        continue;
+
+	        if (sources->events[i].real_caps & ~CLOCK_CAP_NEXTEVT)
+        		return CLOCK_EVT_SCHEDTICK;
+		return CLOCK_EVT_NOTICK;
+        }
+	return CLOCK_EVT_NOTICK;
+}
+
+int clockevents_init_next_event(void)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+
+	if (!sources->nextevt)
+		return 0;
+
+	if (sources->nextevt->set_mode)
+		sources->nextevt->set_mode(CLOCK_EVT_ONESHOT);
+
+	return 1;
+}
+
+int clockevents_set_next_event(ktime_t expires, ktime_t now)
+{
+	struct local_events *sources = &__get_cpu_var(local_eventsources);
+	nsec_t delta = ktime_to_ns(ktime_sub(expires, now));
+	unsigned long clc;
+
+	if (delta <= 0)
+		return -ETIME;
+	if (delta > sources->nextevt->max_delta_ns)
+		delta = sources->nextevt->max_delta_ns;
+	if (delta < sources->nextevt->min_delta_ns)
+		delta = sources->nextevt->min_delta_ns;
+
+	clc = mpy_sc32((unsigned long) delta, sources->nextevt->mult);
+	sources->nextevt->set_next_event(clc);
+
+	ktimer_trace(expires, clc);
+
+	return 0;
+}
+
+void clockevents_trigger_next_event(void)
+{
+}
+
+#ifdef CONFIG_PM
+static int
+global_eventsource_suspend(struct sys_device *dev, pm_message_t state)
+{
+	/* Do generic stuff here */
+	if (global_eventsource.event->suspend)
+		global_eventsource.event->suspend();
+	return 0;
+}
+
+static int global_eventsource_resume(struct sys_device *dev)
+{
+	/* Do generic stuff here */
+	if (global_eventsource.event->resume)
+		global_eventsource.event->resume();
+	return 0;
+}
+#else
+# define global_eventsource_resume	NULL
+# define global_eventsource_suspend	NULL
+#endif
+
+static struct sysdev_class global_clock_event_sysclass = {
+	.resume = global_eventsource_resume,
+	.suspend = global_eventsource_suspend,
+	set_kset_name("global_clock_event"),
+};
+
+static struct sys_device device_global_clock_event = {
+	.id	= 0,
+	.cls	= &global_clock_event_sysclass,
+};
+
+static int __init global_clock_event_devinit(void)
+{
+	int error = sysdev_class_register(&global_clock_event_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_global_clock_event);
+
+	return error;
+}
+
+device_initcall(global_clock_event_devinit);
+
+/*
+ * Functions related to initialization
+ */
+static void __devinit init_clockevents_cpu(int cpu)
+{
+}
+
+static int __devinit clockevents_cpu_notify(struct notifier_block *self,
+					    unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+
+	switch(action) {
+	case CPU_UP_PREPARE:
+		init_clockevents_cpu(cpu);
+		break;
+#ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DEAD:
+		break;
+#endif
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __devinitdata clockevents_nb = {
+	.notifier_call	= clockevents_cpu_notify,
+};
+
+void __init init_clockevents(void)
+{
+	clockevents_cpu_notify(&clockevents_nb, (unsigned long)CPU_UP_PREPARE,
+				(void *)(long)smp_processor_id());
+	register_cpu_notifier(&clockevents_nb);
+}
Index: linux/kernel/time/clocksource.c
===================================================================
--- /dev/null
+++ linux/kernel/time/clocksource.c
@@ -0,0 +1,314 @@
+/*
+ * linux/kernel/time/clocksource.c
+ *
+ * This file contains the functions which manage clocksource drivers.
+ *
+ * Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o Allow clocksource drivers to be unregistered
+ *   o get rid of clocksource_jiffies extern
+ */
+
+#include <linux/clocksource.h>
+#include <linux/sysdev.h>
+#include <linux/init.h>
+#include <linux/module.h>
+
+/* XXX - Would like a better way for initializing curr_clocksource */
+extern struct clocksource clocksource_jiffies;
+
+/*[Clocksource internal variables]---------
+ * curr_clocksource:
+ *	currently selected clocksource. Initialized to clocksource_jiffies.
+ * next_clocksource:
+ *	pending next selected clocksource.
+ * clocksource_list:
+ *	linked list with the registered clocksources
+ * clocksource_lock:
+ *	protects manipulations to curr_clocksource and next_clocksource
+ *	and the clocksource_list
+ * override_name:
+ *	Name of the user-specified clocksource.
+ */
+static struct clocksource *curr_clocksource = &clocksource_jiffies;
+static struct clocksource *next_clocksource;
+static LIST_HEAD(clocksource_list);
+/* TODO: why a seqlock? It's only write-locked, so should be a spinlock. */
+static DECLARE_RAW_SEQLOCK(clocksource_lock);
+
+static char override_name[32];
+
+/**
+ * get_next_clocksource - Returns the selected clocksource
+ */
+struct clocksource *get_next_clocksource(void)
+{
+	write_seqlock(&clocksource_lock);
+	if (next_clocksource) {
+		curr_clocksource = next_clocksource;
+		next_clocksource = NULL;
+	}
+	write_sequnlock(&clocksource_lock);
+
+	return curr_clocksource;
+}
+
+/**
+ * select_clocksource - Finds the best registered clocksource.
+ *
+ * Private function. Must have a writelock on clocksource_lock
+ * when called.
+ */
+static struct clocksource *select_clocksource(void)
+{
+	struct clocksource *best = NULL;
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (!best)
+			best = src;
+
+		/* check for override: */
+		if (strlen(src->name) == strlen(override_name) &&
+		    !strcmp(src->name, override_name)) {
+			best = src;
+			break;
+		}
+		/* pick the highest rating: */
+		if (src->rating > best->rating)
+		 	best = src;
+	}
+
+	return best;
+}
+
+/**
+ * is_registered_source - Checks if clocksource is registered
+ * @c:		pointer to a clocksource
+ *
+ * Private helper function, should not be used externally.
+ *
+ * Returns one if the clocksource is already registered, zero otherwise.
+ */
+static inline int is_registered_source(struct clocksource *c)
+{
+	int len = strlen(c->name);
+	struct list_head *tmp;
+
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		if (strlen(src->name) == len &&	!strcmp(src->name, c->name))
+			return 1;
+	}
+
+	return 0;
+}
+
+/**
+ * register_clocksource - Used to install new clocksources
+ * @t:		clocksource to be registered
+ */
+void register_clocksource(struct clocksource *c)
+{
+	write_seqlock(&clocksource_lock);
+
+	/* check if clocksource is already registered */
+	if (is_registered_source(c)) {
+		printk("register_clocksource: Cannot register %s. Already registered!",
+		       c->name);
+	} else {
+		list_add(&c->list, &clocksource_list);
+		/* select next clocksource */
+		next_clocksource = select_clocksource();
+	}
+	write_sequnlock(&clocksource_lock);
+}
+
+EXPORT_SYMBOL(register_clocksource);
+
+/**
+ * reselect_clocksource - Rescan list for next clocksource
+ *
+ * A quick helper function to be used if a clocksource changes its
+ * rating. Forces the clocksource list to be re-scaned for the best
+ * clocksource.
+ */
+void reselect_clocksource(void)
+{
+	write_seqlock(&clocksource_lock);
+	next_clocksource = select_clocksource();
+	write_sequnlock(&clocksource_lock);
+}
+
+/**
+ * sysfs_show_current_clocksources - sysfs interface for current clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing current clocksource.
+ */
+static ssize_t
+sysfs_show_current_clocksources(struct sys_device *dev, char *buf)
+{
+	char *curr = buf;
+
+	write_seqlock(&clocksource_lock);
+	curr += sprintf(curr, "%s ", curr_clocksource->name);
+	write_sequnlock(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/**
+ * sysfs_override_clocksource - interface for manually overriding clocksource
+ * @dev:	unused
+ * @buf:	name of override clocksource
+ * @count:	length of buffer
+ *
+ * Takes input from sysfs interface for manually overriding the default
+ * clocksource selction.
+ */
+static ssize_t sysfs_override_clocksource(struct sys_device *dev,
+					  const char *buf, size_t count)
+{
+	/* strings from sysfs write are not 0 terminated! */
+	if (count >= sizeof(override_name))
+		return -EINVAL;
+
+	/* strip of \n: */
+	if (buf[count-1] == '\n')
+		count--;
+	if (count < 1)
+		return -EINVAL;
+
+	write_seqlock(&clocksource_lock);
+
+	/* copy the name given: */
+	memcpy(override_name, buf, count);
+	override_name[count] = 0;
+
+	/* try to select it: */
+	next_clocksource = select_clocksource();
+
+	write_sequnlock(&clocksource_lock);
+
+	return count;
+}
+
+/**
+ * sysfs_show_available_clocksources - sysfs interface for listing clocksource
+ * @dev:	unused
+ * @buf:	char buffer to be filled with clocksource list
+ *
+ * Provides sysfs interface for listing registered clocksources
+ */
+static ssize_t
+sysfs_show_available_clocksources(struct sys_device *dev, char *buf)
+{
+	struct list_head *tmp;
+	char *curr = buf;
+
+	write_seqlock(&clocksource_lock);
+	list_for_each(tmp, &clocksource_list) {
+		struct clocksource *src;
+
+		src = list_entry(tmp, struct clocksource, list);
+		curr += sprintf(curr, "%s ", src->name);
+	}
+	write_sequnlock(&clocksource_lock);
+
+	curr += sprintf(curr, "\n");
+
+	return curr - buf;
+}
+
+/*
+ * Sysfs setup bits:
+ */
+static SYSDEV_ATTR(current_clocksource, 0600, sysfs_show_current_clocksources,
+			sysfs_override_clocksource);
+
+static SYSDEV_ATTR(available_clocksource, 0600,
+			sysfs_show_available_clocksources, NULL);
+
+static struct sysdev_class clocksource_sysclass = {
+	set_kset_name("clocksource"),
+};
+
+static struct sys_device device_clocksource = {
+	.id	= 0,
+	.cls	= &clocksource_sysclass,
+};
+
+static int init_clocksource_sysfs(void)
+{
+	int error = sysdev_class_register(&clocksource_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_current_clocksource);
+	if (!error)
+		error = sysdev_create_file(
+				&device_clocksource,
+				&attr_available_clocksource);
+	return error;
+}
+
+device_initcall(init_clocksource_sysfs);
+
+/**
+ * boot_override_clocksource - boot clock override
+ * @str:	override name
+ *
+ * Takes a clocksource= boot argument and uses it
+ * as the clocksource override name.
+ */
+static int __init boot_override_clocksource(char* str)
+{
+	if (str)
+		strlcpy(override_name, str, sizeof(override_name));
+	return 1;
+}
+
+__setup("clocksource=", boot_override_clocksource);
+
+/**
+ * boot_override_clock - Compatibility layer for deprecated boot option
+ * @str:	override name
+ *
+ * DEPRECATED! Takes a clock= boot argument and uses it
+ * as the clocksource override name
+ */
+static int __init boot_override_clock(char* str)
+{
+	printk("Warning! clock= boot option is deprecated.\n");
+
+	return boot_override_clocksource(str);
+}
+
+__setup("clock=", boot_override_clock);
Index: linux/kernel/time/jiffies.c
===================================================================
--- /dev/null
+++ linux/kernel/time/jiffies.c
@@ -0,0 +1,75 @@
+/***********************************************************************
+* linux/kernel/time/jiffies.c
+*
+* This file contains the jiffies based clocksource.
+*
+* Copyright (C) 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+*
+* This program is free software; you can redistribute it and/or modify
+* it under the terms of the GNU General Public License as published by
+* the Free Software Foundation; either version 2 of the License, or
+* (at your option) any later version.
+*
+* This program is distributed in the hope that it will be useful,
+* but WITHOUT ANY WARRANTY; without even the implied warranty of
+* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+* GNU General Public License for more details.
+*
+* You should have received a copy of the GNU General Public License
+* along with this program; if not, write to the Free Software
+* Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+*
+************************************************************************/
+#include <linux/clocksource.h>
+#include <linux/jiffies.h>
+#include <linux/init.h>
+
+/* The Jiffies based clocksource is the lowest common
+ * denominator clock source which should function on
+ * all systems. It has the same coarse resolution as
+ * the timer interrupt frequency HZ and it suffers
+ * inaccuracies caused by missed or lost timer
+ * interrupts and the inability for the timer
+ * interrupt hardware to accuratly tick at the
+ * requested HZ value. It is also not reccomended
+ * for "tick-less" systems.
+ */
+#define NSEC_PER_JIFFY	((u32)((((u64)NSEC_PER_SEC)<<8)/ACTHZ))
+
+/* Since jiffies uses a simple NSEC_PER_JIFFY multiplier
+ * conversion, the .shift value could be zero. However
+ * this would make NTP adjustments impossible as they are
+ * in units of 1/2^.shift. Thus we use JIFFIES_SHIFT to
+ * shift both the nominator and denominator the same
+ * amount, and give ntp adjustments in units of 1/2^8
+ *
+ * The value 8 is somewhat carefully chosen, as anything
+ * larger can result in overflows. NSEC_PER_JIFFY grows as
+ * HZ shrinks, so values greater then 8 overflow 32bits when
+ * HZ=100.
+ */
+#define JIFFIES_SHIFT	8
+
+static cycle_t jiffies_read(void)
+{
+	return (cycle_t) get_jiffies_64();
+}
+
+struct clocksource clocksource_jiffies = {
+	.name		= "jiffies",
+	.rating		= 0, /* lowest rating*/
+	.read		= jiffies_read,
+	.mask		= (cycle_t)-1,
+	.mult		= NSEC_PER_JIFFY << JIFFIES_SHIFT, /* details above */
+	.shift		= JIFFIES_SHIFT,
+	.is_continuous	= 0, /* tick based, not free running */
+};
+
+static int __init init_jiffies_clocksource(void)
+{
+	register_clocksource(&clocksource_jiffies);
+
+	return 0;
+}
+
+module_init(init_jiffies_clocksource);
Index: linux/kernel/time/timeofday.c
===================================================================
--- /dev/null
+++ linux/kernel/time/timeofday.c
@@ -0,0 +1,810 @@
+/*
+ * linux/kernel/time/timeofday.c
+ *
+ * This file contains the functions which access and manage
+ * the system's time of day functionality.
+ *
+ * Copyright (C) 2003, 2004, 2005 IBM, John Stultz (johnstul@us.ibm.com)
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ *
+ * TODO WishList:
+ *   o See XXX's below.
+ */
+
+#include <linux/clocksource.h>
+#include <linux/timeofday.h>
+#include <linux/jiffies.h>
+#include <linux/sysdev.h>
+#include <linux/ktimer.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
+#include <linux/timex.h>
+#include <linux/sched.h>
+
+#include <asm/timeofday.h>
+
+/* Periodic hook interval */
+#define PERIODIC_INTERVAL_MS 50
+
+/* [ktime_t based variables]
+ * system_time:
+ *	Monotonically increasing counter of the number of nanoseconds
+ *	since boot.
+ * wall_time_offset:
+ *	Offset added to system_time to provide accurate time-of-day
+ */
+static ktime_t system_time;
+static ktime_t wall_time_offset;
+
+/* [timespec based variables]
+ * These variables mirror teh ktime_t based variables to avoid
+ * performance issues in the userspace syscall paths.
+ *
+ * wall_time_ts:
+ *	timespec holding the current wall time.
+ * mono_time_ts:
+ *	timespec holding the current monotonic time.
+ * monotonic_time_offset_ts:
+ *	timespec holding the difference between wall and monotonic time.
+ */
+static struct timespec wall_time_ts;
+static struct timespec mono_time_ts;
+static struct timespec monotonic_time_offset_ts;
+
+/* [cycle based variables]
+ * cycle_last:
+ *	Value of the clocksource at the last timeofday_periodic_hook()
+ *	(adjusted only minorly to account for rounded off cycles)
+ */
+static cycle_t cycle_last;
+
+/* [clocksource_interval variables]
+ * ts_interval:
+ *	This clocksource_interval is used in the fixed interval
+ *	cycles to nanosecond calculation.
+ * INTERVAL_LEN:
+ *	This constant is the requested fixed interval period
+ *	in nanoseconds.
+ */
+struct clocksource_interval ts_interval;
+#define INTERVAL_LEN ((PERIODIC_INTERVAL_MS-1)*1000000)
+
+/* [clocksource data]
+ * clock:
+ *	current clocksource pointer
+ */
+static struct clocksource *clock;
+
+/* [NTP adjustment]
+ * ntp_adj:
+ *	value of the current ntp adjustment, stored in
+ *	clocksource multiplier units.
+ */
+int ntp_adj;
+
+/* [locks]
+ * system_time_lock:
+ *	generic lock for all locally scoped time values
+ */
+static DECLARE_RAW_SEQLOCK(system_time_lock);
+
+
+/* [suspend/resume info]
+ * time_suspend_state:
+ *	variable that keeps track of suspend state
+ * suspend_start:
+ *	start of the suspend call
+ */
+static enum {
+	TIME_RUNNING,
+	TIME_SUSPENDED
+} time_suspend_state = TIME_RUNNING;
+
+static nsec_t suspend_start;
+
+/* [Soft-Timers]
+ * timeofday_timer:
+ *	soft-timer used to call timeofday_periodic_hook()
+ */
+struct ktimer timeofday_timer;
+
+
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+/* This will hurt performance! */
+static DEFINE_RAW_SPINLOCK(check_monotonic_lock);
+static ktime_t last_monotonic_ktime;
+
+static ktime_t get_check_value(void)
+{
+	unsigned long flags;
+	ktime_t ret;
+
+	spin_lock_irqsave(&check_monotonic_lock, flags);
+	ret = last_monotonic_ktime;
+	spin_unlock_irqrestore(&check_monotonic_lock, flags);
+
+	return ret;
+}
+
+static void check_monotonic_clock(ktime_t prev, ktime_t now)
+{
+	unsigned long flags;
+
+	/* check for monotonic inconsistencies */
+	if (ktime_cmp(now, <, prev)) {
+		static int warn = 1;
+
+		if (warn) {
+			warn = 0;
+
+			printk("check_monotonic_clock: monotonic inconsistency"
+					" detected!\n");
+			printk("	from %16Lx (%llu) to %16Lx (%llu).\n",
+					ktime_to_ns(prev),
+					ktime_to_ns(prev),
+					ktime_to_ns(now),
+					ktime_to_ns(now));
+			WARN_ON(1);
+		}
+	}
+	spin_lock_irqsave(&check_monotonic_lock, flags);
+	last_monotonic_ktime = now;
+	spin_unlock_irqrestore(&check_monotonic_lock, flags);
+}
+
+/* timespec version */
+#define check_monotonic_clock_ts(prev, now) \
+	check_monotonic_clock(prev, timespec_to_ktime(now))
+
+/* Call holding atleast a readlock on system_time_lock */
+void verify_timekeeping_state(void)
+{
+	/* ensure all the timespec and ktime values are consistent: */
+	WARN_ON_ONCE(ktime_cmp(system_time, !=,
+			timespec_to_ktime(mono_time_ts)));
+	WARN_ON_ONCE(ktime_cmp(ktime_add(system_time, wall_time_offset), !=,
+			timespec_to_ktime(wall_time_ts)));
+	WARN_ON_ONCE(ktime_cmp(wall_time_offset, !=,
+			timespec_to_ktime(monotonic_time_offset_ts)));
+}
+
+static void check_periodic_interval(cycle_t now)
+{
+	static cycle_t last;
+
+	cycle_t delta;
+	nsec_t ns_offset;
+
+	if (last != 0 && now != 0) {
+		delta = (now - last)& clock->mask;
+
+		ns_offset = cyc2ns(clock, ntp_adj, delta);
+
+		if (ns_offset > (nsec_t)2*PERIODIC_INTERVAL_MS *1000000) {
+			static int warn_count = 1;
+			if (warn_count > 0) {
+				warn_count--;
+				printk("check_periodic_interval: Long interval! %llu.\n",
+								ns_offset);
+				printk("		Something may be blocking interrupts.\n");
+			}
+		}
+		if (ns_offset < (nsec_t)PERIODIC_INTERVAL_MS *1000000) {
+			static int warn_count = 1;
+			if (warn_count > 0) {
+				warn_count--;
+				printk("check_periodic_interval: short interval! %llu.\n",
+								ns_offset);
+				printk("		bad calibration or ktimers may be broken.\n");
+			}
+		}
+	}
+	last = now;
+}
+
+#else /* CONFIG_PARANOID_GENERIC_TIME */
+  /* XXX can we optimize this out? */
+# define get_check_value(void)		ktime_set(0,0)
+# define check_monotonic_clock(x,y)	do { } while (0)
+# define check_monotonic_clock_ts(x,ts)	do { } while (0)
+# define verify_timekeeping_state()	do { } while (0)
+# define check_periodic_interval(x)	do { } while (0)
+#endif /* CONFIG_PARANOID_GENERIC_TIME */
+
+/**
+ * update_legacy_time_values - sync legacy time values
+ *
+ * This function is necessary for a smooth transition to the
+ * new timekeeping code. When all the xtime/wall_to_monotonic
+ * users are converted this function can be removed.
+ *
+ * system_time_lock must be held by the caller
+ */
+static void update_legacy_time_values(void)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&xtime_lock, flags);
+
+	xtime = wall_time_ts;
+	set_normalized_timespec(&wall_to_monotonic,
+		-monotonic_time_offset_ts.tv_sec,
+		-monotonic_time_offset_ts.tv_nsec);
+
+	write_sequnlock_irqrestore(&xtime_lock, flags);
+
+	/* since time state has changed, notify vsyscall code */
+	arch_update_vsyscall_gtod(wall_time_ts, cycle_last, clock, ntp_adj);
+}
+
+/**
+ * __get_nsec_offset - Returns nanoseconds since last call to periodic_hook
+ *
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the number of nanoseconds since the
+ * last call to timeofday_periodic_hook() (adjusted by NTP scaling)
+ */
+static inline nsec_t __get_nsec_offset(void)
+{
+	cycle_t cycle_now, cycle_delta;
+	nsec_t ns_offset;
+
+	/* read clocksource: */
+	cycle_now = read_clocksource(clock);
+
+	/* calculate the delta since the last timeofday_periodic_hook: */
+	cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyc2ns(clock, ntp_adj, cycle_delta);
+
+	/*
+	 * special case for jiffies tick/offset based systems,
+	 * add arch-specific offset:
+	 */
+	ns_offset += arch_getoffset();
+
+	return ns_offset;
+}
+
+/**
+ * __get_monotonic_clock - Returns monotonically increasing nanoseconds
+ *
+ * private function, must hold system_time_lock lock when being
+ * called. Returns the monotonically increasing number of
+ * nanoseconds since the system booted (adjusted by NTP scaling)
+ */
+static inline ktime_t __get_monotonic_clock(void)
+{
+	nsec_t offset = __get_nsec_offset();
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	ktime_t check = get_check_value();
+#endif
+	ktime_t ret;
+	
+	ret = ktime_add_ns(system_time, offset);
+	check_monotonic_clock(check,ret);
+
+	return ret;
+}
+
+/**
+ * get_monotonic_clock - Returns monotonic time in ktime_t format
+ *
+ * Returns the monotonically increasing number of nanoseconds
+ * since the system booted via __monotonic_clock()
+ */
+ktime_t get_monotonic_clock(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read __get_monotonic_clock() */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = __get_monotonic_clock();
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+EXPORT_SYMBOL_GPL(get_monotonic_clock);
+
+/**
+ * get_realtime_clock - Returns the timeofday in ktime_t format
+ *
+ * Returns the wall time in ktime_t format. The resolution
+ * is nanoseconds
+ */
+ktime_t get_realtime_clock(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read __get_monotonic_clock() */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = __get_monotonic_clock();
+		ret = ktime_add(ret, wall_time_offset);
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * get_realtime_offset - Returns the offset of realtime clock
+ *
+ * Returns the number of nanoseconds in ktime_t storage format which
+ * represents the offset of the realtime clock to the the monotonic clock
+ */
+ktime_t get_realtime_offset(void)
+{
+	unsigned long seq;
+	ktime_t ret;
+
+	/* atomically read wall_time_offset */
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = wall_time_offset;
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * get_monotonic_clock_ts - Returns monotonic time in timespec format
+ *
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns a timespec of nanoseconds since the system booted and
+ * store the result in the timespec variable pointed to by @ts
+ */
+void get_monotonic_clock_ts(struct timespec *ts)
+{
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	ktime_t check = get_check_value();
+#endif
+	unsigned long seq;
+	nsec_t offset;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		*ts = mono_time_ts;
+		offset = __get_nsec_offset();
+	} while (read_seqretry(&system_time_lock, seq));
+
+	timespec_add_ns(ts, offset);
+	check_monotonic_clock_ts(check, *ts);
+}
+
+/**
+ * __get_realtime_clock_ts - Returns the time of day in a timespec
+ *
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec. Used by
+ * do_gettimeofday() and get_realtime_clock_ts().
+ *
+ */
+static inline void __get_realtime_clock_ts(struct timespec *ts)
+{
+	unsigned long seq;
+	nsec_t nsecs;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		*ts = wall_time_ts;
+		nsecs = __get_nsec_offset();
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	timespec_add_ns(ts, nsecs);
+}
+
+/**
+ * get_realtime_clock_ts - Returns the time of day in a timespec
+ * @ts:		pointer to the timespec to be set
+ *
+ * Returns the time of day in a timespec.
+ */
+void get_realtime_clock_ts(struct timespec *ts)
+{
+	__get_realtime_clock_ts(ts);
+}
+
+EXPORT_SYMBOL(get_realtime_clock_ts);
+
+
+/**
+ * do_gettimeofday - Returns the time of day in a timeval
+ * @tv:		pointer to the timeval to be set
+ *
+ * NOTE: Users should be converted to using get_realtime_clock_ts()
+ */
+void do_gettimeofday(struct timeval *tv)
+{
+	struct timespec now;
+
+	__get_realtime_clock_ts(&now);
+	tv->tv_sec = now.tv_sec;
+	tv->tv_usec = now.tv_nsec/1000;
+}
+
+EXPORT_SYMBOL(do_gettimeofday);
+
+/**
+ * do_settimeofday - Sets the time of day
+ * @tv:		pointer to the timespec variable containing the new time
+ *
+ * Sets the time of day to the new time and update NTP and notify ktimers
+ */
+int do_settimeofday(struct timespec *tv)
+{
+	unsigned long flags;
+	ktime_t newtime;
+
+	newtime = timespec_to_ktime(*tv);
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* calculate the new offset from the monotonic clock */
+	wall_time_offset = ktime_sub(newtime, __get_monotonic_clock());
+
+	/* update the internal timespec variables */
+	ktime_to_timespec(&wall_time_ts,
+				ktime_add(system_time, wall_time_offset));
+	ktime_to_timespec(&monotonic_time_offset_ts, wall_time_offset);
+
+	ntp_clear();
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+#ifdef CONFIG_PARANOID_GENERIC_TIME
+	printk("do_settimeofday() was called!\n");
+#endif
+	/* signal ktimers about time change */
+	clock_was_set();
+
+	return 0;
+}
+
+EXPORT_SYMBOL(do_settimeofday);
+
+/**
+ * __increment_system_time - Increments system time
+ * @delta:	nanosecond delta to add to the time variables
+ *
+ * Private helper that increments system_time and related
+ * timekeeping variables.
+ */
+static inline void __increment_system_time(nsec_t delta)
+{
+	system_time = ktime_add_ns(system_time, delta);
+	timespec_add_ns(&wall_time_ts, delta);
+	timespec_add_ns(&mono_time_ts, delta);
+}
+
+/**
+ * timeofday_suspend_hook - allows the timeofday subsystem to be shutdown
+ * @dev:	unused
+ * @state:	unused
+ *
+ * This function allows the timeofday subsystem to be shutdown for a period
+ * of time. Called when going into suspend/hibernate mode.
+ */
+static int timeofday_suspend_hook(struct sys_device *dev, pm_message_t state)
+{
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	BUG_ON(time_suspend_state != TIME_RUNNING);
+
+	/*
+	 * First off, save suspend start time
+	 * then quickly accumulate the current nsec offset.
+	 * These two calls hopefully occur quickly
+	 * because the difference between reads will
+	 * accumulate as time drift on resume.
+	 */
+	suspend_start = read_persistent_clock();
+	__increment_system_time(__get_nsec_offset());
+
+	time_suspend_state = TIME_SUSPENDED;
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	return 0;
+}
+
+/**
+ * timeofday_resume_hook - Resumes the timeofday subsystem.
+ * @dev:	unused
+ *
+ * This function resumes the timeofday subsystem from a previous call
+ * to timeofday_suspend_hook.
+ */
+static int timeofday_resume_hook(struct sys_device *dev)
+{
+	nsec_t suspend_end, suspend_time;
+	unsigned long flags;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	BUG_ON(time_suspend_state != TIME_SUSPENDED);
+
+	/*
+	 * Read persistent clock to mark the end of
+	 * the suspend interval then rebase the
+	 * cycle_last to current clocksource value.
+	 * Again, time between these two calls will
+	 * not be accounted for and will show up as
+	 * time drift.
+	 */
+	suspend_end = read_persistent_clock();
+	cycle_last = read_clocksource(clock);
+
+	/* calculate suspend time and add it to system time: */
+	suspend_time = suspend_end - suspend_start;
+	__increment_system_time(suspend_time);
+
+	ntp_clear();
+
+	time_suspend_state = TIME_RUNNING;
+
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	/* inform ktimers about time change: */
+	clock_was_set();
+
+	return 0;
+}
+
+/* sysfs resume/suspend bits */
+static struct sysdev_class timeofday_sysclass = {
+	.resume		= timeofday_resume_hook,
+	.suspend	= timeofday_suspend_hook,
+	set_kset_name("timeofday"),
+};
+
+static struct sys_device device_timer = {
+	.id		= 0,
+	.cls		= &timeofday_sysclass,
+};
+
+static int timeofday_init_device(void)
+{
+	int error = sysdev_class_register(&timeofday_sysclass);
+
+	if (!error)
+		error = sysdev_register(&device_timer);
+
+	return error;
+}
+
+device_initcall(timeofday_init_device);
+
+
+/**
+ * timeofday_periodic_hook - Does periodic update of timekeeping values.
+ * @unused:	unused value
+ *
+ * Calculates the delta since the last call, updates system time and
+ * clears the offset.
+ *
+ * Called via timeofday_timer.
+ */
+static void timeofday_periodic_hook(void* unused)
+{
+	unsigned long flags;
+
+	cycle_t cycle_now, cycle_delta;
+	nsec_t delta_nsec;
+	static u64 remainder;
+
+	long leapsecond;
+	struct clocksource* next;
+
+	int ppm;
+	static int ppm_last;
+
+	int something_changed = 0, clocksource_changed = 0;
+	struct clocksource old_clock;
+	static nsec_t second_check;
+	ktime_t expire_time;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* read time source & calc time since last call: */
+	cycle_now = read_clocksource(clock);
+	check_periodic_interval(cycle_now);
+	cycle_delta = (cycle_now - cycle_last) & clock->mask;
+
+	delta_nsec = cyc2ns_fixed_rem(ts_interval, &cycle_delta, &remainder);
+	cycle_last = (cycle_now - cycle_delta)&clock->mask;
+
+	/* update system_time:  */
+	__increment_system_time(delta_nsec);
+
+	/* advance the ntp state machine by ns interval: */
+	ntp_advance(delta_nsec);
+
+	/* only call ntp_leapsecond and ntp_sync once a sec:  */
+	second_check += delta_nsec;
+	if (second_check >= NSEC_PER_SEC) {
+		/* do ntp leap second processing: */
+		leapsecond = ntp_leapsecond(wall_time_ts);
+		if (leapsecond) {
+			wall_time_offset = ktime_add_ns(wall_time_offset,
+						leapsecond * NSEC_PER_SEC);
+			wall_time_ts.tv_sec += leapsecond;
+			monotonic_time_offset_ts.tv_sec += leapsecond;
+		}
+		/* sync the persistent clock: */
+		if (ntp_synced())
+			sync_persistent_clock(wall_time_ts);
+		second_check -= NSEC_PER_SEC;
+	}
+
+	/* if necessary, switch clocksources: */
+	next = get_next_clocksource();
+	if (next != clock) {
+		/* immediately set new cycle_last: */
+		cycle_last = read_clocksource(next);
+		/* update cycle_now to avoid problems in accumulation later: */
+		cycle_now = cycle_last;
+		/* swap clocksources: */
+		old_clock = *clock;
+		clock = next;
+		printk(KERN_INFO "Time: %s clocksource has been installed.\n",
+					clock->name);
+		ntp_clear();
+		ntp_adj = 0;
+		remainder = 0;
+		something_changed = 1;
+		clocksource_changed = 1;
+		check_periodic_interval(0);
+	}
+
+	/*
+	 * now is a safe time, so allow clocksource to adjust
+	 * itself (for example: to make cpufreq changes):
+	 */
+	if (clock->update_callback) {
+		/* since clocksource state might change,
+		 * keep a copy, but only if we've not
+		 * already changed timesources:
+		 */
+		if (!something_changed)
+			old_clock = *clock;
+		if (clock->update_callback()) {
+			remainder = 0;
+			something_changed = 1;
+		}
+	}
+
+	/* check for new PPM adjustment: */
+	ppm = ntp_get_ppm_adjustment();
+	if (ppm_last != ppm) {
+		/* make sure old_clock is set: */
+		if (!something_changed)
+			old_clock = *clock;
+		something_changed = 1;
+	}
+
+	/* if something changed, recalculate the ntp adjustment value: */
+	if (something_changed) {
+		/* accumulate current leftover cycles using old_clock: */
+		if (cycle_delta) {
+			delta_nsec = cyc2ns_rem(&old_clock, ntp_adj,
+						cycle_delta, &remainder);
+			cycle_last = cycle_now;
+			__increment_system_time(delta_nsec);
+			ntp_advance(delta_nsec);
+		}
+
+		/* recalculate the ntp adjustment and fixed interval values: */
+		ppm_last = ppm;
+		ntp_adj = ppm_to_mult_adj(clock, ppm);
+		ts_interval = calculate_clocksource_interval(clock, ntp_adj,
+					INTERVAL_LEN);
+	}
+
+	update_legacy_time_values();
+
+	verify_timekeeping_state();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	if (clocksource_changed)
+		ktimer_clock_notify();
+
+	/* set us up to go off on the next interval: */
+	expire_time = ktime_set(0, PERIODIC_INTERVAL_MS*1000000);
+	ktimer_start(&timeofday_timer, &expire_time, KTIMER_REL);
+}
+
+/**
+ * timeofday_is_continuous - check to see if timekeeping is free running
+ */
+int timeofday_is_continuous(void)
+{
+	unsigned long seq;
+	int ret;
+
+	do {
+		seq = read_seqbegin(&system_time_lock);
+
+		ret = clock->is_continuous;
+
+	} while (read_seqretry(&system_time_lock, seq));
+
+	return ret;
+}
+
+/**
+ * timeofday_init - Initializes time variables
+ */
+void __init timeofday_init(void)
+{
+	unsigned long flags;
+	ktime_t expire_time;
+
+	write_seqlock_irqsave(&system_time_lock, flags);
+
+	/* initialize the clock variable: */
+	clock = get_next_clocksource();
+
+	/* initialize cycle_last offset base: */
+	cycle_last = read_clocksource(clock);
+
+	/* initialize wall_time_offset to now: */
+	/* XXX - this should be something like ns_to_ktime() */
+	wall_time_offset = ktime_add_ns(wall_time_offset,
+					read_persistent_clock());
+
+	/* initialize timespec values: */
+	ktime_to_timespec(&wall_time_ts,
+				ktime_add(system_time, wall_time_offset));
+	ktime_to_timespec(&monotonic_time_offset_ts, wall_time_offset);
+
+	/* clear NTP scaling factor & state machine: */
+	ntp_adj = 0;
+	ntp_clear();
+	ts_interval = calculate_clocksource_interval(clock, ntp_adj,
+				INTERVAL_LEN);
+
+	/* initialize legacy time values: */
+	update_legacy_time_values();
+
+	write_sequnlock_irqrestore(&system_time_lock, flags);
+
+	/* install timeofday_periodic_hook timer: */
+	ktimer_init(&timeofday_timer);
+	expire_time = ktime_set(0, PERIODIC_INTERVAL_MS*1000000);
+	timeofday_timer.function = timeofday_periodic_hook;
+	ktimer_start(&timeofday_timer, &expire_time, KTIMER_REL);
+}
Index: linux/kernel/timer.c
===================================================================
--- linux.orig/kernel/timer.c
+++ linux/kernel/timer.c
@@ -28,11 +28,12 @@
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/thread_info.h>
-#include <linux/time.h>
+#include <linux/timeofday.h>
 #include <linux/jiffies.h>
 #include <linux/posix-timers.h>
 #include <linux/cpu.h>
 #include <linux/syscalls.h>
+#include <linux/kallsyms.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -46,6 +47,10 @@ static void time_interpolator_update(lon
 #define time_interpolator_update(x)
 #endif
 
+u64 jiffies_64 __cacheline_aligned_in_smp = INITIAL_JIFFIES;
+
+EXPORT_SYMBOL(jiffies_64);
+
 /*
  * per-CPU timer vector definitions:
  */
@@ -60,6 +65,7 @@ static void time_interpolator_update(lon
 struct timer_base_s {
 	spinlock_t lock;
 	struct timer_list *running_timer;
+	wait_queue_head_t wait_for_running_timer;
 };
 
 typedef struct tvec_s {
@@ -86,7 +92,7 @@ static DEFINE_PER_CPU(tvec_base_t, tvec_
 static inline void set_running_timer(tvec_base_t *base,
 					struct timer_list *timer)
 {
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
 	base->t_base.running_timer = timer;
 #endif
 }
@@ -162,8 +168,7 @@ typedef struct timer_base_s timer_base_t
  * Used by TIMER_INITIALIZER, we can't use per_cpu(tvec_bases)
  * at compile time, and we need timer->base to lock the timer.
  */
-timer_base_t __init_timer_base
-	____cacheline_aligned_in_smp = { .lock = SPIN_LOCK_UNLOCKED };
+timer_base_t __init_timer_base ____cacheline_aligned_in_smp;
 EXPORT_SYMBOL(__init_timer_base);
 
 /***
@@ -227,7 +232,7 @@ int __mod_timer(struct timer_list *timer
 	timer_base_t *base;
 	tvec_base_t *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret = 0, cpu;
 
 	BUG_ON(!timer->function);
 	check_timer(timer);
@@ -238,8 +243,8 @@ int __mod_timer(struct timer_list *timer
 		detach_timer(timer, 0);
 		ret = 1;
 	}
-
-	new_base = &__get_cpu_var(tvec_bases);
+	cpu = raw_smp_processor_id();
+	new_base = &per_cpu(tvec_bases, cpu);
 
 	if (base != &new_base->t_base) {
 		/*
@@ -292,6 +297,20 @@ void add_timer_on(struct timer_list *tim
 	spin_unlock_irqrestore(&base->t_base.lock, flags);
 }
 
+/*
+ * Wait for a running timer
+ */
+void wait_for_running_timer(struct timer_list *timer)
+{
+	timer_base_t *base;
+	check_timer(timer);
+
+	base = timer->base;
+	if (base->running_timer == timer) {
+		wait_event(base->wait_for_running_timer,
+			   base->running_timer != timer);
+	}
+}
 
 /***
  * mod_timer - modify a timer's timeout
@@ -364,7 +383,34 @@ int del_timer(struct timer_list *timer)
 
 EXPORT_SYMBOL(del_timer);
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_SOFTIRQS)
+/*
+ * This function checks whether a timer is active and not running on any
+ * CPU. Upon successful (ret >= 0) exit the timer is not queued and the
+ * handler is not running on any CPU.
+ *
+ * It must not be called from interrupt contexts.
+ */
+int timer_pending_sync(struct timer_list *timer)
+{
+	timer_base_t *base;
+	unsigned long flags;
+	int ret = -1;
+
+	base = lock_timer_base(timer, &flags);
+
+	if (base->running_timer == timer)
+		goto out;
+
+	ret = 0;
+	if (timer_pending(timer))
+		ret = 1;
+out:
+	spin_unlock_irqrestore(&base->lock, flags);
+
+	return ret;
+}
+
 /*
  * This function tries to deactivate a timer. Upon successful (ret >= 0)
  * exit the timer is not queued and the handler is not running on any CPU.
@@ -393,6 +439,7 @@ out:
 	return ret;
 }
 
+
 /***
  * del_timer_sync - deactivate a timer and wait for the handler to finish.
  * @timer: the timer to be deactivated
@@ -418,6 +465,7 @@ int del_timer_sync(struct timer_list *ti
 		int ret = try_to_del_timer_sync(timer);
 		if (ret >= 0)
 			return ret;
+		wait_for_running_timer(timer);
 	}
 }
 
@@ -465,8 +513,21 @@ static inline void __run_timers(tvec_bas
 	while (time_after_eq(jiffies, base->timer_jiffies)) {
 		struct list_head work_list = LIST_HEAD_INIT(work_list);
 		struct list_head *head = &work_list;
- 		int index = base->timer_jiffies & TVR_MASK;
- 
+		int index = base->timer_jiffies & TVR_MASK;
+
+		if (softirq_need_resched()) {
+			spin_unlock_irq(&base->t_base.lock);
+			wake_up(&base->t_base.wait_for_running_timer);
+			cond_resched_all();
+			cpu_relax();
+			spin_lock_irq(&base->t_base.lock);
+			/*
+			 * We can simply continue after preemption, nobody
+			 * else can touch timer_jiffies so 'index' is still
+			 * valid. Any new jiffy will be taken care of in
+			 * subsequent loops:
+			 */
+		}
 		/*
 		 * Cascade timers:
 		 */
@@ -492,19 +553,18 @@ static inline void __run_timers(tvec_bas
 				int preempt_count = preempt_count();
 				fn(data);
 				if (preempt_count != preempt_count()) {
-					printk(KERN_WARNING "huh, entered %p "
-					       "with preempt_count %08x, exited"
-					       " with %08x?\n",
-					       fn, preempt_count,
-					       preempt_count());
-					BUG();
+					print_symbol("BUG: unbalanced timer-handler preempt count in %s!\n", (unsigned long) fn);
+					printk("entered with %08x, exited with %08x.\n", preempt_count, preempt_count());
+					preempt_count() = preempt_count;
 				}
 			}
+			set_running_timer(base, NULL);
+			cond_resched_all();
 			spin_lock_irq(&base->t_base.lock);
 		}
 	}
-	set_running_timer(base, NULL);
 	spin_unlock_irq(&base->t_base.lock);
+	wake_up(&base->t_base.wait_for_running_timer);
 }
 
 #ifdef CONFIG_NO_IDLE_HZ
@@ -613,13 +673,96 @@ long time_tolerance = MAXFREQ;		/* frequ
 long time_precision = 1;		/* clock precision (us)		*/
 long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
 long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-static long time_phase;			/* phase offset (scaled us)	*/
 long time_freq = (((NSEC_PER_SEC + HZ/2) % HZ - HZ/2) << SHIFT_USEC) / NSEC_PER_USEC;
 					/* frequency offset (scaled ppm)*/
 static long time_adj;			/* tick adjust (scaled 1 / HZ)	*/
 long time_reftime;			/* time at last adjustment (s)	*/
 long time_adjust;
 long time_next_adjust;
+long time_adjust_step;			/* per tick time_adjust step */
+
+long total_sppm;			/* shifted ppm sum of all adjustments */
+long offset_adj_ppm;
+long tick_adj_ppm;
+long singleshot_adj_ppm;
+
+#define MAX_SINGLESHOT_ADJ	500	/* (ppm) */
+#define SEC_PER_DAY		86400
+#define END_OF_DAY(x)		((x) + SEC_PER_DAY - ((x) % SEC_PER_DAY) - 1)
+
+/* NTP lock, protects NTP state machine */
+DECLARE_RAW_SEQLOCK(ntp_lock);
+
+/**
+ * ntp_leapsecond - NTP leapsecond processing code.
+ * now: the current time
+ *
+ * Returns the number of seconds (-1, 0, or 1) that
+ * should be added to the current time to properly
+ * adjust for leapseconds.
+ */
+int ntp_leapsecond(struct timespec now)
+{
+	/*
+	 * Leap second processing. If in leap-insert state at
+	 * the end of the day, the system clock is set back one
+	 * second; if in leap-delete state, the system clock is
+	 * set ahead one second.
+	 */
+	static time_t leaptime = 0;
+
+	unsigned long flags;
+	int ret = 0;
+
+	write_seqlock_irqsave(&ntp_lock, flags);
+
+	switch (time_state) {
+
+	case TIME_OK:
+		if (time_status & STA_INS) {
+			time_state = TIME_INS;
+			leaptime = END_OF_DAY(now.tv_sec);
+		} else if (time_status & STA_DEL) {
+			time_state = TIME_DEL;
+			leaptime = END_OF_DAY(now.tv_sec);
+		}
+		break;
+
+	case TIME_INS:
+		/* Once we are at (or past) leaptime, insert the second */
+		if (now.tv_sec >= leaptime) {
+			time_state = TIME_OOP;
+			printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
+			ret = -1;
+		}
+		break;
+
+	case TIME_DEL:
+		/* Once we are at (or past) leaptime, delete the second */
+		if (now.tv_sec >= leaptime) {
+			time_state = TIME_WAIT;
+			printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
+			ret = 1;
+		}
+		break;
+
+	case TIME_OOP:
+		/* Wait for the end of the leap second*/
+		if (now.tv_sec > (leaptime + 1))
+			time_state = TIME_WAIT;
+		time_state = TIME_WAIT;
+		break;
+
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+			time_state = TIME_OK;
+		break;
+	}
+
+	write_sequnlock_irqrestore(&ntp_lock, flags);
+
+	return ret;
+}
 
 /*
  * this routine handles the overflow of the microsecond field
@@ -632,184 +775,257 @@ long time_next_adjust;
  */
 static void second_overflow(void)
 {
-    long ltemp;
+	long ltemp;
 
-    /* Bump the maxerror field */
-    time_maxerror += time_tolerance >> SHIFT_USEC;
-    if ( time_maxerror > NTP_PHASE_LIMIT ) {
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_status |= STA_UNSYNC;
-    }
-
-    /*
-     * Leap second processing. If in leap-insert state at
-     * the end of the day, the system clock is set back one
-     * second; if in leap-delete state, the system clock is
-     * set ahead one second. The microtime() routine or
-     * external clock driver will insure that reported time
-     * is always monotonic. The ugly divides should be
-     * replaced.
-     */
-    switch (time_state) {
-
-    case TIME_OK:
-	if (time_status & STA_INS)
-	    time_state = TIME_INS;
-	else if (time_status & STA_DEL)
-	    time_state = TIME_DEL;
-	break;
-
-    case TIME_INS:
-	if (xtime.tv_sec % 86400 == 0) {
-	    xtime.tv_sec--;
-	    wall_to_monotonic.tv_sec++;
-	    /* The timer interpolator will make time change gradually instead
-	     * of an immediate jump by one second.
-	     */
-	    time_interpolator_update(-NSEC_PER_SEC);
-	    time_state = TIME_OOP;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: inserting leap second 23:59:60 UTC\n");
-	}
-	break;
-
-    case TIME_DEL:
-	if ((xtime.tv_sec + 1) % 86400 == 0) {
-	    xtime.tv_sec++;
-	    wall_to_monotonic.tv_sec--;
-	    /* Use of time interpolator for a gradual change of time */
-	    time_interpolator_update(NSEC_PER_SEC);
-	    time_state = TIME_WAIT;
-	    clock_was_set();
-	    printk(KERN_NOTICE "Clock: deleting leap second 23:59:59 UTC\n");
-	}
-	break;
-
-    case TIME_OOP:
-	time_state = TIME_WAIT;
-	break;
-
-    case TIME_WAIT:
-	if (!(time_status & (STA_INS | STA_DEL)))
-	    time_state = TIME_OK;
-    }
-
-    /*
-     * Compute the phase adjustment for the next second. In
-     * PLL mode, the offset is reduced by a fixed factor
-     * times the time constant. In FLL mode the offset is
-     * used directly. In either mode, the maximum phase
-     * adjustment for each second is clamped so as to spread
-     * the adjustment over not more than the number of
-     * seconds between updates.
-     */
-    if (time_offset < 0) {
-	ltemp = -time_offset;
-	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
-	time_offset += ltemp;
-	time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    } else {
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if (time_maxerror > NTP_PHASE_LIMIT) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
+	}
+
+	/*
+	 * Leap second processing. If in leap-insert state at the end of the
+	 * day, the system clock is set back one second; if in leap-delete
+	 * state, the system clock is set ahead one second. The microtime()
+	 * routine or external clock driver will insure that reported time is
+	 * always monotonic. The ugly divides should be replaced.
+	 */
+	switch (time_state) {
+	case TIME_OK:
+		if (time_status & STA_INS)
+			time_state = TIME_INS;
+		else if (time_status & STA_DEL)
+			time_state = TIME_DEL;
+		break;
+	case TIME_INS:
+		if (xtime.tv_sec % 86400 == 0) {
+			xtime.tv_sec--;
+			wall_to_monotonic.tv_sec++;
+			/*
+			 * The timer interpolator will make time change
+			 * gradually instead of an immediate jump by one second
+			 */
+			time_interpolator_update(-NSEC_PER_SEC);
+			time_state = TIME_OOP;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: inserting leap second "
+					"23:59:60 UTC\n");
+		}
+		break;
+	case TIME_DEL:
+		if ((xtime.tv_sec + 1) % 86400 == 0) {
+			xtime.tv_sec++;
+			wall_to_monotonic.tv_sec--;
+			/*
+			 * Use of time interpolator for a gradual change of
+			 * time
+			 */
+			time_interpolator_update(NSEC_PER_SEC);
+			time_state = TIME_WAIT;
+			clock_was_set();
+			printk(KERN_NOTICE "Clock: deleting leap second "
+					"23:59:59 UTC\n");
+		}
+		break;
+	case TIME_OOP:
+		time_state = TIME_WAIT;
+		break;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+		time_state = TIME_OK;
+	}
+
+	/* Bump the maxerror field */
+	time_maxerror += time_tolerance >> SHIFT_USEC;
+	if ( time_maxerror > NTP_PHASE_LIMIT ) {
+		time_maxerror = NTP_PHASE_LIMIT;
+		time_status |= STA_UNSYNC;
+	}
+
+	/*
+	 * Compute the phase adjustment for the next second. In PLL mode, the
+	 * offset is reduced by a fixed factor times the time constant. In FLL
+	 * mode the offset is used directly. In either mode, the maximum phase
+	 * adjustment for each second is clamped so as to spread the adjustment
+	 * over not more than the number of seconds between updates.
+	 */
 	ltemp = time_offset;
 	if (!(time_status & STA_FLL))
-	    ltemp >>= SHIFT_KG + time_constant;
-	if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
-	    ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE;
+		ltemp = shift_right(ltemp, SHIFT_KG + time_constant);
+	ltemp = min(ltemp, (MAXPHASE / MINSEC) << SHIFT_UPDATE);
+	ltemp = max(ltemp, -(MAXPHASE / MINSEC) << SHIFT_UPDATE);
 	time_offset -= ltemp;
 	time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - SHIFT_UPDATE);
-    }
 
-    /*
-     * Compute the frequency estimate and additional phase
-     * adjustment due to frequency error for the next
-     * second. When the PPS signal is engaged, gnaw on the
-     * watchdog counter and update the frequency computed by
-     * the pll and the PPS signal.
-     */
-    pps_valid++;
-    if (pps_valid == PPS_VALID) {	/* PPS signal lost */
-	pps_jitter = MAXTIME;
-	pps_stabil = MAXFREQ;
-	time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
-			 STA_PPSWANDER | STA_PPSERROR);
-    }
-    ltemp = time_freq + pps_freq;
-    if (ltemp < 0)
-	time_adj -= -ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
-    else
-	time_adj += ltemp >>
-	    (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+	offset_adj_ppm = shift_right(ltemp, SHIFT_UPDATE); /* ppm */
+
+	/* first calculate usec/user_tick offset: */
+	tick_adj_ppm = ((USEC_PER_SEC + USER_HZ/2)/USER_HZ) - tick_usec;
+	/* multiply by user_hz to get usec/sec => ppm: */
+	tick_adj_ppm *= USER_HZ;
+
+	/*
+	 * Compute the frequency estimate and additional phase adjustment due
+	 * to frequency error for the next second. When the PPS signal is
+	 * engaged, gnaw on the watchdog counter and update the frequency
+	 * computed by the pll and the PPS signal.
+	 */
+	pps_valid++;
+	if (pps_valid == PPS_VALID) {	/* PPS signal lost */
+		pps_jitter = MAXTIME;
+		pps_stabil = MAXFREQ;
+		time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				STA_PPSWANDER | STA_PPSERROR);
+	}
+	ltemp = time_freq + pps_freq;
+	time_adj += shift_right(ltemp,(SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE));
 
 #if HZ == 100
-    /* Compensate for (HZ==100) != (1 << SHIFT_HZ).
-     * Add 25% and 3.125% to get 128.125; => only 0.125% error (p. 14)
-     */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 2) + (-time_adj >> 5);
-    else
-	time_adj += (time_adj >> 2) + (time_adj >> 5);
+	/*
+	 * Compensate for (HZ==100) != (1 << SHIFT_HZ).  Add 25% and 3.125% to
+	 * get 128.125; => only 0.125% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 2) + shift_right(time_adj, 5);
+#endif
+#if HZ == 250
+	/*
+	 * Compensate for (HZ==250) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 255.85938; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 #if HZ == 1000
-    /* Compensate for (HZ==1000) != (1 << SHIFT_HZ).
-     * Add 1.5625% and 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
-     */
-    if (time_adj < 0)
-	time_adj -= (-time_adj >> 6) + (-time_adj >> 7);
-    else
-	time_adj += (time_adj >> 6) + (time_adj >> 7);
+	/*
+	 * Compensate for (HZ==1000) != (1 << SHIFT_HZ).  Add 1.5625% and
+	 * 0.78125% to get 1023.4375; => only 0.05% error (p. 14)
+	 */
+	time_adj += shift_right(time_adj, 6) + shift_right(time_adj, 7);
 #endif
 }
 
-/* in the NTP reference this is called "hardclock()" */
-static void update_wall_time_one_tick(void)
+
+/**
+ * ntp_get_ppm_adjustment - return shifted PPM adjustment
+ */
+long ntp_get_ppm_adjustment(void)
+{
+	return total_sppm;
+}
+
+/**
+ * ntp_advance - increments the NTP state machine
+ * @interval_ns: interval, in nanoseconds
+ */
+void ntp_advance(unsigned long interval_ns)
 {
-	long time_adjust_step, delta_nsec;
+	static unsigned long interval_sum;
 
-	if ( (time_adjust_step = time_adjust) != 0 ) {
-	    /* We are doing an adjtime thing. 
-	     *
-	     * Prepare time_adjust_step to be within bounds.
-	     * Note that a positive time_adjust means we want the clock
-	     * to run faster.
-	     *
-	     * Limit the amount of the step to be in the range
-	     * -tickadj .. +tickadj
-	     */
-	     if (time_adjust > tickadj)
-		time_adjust_step = tickadj;
-	     else if (time_adjust < -tickadj)
-		time_adjust_step = -tickadj;
+	unsigned long flags;
 
-	    /* Reduce by this step the amount of time left  */
-	    time_adjust -= time_adjust_step;
-	}
-	delta_nsec = tick_nsec + time_adjust_step * 1000;
-	/*
-	 * Advance the phase, once it gets to one microsecond, then
-	 * advance the tick more.
-	 */
-	time_phase += time_adj;
-	if (time_phase <= -FINENSEC) {
-		long ltemp = -time_phase >> (SHIFT_SCALE - 10);
-		time_phase += ltemp << (SHIFT_SCALE - 10);
-		delta_nsec -= ltemp;
-	}
-	else if (time_phase >= FINENSEC) {
-		long ltemp = time_phase >> (SHIFT_SCALE - 10);
-		time_phase -= ltemp << (SHIFT_SCALE - 10);
-		delta_nsec += ltemp;
+	write_seqlock_irqsave(&ntp_lock, flags);
+
+	/* increment the interval sum: */
+	interval_sum += interval_ns;
+
+	/* calculate the per tick singleshot adjtime adjustment step: */
+	while (interval_ns >= tick_nsec) {
+		time_adjust_step = time_adjust;
+		if (time_adjust_step) {
+			/*
+			 * We are doing an adjtime thing.
+			 *
+			 * Prepare time_adjust_step to be within bounds.
+			 * Note that a positive time_adjust means we want
+			 * the clock to run faster.
+			 *
+			 * Limit the amount of the step to be in the range
+			 * -tickadj .. +tickadj:
+			 */
+			time_adjust_step = min(time_adjust_step, (long)tickadj);
+			time_adjust_step = max(time_adjust_step,
+							 (long)-tickadj);
+
+			/* Reduce by this step the amount of time left: */
+			time_adjust -= time_adjust_step;
+		}
+		interval_ns -= tick_nsec;
 	}
-	xtime.tv_nsec += delta_nsec;
-	time_interpolator_update(delta_nsec);
+	/* usec/tick => ppm: */
+	singleshot_adj_ppm = time_adjust_step*(1000000/HZ);
 
 	/* Changes by adjtime() do not take effect till next tick. */
 	if (time_next_adjust != 0) {
 		time_adjust = time_next_adjust;
 		time_next_adjust = 0;
 	}
+
+	while (interval_sum >= NSEC_PER_SEC) {
+		interval_sum -= NSEC_PER_SEC;
+		second_overflow();
+	}
+
+	/* calculate the total continuous ppm adjustment: */
+	total_sppm = time_freq; /* already shifted by SHIFT_USEC */
+	total_sppm += offset_adj_ppm << SHIFT_USEC;
+	total_sppm += tick_adj_ppm << SHIFT_USEC;
+	total_sppm += singleshot_adj_ppm << SHIFT_USEC;
+
+	write_sequnlock_irqrestore(&ntp_lock, flags);
+}
+
+#ifdef CONFIG_GENERIC_TIME
+# define update_wall_time(x) do { } while (0)
+#else
+
+/**
+ * phase_advance - advance the phase
+ * @time_adj: adjustment in nsecs
+ *
+ * advance the phase, once it gets to one microsecond advance the tick more.
+ */
+static inline long phase_advance(long time_adj)
+{
+	long delta = 0;
+
+	time_phase += time_adj;
+
+	if ((time_phase >= FINENSEC) || (time_phase <= -FINENSEC)) {
+		delta = shift_right(time_phase, (SHIFT_SCALE - 10));
+		time_phase -= delta << (SHIFT_SCALE - 10);
+	}
+
+	return delta;
+}
+
+/**
+ * xtime_advance - advance xtime
+ * @delta_nsec: adjustment in nsecs
+ */
+static inline void xtime_advance(long delta_nsec)
+{
+	int leapsecond;
+
+	xtime.tv_nsec += delta_nsec;
+	if (likely(xtime.tv_nsec < NSEC_PER_SEC))
+		return;
+
+	xtime.tv_nsec -= NSEC_PER_SEC;
+	xtime.tv_sec++;
+
+	/* process leapsecond: */
+	leapsecond = ntp_leapsecond(xtime);
+	if (likely(!leapsecond))
+		return;
+
+	xtime.tv_sec += leapsecond;
+	wall_to_monotonic.tv_sec -= leapsecond;
+	/*
+	 * Use of time interpolator for a gradual
+	 * change of time:
+	 */
+	time_interpolator_update(leapsecond*NSEC_PER_SEC);
+	clock_was_set();
 }
 
 /*
@@ -817,20 +1033,29 @@ static void update_wall_time_one_tick(vo
  * usually just one (we shouldn't be losing ticks,
  * we're doing this this way mainly for interrupt
  * latency reasons, not because we think we'll
- * have lots of lost timer ticks
+ * have lots of lost timer ticks)
  */
 static void update_wall_time(unsigned long ticks)
 {
+	static long time_phase; /* phase offset (scaled us) */
+
 	do {
-		ticks--;
-		update_wall_time_one_tick();
-		if (xtime.tv_nsec >= 1000000000) {
-			xtime.tv_nsec -= 1000000000;
-			xtime.tv_sec++;
-			second_overflow();
-		}
-	} while (ticks);
+		/*
+		 * Calculate the nsec delta using the precomputed NTP
+		 * adjustments:
+		 *     tick_nsec, time_adjust_step, time_adj
+		 */
+		long delta_nsec = tick_nsec + time_adjust_step * 1000;
+
+		delta_nsec += phase_advance();
+
+		xtime_advance(delta_nsec);
+		ntp_advance(tick_nsec);
+		time_interpolator_update(delta_nsec);
+
+	} while (--ticks);
 }
+#endif /* !CONFIG_GENERIC_TIME */
 
 /*
  * Called from the timer interrupt handler to charge one tick to the current 
@@ -838,8 +1063,8 @@ static void update_wall_time(unsigned lo
  */
 void update_process_times(int user_tick)
 {
-	struct task_struct *p = current;
 	int cpu = smp_processor_id();
+	struct task_struct *p = current;
 
 	/* Note: this timer irq context must be accounted for as well. */
 	if (user_tick)
@@ -850,7 +1075,10 @@ void update_process_times(int user_tick)
 	if (rcu_pending(cpu))
 		rcu_check_callbacks(cpu, user_tick);
 	scheduler_tick();
+#ifndef CONFIG_PREEMPT_RT
  	run_posix_cpu_timers(p);
+#endif
+	softlockup_tick();
 }
 
 /*
@@ -858,7 +1086,15 @@ void update_process_times(int user_tick)
  */
 static unsigned long count_active_tasks(void)
 {
+	/*
+	 * On PREEMPT_RT, we are running in the timer softirq thread,
+	 * so consider 1 less running tasks:
+	 */
+#ifdef CONFIG_PREEMPT_RT
+	return (nr_running()-1 + nr_uninterruptible()) * FIXED_1;
+#else
 	return (nr_running() + nr_uninterruptible()) * FIXED_1;
+#endif
 }
 
 /*
@@ -900,23 +1136,12 @@ unsigned long wall_jiffies = INITIAL_JIF
  * playing with xtime and avenrun.
  */
 #ifndef ARCH_HAVE_XTIME_LOCK
-seqlock_t xtime_lock __cacheline_aligned_in_smp = SEQLOCK_UNLOCKED;
+DECLARE_RAW_SEQLOCK(xtime_lock);
 
 EXPORT_SYMBOL(xtime_lock);
 #endif
 
 /*
- * This function runs timers and the timer-tq in bottom half context.
- */
-static void run_timer_softirq(struct softirq_action *h)
-{
-	tvec_base_t *base = &__get_cpu_var(tvec_bases);
-
-	if (time_after_eq(jiffies, base->timer_jiffies))
-		__run_timers(base);
-}
-
-/*
  * Called by the local, per-CPU timer interrupt on SMP.
  */
 void run_local_timers(void)
@@ -925,22 +1150,49 @@ void run_local_timers(void)
 }
 
 /*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
+ * Time of day handling:
  */
 static inline void update_times(void)
 {
-	unsigned long ticks;
+	unsigned long ticks = 0;
+	/*
+	 * First test outside the lock for performance reasons:
+	 */
+	if (jiffies != wall_jiffies) {
+		unsigned long flags;
 
-	ticks = jiffies - wall_jiffies;
-	if (ticks) {
-		wall_jiffies += ticks;
-		update_wall_time(ticks);
+		write_seqlock_irqsave(&xtime_lock, flags);
+		while (jiffies != wall_jiffies) {
+			wall_jiffies++;
+			ticks++;
+			update_wall_time(1);
+			/*
+			 * Unlock unconditionally, to make sure
+			 * we dont keep irqs off for a long time!
+			 */
+			write_sequnlock_irqrestore(&xtime_lock, flags);
+			cond_resched_softirq();
+			write_seqlock_irqsave(&xtime_lock, flags);
+		}
+		calc_load(ticks);
+		write_sequnlock_irqrestore(&xtime_lock, flags);
 	}
-	calc_load(ticks);
 }
   
 /*
+ * This function runs timers and the timer-tq in bottom half context.
+ */
+static void run_timer_softirq(struct softirq_action *h)
+{
+	tvec_base_t *base = &__get_cpu_var(tvec_bases);
+
+	update_times();
+	ktimer_run_queues();
+	if (time_after_eq(jiffies, base->timer_jiffies))
+		__run_timers(base);
+}
+
+/*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
  * jiffies is defined in the linker script...
@@ -949,8 +1201,6 @@ static inline void update_times(void)
 void do_timer(struct pt_regs *regs)
 {
 	jiffies_64++;
-	update_times();
-	softlockup_tick(regs);
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1128,8 +1378,8 @@ fastcall signed long __sched schedule_ti
 		if (timeout < 0)
 		{
 			printk(KERN_ERR "schedule_timeout: wrong timeout "
-			       "value %lx from %p\n", timeout,
-			       __builtin_return_address(0));
+				"value %lx from %p\n", timeout,
+				__builtin_return_address(0));
 			current->state = TASK_RUNNING;
 			goto out;
 		}
@@ -1159,15 +1409,15 @@ EXPORT_SYMBOL(schedule_timeout);
  */
 signed long __sched schedule_timeout_interruptible(signed long timeout)
 {
-       __set_current_state(TASK_INTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_INTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_interruptible);
 
 signed long __sched schedule_timeout_uninterruptible(signed long timeout)
 {
-       __set_current_state(TASK_UNINTERRUPTIBLE);
-       return schedule_timeout(timeout);
+	__set_current_state(TASK_UNINTERRUPTIBLE);
+	return schedule_timeout(timeout);
 }
 EXPORT_SYMBOL(schedule_timeout_uninterruptible);
 
@@ -1177,62 +1427,6 @@ asmlinkage long sys_gettid(void)
 	return current->pid;
 }
 
-static long __sched nanosleep_restart(struct restart_block *restart)
-{
-	unsigned long expire = restart->arg0, now = jiffies;
-	struct timespec __user *rmtp = (struct timespec __user *) restart->arg1;
-	long ret;
-
-	/* Did it expire while we handled signals? */
-	if (!time_after(expire, now))
-		return 0;
-
-	expire = schedule_timeout_interruptible(expire - now);
-
-	ret = 0;
-	if (expire) {
-		struct timespec t;
-		jiffies_to_timespec(expire, &t);
-
-		ret = -ERESTART_RESTARTBLOCK;
-		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
-			ret = -EFAULT;
-		/* The 'restart' block is already filled in */
-	}
-	return ret;
-}
-
-asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp)
-{
-	struct timespec t;
-	unsigned long expire;
-	long ret;
-
-	if (copy_from_user(&t, rqtp, sizeof(t)))
-		return -EFAULT;
-
-	if ((t.tv_nsec >= 1000000000L) || (t.tv_nsec < 0) || (t.tv_sec < 0))
-		return -EINVAL;
-
-	expire = timespec_to_jiffies(&t) + (t.tv_sec || t.tv_nsec);
-	expire = schedule_timeout_interruptible(expire);
-
-	ret = 0;
-	if (expire) {
-		struct restart_block *restart;
-		jiffies_to_timespec(expire, &t);
-		if (rmtp && copy_to_user(rmtp, &t, sizeof(t)))
-			return -EFAULT;
-
-		restart = &current_thread_info()->restart_block;
-		restart->fn = nanosleep_restart;
-		restart->arg0 = jiffies + expire;
-		restart->arg1 = (unsigned long) rmtp;
-		ret = -ERESTART_RESTARTBLOCK;
-	}
-	return ret;
-}
-
 /*
  * sys_sysinfo - fill in sysinfo struct
  */ 
@@ -1329,6 +1523,7 @@ static void __devinit init_timers_cpu(in
 
 	base = &per_cpu(tvec_bases, cpu);
 	spin_lock_init(&base->t_base.lock);
+	init_waitqueue_head(&base->t_base.wait_for_running_timer);
 	for (j = 0; j < TVN_SIZE; j++) {
 		INIT_LIST_HEAD(base->tv5.vec + j);
 		INIT_LIST_HEAD(base->tv4.vec + j);
@@ -1364,8 +1559,7 @@ static void __devinit migrate_timers(int
 	old_base = &per_cpu(tvec_bases, cpu);
 	new_base = &get_cpu_var(tvec_bases);
 
-	local_irq_disable();
-	spin_lock(&new_base->t_base.lock);
+	spin_lock_irq(&new_base->t_base.lock);
 	spin_lock(&old_base->t_base.lock);
 
 	if (old_base->t_base.running_timer)
@@ -1380,8 +1574,7 @@ static void __devinit migrate_timers(int
 	}
 
 	spin_unlock(&old_base->t_base.lock);
-	spin_unlock(&new_base->t_base.lock);
-	local_irq_enable();
+	spin_unlock_irq(&new_base->t_base.lock);
 	put_cpu_var(tvec_bases);
 }
 #endif /* CONFIG_HOTPLUG_CPU */
@@ -1412,6 +1605,7 @@ static struct notifier_block __devinitda
 
 void __init init_timers(void)
 {
+	spin_lock_init(&__init_timer_base.lock);
 	timer_cpu_notify(&timers_nb, (unsigned long)CPU_UP_PREPARE,
 				(void *)(long)smp_processor_id());
 	register_cpu_notifier(&timers_nb);
@@ -1507,16 +1701,18 @@ static void time_interpolator_update(lon
 	if (!time_interpolator)
 		return;
 
-	/* The interpolator compensates for late ticks by accumulating
-         * the late time in time_interpolator->offset. A tick earlier than
-	 * expected will lead to a reset of the offset and a corresponding
-	 * jump of the clock forward. Again this only works if the
-	 * interpolator clock is running slightly slower than the regular clock
-	 * and the tuning logic insures that.
-         */
+	/*
+	 * The interpolator compensates for late ticks by accumulating the late
+	 * time in time_interpolator->offset. A tick earlier than expected will
+	 * lead to a reset of the offset and a corresponding jump of the clock
+	 * forward. Again this only works if the interpolator clock is running
+	 * slightly slower than the regular clock and the tuning logic insures
+	 * that.
+	 */
 
 	counter = time_interpolator_get_counter(1);
-	offset = time_interpolator->offset + GET_TI_NSECS(counter, time_interpolator);
+	offset = time_interpolator->offset +
+			GET_TI_NSECS(counter, time_interpolator);
 
 	if (delta_nsec < 0 || (unsigned long) delta_nsec < offset)
 		time_interpolator->offset = offset - delta_nsec;
Index: linux/kernel/workqueue.c
===================================================================
--- linux.orig/kernel/workqueue.c
+++ linux/kernel/workqueue.c
@@ -25,6 +25,7 @@
 #include <linux/cpu.h>
 #include <linux/notifier.h>
 #include <linux/kthread.h>
+#include <linux/syscalls.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use cpu 0's).
@@ -93,10 +94,12 @@ static void __queue_work(struct cpu_work
  *
  * We queue the work to the CPU it was submitted, but there is no
  * guarantee that it will be processed by that CPU.
+ *
+ * Especially no such guarantee on PREEMPT_RT.
  */
 int fastcall queue_work(struct workqueue_struct *wq, struct work_struct *work)
 {
-	int ret = 0, cpu = get_cpu();
+	int ret = 0, cpu = raw_smp_processor_id();
 
 	if (!test_and_set_bit(0, &work->pending)) {
 		if (unlikely(is_single_threaded(wq)))
@@ -105,7 +108,6 @@ int fastcall queue_work(struct workqueue
 		__queue_work(wq->cpu_wq + cpu, work);
 		ret = 1;
 	}
-	put_cpu();
 	return ret;
 }
 
@@ -362,6 +364,39 @@ static void cleanup_workqueue_thread(str
 		kthread_stop(p);
 }
 
+void set_workqueue_thread_prio(struct workqueue_struct *wq, int cpu,
+				int policy, int rt_priority, int nice)
+{
+	struct task_struct *p = wq->cpu_wq[cpu].thread;
+	struct sched_param param = { .sched_priority = rt_priority };
+	int ret;
+
+	set_user_nice(p, nice);
+	ret = sys_sched_setscheduler(p->pid, policy, &param);
+	if (ret)
+		printk("BUG: wq(%s) setscheduler() returned: %d.\n",
+			wq->name, ret);
+
+}
+
+void set_workqueue_prio(struct workqueue_struct *wq, int policy,
+			int rt_priority, int nice)
+{
+	int cpu;
+
+	/* We don't need the distraction of CPUs appearing and vanishing. */
+	lock_cpu_hotplug();
+	if (is_single_threaded(wq))
+		set_workqueue_thread_prio(wq, 0, policy, rt_priority, nice);
+	else {
+		for_each_online_cpu(cpu)
+			set_workqueue_thread_prio(wq, cpu, policy,
+						  rt_priority, nice);
+	}
+	unlock_cpu_hotplug();
+}
+
+
 void destroy_workqueue(struct workqueue_struct *wq)
 {
 	int cpu;
@@ -539,6 +574,7 @@ void init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+	set_workqueue_prio(keventd_wq, SCHED_FIFO, 1, -20);
 }
 
 EXPORT_SYMBOL_GPL(__create_workqueue);
Index: linux/lib/Kconfig.debug
===================================================================
--- linux.orig/lib/Kconfig.debug
+++ linux/lib/Kconfig.debug
@@ -8,6 +8,22 @@ config PRINTK_TIME
 	  operations.  This is useful for identifying long delays
 	  in kernel startup.
 
+config PRINTK_IGNORE_LOGLEVEL
+	bool "Ignore loglevel on printks"
+	default n
+	help
+	  Selecting this option causes all printk messages to go
+	  to the console.  This allows you to serial-log kernel
+	  messages, no matter what userspace does. (e.g. some
+	  distributions disable kernel log messages during
+	  certain phases of system startup.)
+
+	  NOTE: this option also makes printk non-preemptible,
+	  which might improve the output of debugging info or
+	  crash info, but it might also cause latencies if your
+	  kernel is printk-ing alot.
+
+	  Normally you dont need or want this option.
 
 config DEBUG_KERNEL
 	bool "Kernel debugging"
@@ -46,6 +62,11 @@ config LOG_BUF_SHIFT
 		     13 =>  8 KB
 		     12 =>  4 KB
 
+config PARANOID_GENERIC_TIME
+	default y
+	depends on GENERIC_TIME
+	bool "Paraniod Timekeeping Checks"
+
 config DETECT_SOFTLOCKUP
 	bool "Detect Soft Lockups"
 	depends on DEBUG_KERNEL
@@ -93,11 +114,19 @@ config DEBUG_PREEMPT
 	  If you say Y here then the kernel will use a debug variant of the
 	  commonly used smp_processor_id() function and will print warnings
 	  if kernel code uses it in a preemption-unsafe way. Also, the kernel
-	  will detect preemption count underflows.
+	  will detect preemption count underflows and will track critical
+	  section entries and print that info when an illegal sleep happens.
+
+config DEBUG_IRQ_FLAGS
+	bool
+	default y
+	depends on DEBUG_PREEMPT
 
+# broken by PREEMPT_RT, disable for now
 config DEBUG_SPINLOCK
 	bool "Spinlock debugging"
-	depends on DEBUG_KERNEL
+	depends on 0 && DEBUG_KERNEL
+	default n
 	help
 	  Say Y here and build SMP to catch missing spinlock initialization
 	  and certain other kinds of spinlock errors commonly made.  This is
@@ -106,11 +135,187 @@ config DEBUG_SPINLOCK
 
 config DEBUG_SPINLOCK_SLEEP
 	bool "Sleep-inside-spinlock checking"
-	depends on DEBUG_KERNEL
+	depends on DEBUG_KERNEL && !DEBUG_PREEMPT
 	help
 	  If you say Y here, various routines which may sleep will become very
 	  noisy if they are called with a spinlock held.
 
+config WAKEUP_TIMING
+	bool "Wakeup latency timing"
+	default y
+	help
+	  This option measures the time spent from a highprio thread being
+	  woken up to it getting scheduled on a CPU, with microsecond
+	  accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+config WAKEUP_LATENCY_HIST
+	bool "wakeup latency histogram"
+	default n
+	depends on WAKEUP_TIMING
+	help
+	  This option logs all the wakeup latency timing to a big histogram
+	  bucket, in the meanwhile, it also dummies up printk produced by
+	  wakeup latency timing.
+
+	  The wakeup latency timing histogram can be viewed via:
+
+	      cat /proc/latency_hist/wakeup_latency/CPU*
+
+	  (Note: * presents CPU ID.)
+
+config PREEMPT_TRACE
+	bool
+	default y
+	depends on DEBUG_PREEMPT
+
+config CRITICAL_PREEMPT_TIMING
+	bool "Non-preemptible critical section latency timing"
+	default n
+	depends on PREEMPT
+	help
+	  This option measures the time spent in preempt-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the irqs-off timing option can be
+	  used together or separately.)
+
+config PREEMPT_OFF_HIST
+        bool "non-preemptible critical section latency histogram"
+        default n
+        depends on CRITICAL_PREEMPT_TIMING
+        help
+          This option logs all the non-preemptible critical section latency
+	  timing to a big histogram bucket, in the meanwhile, it also
+	  dummies up printk produced by non-preemptible critical section
+	  latency timing.
+
+          The non-preemptible critical section latency timing histogram can
+	  be viewed via:
+
+              cat /proc/latency_hist/preempt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_IRQSOFF_TIMING
+	bool "Interrupts-off critical section latency timing"
+	default n
+	help
+	  This option measures the time spent in irqs-off critical
+	  sections, with microsecond accuracy.
+
+	  The default measurement method is a maximum search, which is
+	  disabled by default and can be runtime (re-)started via:
+
+	      echo 0 > /proc/sys/kernel/preempt_max_latency
+
+	  (Note that kernel size and overhead increases with this option
+	  enabled. This option and the preempt-off timing option can be
+	  used together or separately.)
+
+config INTERRUPT_OFF_HIST
+        bool "interrupts-off critical section latency histogram"
+        default n
+        depends on CRITICAL_IRQSOFF_TIMING
+        help
+          This option logs all the interrupts-off critical section latency
+          timing to a big histogram bucket, in the meanwhile, it also
+          dummies up printk produced by interrupts-off critical section
+          latency timing.
+
+          The interrupts-off critical section latency timing histogram can
+          be viewed via:
+
+              cat /proc/latency_hist/interrupt_off_latency/CPU*
+
+          (Note: * presents CPU ID.)
+
+config CRITICAL_TIMING
+	bool
+	default y
+	depends on CRITICAL_PREEMPT_TIMING || CRITICAL_IRQSOFF_TIMING
+
+config LATENCY_TIMING
+	bool
+	default y
+	depends on WAKEUP_TIMING || CRITICAL_TIMING
+	select SYSCTL
+
+config CRITICAL_LATENCY_HIST
+	bool
+	default y
+	depends on PREEMPT_OFF_HIST || INTERRUPT_OFF_HIST
+
+config LATENCY_HIST
+	bool
+	default y
+	depends on WAKEUP_LATENCY_HIST || CRITICAL_LATENCY_HIST
+
+config LATENCY_TRACE
+	bool "Latency tracing"
+	default n
+	depends on LATENCY_TIMING
+	help
+	  This option enables a kernel tracing mechanism that will track
+	  precise function-call granularity kernel execution during
+	  wakeup paths or critical sections.  When this option is enabled
+	  then the last maximum latency timing event's full trace can be
+	  found in /proc/latency_trace, in a human-readable (or rather as
+	  some would say, in a kernel-developer-readable) form.
+
+	  (Note that kernel size and overhead increases noticeably
+	  with this option enabled.)
+
+config MCOUNT
+	bool
+	depends on LATENCY_TRACE
+	default y
+
+config DEBUG_DEADLOCKS
+	bool "Automatic spinlock/rwlock/mutex/rwsem deadlock detection"
+	depends on PREEMPT_RT
+	default y
+	help
+	  This allows semaphores, rw-semaphores, and spinlock/rwlock
+	  related deadlocks (lockups) to be detected and reported
+	  automatically.
+
+config DEBUG_RT_LOCKING_MODE
+	bool "Runtime switching between spinlocks and mutexes"
+	depends on PREEMPT_RT
+	default n
+	help
+	  This debugging option makes RT mutexes non-preemptible again,
+	  thus simulating the locking and scheduling properties of a
+	  spin-lock based kernel. The /proc/sys/kernel/preempt_locks
+	  flag can be used to switch between the locking modes runtime.
+
+	  This option is useful to developers only, it helps measuring
+	  the impact of the PREEMPT_RT framework. As a user you dont
+	  want to enable this, as with lock preemption turned off you'll
+	  get high scheduling latencies again! Also, this feature has
+	  some runtime overhead, even if preempt_locks is turned on.
+
+          (NOTE: the preempt_locks flag is not directly changed but is
+	   indirectly propagated by the idle thread. [this is a safe
+	   method of changing the locking mode.] But this also means
+	   that if you are e.g. scripting the mode-change you should
+	   make sure the idle thread has scheduled at least once.
+	   You can achieve this via inserting "sleep 1" into the script
+	   for example.)
+
+	  Say N if you are unsure.
+
 config DEBUG_KOBJECT
 	bool "kobject debugging"
 	depends on DEBUG_KERNEL
@@ -168,9 +373,31 @@ config DEBUG_FS
 
 	  If unsure, say N.
 
-config FRAME_POINTER
+config DEBUG_VM
+	bool "Debug VM"
+	depends on DEBUG_KERNEL
+	help
+	  Enable this to debug the virtual-memory system.
+
+	  If unsure, say N.
+
+config RCU_TORTURE_TEST
+	tristate "torture tests for RCU"
+	depends on DEBUG_KERNEL
+	default n
+	help
+	  This option provides a kernel module that runs torture tests
+	  on the RCU infrastructure.  The kernel module may be built
+	  after the fact on the running kernel to be tested, if desired.
+
+	  Say Y here if you want RCU torture tests to start automatically
+	  at boot time (you probably don't).
+	  Say M if you want the RCU torture tests to build as a module.
+	  Say N if you are unsure.
+
+config USE_FRAME_POINTER
 	bool "Compile the kernel with frame pointers"
-	depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
+	depends on DEBUG_KERNEL && !MCOUNT && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML)
 	default y if DEBUG_INFO && UML
 	help
 	  If you say Y here the resulting kernel image will be slightly larger
@@ -178,3 +405,8 @@ config FRAME_POINTER
 	  on some architectures or you use external debuggers.
 	  If you don't debug the kernel, you can say N.
 
+config FRAME_POINTER
+	bool
+	depends on USE_FRAME_POINTER || MCOUNT
+	default y
+
Index: linux/lib/Makefile
===================================================================
--- linux.orig/lib/Makefile
+++ linux/lib/Makefile
@@ -5,7 +5,7 @@
 lib-y := errno.o ctype.o string.o vsprintf.o cmdline.o \
 	 bust_spinlocks.o rbtree.o radix-tree.o dump_stack.o \
 	 idr.o div64.o int_sqrt.o bitmap.o extable.o prio_tree.o \
-	 sha1.o
+	 sha1.o plist.o
 
 lib-y	+= kobject.o kref.o kobject_uevent.o klist.o
 
@@ -17,7 +17,8 @@ CFLAGS_kobject_uevent.o += -DDEBUG
 endif
 
 obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o
-lib-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
+obj-$(CONFIG_PREEMPT_RT) += plist.o
+obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
 lib-$(CONFIG_SEMAPHORE_SLEEPERS) += semaphore-sleepers.o
 lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
Index: linux/lib/dec_and_lock.c
===================================================================
--- linux.orig/lib/dec_and_lock.c
+++ linux/lib/dec_and_lock.c
@@ -13,7 +13,7 @@
  * the value of the atomic (i.e. the high bits aren't used
  * for a lock or anything like that).
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
 	int counter;
 	int newcount;
@@ -29,10 +29,10 @@ int _atomic_dec_and_lock(atomic_t *atomi
 			return 0;
 	}
 
-	spin_lock(lock);
+	_raw_spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
-	spin_unlock(lock);
+	_raw_spin_unlock(lock);
 	return 0;
 }
 #else
@@ -59,14 +59,14 @@ int _atomic_dec_and_lock(atomic_t *atomi
  * this is trivially done efficiently using a load-locked
  * store-conditional approach, for example.
  */
-int _atomic_dec_and_lock(atomic_t *atomic, spinlock_t *lock)
+int _atomic_dec_and_raw_spin_lock(atomic_t *atomic, raw_spinlock_t *lock)
 {
-	spin_lock(lock);
+	_raw_spin_lock(lock);
 	if (atomic_dec_and_test(atomic))
 		return 1;
-	spin_unlock(lock);
+	_raw_spin_unlock(lock);
 	return 0;
 }
 #endif
 
-EXPORT_SYMBOL(_atomic_dec_and_lock);
+EXPORT_SYMBOL(_atomic_dec_and_raw_spin_lock);
Index: linux/lib/inflate.c
===================================================================
--- linux.orig/lib/inflate.c
+++ linux/lib/inflate.c
@@ -141,6 +141,25 @@ struct huft {
   } v;
 };
 
+/*
+ * turn off the inflate_lock for the bootloader code, it is
+ * single-threaded and has no need for (nor access to) the
+ * kernel's locking primitives:
+ */
+#ifdef ZLIB_INFLATE_NO_INFLATE_LOCK
+# undef DEFINE_SPINLOCK
+# undef spin_lock
+# undef spin_unlock
+# define DEFINE_SPINLOCK(x)	int x
+# define spin_lock(x)		(void)(x)
+# define spin_unlock(x)		(void)(x)
+#endif
+
+/*
+ * lock protecting static variables of huft_build() and other inflate
+ * functions, to reduce their insane stack footprint.
+ */
+static DEFINE_SPINLOCK(inflate_lock);
 
 /* Function prototypes */
 STATIC int INIT huft_build OF((unsigned *, unsigned, unsigned, 
@@ -304,7 +323,7 @@ STATIC int INIT huft_build(
   register struct huft *q;      /* points to current table */
   struct huft r;                /* table entry for structure assignment */
   struct huft *u[BMAX];         /* table stack */
-  unsigned v[N_MAX];            /* values in order of bit length */
+  static unsigned v[N_MAX];     /* values in order of bit length */
   register int w;               /* bits before this table == (l * h) */
   unsigned x[BMAX+1];           /* bit offsets, then code stack */
   unsigned *xp;                 /* pointer into x */
@@ -705,7 +724,7 @@ STATIC int noinline INIT inflate_fixed(v
   struct huft *td;      /* distance code table */
   int bl;               /* lookup bits for tl */
   int bd;               /* lookup bits for td */
-  unsigned l[288];      /* length list for huft_build */
+  static unsigned l[288];      /* length list for huft_build */
 
 DEBG("<fix");
 
@@ -767,9 +786,9 @@ STATIC int noinline INIT inflate_dynamic
   unsigned nl;          /* number of literal/length codes */
   unsigned nd;          /* number of distance codes */
 #ifdef PKZIP_BUG_WORKAROUND
-  unsigned ll[288+32];  /* literal/length and distance code lengths */
+  static unsigned ll[288+32];  /* literal/length and distance code lengths */
 #else
-  unsigned ll[286+30];  /* literal/length and distance code lengths */
+  static unsigned ll[286+30];  /* literal/length and distance code lengths */
 #endif
   register ulg b;       /* bit buffer */
   register unsigned k;  /* number of bits in bit buffer */
@@ -940,6 +959,7 @@ STATIC int INIT inflate_block(
   unsigned t;           /* block type */
   register ulg b;       /* bit buffer */
   register unsigned k;  /* number of bits in bit buffer */
+  unsigned ret;         /* return code */
 
   DEBG("<blk");
 
@@ -965,17 +985,19 @@ STATIC int INIT inflate_block(
   bk = k;
 
   /* inflate that block type */
-  if (t == 2)
-    return inflate_dynamic();
-  if (t == 0)
-    return inflate_stored();
-  if (t == 1)
-    return inflate_fixed();
+  ret = 2;
+  spin_lock(&inflate_lock);
+  switch (t) {
+	case 2: ret = inflate_dynamic(); break;
+	case 0: ret = inflate_stored();  break;
+	case 1: ret = inflate_fixed();   break;
+  }
+  spin_unlock(&inflate_lock);
 
   DEBG(">");
 
   /* bad block type */
-  return 2;
+  return ret;
 
  underrun:
   return 4;			/* Input underrun */
Index: linux/lib/kernel_lock.c
===================================================================
--- linux.orig/lib/kernel_lock.c
+++ linux/lib/kernel_lock.c
@@ -24,7 +24,7 @@
  *
  * Don't use in new code.
  */
-static DECLARE_MUTEX(kernel_sem);
+DECLARE_MUTEX(kernel_sem);
 
 /*
  * Re-acquire the kernel semaphore.
@@ -35,22 +35,25 @@ static DECLARE_MUTEX(kernel_sem);
  * about recursion, both due to the down() and due to the enabling of
  * preemption. schedule() will re-check the preemption flag after
  * reacquiring the semaphore.
+ *
+ * Called with interrupts disabled.
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	struct task_struct *task = current;
 	int saved_lock_depth = task->lock_depth;
 
+	raw_local_irq_enable();
 	BUG_ON(saved_lock_depth < 0);
 
 	task->lock_depth = -1;
-	preempt_enable_no_resched();
 
 	down(&kernel_sem);
 
-	preempt_disable();
 	task->lock_depth = saved_lock_depth;
 
+	raw_local_irq_disable();
+
 	return 0;
 }
 
@@ -67,11 +70,15 @@ void __lockfunc lock_kernel(void)
 	struct task_struct *task = current;
 	int depth = task->lock_depth + 1;
 
-	if (likely(!depth))
+	if (likely(!depth)) {
 		/*
 		 * No recursion worries - we set up lock_depth _after_
 		 */
 		down(&kernel_sem);
+#ifdef CONFIG_DEBUG_DEADLOCKS
+		current->last_kernel_lock = __builtin_return_address(0);
+#endif
+	}
 
 	task->lock_depth = depth;
 }
@@ -82,8 +89,12 @@ void __lockfunc unlock_kernel(void)
 
 	BUG_ON(task->lock_depth < 0);
 
-	if (likely(--task->lock_depth < 0))
+	if (likely(--task->lock_depth == -1)) {
+#ifdef CONFIG_DEBUG_DEADLOCKS
+		current->last_kernel_lock = NULL;
+#endif
 		up(&kernel_sem);
+	}
 }
 
 #else
@@ -116,38 +127,40 @@ static  __cacheline_aligned_in_smp DEFIN
  */
 int __lockfunc __reacquire_kernel_lock(void)
 {
-	while (!_raw_spin_trylock(&kernel_flag)) {
+	raw_local_irq_enable();
+	while (!__raw_spin_trylock(&kernel_flag.raw_lock)) {
 		if (test_thread_flag(TIF_NEED_RESCHED))
 			return -EAGAIN;
 		cpu_relax();
 	}
+	raw_local_irq_disable();
 	preempt_disable();
 	return 0;
 }
 
 void __lockfunc __release_kernel_lock(void)
 {
-	_raw_spin_unlock(&kernel_flag);
+	__raw_spin_unlock(&kernel_flag.raw_lock);
 	preempt_enable_no_resched();
 }
 
 /*
  * These are the BKL spinlocks - we try to be polite about preemption. 
  * If SMP is not on (ie UP preemption), this all goes away because the
- * _raw_spin_trylock() will always succeed.
+ * __raw_spin_trylock() will always succeed.
  */
 #ifdef CONFIG_PREEMPT
 static inline void __lock_kernel(void)
 {
 	preempt_disable();
-	if (unlikely(!_raw_spin_trylock(&kernel_flag))) {
+	if (unlikely(!__raw_spin_trylock(&kernel_flag.raw_lock))) {
 		/*
 		 * If preemption was disabled even before this
 		 * was called, there's nothing we can be polite
 		 * about - just spin.
 		 */
 		if (preempt_count() > 1) {
-			_raw_spin_lock(&kernel_flag);
+			__raw_spin_lock(&kernel_flag.raw_lock);
 			return;
 		}
 
@@ -160,7 +173,7 @@ static inline void __lock_kernel(void)
 			while (spin_is_locked(&kernel_flag))
 				cpu_relax();
 			preempt_disable();
-		} while (!_raw_spin_trylock(&kernel_flag));
+		} while (!__raw_spin_trylock(&kernel_flag.raw_lock));
 	}
 }
 
@@ -171,7 +184,7 @@ static inline void __lock_kernel(void)
  */
 static inline void __lock_kernel(void)
 {
-	_raw_spin_lock(&kernel_flag);
+	__raw_spin_lock(&kernel_flag.raw_lock);
 }
 #endif
 
Index: linux/lib/plist.c
===================================================================
--- /dev/null
+++ linux/lib/plist.c
@@ -0,0 +1,175 @@
+/*
+ * lib/plist.c
+ *
+ * Descending-priority-sorted double-linked list
+ *
+ * (C) 2002-2003 Intel Corp
+ * Inaky Perez-Gonzalez <inaky.perez-gonzalez@intel.com>.
+ *
+ * 2001-2005 (c) MontaVista Software, Inc.
+ * Daniel Walker <dwalker@mvista.com>
+ *
+ * (C) 2005 Thomas Gleixner <tglx@linutronix.de>
+ * Tested and made it functional.
+ *
+ * Licensed under the FSF's GNU Public License v2 or later.
+ *
+ * Based on simple lists (include/linux/list.h).
+ */
+
+#include <linux/sched.h>
+#include <linux/rt_lock.h>
+
+
+/* Initialize a pl */
+void plist_init(struct plist *pl, int prio)
+{
+#ifdef CONFIG_PREEMPT_DEBUG
+	WARN_ON(!preempt_count() && !raw_irqs_disabled());
+#endif
+	pl->prio = prio;
+	INIT_LIST_HEAD(&pl->dp_node);
+	INIT_LIST_HEAD(&pl->sp_node);
+}
+
+/* Update the maximum priority of the whole list
+ *
+ * @returns !0 if the plist prio changed, 0 otherwise.
+ *
+ * __plist_update_prio() assumes the plist is not empty.
+ */
+static inline unsigned __plist_update_prio(struct plist *plist)
+{
+	int prio = plist_first(plist)->prio;
+	if (plist->prio == prio)
+		return 0;
+	plist->prio = prio;
+	return !0;
+}
+
+unsigned plist_update_prio(struct plist *plist)
+{
+	int old_prio = plist->prio;
+	/* plist empty, lowest prio = INT_MAX */
+	plist->prio = plist_empty(plist) ? INT_MAX : plist_first(plist)->prio;
+
+	return old_prio != plist->prio;
+}
+
+/* Add a node to the plist [internal]
+ *
+ * pl->prio == INT_MAX is an special case, means low priority, get
+ * down to the end of the plist. Note the we want FIFO behaviour on
+ * the same priority.
+ */
+static inline void __plist_add_sorted(struct plist *plist, struct plist *pl)
+{
+	struct list_head *itr;
+	struct plist *itr_pl, *itr_pl2;
+
+	if (pl->prio < INT_MAX) {
+		list_for_each(itr, &plist->dp_node) {
+			itr_pl = list_entry(itr, struct plist, dp_node);
+			if (pl->prio == itr_pl->prio)
+				goto existing_sp_head;
+			else if (pl->prio < itr_pl->prio)
+				goto new_sp_head;
+		}
+		itr_pl = plist;
+		goto new_sp_head;
+	}
+	/* Append to end, SP list for prio INT_MAX */
+	itr_pl = container_of(plist->dp_node.prev, struct plist, dp_node);
+	if (!list_empty(&plist->dp_node) && itr_pl->prio == INT_MAX)
+		goto existing_sp_head;
+	itr_pl = plist;
+
+new_sp_head:
+	list_add_tail(&pl->dp_node, &itr_pl->dp_node);
+	list_add_tail(&pl->sp_node, &itr_pl->sp_node);
+	return;
+existing_sp_head:
+	itr_pl2 = container_of(itr_pl->dp_node.next, struct plist, dp_node);
+	list_add_tail(&pl->sp_node, &itr_pl2->sp_node);
+	return;
+}
+
+/**
+ * Add node @pl to @plist @returns !0 if the plist prio changed, 0
+ * otherwise.
+ */
+unsigned plist_add(struct plist *pl, struct plist *plist)
+{
+	__plist_add_sorted(plist, pl);
+	/* Are we setting a higher priority? */
+	if (pl->prio < plist->prio) {
+		plist->prio = pl->prio;
+		return !0;
+	}
+	return 0;
+}
+
+/* Grunt to do the real removal work of @pl from the plist. */
+static inline
+void  __plist_del(struct plist *pl)
+{
+	if (!list_empty(&pl->dp_node)) {
+		struct plist *pl_new = container_of(pl->sp_node.next,
+						    struct plist, sp_node);
+
+		if (pl->dp_node.next == &pl_new->dp_node) {
+			/* end of this priorities list */
+			list_del_init(&pl->dp_node);
+		} else {
+			list_replace_rcu(&pl->dp_node, &pl_new->dp_node);
+			INIT_LIST_HEAD(&pl->dp_node);
+		}
+	}
+	list_del_init(&pl->sp_node);
+}
+
+/**
+ * Remove a node @pl from @plist. @returns !0 if the plist prio
+ * changed, 0 otherwise.
+ */
+unsigned plist_del(struct plist *pl, struct plist *plist)
+{
+	__plist_del(pl);
+	return plist_update_prio(plist);
+}
+
+/**
+ * plist_del_init - deletes entry from list and reinitialize it.
+ * @entry: the element to delete from the list.
+ */
+void plist_del_init(struct plist *pl, struct plist *plist)
+{
+	plist_del(pl, plist);
+	plist_init(pl, INT_MAX);
+}
+
+/* Change the priority of a pl node, without updating plist position */
+static inline
+void __plist_chprio(struct plist *pl, int new_prio)
+{
+	pl->prio = new_prio;
+}
+
+/**
+ * Change the priority of node @pl in @plist (updating the list's max
+ * priority).
+ *
+ * @returns !0 if the plist's maximum priority changes
+ */
+unsigned plist_chprio(struct plist *plist, struct plist *pl, int new_prio)
+{
+	if (new_prio == pl->prio)
+		return 0;
+
+	__plist_chprio(pl, new_prio);
+	__plist_del(pl);
+	__plist_add_sorted(plist, pl);
+
+	return __plist_update_prio(plist);
+}
+
Index: linux/lib/radix-tree.c
===================================================================
--- linux.orig/lib/radix-tree.c
+++ linux/lib/radix-tree.c
@@ -104,6 +104,8 @@ radix_tree_node_free(struct radix_tree_n
 	kmem_cache_free(radix_tree_node_cachep, node);
 }
 
+#ifndef CONFIG_PREEMPT_RT
+
 /*
  * Load up this CPU's radix_tree_node buffer with sufficient objects to
  * ensure that the addition of a single element in the tree cannot fail.  On
@@ -135,6 +137,8 @@ out:
 	return ret;
 }
 
+#endif
+
 static inline void tag_set(struct radix_tree_node *node, int tag, int offset)
 {
 	if (!test_bit(offset, &node->tags[tag][0]))
Index: linux/lib/rwsem-spinlock.c
===================================================================
--- linux.orig/lib/rwsem-spinlock.c
+++ linux/lib/rwsem-spinlock.c
@@ -18,7 +18,7 @@ struct rwsem_waiter {
 };
 
 #if RWSEM_DEBUG
-void rwsemtrace(struct rw_semaphore *sem, const char *str)
+void rwsemtrace(struct compat_rw_semaphore *sem, const char *str)
 {
 	if (sem->debug)
 		printk("[%d] %s({%d,%d})\n",
@@ -30,7 +30,7 @@ void rwsemtrace(struct rw_semaphore *sem
 /*
  * initialise the semaphore
  */
-void fastcall init_rwsem(struct rw_semaphore *sem)
+void fastcall compat_init_rwsem(struct compat_rw_semaphore *sem)
 {
 	sem->activity = 0;
 	spin_lock_init(&sem->wait_lock);
@@ -49,8 +49,8 @@ void fastcall init_rwsem(struct rw_semap
  * - woken process blocks are discarded from the list after having task zeroed
  * - writers are only woken if wakewrite is non-zero
  */
-static inline struct rw_semaphore *
-__rwsem_do_wake(struct rw_semaphore *sem, int wakewrite)
+static inline struct compat_rw_semaphore *
+__rwsem_do_wake(struct compat_rw_semaphore *sem, int wakewrite)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -111,8 +111,8 @@ __rwsem_do_wake(struct rw_semaphore *sem
 /*
  * wake a single writer
  */
-static inline struct rw_semaphore *
-__rwsem_wake_one_writer(struct rw_semaphore *sem)
+static inline struct compat_rw_semaphore *
+__rwsem_wake_one_writer(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter *waiter;
 	struct task_struct *tsk;
@@ -133,7 +133,7 @@ __rwsem_wake_one_writer(struct rw_semaph
 /*
  * get a read lock on the semaphore
  */
-void fastcall __sched __down_read(struct rw_semaphore *sem)
+void fastcall __sched __down_read(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -179,7 +179,7 @@ void fastcall __sched __down_read(struct
 /*
  * trylock for reading -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_read_trylock(struct rw_semaphore *sem)
+int fastcall __down_read_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -204,7 +204,7 @@ int fastcall __down_read_trylock(struct 
  * get a write lock on the semaphore
  * - we increment the waiting count anyway to indicate an exclusive lock
  */
-void fastcall __sched __down_write(struct rw_semaphore *sem)
+void fastcall __sched __down_write(struct compat_rw_semaphore *sem)
 {
 	struct rwsem_waiter waiter;
 	struct task_struct *tsk;
@@ -250,7 +250,7 @@ void fastcall __sched __down_write(struc
 /*
  * trylock for writing -- returns 1 if successful, 0 if contention
  */
-int fastcall __down_write_trylock(struct rw_semaphore *sem)
+int fastcall __down_write_trylock(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 	int ret = 0;
@@ -274,7 +274,7 @@ int fastcall __down_write_trylock(struct
 /*
  * release a read lock on the semaphore
  */
-void fastcall __up_read(struct rw_semaphore *sem)
+void fastcall __up_read(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -293,7 +293,7 @@ void fastcall __up_read(struct rw_semaph
 /*
  * release a write lock on the semaphore
  */
-void fastcall __up_write(struct rw_semaphore *sem)
+void fastcall __up_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -314,7 +314,7 @@ void fastcall __up_write(struct rw_semap
  * downgrade a write lock into a read lock
  * - just wake up any readers at the front of the queue
  */
-void fastcall __downgrade_write(struct rw_semaphore *sem)
+void fastcall __downgrade_write(struct compat_rw_semaphore *sem)
 {
 	unsigned long flags;
 
@@ -331,7 +331,7 @@ void fastcall __downgrade_write(struct r
 	rwsemtrace(sem, "Leaving __downgrade_write");
 }
 
-EXPORT_SYMBOL(init_rwsem);
+EXPORT_SYMBOL(compat_init_rwsem);
 EXPORT_SYMBOL(__down_read);
 EXPORT_SYMBOL(__down_read_trylock);
 EXPORT_SYMBOL(__down_write);
Index: linux/lib/semaphore-sleepers.c
===================================================================
--- linux.orig/lib/semaphore-sleepers.c
+++ linux/lib/semaphore-sleepers.c
@@ -16,6 +16,7 @@
 #include <linux/sched.h>
 #include <linux/err.h>
 #include <linux/init.h>
+#include <linux/module.h>
 #include <asm/semaphore.h>
 
 /*
@@ -49,12 +50,12 @@
  *    we cannot lose wakeup events.
  */
 
-fastcall void __up(struct semaphore *sem)
+fastcall void __compat_up(struct compat_semaphore *sem)
 {
 	wake_up(&sem->wait);
 }
 
-fastcall void __sched __down(struct semaphore * sem)
+fastcall void __sched __compat_down(struct compat_semaphore * sem)
 {
 	struct task_struct *tsk = current;
 	DECLARE_WAITQUEUE(wait, tsk);
@@ -91,7 +92,7 @@ fastcall void __sched __down(struct sema
 	tsk->state = TASK_RUNNING;
 }
 
-fastcall int __sched __down_interruptible(struct semaphore * sem)
+fastcall int __sched __compat_down_interruptible(struct compat_semaphore * sem)
 {
 	int retval = 0;
 	struct task_struct *tsk = current;
@@ -154,7 +155,7 @@ fastcall int __sched __down_interruptibl
  * single "cmpxchg" without failure cases,
  * but then it wouldn't work on a 386.
  */
-fastcall int __down_trylock(struct semaphore * sem)
+fastcall int __compat_down_trylock(struct compat_semaphore * sem)
 {
 	int sleepers;
 	unsigned long flags;
@@ -175,3 +176,10 @@ fastcall int __down_trylock(struct semap
 	spin_unlock_irqrestore(&sem->wait.lock, flags);
 	return 1;
 }
+
+int fastcall compat_sem_is_locked(struct compat_semaphore *sem)
+{
+	return (int) atomic_read(&sem->count) < 0;
+}
+
+EXPORT_SYMBOL(compat_sem_is_locked);
Index: linux/lib/smp_processor_id.c
===================================================================
--- linux.orig/lib/smp_processor_id.c
+++ linux/lib/smp_processor_id.c
@@ -6,7 +6,7 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 
-unsigned int debug_smp_processor_id(void)
+unsigned int notrace debug_smp_processor_id(void)
 {
 	unsigned long preempt_count = preempt_count();
 	int this_cpu = raw_smp_processor_id();
@@ -15,7 +15,7 @@ unsigned int debug_smp_processor_id(void
 	if (likely(preempt_count))
 		goto out;
 
-	if (irqs_disabled())
+	if (irqs_disabled() || raw_irqs_disabled())
 		goto out;
 
 	/*
@@ -41,7 +41,7 @@ unsigned int debug_smp_processor_id(void
 	if (!printk_ratelimit())
 		goto out_enable;
 
-	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count(), current->comm, current->pid);
+	printk(KERN_ERR "BUG: using smp_processor_id() in preemptible [%08x] code: %s/%d\n", preempt_count()-1, current->comm, current->pid);
 	print_symbol("caller is %s\n", (long)__builtin_return_address(0));
 	dump_stack();
 
Index: linux/lib/zlib_inflate/inftrees.c
===================================================================
--- linux.orig/lib/zlib_inflate/inftrees.c
+++ linux/lib/zlib_inflate/inftrees.c
@@ -4,11 +4,19 @@
  */
 
 #include <linux/zutil.h>
+#include <linux/spinlock.h>
 #include "inftrees.h"
 #include "infutil.h"
 
 static const char inflate_copyright[] __attribute_used__ =
    " inflate 1.1.3 Copyright 1995-1998 Mark Adler ";
+
+/*
+ * lock protecting static variables of huft_build() and other inflate
+ * functions, to reduce their insane stack footprint.
+ */
+static DEFINE_SPINLOCK(inflate_lock);
+
 /*
   If you use the zlib library in a product, an acknowledgment is welcome
   in the documentation of your product. If for some reason you cannot
@@ -107,7 +115,7 @@ static int huft_build(
 {
 
   uInt a;                       /* counter for codes of length k */
-  uInt c[BMAX+1];               /* bit length count table */
+  static uInt c[BMAX+1];        /* bit length count table */
   uInt f;                       /* i repeats in table every f entries */
   int g;                        /* maximum code length */
   int h;                        /* table level */
@@ -118,10 +126,10 @@ static int huft_build(
   uInt mask;                    /* (1 << w) - 1, to avoid cc -O bug on HP */
   register uInt *p;             /* pointer into c[], b[], or v[] */
   inflate_huft *q;              /* points to current table */
-  struct inflate_huft_s r;      /* table entry for structure assignment */
-  inflate_huft *u[BMAX];        /* table stack */
+  static struct inflate_huft_s r; /* table entry for structure assignment */
+  static inflate_huft *u[BMAX]; /* table stack */
   register int w;               /* bits before this table == (l * h) */
-  uInt x[BMAX+1];               /* bit offsets, then code stack */
+  static uInt x[BMAX+1];        /* bit offsets, then code stack */
   uInt *xp;                     /* pointer into x */
   int y;                        /* number of dummy codes added */
   uInt z;                       /* number of entries in current table */
@@ -300,9 +308,13 @@ int zlib_inflate_trees_bits(
   int r;
   uInt hn = 0;          /* hufts used in space */
   uInt *v;              /* work area for huft_build */
-  
+
   v = WS(z)->tree_work_area_1;
+
+  spin_lock(&inflate_lock);
   r = huft_build(c, 19, 19, NULL, NULL, tb, bb, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r == Z_DATA_ERROR)
     z->msg = (char*)"oversubscribed dynamic bit lengths tree";
   else if (r == Z_BUF_ERROR || *bb == 0)
@@ -333,7 +345,10 @@ int zlib_inflate_trees_dynamic(
   v = WS(z)->tree_work_area_2;
 
   /* build literal/length tree */
+  spin_lock(&inflate_lock);
   r = huft_build(c, nl, 257, cplens, cplext, tl, bl, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r != Z_OK || *bl == 0)
   {
     if (r == Z_DATA_ERROR)
@@ -347,7 +362,10 @@ int zlib_inflate_trees_dynamic(
   }
 
   /* build distance tree */
+  spin_lock(&inflate_lock);
   r = huft_build(c + nl, nd, 0, cpdist, cpdext, td, bd, hp, &hn, v);
+  spin_unlock(&inflate_lock);
+
   if (r != Z_OK || (*bd == 0 && nl > 257))
   {
     if (r == Z_DATA_ERROR)
@@ -383,9 +401,11 @@ int zlib_inflate_trees_fixed(
 	z_streamp z              /* for memory allocation */
 )
 {
-  int i;                /* temporary variable */
-  unsigned l[288];      /* length list for huft_build */
-  uInt *v;              /* work area for huft_build */
+  int i;                       /* temporary variable */
+  static unsigned l[288];      /* length list for huft_build */
+  uInt *v;                     /* work area for huft_build */
+
+  spin_lock(&inflate_lock);
 
   /* set up literal table */
   for (i = 0; i < 144; i++)
@@ -398,15 +418,20 @@ int zlib_inflate_trees_fixed(
     l[i] = 8;
   *bl = 9;
   v = WS(z)->tree_work_area_1;
-  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0)
+  if ((i = huft_build(l, 288, 257, cplens, cplext, tl, bl, hp,  &i, v)) != 0) {
+    spin_unlock(&inflate_lock);
     return i;
+  }
 
   /* set up distance table */
   for (i = 0; i < 30; i++)      /* make an incomplete code set */
     l[i] = 5;
   *bd = 5;
-  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1)
+  if ((i = huft_build(l, 30, 0, cpdist, cpdext, td, bd, hp, &i, v)) > 1) {
+    spin_unlock(&inflate_lock);
     return i;
+  }
 
+  spin_unlock(&inflate_lock);
   return Z_OK;
 }
Index: linux/mm/highmem.c
===================================================================
--- linux.orig/mm/highmem.c
+++ linux/mm/highmem.c
@@ -242,11 +242,11 @@ static void bounce_copy_vec(struct bio_v
 	unsigned long flags;
 	unsigned char *vto;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	vto = kmap_atomic(to->bv_page, KM_BOUNCE_READ);
 	memcpy(vto + to->bv_offset, vfrom, to->bv_len);
 	kunmap_atomic(vto, KM_BOUNCE_READ);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 
 #else /* CONFIG_HIGHMEM */
Index: linux/mm/memory.c
===================================================================
--- linux.orig/mm/memory.c
+++ linux/mm/memory.c
@@ -650,10 +650,13 @@ static void unmap_page_range(struct mmu_
 	tlb_end_vma(tlb, vma);
 }
 
-#ifdef CONFIG_PREEMPT
+#if defined(CONFIG_PREEMPT) && !defined(CONFIG_PREEMPT_RT)
 # define ZAP_BLOCK_SIZE	(8 * PAGE_SIZE)
 #else
-/* No preempt: go for improved straight-line efficiency */
+/*
+ * No preempt: go for improved straight-line efficiency
+ * on PREEMPT_RT this is not a critical latency-path.
+ */
 # define ZAP_BLOCK_SIZE	(1024 * PAGE_SIZE)
 #endif
 
Index: linux/mm/mmap.c
===================================================================
--- linux.orig/mm/mmap.c
+++ linux/mm/mmap.c
@@ -1821,11 +1821,17 @@ asmlinkage long sys_munmap(unsigned long
 
 static inline void verify_mm_writelocked(struct mm_struct *mm)
 {
-#ifdef CONFIG_DEBUG_KERNEL
-	if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+#ifdef CONFIG_DEBUG_VM
+# ifdef CONFIG_PREEMPT_RT
+	if (unlikely(!rt_rwsem_is_locked(&mm->mmap_sem))) {
 		WARN_ON(1);
-		up_read(&mm->mmap_sem);
 	}
+# else
+        if (unlikely(down_read_trylock(&mm->mmap_sem))) {
+		WARN_ON(1);
+		up_read(&mm->mmap_sem);
+        }
+# endif
 #endif
 }
 
Index: linux/mm/oom_kill.c
===================================================================
--- linux.orig/mm/oom_kill.c
+++ linux/mm/oom_kill.c
@@ -266,12 +266,16 @@ static struct mm_struct *oom_kill_proces
 void out_of_memory(gfp_t gfp_mask, int order)
 {
 	struct mm_struct *mm = NULL;
+	int print = 0;
 	task_t * p;
 
 	if (printk_ratelimit()) {
 		printk("oom-killer: gfp_mask=0x%x, order=%d\n",
 			gfp_mask, order);
 		show_mem();
+		printk("current: %s/%d.\n", current->comm, current->pid);
+		dump_stack();
+		print = 1;
 	}
 
 	read_lock(&tasklist_lock);
@@ -281,6 +285,11 @@ retry:
 	if (PTR_ERR(p) == -1UL)
 		goto out;
 
+	if (print) {
+		printk("victim: %s/%d.\n", p->comm, p->pid);
+		show_stack(p, NULL);
+	}
+
 	/* Found nothing?!?! Either we hang forever, or we panic. */
 	if (!p) {
 		read_unlock(&tasklist_lock);
Index: linux/mm/page_alloc.c
===================================================================
--- linux.orig/mm/page_alloc.c
+++ linux/mm/page_alloc.c
@@ -372,6 +372,9 @@ void __free_pages_ok(struct page *page, 
 	int i;
 
 	arch_free_page(page, order);
+	if (!PageHighMem(page))
+		check_no_locks_freed(page_address(page),
+			page_address(page+(1<<order)));
 
 	mod_page_state(pgfree, 1 << order);
 
@@ -492,6 +495,7 @@ static struct page *__rmqueue(struct zon
 	return NULL;
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /* 
  * Obtain a specified number of elements from the buddy allocator, all under
  * a single hold of the lock, for efficiency.  Add them to the supplied list.
@@ -516,6 +520,7 @@ static int rmqueue_bulk(struct zone *zon
 	spin_unlock_irqrestore(&zone->lock, flags);
 	return allocated;
 }
+#endif
 
 #ifdef CONFIG_NUMA
 /* Called from the slab reaper to drain remote pagesets */
@@ -634,6 +639,7 @@ static void zone_statistics(struct zonel
 #endif
 }
 
+#ifndef CONFIG_PREEMPT_RT
 /*
  * Free a 0-order page
  */
@@ -660,15 +666,32 @@ static void fastcall free_hot_cold_page(
 	local_irq_restore(flags);
 	put_cpu();
 }
+#endif
 
+/*
+ * On PREEMPT_RT we use a simple solution for the time being,
+ * per-CPU allocation is disabled.
+ */
 void fastcall free_hot_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 0);
+#endif
 }
 	
 void fastcall free_cold_page(struct page *page)
 {
+#ifdef CONFIG_PREEMPT_RT
+	if (PageAnon(page))
+		page->mapping = NULL;
+	__free_pages_ok(page, 0);
+#else
 	free_hot_cold_page(page, 1);
+#endif
 }
 
 static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
@@ -690,6 +713,7 @@ buffered_rmqueue(struct zone *zone, int 
 {
 	unsigned long flags;
 	struct page *page = NULL;
+#ifndef CONFIG_PREEMPT_RT
 	int cold = !!(gfp_flags & __GFP_COLD);
 
 	if (order == 0) {
@@ -708,6 +732,7 @@ buffered_rmqueue(struct zone *zone, int 
 		local_irq_restore(flags);
 		put_cpu();
 	}
+#endif
 
 	if (page == NULL) {
 		spin_lock_irqsave(&zone->lock, flags);
@@ -1010,8 +1035,15 @@ void __pagevec_free(struct pagevec *pvec
 {
 	int i = pagevec_count(pvec);
 
-	while (--i >= 0)
+	while (--i >= 0) {
+#ifdef CONFIG_PREEMPT_RT
+		if (PageAnon(pvec->pages[i]))
+			pvec->pages[i]->mapping = NULL;
+		__free_pages_ok(pvec->pages[i], 0);
+#else
 		free_hot_cold_page(pvec->pages[i], pvec->cold);
+#endif
+	}
 }
 
 fastcall void __free_pages(struct page *page, unsigned int order)
@@ -1207,10 +1239,10 @@ void __mod_page_state(unsigned long offs
 	unsigned long flags;
 	void* ptr;
 
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	ptr = &__get_cpu_var(page_states);
 	*(unsigned long*)(ptr + offset) += delta;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 }
 
 EXPORT_SYMBOL(__mod_page_state);
Index: linux/mm/slab.c
===================================================================
--- linux.orig/mm/slab.c
+++ linux/mm/slab.c
@@ -75,15 +75,6 @@
  *
  *	At present, each engine can be growing a cache.  This should be blocked.
  *
- * 15 March 2005. NUMA slab allocator.
- *	Shai Fultheim <shai@scalex86.org>.
- *	Shobhit Dayal <shobhit@calsoftinc.com>
- *	Alok N Kataria <alokk@calsoftinc.com>
- *	Christoph Lameter <christoph@lameter.com>
- *
- *	Modified the slab allocator to be node aware on NUMA systems.
- *	Each node has its own list of partial, free and full slabs.
- *	All object allocations for a node occur from node specific slab lists.
  */
 
 #include	<linux/config.h>
@@ -102,7 +93,6 @@
 #include	<linux/module.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
-#include	<linux/nodemask.h>
 
 #include	<asm/uaccess.h>
 #include	<asm/cacheflush.h>
@@ -222,7 +212,6 @@ struct slab {
 	void			*s_mem;		/* including colour offset */
 	unsigned int		inuse;		/* num of objs active in slab */
 	kmem_bufctl_t		free;
-	unsigned short          nodeid;
 };
 
 /*
@@ -250,6 +239,7 @@ struct slab_rcu {
 /*
  * struct array_cache
  *
+ * Per cpu structures
  * Purpose:
  * - LIFO ordering, to hand out cache-warm objects from _alloc
  * - reduce the number of linked list operations
@@ -264,13 +254,6 @@ struct array_cache {
 	unsigned int limit;
 	unsigned int batchcount;
 	unsigned int touched;
-	spinlock_t lock;
-	void *entry[0];		/*
-				 * Must have this definition in here for the proper
-				 * alignment of array_cache. Also simplifies accessing
-				 * the entries.
-				 * [0] is for gcc 2.95. It should really be [].
-				 */
 };
 
 /* bootstrap: The caches do not work without cpuarrays anymore,
@@ -283,84 +266,34 @@ struct arraycache_init {
 };
 
 /*
- * The slab lists for all objects.
+ * The slab lists of all objects.
+ * Hopefully reduce the internal fragmentation
+ * NUMA: The spinlock could be moved from the kmem_cache_t
+ * into this structure, too. Figure out what causes
+ * fewer cross-node spinlock operations.
  */
 struct kmem_list3 {
 	struct list_head	slabs_partial;	/* partial list first, better asm code */
 	struct list_head	slabs_full;
 	struct list_head	slabs_free;
 	unsigned long	free_objects;
-	unsigned long	next_reap;
 	int		free_touched;
-	unsigned int 	free_limit;
-	spinlock_t      list_lock;
-	struct array_cache	*shared;	/* shared per node */
-	struct array_cache	**alien;	/* on other nodes */
+	unsigned long	next_reap;
+	struct array_cache	*shared;
 };
 
-/*
- * Need this for bootstrapping a per node allocator.
- */
-#define NUM_INIT_LISTS (2 * MAX_NUMNODES + 1)
-struct kmem_list3 __initdata initkmem_list3[NUM_INIT_LISTS];
-#define	CACHE_CACHE 0
-#define	SIZE_AC 1
-#define	SIZE_L3 (1 + MAX_NUMNODES)
-
-/*
- * This function must be completely optimized away if
- * a constant is passed to it. Mostly the same as
- * what is in linux/slab.h except it returns an
- * index.
- */
-static __always_inline int index_of(const size_t size)
-{
-	if (__builtin_constant_p(size)) {
-		int i = 0;
-
-#define CACHE(x) \
-	if (size <=x) \
-		return i; \
-	else \
-		i++;
-#include "linux/kmalloc_sizes.h"
-#undef CACHE
-		{
-			extern void __bad_size(void);
-			__bad_size();
-		}
-	} else
-		BUG();
-	return 0;
-}
-
-#define INDEX_AC index_of(sizeof(struct arraycache_init))
-#define INDEX_L3 index_of(sizeof(struct kmem_list3))
-
-static inline void kmem_list3_init(struct kmem_list3 *parent)
-{
-	INIT_LIST_HEAD(&parent->slabs_full);
-	INIT_LIST_HEAD(&parent->slabs_partial);
-	INIT_LIST_HEAD(&parent->slabs_free);
-	parent->shared = NULL;
-	parent->alien = NULL;
-	spin_lock_init(&parent->list_lock);
-	parent->free_objects = 0;
-	parent->free_touched = 0;
-}
-
-#define MAKE_LIST(cachep, listp, slab, nodeid)	\
-	do {	\
-		INIT_LIST_HEAD(listp);		\
-		list_splice(&(cachep->nodelists[nodeid]->slab), listp); \
-	} while (0)
-
-#define	MAKE_ALL_LISTS(cachep, ptr, nodeid)			\
-	do {					\
-	MAKE_LIST((cachep), (&(ptr)->slabs_full), slabs_full, nodeid);	\
-	MAKE_LIST((cachep), (&(ptr)->slabs_partial), slabs_partial, nodeid); \
-	MAKE_LIST((cachep), (&(ptr)->slabs_free), slabs_free, nodeid);	\
-	} while (0)
+#define LIST3_INIT(parent) \
+	{ \
+		.slabs_full	= LIST_HEAD_INIT(parent.slabs_full), \
+		.slabs_partial	= LIST_HEAD_INIT(parent.slabs_partial), \
+		.slabs_free	= LIST_HEAD_INIT(parent.slabs_free) \
+	}
+#define list3_data(cachep) \
+	(&(cachep)->lists)
+
+/* NUMA: per-node */
+#define list3_data_ptr(cachep, ptr) \
+		list3_data(cachep)
 
 /*
  * kmem_cache_t
@@ -373,12 +306,13 @@ struct kmem_cache_s {
 	struct array_cache	*array[NR_CPUS];
 	unsigned int		batchcount;
 	unsigned int		limit;
-	unsigned int 		shared;
-	unsigned int		objsize;
 /* 2) touched by every alloc & free from the backend */
-	struct kmem_list3	*nodelists[MAX_NUMNODES];
+	struct kmem_list3	lists;
+	/* NUMA: kmem_3list_t	*nodelists[MAX_NUMNODES] */
+	unsigned int		objsize;
 	unsigned int	 	flags;	/* constant flags */
 	unsigned int		num;	/* # of objs per slab */
+	unsigned int		free_limit; /* upper limit of objects in the lists */
 	spinlock_t		spinlock;
 
 /* 3) cache_grow/shrink */
@@ -415,7 +349,6 @@ struct kmem_cache_s {
 	unsigned long 		errors;
 	unsigned long		max_freeable;
 	unsigned long		node_allocs;
-	unsigned long		node_frees;
 	atomic_t		allochit;
 	atomic_t		allocmiss;
 	atomic_t		freehit;
@@ -451,7 +384,6 @@ struct kmem_cache_s {
 				} while (0)
 #define	STATS_INC_ERR(x)	((x)->errors++)
 #define	STATS_INC_NODEALLOCS(x)	((x)->node_allocs++)
-#define	STATS_INC_NODEFREES(x)	((x)->node_frees++)
 #define	STATS_SET_FREEABLE(x, i) \
 				do { if ((x)->max_freeable < i) \
 					(x)->max_freeable = i; \
@@ -470,7 +402,6 @@ struct kmem_cache_s {
 #define	STATS_SET_HIGH(x)	do { } while (0)
 #define	STATS_INC_ERR(x)	do { } while (0)
 #define	STATS_INC_NODEALLOCS(x)	do { } while (0)
-#define	STATS_INC_NODEFREES(x)	do { } while (0)
 #define	STATS_SET_FREEABLE(x, i) \
 				do { } while (0)
 
@@ -603,12 +534,12 @@ static struct arraycache_init initarray_
 
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
+	.lists		= LIST3_INIT(cache_cache.lists),
 	.batchcount	= 1,
 	.limit		= BOOT_CPUCACHE_ENTRIES,
-	.shared		= 1,
 	.objsize	= sizeof(kmem_cache_t),
 	.flags		= SLAB_NO_REAP,
-	.spinlock	= SPIN_LOCK_UNLOCKED,
+	.spinlock	= SPIN_LOCK_UNLOCKED(cache_cache.spinlock),
 	.name		= "kmem_cache",
 #if DEBUG
 	.reallen	= sizeof(kmem_cache_t),
@@ -626,6 +557,7 @@ static struct list_head cache_chain;
  * SLAB_RECLAIM_ACCOUNT turns this on per-slab
  */
 atomic_t slab_reclaim_pages;
+EXPORT_SYMBOL(slab_reclaim_pages);
 
 /*
  * chicken and egg problem: delay the per-cpu array allocation
@@ -633,24 +565,28 @@ atomic_t slab_reclaim_pages;
  */
 static enum {
 	NONE,
-	PARTIAL_AC,
-	PARTIAL_L3,
+	PARTIAL,
 	FULL
 } g_cpucache_up;
 
 static DEFINE_PER_CPU(struct work_struct, reap_work);
 
-static void free_block(kmem_cache_t* cachep, void** objpp, int len, int node);
+static void free_block(kmem_cache_t* cachep, void** objpp, int len);
 static void enable_cpucache (kmem_cache_t *cachep);
 static void cache_reap (void *unused);
-static int __node_shrink(kmem_cache_t *cachep, int node);
 
-static inline struct array_cache *ac_data(kmem_cache_t *cachep)
+static inline void **ac_entry(struct array_cache *ac)
 {
-	return cachep->array[smp_processor_id()];
+	return (void**)(ac+1);
 }
 
-static inline kmem_cache_t *__find_general_cachep(size_t size, gfp_t gfpflags)
+static inline struct array_cache *ac_data(kmem_cache_t *cachep, int cpu)
+{
+	return cachep->array[cpu];
+}
+
+static inline kmem_cache_t *__find_general_cachep(size_t size,
+						unsigned int __nocast gfpflags)
 {
 	struct cache_sizes *csizep = malloc_sizes;
 
@@ -659,13 +595,13 @@ static inline kmem_cache_t *__find_gener
  	* kmem_cache_create(), or __kmalloc(), before
  	* the generic caches are initialized.
  	*/
-	BUG_ON(malloc_sizes[INDEX_AC].cs_cachep == NULL);
+	BUG_ON(csizep->cs_cachep == NULL);
 #endif
 	while (size > csizep->cs_size)
 		csizep++;
 
 	/*
-	 * Really subtle: The last entry with cs->cs_size==ULONG_MAX
+	 * Really subtile: The last entry with cs->cs_size==ULONG_MAX
 	 * has cs_{dma,}cachep==NULL. Thus no special case
 	 * for large kmalloc calls required.
 	 */
@@ -674,7 +610,8 @@ static inline kmem_cache_t *__find_gener
 	return csizep->cs_cachep;
 }
 
-kmem_cache_t *kmem_find_general_cachep(size_t size, gfp_t gfpflags)
+kmem_cache_t *kmem_find_general_cachep(size_t size,
+		unsigned int __nocast gfpflags)
 {
 	return __find_general_cachep(size, gfpflags);
 }
@@ -739,160 +676,48 @@ static void __devinit start_cpu_timer(in
 	}
 }
 
-static struct array_cache *alloc_arraycache(int node, int entries,
+static struct array_cache *alloc_arraycache(int cpu, int entries,
 						int batchcount)
 {
 	int memsize = sizeof(void*)*entries+sizeof(struct array_cache);
 	struct array_cache *nc = NULL;
 
-	nc = kmalloc_node(memsize, GFP_KERNEL, node);
+	if (cpu == -1)
+		nc = kmalloc(memsize, GFP_KERNEL);
+	else
+		nc = kmalloc_node(memsize, GFP_KERNEL, cpu_to_node(cpu));
+
 	if (nc) {
 		nc->avail = 0;
 		nc->limit = entries;
 		nc->batchcount = batchcount;
 		nc->touched = 0;
-		spin_lock_init(&nc->lock);
 	}
 	return nc;
 }
 
-#ifdef CONFIG_NUMA
-static inline struct array_cache **alloc_alien_cache(int node, int limit)
-{
-	struct array_cache **ac_ptr;
-	int memsize = sizeof(void*)*MAX_NUMNODES;
-	int i;
-
-	if (limit > 1)
-		limit = 12;
-	ac_ptr = kmalloc_node(memsize, GFP_KERNEL, node);
-	if (ac_ptr) {
-		for_each_node(i) {
-			if (i == node || !node_online(i)) {
-				ac_ptr[i] = NULL;
-				continue;
-			}
-			ac_ptr[i] = alloc_arraycache(node, limit, 0xbaadf00d);
-			if (!ac_ptr[i]) {
-				for (i--; i <=0; i--)
-					kfree(ac_ptr[i]);
-				kfree(ac_ptr);
-				return NULL;
-			}
-		}
-	}
-	return ac_ptr;
-}
-
-static inline void free_alien_cache(struct array_cache **ac_ptr)
-{
-	int i;
-
-	if (!ac_ptr)
-		return;
-
-	for_each_node(i)
-		kfree(ac_ptr[i]);
-
-	kfree(ac_ptr);
-}
-
-static inline void __drain_alien_cache(kmem_cache_t *cachep, struct array_cache *ac, int node)
-{
-	struct kmem_list3 *rl3 = cachep->nodelists[node];
-
-	if (ac->avail) {
-		spin_lock(&rl3->list_lock);
-		free_block(cachep, ac->entry, ac->avail, node);
-		ac->avail = 0;
-		spin_unlock(&rl3->list_lock);
-	}
-}
-
-static void drain_alien_cache(kmem_cache_t *cachep, struct kmem_list3 *l3)
-{
-	int i=0;
-	struct array_cache *ac;
-	unsigned long flags;
-
-	for_each_online_node(i) {
-		ac = l3->alien[i];
-		if (ac) {
-			spin_lock_irqsave(&ac->lock, flags);
-			__drain_alien_cache(cachep, ac, i);
-			spin_unlock_irqrestore(&ac->lock, flags);
-		}
-	}
-}
-#else
-#define alloc_alien_cache(node, limit) do { } while (0)
-#define free_alien_cache(ac_ptr) do { } while (0)
-#define drain_alien_cache(cachep, l3) do { } while (0)
-#endif
-
 static int __devinit cpuup_callback(struct notifier_block *nfb,
 				  unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	kmem_cache_t* cachep;
-	struct kmem_list3 *l3 = NULL;
-	int node = cpu_to_node(cpu);
-	int memsize = sizeof(struct kmem_list3);
-	struct array_cache *nc = NULL;
 
 	switch (action) {
 	case CPU_UP_PREPARE:
 		down(&cache_chain_sem);
-		/* we need to do this right in the beginning since
-		 * alloc_arraycache's are going to use this list.
-		 * kmalloc_node allows us to add the slab to the right
-		 * kmem_list3 and not this cpu's kmem_list3
-		 */
-
 		list_for_each_entry(cachep, &cache_chain, next) {
-			/* setup the size64 kmemlist for cpu before we can
-			 * begin anything. Make sure some other cpu on this
-			 * node has not already allocated this
-			 */
-			if (!cachep->nodelists[node]) {
-				if (!(l3 = kmalloc_node(memsize,
-						GFP_KERNEL, node)))
-					goto bad;
-				kmem_list3_init(l3);
-				l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-				  ((unsigned long)cachep)%REAPTIMEOUT_LIST3;
-
-				cachep->nodelists[node] = l3;
-			}
-
-			spin_lock_irq(&cachep->nodelists[node]->list_lock);
-			cachep->nodelists[node]->free_limit =
-				(1 + nr_cpus_node(node)) *
-				cachep->batchcount + cachep->num;
-			spin_unlock_irq(&cachep->nodelists[node]->list_lock);
-		}
+			struct array_cache *nc;
 
-		/* Now we can go ahead with allocating the shared array's
-		  & array cache's */
-		list_for_each_entry(cachep, &cache_chain, next) {
-			nc = alloc_arraycache(node, cachep->limit,
-					cachep->batchcount);
+			nc = alloc_arraycache(cpu, cachep->limit, cachep->batchcount);
 			if (!nc)
 				goto bad;
+
+			spin_lock_irq(&cachep->spinlock);
 			cachep->array[cpu] = nc;
+			cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
+						+ cachep->num;
+			spin_unlock_irq(&cachep->spinlock);
 
-			l3 = cachep->nodelists[node];
-			BUG_ON(!l3);
-			if (!l3->shared) {
-				if (!(nc = alloc_arraycache(node,
-					cachep->shared*cachep->batchcount,
-					0xbaadf00d)))
-					goto  bad;
-
-				/* we are serialised from CPU_DEAD or
-				  CPU_UP_CANCELLED by the cpucontrol lock */
-				l3->shared = nc;
-			}
 		}
 		up(&cache_chain_sem);
 		break;
@@ -907,51 +732,13 @@ static int __devinit cpuup_callback(stru
 
 		list_for_each_entry(cachep, &cache_chain, next) {
 			struct array_cache *nc;
-			cpumask_t mask;
 
-			mask = node_to_cpumask(node);
 			spin_lock_irq(&cachep->spinlock);
 			/* cpu is dead; no one can alloc from it. */
 			nc = cachep->array[cpu];
 			cachep->array[cpu] = NULL;
-			l3 = cachep->nodelists[node];
-
-			if (!l3)
-				goto unlock_cache;
-
-			spin_lock(&l3->list_lock);
-
-			/* Free limit for this kmem_list3 */
-			l3->free_limit -= cachep->batchcount;
-			if (nc)
-				free_block(cachep, nc->entry, nc->avail, node);
-
-			if (!cpus_empty(mask)) {
-                                spin_unlock(&l3->list_lock);
-                                goto unlock_cache;
-                        }
-
-			if (l3->shared) {
-				free_block(cachep, l3->shared->entry,
-						l3->shared->avail, node);
-				kfree(l3->shared);
-				l3->shared = NULL;
-			}
-			if (l3->alien) {
-				drain_alien_cache(cachep, l3);
-				free_alien_cache(l3->alien);
-				l3->alien = NULL;
-			}
-
-			/* free slabs belonging to this node */
-			if (__node_shrink(cachep, node)) {
-				cachep->nodelists[node] = NULL;
-				spin_unlock(&l3->list_lock);
-				kfree(l3);
-			} else {
-				spin_unlock(&l3->list_lock);
-			}
-unlock_cache:
+			cachep->free_limit -= cachep->batchcount;
+			free_block(cachep, ac_entry(nc), nc->avail);
 			spin_unlock_irq(&cachep->spinlock);
 			kfree(nc);
 		}
@@ -967,25 +754,6 @@ bad:
 
 static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 };
 
-/*
- * swap the static kmem_list3 with kmalloced memory
- */
-static void init_list(kmem_cache_t *cachep, struct kmem_list3 *list,
-		int nodeid)
-{
-	struct kmem_list3 *ptr;
-
-	BUG_ON(cachep->nodelists[nodeid] != list);
-	ptr = kmalloc_node(sizeof(struct kmem_list3), GFP_KERNEL, nodeid);
-	BUG_ON(!ptr);
-
-	local_irq_disable();
-	memcpy(ptr, list, sizeof(struct kmem_list3));
-	MAKE_ALL_LISTS(cachep, ptr, nodeid);
-	cachep->nodelists[nodeid] = ptr;
-	local_irq_enable();
-}
-
 /* Initialisation.
  * Called after the gfp() functions have been enabled, and before smp_init().
  */
@@ -994,13 +762,6 @@ void __init kmem_cache_init(void)
 	size_t left_over;
 	struct cache_sizes *sizes;
 	struct cache_names *names;
-	int i;
-
-	for (i = 0; i < NUM_INIT_LISTS; i++) {
-		kmem_list3_init(&initkmem_list3[i]);
-		if (i < MAX_NUMNODES)
-			cache_cache.nodelists[i] = NULL;
-	}
 
 	/*
 	 * Fragmentation resistance on low memory - only use bigger
@@ -1009,24 +770,21 @@ void __init kmem_cache_init(void)
 	if (num_physpages > (32 << 20) >> PAGE_SHIFT)
 		slab_break_gfp_order = BREAK_GFP_ORDER_HI;
 
+
 	/* Bootstrap is tricky, because several objects are allocated
 	 * from caches that do not exist yet:
 	 * 1) initialize the cache_cache cache: it contains the kmem_cache_t
 	 *    structures of all caches, except cache_cache itself: cache_cache
 	 *    is statically allocated.
-	 *    Initially an __init data area is used for the head array and the
-	 *    kmem_list3 structures, it's replaced with a kmalloc allocated
-	 *    array at the end of the bootstrap.
+	 *    Initially an __init data area is used for the head array, it's
+	 *    replaced with a kmalloc allocated array at the end of the bootstrap.
 	 * 2) Create the first kmalloc cache.
-	 *    The kmem_cache_t for the new cache is allocated normally.
-	 *    An __init data area is used for the head array.
-	 * 3) Create the remaining kmalloc caches, with minimally sized
-	 *    head arrays.
+	 *    The kmem_cache_t for the new cache is allocated normally. An __init
+	 *    data area is used for the head array.
+	 * 3) Create the remaining kmalloc caches, with minimally sized head arrays.
 	 * 4) Replace the __init data head arrays for cache_cache and the first
 	 *    kmalloc cache with kmalloc allocated arrays.
-	 * 5) Replace the __init data for kmem_list3 for cache_cache and
-	 *    the other cache's with kmalloc allocated memory.
-	 * 6) Resize the head arrays of the kmalloc caches to their final sizes.
+	 * 5) Resize the head arrays of the kmalloc caches to their final sizes.
 	 */
 
 	/* 1) create the cache_cache */
@@ -1035,7 +793,6 @@ void __init kmem_cache_init(void)
 	list_add(&cache_cache.next, &cache_chain);
 	cache_cache.colour_off = cache_line_size();
 	cache_cache.array[smp_processor_id()] = &initarray_cache.cache;
-	cache_cache.nodelists[numa_node_id()] = &initkmem_list3[CACHE_CACHE];
 
 	cache_cache.objsize = ALIGN(cache_cache.objsize, cache_line_size());
 
@@ -1053,33 +810,15 @@ void __init kmem_cache_init(void)
 	sizes = malloc_sizes;
 	names = cache_names;
 
-	/* Initialize the caches that provide memory for the array cache
-	 * and the kmem_list3 structures first.
-	 * Without this, further allocations will bug
-	 */
-
-	sizes[INDEX_AC].cs_cachep = kmem_cache_create(names[INDEX_AC].name,
-				sizes[INDEX_AC].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
-
-	if (INDEX_AC != INDEX_L3)
-		sizes[INDEX_L3].cs_cachep =
-			kmem_cache_create(names[INDEX_L3].name,
-				sizes[INDEX_L3].cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
-
 	while (sizes->cs_size != ULONG_MAX) {
-		/*
-		 * For performance, all the general caches are L1 aligned.
+		/* For performance, all the general caches are L1 aligned.
 		 * This should be particularly beneficial on SMP boxes, as it
 		 * eliminates "false sharing".
 		 * Note for systems short on memory removing the alignment will
-		 * allow tighter packing of the smaller caches.
-		 */
-		if(!sizes->cs_cachep)
-			sizes->cs_cachep = kmem_cache_create(names->name,
-				sizes->cs_size, ARCH_KMALLOC_MINALIGN,
-				(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
+		 * allow tighter packing of the smaller caches. */
+		sizes->cs_cachep = kmem_cache_create(names->name,
+			sizes->cs_size, ARCH_KMALLOC_MINALIGN,
+			(ARCH_KMALLOC_FLAGS | SLAB_PANIC), NULL, NULL);
 
 		/* Inc off-slab bufctl limit until the ceiling is hit. */
 		if (!(OFF_SLAB(sizes->cs_cachep))) {
@@ -1098,47 +837,25 @@ void __init kmem_cache_init(void)
 	/* 4) Replace the bootstrap head arrays */
 	{
 		void * ptr;
+		int cpu = smp_processor_id();
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-
-		local_irq_disable();
-		BUG_ON(ac_data(&cache_cache) != &initarray_cache.cache);
-		memcpy(ptr, ac_data(&cache_cache),
-				sizeof(struct arraycache_init));
-		cache_cache.array[smp_processor_id()] = ptr;
-		local_irq_enable();
+		local_irq_disable_nort();
+		BUG_ON(ac_data(&cache_cache, cpu) != &initarray_cache.cache);
+		memcpy(ptr, ac_data(&cache_cache, cpu), sizeof(struct arraycache_init));
+		cache_cache.array[cpu] = ptr;
+		local_irq_enable_nort();
 
 		ptr = kmalloc(sizeof(struct arraycache_init), GFP_KERNEL);
-
-		local_irq_disable();
-		BUG_ON(ac_data(malloc_sizes[INDEX_AC].cs_cachep)
-				!= &initarray_generic.cache);
-		memcpy(ptr, ac_data(malloc_sizes[INDEX_AC].cs_cachep),
+		local_irq_disable_nort();
+		BUG_ON(ac_data(malloc_sizes[0].cs_cachep, cpu) != &initarray_generic.cache);
+		memcpy(ptr, ac_data(malloc_sizes[0].cs_cachep, cpu),
 				sizeof(struct arraycache_init));
-		malloc_sizes[INDEX_AC].cs_cachep->array[smp_processor_id()] =
-						ptr;
-		local_irq_enable();
-	}
-	/* 5) Replace the bootstrap kmem_list3's */
-	{
-		int node;
-		/* Replace the static kmem_list3 structures for the boot cpu */
-		init_list(&cache_cache, &initkmem_list3[CACHE_CACHE],
-				numa_node_id());
-
-		for_each_online_node(node) {
-			init_list(malloc_sizes[INDEX_AC].cs_cachep,
-					&initkmem_list3[SIZE_AC+node], node);
-
-			if (INDEX_AC != INDEX_L3) {
-				init_list(malloc_sizes[INDEX_L3].cs_cachep,
-						&initkmem_list3[SIZE_L3+node],
-						node);
-			}
-		}
+		malloc_sizes[0].cs_cachep->array[cpu] = ptr;
+		local_irq_enable_nort();
 	}
 
-	/* 6) resize the head arrays to their final sizes */
+	/* 5) resize the head arrays to their final sizes */
 	{
 		kmem_cache_t *cachep;
 		down(&cache_chain_sem);
@@ -1155,6 +872,7 @@ void __init kmem_cache_init(void)
 	 */
 	register_cpu_notifier(&cpucache_notifier);
 
+
 	/* The reap timers are started later, with a module init call:
 	 * That part of the kernel is not yet operational.
 	 */
@@ -1168,8 +886,10 @@ static int __init cpucache_init(void)
 	 * Register the timers that return unneeded
 	 * pages to gfp.
 	 */
-	for_each_online_cpu(cpu)
-		start_cpu_timer(cpu);
+	for (cpu = 0; cpu < NR_CPUS; cpu++) {
+		if (cpu_online(cpu))
+			start_cpu_timer(cpu);
+	}
 
 	return 0;
 }
@@ -1183,7 +903,7 @@ __initcall(cpucache_init);
  * did not request dmaable memory, we might get it, but that
  * would be relatively rare and ignorable.
  */
-static void *kmem_getpages(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static void *kmem_getpages(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
 {
 	struct page *page;
 	void *addr;
@@ -1257,7 +977,7 @@ static void store_stackinfo(kmem_cache_t
 
 	*addr++=0x12345678;
 	*addr++=caller;
-	*addr++=smp_processor_id();
+	*addr++=raw_smp_processor_id();
 	size -= 3*sizeof(unsigned long);
 	{
 		unsigned long *sptr = &caller;
@@ -1448,20 +1168,6 @@ static void slab_destroy (kmem_cache_t *
 	}
 }
 
-/* For setting up all the kmem_list3s for cache whose objsize is same
-   as size of kmem_list3. */
-static inline void set_up_list3s(kmem_cache_t *cachep, int index)
-{
-	int node;
-
-	for_each_online_node(node) {
-		cachep->nodelists[node] = &initkmem_list3[index+node];
-		cachep->nodelists[node]->next_reap = jiffies +
-			REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
-	}
-}
-
 /**
  * kmem_cache_create - Create a cache.
  * @name: A string which is used in /proc/slabinfo to identify this cache.
@@ -1502,6 +1208,7 @@ kmem_cache_create (const char *name, siz
 {
 	size_t left_over, slab_size, ralign;
 	kmem_cache_t *cachep = NULL;
+	int cpu = raw_smp_processor_id();
 
 	/*
 	 * Sanity checks... these are all serious usage bugs.
@@ -1615,7 +1322,7 @@ kmem_cache_create (const char *name, siz
 		size += BYTES_PER_WORD;
 	}
 #if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
-	if (size >= malloc_sizes[INDEX_L3+1].cs_size && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
+	if (size > 128 && cachep->reallen > cache_line_size() && size < PAGE_SIZE) {
 		cachep->dbghead += PAGE_SIZE - size;
 		size = PAGE_SIZE;
 	}
@@ -1717,9 +1424,13 @@ next:
 		cachep->gfpflags |= GFP_DMA;
 	spin_lock_init(&cachep->spinlock);
 	cachep->objsize = size;
+	/* NUMA */
+	INIT_LIST_HEAD(&cachep->lists.slabs_full);
+	INIT_LIST_HEAD(&cachep->lists.slabs_partial);
+	INIT_LIST_HEAD(&cachep->lists.slabs_free);
 
 	if (flags & CFLGS_OFF_SLAB)
-		cachep->slabp_cache = kmem_find_general_cachep(slab_size, 0u);
+		cachep->slabp_cache = kmem_find_general_cachep(slab_size,0);
 	cachep->ctor = ctor;
 	cachep->dtor = dtor;
 	cachep->name = name;
@@ -1735,52 +1446,25 @@ next:
 			 * the cache that's used by kmalloc(24), otherwise
 			 * the creation of further caches will BUG().
 			 */
-			cachep->array[smp_processor_id()] =
-				&initarray_generic.cache;
-
-			/* If the cache that's used by
-			 * kmalloc(sizeof(kmem_list3)) is the first cache,
-			 * then we need to set up all its list3s, otherwise
-			 * the creation of further caches will BUG().
-			 */
-			set_up_list3s(cachep, SIZE_AC);
-			if (INDEX_AC == INDEX_L3)
-				g_cpucache_up = PARTIAL_L3;
-			else
-				g_cpucache_up = PARTIAL_AC;
+			cachep->array[cpu] = &initarray_generic.cache;
+			g_cpucache_up = PARTIAL;
 		} else {
-			cachep->array[smp_processor_id()] =
-				kmalloc(sizeof(struct arraycache_init),
-						GFP_KERNEL);
-
-			if (g_cpucache_up == PARTIAL_AC) {
-				set_up_list3s(cachep, SIZE_L3);
-				g_cpucache_up = PARTIAL_L3;
-			} else {
-				int node;
-				for_each_online_node(node) {
-
-					cachep->nodelists[node] =
-						kmalloc_node(sizeof(struct kmem_list3),
-								GFP_KERNEL, node);
-					BUG_ON(!cachep->nodelists[node]);
-					kmem_list3_init(cachep->nodelists[node]);
-				}
-			}
+			cachep->array[cpu] = kmalloc(sizeof(struct arraycache_init),GFP_KERNEL);
 		}
-		cachep->nodelists[numa_node_id()]->next_reap =
-			jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
-
-		BUG_ON(!ac_data(cachep));
-		ac_data(cachep)->avail = 0;
-		ac_data(cachep)->limit = BOOT_CPUCACHE_ENTRIES;
-		ac_data(cachep)->batchcount = 1;
-		ac_data(cachep)->touched = 0;
+		BUG_ON(!ac_data(cachep, cpu));
+		ac_data(cachep, cpu)->avail = 0;
+		ac_data(cachep, cpu)->limit = BOOT_CPUCACHE_ENTRIES;
+		ac_data(cachep, cpu)->batchcount = 1;
+		ac_data(cachep, cpu)->touched = 0;
 		cachep->batchcount = 1;
 		cachep->limit = BOOT_CPUCACHE_ENTRIES;
+		cachep->free_limit = (1+num_online_cpus())*cachep->batchcount
+					+ cachep->num;
 	} 
 
+	cachep->lists.next_reap = jiffies + REAPTIMEOUT_LIST3 +
+					((unsigned long)cachep)%REAPTIMEOUT_LIST3;
+
 	/* Need the semaphore to access the chain. */
 	down(&cache_chain_sem);
 	{
@@ -1825,35 +1509,27 @@ EXPORT_SYMBOL(kmem_cache_create);
 #if DEBUG
 static void check_irq_off(void)
 {
-	BUG_ON(!irqs_disabled());
+#ifndef CONFIG_PREEMPT_RT
+	BUG_ON(!raw_irqs_disabled());
+#endif
 }
 
 static void check_irq_on(void)
 {
-	BUG_ON(irqs_disabled());
+	BUG_ON(raw_irqs_disabled());
 }
 
 static void check_spinlock_acquired(kmem_cache_t *cachep)
 {
 #ifdef CONFIG_SMP
 	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[numa_node_id()]->list_lock);
-#endif
-}
-
-static inline void check_spinlock_acquired_node(kmem_cache_t *cachep, int node)
-{
-#ifdef CONFIG_SMP
-	check_irq_off();
-	assert_spin_locked(&cachep->nodelists[node]->list_lock);
+	BUG_ON(spin_trylock(&cachep->spinlock));
 #endif
 }
-
 #else
 #define check_irq_off()	do { } while(0)
 #define check_irq_on()	do { } while(0)
 #define check_spinlock_acquired(x) do { } while(0)
-#define check_spinlock_acquired_node(x, y) do { } while(0)
 #endif
 
 /*
@@ -1864,9 +1540,9 @@ static void smp_call_function_all_cpus(v
 	check_irq_on();
 	preempt_disable();
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	func(arg);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	if (smp_call_function(func, arg, 1, 1))
 		BUG();
@@ -1875,92 +1551,85 @@ static void smp_call_function_all_cpus(v
 }
 
 static void drain_array_locked(kmem_cache_t* cachep,
-				struct array_cache *ac, int force, int node);
+				struct array_cache *ac, int force);
 
-static void do_drain(void *arg)
+static void do_drain_cpu(kmem_cache_t *cachep, int cpu)
 {
-	kmem_cache_t *cachep = (kmem_cache_t*)arg;
 	struct array_cache *ac;
-	int node = numa_node_id();
 
 	check_irq_off();
-	ac = ac_data(cachep);
-	spin_lock(&cachep->nodelists[node]->list_lock);
-	free_block(cachep, ac->entry, ac->avail, node);
-	spin_unlock(&cachep->nodelists[node]->list_lock);
+
+	spin_lock(&cachep->spinlock);
+	ac = ac_data(cachep, cpu);
+	free_block(cachep, &ac_entry(ac)[0], ac->avail);
 	ac->avail = 0;
+	spin_unlock(&cachep->spinlock);
 }
 
-static void drain_cpu_caches(kmem_cache_t *cachep)
+#ifndef CONFIG_PREEMPT_RT
+/*
+ * Executes in an IRQ context:
+ */
+static void do_drain(void *arg)
 {
-	struct kmem_list3 *l3;
-	int node;
+	do_drain_cpu((kmem_cache_t*)arg, smp_processor_id());
+}
+#endif
 
+static void drain_cpu_caches(kmem_cache_t *cachep)
+{
+#ifndef CONFIG_PREEMPT_RT
 	smp_call_function_all_cpus(do_drain, cachep);
+#else
+	int cpu;
+
+	for_each_online_cpu(cpu)
+		do_drain_cpu(cachep, cpu);
+#endif
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
-	for_each_online_node(node)  {
-		l3 = cachep->nodelists[node];
-		if (l3) {
-			spin_lock(&l3->list_lock);
-			drain_array_locked(cachep, l3->shared, 1, node);
-			spin_unlock(&l3->list_lock);
-			if (l3->alien)
-				drain_alien_cache(cachep, l3);
-		}
-	}
+	if (cachep->lists.shared)
+		drain_array_locked(cachep, cachep->lists.shared, 1);
 	spin_unlock_irq(&cachep->spinlock);
 }
 
-static int __node_shrink(kmem_cache_t *cachep, int node)
+
+/* NUMA shrink all list3s */
+static int __cache_shrink(kmem_cache_t *cachep)
 {
 	struct slab *slabp;
-	struct kmem_list3 *l3 = cachep->nodelists[node];
 	int ret;
 
-	for (;;) {
+	drain_cpu_caches(cachep);
+
+	check_irq_on();
+	spin_lock_irq(&cachep->spinlock);
+
+	for(;;) {
 		struct list_head *p;
 
-		p = l3->slabs_free.prev;
-		if (p == &l3->slabs_free)
+		p = cachep->lists.slabs_free.prev;
+		if (p == &cachep->lists.slabs_free)
 			break;
 
-		slabp = list_entry(l3->slabs_free.prev, struct slab, list);
+		slabp = list_entry(cachep->lists.slabs_free.prev, struct slab, list);
 #if DEBUG
 		if (slabp->inuse)
 			BUG();
 #endif
 		list_del(&slabp->list);
 
-		l3->free_objects -= cachep->num;
-		spin_unlock_irq(&l3->list_lock);
+		cachep->lists.free_objects -= cachep->num;
+		spin_unlock_irq(&cachep->spinlock);
 		slab_destroy(cachep, slabp);
-		spin_lock_irq(&l3->list_lock);
+		spin_lock_irq(&cachep->spinlock);
 	}
-	ret = !list_empty(&l3->slabs_full) ||
-		!list_empty(&l3->slabs_partial);
+	ret = !list_empty(&cachep->lists.slabs_full) ||
+		!list_empty(&cachep->lists.slabs_partial);
+	spin_unlock_irq(&cachep->spinlock);
 	return ret;
 }
 
-static int __cache_shrink(kmem_cache_t *cachep)
-{
-	int ret = 0, i = 0;
-	struct kmem_list3 *l3;
-
-	drain_cpu_caches(cachep);
-
-	check_irq_on();
-	for_each_online_node(i) {
-		l3 = cachep->nodelists[i];
-		if (l3) {
-			spin_lock_irq(&l3->list_lock);
-			ret += __node_shrink(cachep, i);
-			spin_unlock_irq(&l3->list_lock);
-		}
-	}
-	return (ret ? 1 : 0);
-}
-
 /**
  * kmem_cache_shrink - Shrink a cache.
  * @cachep: The cache to shrink.
@@ -1997,7 +1666,6 @@ EXPORT_SYMBOL(kmem_cache_shrink);
 int kmem_cache_destroy(kmem_cache_t * cachep)
 {
 	int i;
-	struct kmem_list3 *l3;
 
 	if (!cachep || in_interrupt())
 		BUG();
@@ -2025,17 +1693,15 @@ int kmem_cache_destroy(kmem_cache_t * ca
 	if (unlikely(cachep->flags & SLAB_DESTROY_BY_RCU))
 		synchronize_rcu();
 
-	for_each_online_cpu(i)
+	/* no cpu_online check required here since we clear the percpu
+	 * array on cpu offline and set this to NULL.
+	 */
+	for (i = 0; i < NR_CPUS; i++)
 		kfree(cachep->array[i]);
 
 	/* NUMA: free the list3 structures */
-	for_each_online_node(i) {
-		if ((l3 = cachep->nodelists[i])) {
-			kfree(l3->shared);
-			free_alien_cache(l3->alien);
-			kfree(l3);
-		}
-	}
+	kfree(cachep->lists.shared);
+	cachep->lists.shared = NULL;
 	kmem_cache_free(&cache_cache, cachep);
 
 	unlock_cpu_hotplug();
@@ -2045,8 +1711,8 @@ int kmem_cache_destroy(kmem_cache_t * ca
 EXPORT_SYMBOL(kmem_cache_destroy);
 
 /* Get the memory for a slab management obj. */
-static struct slab* alloc_slabmgmt(kmem_cache_t *cachep, void *objp,
-			int colour_off, gfp_t local_flags)
+static struct slab* alloc_slabmgmt(kmem_cache_t *cachep,
+			void *objp, int colour_off, unsigned int __nocast local_flags)
 {
 	struct slab *slabp;
 	
@@ -2077,7 +1743,7 @@ static void cache_init_objs(kmem_cache_t
 	int i;
 
 	for (i = 0; i < cachep->num; i++) {
-		void *objp = slabp->s_mem+cachep->objsize*i;
+		void* objp = slabp->s_mem+cachep->objsize*i;
 #if DEBUG
 		/* need to poison the objs? */
 		if (cachep->flags & SLAB_POISON)
@@ -2147,14 +1813,13 @@ static void set_slab_attr(kmem_cache_t *
  * Grow (by 1) the number of slabs within a cache.  This is called by
  * kmem_cache_alloc() when there are no active objs left in a cache.
  */
-static int cache_grow(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+static int cache_grow(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
 {
 	struct slab	*slabp;
 	void		*objp;
 	size_t		 offset;
 	unsigned int	 local_flags;
 	unsigned long	 ctor_flags;
-	struct kmem_list3 *l3;
 
 	/* Be lazy and only check for valid flags here,
  	 * keeping it out of the critical path in kmem_cache_alloc().
@@ -2186,9 +1851,8 @@ static int cache_grow(kmem_cache_t *cach
 
 	spin_unlock(&cachep->spinlock);
 
-	check_irq_off();
 	if (local_flags & __GFP_WAIT)
-		local_irq_enable();
+		local_irq_enable_nort();
 
 	/*
 	 * The test for missing atomic flag is performed here, rather than
@@ -2198,9 +1862,8 @@ static int cache_grow(kmem_cache_t *cach
 	 */
 	kmem_flagcheck(cachep, flags);
 
-	/* Get mem for the objs.
-	 * Attempt to allocate a physical page from 'nodeid',
-	 */
+
+	/* Get mem for the objs. */
 	if (!(objp = kmem_getpages(cachep, flags, nodeid)))
 		goto failed;
 
@@ -2208,28 +1871,26 @@ static int cache_grow(kmem_cache_t *cach
 	if (!(slabp = alloc_slabmgmt(cachep, objp, offset, local_flags)))
 		goto opps1;
 
-	slabp->nodeid = nodeid;
 	set_slab_attr(cachep, slabp, objp);
 
 	cache_init_objs(cachep, slabp, ctor_flags);
 
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_irq_disable_nort();
 	check_irq_off();
-	l3 = cachep->nodelists[nodeid];
-	spin_lock(&l3->list_lock);
+	spin_lock(&cachep->spinlock);
 
 	/* Make slab active. */
-	list_add_tail(&slabp->list, &(l3->slabs_free));
+	list_add_tail(&slabp->list, &(list3_data(cachep)->slabs_free));
 	STATS_INC_GROWN(cachep);
-	l3->free_objects += cachep->num;
-	spin_unlock(&l3->list_lock);
+	list3_data(cachep)->free_objects += cachep->num;
+	spin_unlock(&cachep->spinlock);
 	return 1;
 opps1:
 	kmem_freepages(cachep, objp);
 failed:
 	if (local_flags & __GFP_WAIT)
-		local_irq_disable();
+		local_irq_disable_nort();
 	return 0;
 }
 
@@ -2329,6 +1990,7 @@ static void check_slabp(kmem_cache_t *ca
 	kmem_bufctl_t i;
 	int entries = 0;
 	
+	check_spinlock_acquired(cachep);
 	/* Check slab's freelist to see if this obj is there. */
 	for (i = slabp->free; i != BUFCTL_END; i = slab_bufctl(slabp)[i]) {
 		entries++;
@@ -2354,14 +2016,14 @@ bad:
 #define check_slabp(x,y) do { } while(0)
 #endif
 
-static void *cache_alloc_refill(kmem_cache_t *cachep, gfp_t flags)
+static void *cache_alloc_refill(kmem_cache_t *cachep, unsigned int __nocast flags, int cpu)
 {
 	int batchcount;
 	struct kmem_list3 *l3;
 	struct array_cache *ac;
 
 	check_irq_off();
-	ac = ac_data(cachep);
+	ac = ac_data(cachep, cpu);
 retry:
 	batchcount = ac->batchcount;
 	if (!ac->touched && batchcount > BATCHREFILL_LIMIT) {
@@ -2371,11 +2033,10 @@ retry:
 		 */
 		batchcount = BATCHREFILL_LIMIT;
 	}
-	l3 = cachep->nodelists[numa_node_id()];
-
-	BUG_ON(ac->avail > 0 || !l3);
-	spin_lock(&l3->list_lock);
+	l3 = list3_data(cachep);
 
+	BUG_ON(ac->avail > 0);
+	spin_lock_nort(&cachep->spinlock);
 	if (l3->shared) {
 		struct array_cache *shared_array = l3->shared;
 		if (shared_array->avail) {
@@ -2383,9 +2044,8 @@ retry:
 				batchcount = shared_array->avail;
 			shared_array->avail -= batchcount;
 			ac->avail = batchcount;
-			memcpy(ac->entry,
-				&(shared_array->entry[shared_array->avail]),
-				sizeof(void*)*batchcount);
+			memcpy(ac_entry(ac), &ac_entry(shared_array)[shared_array->avail],
+					sizeof(void*)*batchcount);
 			shared_array->touched = 1;
 			goto alloc_done;
 		}
@@ -2412,8 +2072,7 @@ retry:
 			STATS_SET_HIGH(cachep);
 
 			/* get obj pointer */
-			ac->entry[ac->avail++] = slabp->s_mem +
-				slabp->free*cachep->objsize;
+			ac_entry(ac)[ac->avail++] = slabp->s_mem + slabp->free*cachep->objsize;
 
 			slabp->inuse++;
 			next = slab_bufctl(slabp)[slabp->free];
@@ -2435,14 +2094,17 @@ retry:
 must_grow:
 	l3->free_objects -= ac->avail;
 alloc_done:
-	spin_unlock(&l3->list_lock);
+	spin_unlock_nort(&cachep->spinlock);
 
 	if (unlikely(!ac->avail)) {
 		int x;
-		x = cache_grow(cachep, flags, numa_node_id());
+		spin_unlock_rt(&cachep->spinlock);
+		x = cache_grow(cachep, flags, -1);
 
+		spin_lock_rt(&cachep->spinlock);
 		// cache_grow can reenable interrupts, then ac could change.
-		ac = ac_data(cachep);
+		cpu = smp_processor_id_rt(cpu);
+		ac = ac_data(cachep, cpu);
 		if (!x && ac->avail == 0)	// no objects in sight? abort
 			return NULL;
 
@@ -2450,11 +2112,11 @@ alloc_done:
 			goto retry;
 	}
 	ac->touched = 1;
-	return ac->entry[--ac->avail];
+	return ac_entry(ac)[--ac->avail];
 }
 
 static inline void
-cache_alloc_debugcheck_before(kmem_cache_t *cachep, gfp_t flags)
+cache_alloc_debugcheck_before(kmem_cache_t *cachep, unsigned int __nocast flags)
 {
 	might_sleep_if(flags & __GFP_WAIT);
 #if DEBUG
@@ -2465,7 +2127,7 @@ cache_alloc_debugcheck_before(kmem_cache
 #if DEBUG
 static void *
 cache_alloc_debugcheck_after(kmem_cache_t *cachep,
-			gfp_t flags, void *objp, void *caller)
+			unsigned int __nocast flags, void *objp, void *caller)
 {
 	if (!objp)	
 		return objp;
@@ -2508,118 +2170,47 @@ cache_alloc_debugcheck_after(kmem_cache_
 #define cache_alloc_debugcheck_after(a,b,objp,d) (objp)
 #endif
 
-static inline void *____cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+
+static inline void *__cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
 {
+	int cpu;
+	unsigned long save_flags;
 	void* objp;
 	struct array_cache *ac;
 
-	check_irq_off();
-	ac = ac_data(cachep);
+	cache_alloc_debugcheck_before(cachep, flags);
+
+	local_irq_save_nort(save_flags);
+	spin_lock_rt(&cachep->spinlock);
+	cpu = raw_smp_processor_id();
+	ac = ac_data(cachep, cpu);
 	if (likely(ac->avail)) {
 		STATS_INC_ALLOCHIT(cachep);
 		ac->touched = 1;
-		objp = ac->entry[--ac->avail];
+		objp = ac_entry(ac)[--ac->avail];
 	} else {
 		STATS_INC_ALLOCMISS(cachep);
-		objp = cache_alloc_refill(cachep, flags);
+		objp = cache_alloc_refill(cachep, flags, cpu);
 	}
+	spin_unlock_rt(&cachep->spinlock);
+	local_irq_restore_nort(save_flags);
+	objp = cache_alloc_debugcheck_after(cachep, flags, objp, __builtin_return_address(0));
 	return objp;
 }
 
-static inline void *__cache_alloc(kmem_cache_t *cachep, gfp_t flags)
-{
-	unsigned long save_flags;
-	void* objp;
-
-	cache_alloc_debugcheck_before(cachep, flags);
-
-	local_irq_save(save_flags);
-	objp = ____cache_alloc(cachep, flags);
-	local_irq_restore(save_flags);
-	objp = cache_alloc_debugcheck_after(cachep, flags, objp,
-					__builtin_return_address(0));
-	prefetchw(objp);
-	return objp;
-}
-
-#ifdef CONFIG_NUMA
 /*
- * A interface to enable slab creation on nodeid
+ * NUMA: different approach needed if the spinlock is moved into
+ * the l3 structure
  */
-static void *__cache_alloc_node(kmem_cache_t *cachep, int flags, int nodeid)
-{
-	struct list_head *entry;
- 	struct slab *slabp;
- 	struct kmem_list3 *l3;
- 	void *obj;
- 	kmem_bufctl_t next;
- 	int x;
 
- 	l3 = cachep->nodelists[nodeid];
- 	BUG_ON(!l3);
-
-retry:
- 	spin_lock(&l3->list_lock);
- 	entry = l3->slabs_partial.next;
- 	if (entry == &l3->slabs_partial) {
- 		l3->free_touched = 1;
- 		entry = l3->slabs_free.next;
- 		if (entry == &l3->slabs_free)
- 			goto must_grow;
- 	}
-
- 	slabp = list_entry(entry, struct slab, list);
- 	check_spinlock_acquired_node(cachep, nodeid);
- 	check_slabp(cachep, slabp);
-
- 	STATS_INC_NODEALLOCS(cachep);
- 	STATS_INC_ACTIVE(cachep);
- 	STATS_SET_HIGH(cachep);
-
- 	BUG_ON(slabp->inuse == cachep->num);
-
- 	/* get obj pointer */
- 	obj =  slabp->s_mem + slabp->free*cachep->objsize;
- 	slabp->inuse++;
- 	next = slab_bufctl(slabp)[slabp->free];
-#if DEBUG
- 	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
-#endif
- 	slabp->free = next;
- 	check_slabp(cachep, slabp);
- 	l3->free_objects--;
- 	/* move slabp to correct slabp list: */
- 	list_del(&slabp->list);
-
- 	if (slabp->free == BUFCTL_END) {
- 		list_add(&slabp->list, &l3->slabs_full);
- 	} else {
- 		list_add(&slabp->list, &l3->slabs_partial);
- 	}
-
- 	spin_unlock(&l3->list_lock);
- 	goto done;
-
-must_grow:
- 	spin_unlock(&l3->list_lock);
- 	x = cache_grow(cachep, flags, nodeid);
-
- 	if (!x)
- 		return NULL;
-
- 	goto retry;
-done:
- 	return obj;
-}
-#endif
-
-/*
- * Caller needs to acquire correct kmem_list's list_lock
- */
-static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects, int node)
+static void free_block(kmem_cache_t *cachep, void **objpp, int nr_objects)
 {
 	int i;
-	struct kmem_list3 *l3;
+
+	check_spinlock_acquired(cachep);
+
+	/* NUMA: move add into loop */
+	cachep->lists.free_objects += nr_objects;
 
 	for (i = 0; i < nr_objects; i++) {
 		void *objp = objpp[i];
@@ -2627,17 +2218,13 @@ static void free_block(kmem_cache_t *cac
 		unsigned int objnr;
 
 		slabp = GET_PAGE_SLAB(virt_to_page(objp));
-		l3 = cachep->nodelists[node];
 		list_del(&slabp->list);
 		objnr = (objp - slabp->s_mem) / cachep->objsize;
-		check_spinlock_acquired_node(cachep, node);
 		check_slabp(cachep, slabp);
-
-
 #if DEBUG
 		if (slab_bufctl(slabp)[objnr] != BUFCTL_FREE) {
-			printk(KERN_ERR "slab: double free detected in cache "
-					"'%s', objp %p\n", cachep->name, objp);
+			printk(KERN_ERR "slab: double free detected in cache '%s', objp %p.\n",
+						cachep->name, objp);
 			BUG();
 		}
 #endif
@@ -2645,23 +2232,24 @@ static void free_block(kmem_cache_t *cac
 		slabp->free = objnr;
 		STATS_DEC_ACTIVE(cachep);
 		slabp->inuse--;
-		l3->free_objects++;
 		check_slabp(cachep, slabp);
 
 		/* fixup slab chains */
 		if (slabp->inuse == 0) {
-			if (l3->free_objects > l3->free_limit) {
-				l3->free_objects -= cachep->num;
+			if (cachep->lists.free_objects > cachep->free_limit) {
+				cachep->lists.free_objects -= cachep->num;
 				slab_destroy(cachep, slabp);
 			} else {
-				list_add(&slabp->list, &l3->slabs_free);
+				list_add(&slabp->list,
+				&list3_data_ptr(cachep, objp)->slabs_free);
 			}
 		} else {
 			/* Unconditionally move a slab to the end of the
 			 * partial list on free - maximum time for the
 			 * other objects to be freed, too.
 			 */
-			list_add_tail(&slabp->list, &l3->slabs_partial);
+			list_add_tail(&slabp->list,
+				&list3_data_ptr(cachep, objp)->slabs_partial);
 		}
 	}
 }
@@ -2669,39 +2257,36 @@ static void free_block(kmem_cache_t *cac
 static void cache_flusharray(kmem_cache_t *cachep, struct array_cache *ac)
 {
 	int batchcount;
-	struct kmem_list3 *l3;
-	int node = numa_node_id();
 
 	batchcount = ac->batchcount;
 #if DEBUG
 	BUG_ON(!batchcount || batchcount > ac->avail);
 #endif
 	check_irq_off();
-	l3 = cachep->nodelists[node];
-	spin_lock(&l3->list_lock);
-	if (l3->shared) {
-		struct array_cache *shared_array = l3->shared;
+	spin_lock_nort(&cachep->spinlock);
+	if (cachep->lists.shared) {
+		struct array_cache *shared_array = cachep->lists.shared;
 		int max = shared_array->limit-shared_array->avail;
 		if (max) {
 			if (batchcount > max)
 				batchcount = max;
-			memcpy(&(shared_array->entry[shared_array->avail]),
-					ac->entry,
+			memcpy(&ac_entry(shared_array)[shared_array->avail],
+					&ac_entry(ac)[0],
 					sizeof(void*)*batchcount);
 			shared_array->avail += batchcount;
 			goto free_done;
 		}
 	}
 
-	free_block(cachep, ac->entry, batchcount, node);
+	free_block(cachep, &ac_entry(ac)[0], batchcount);
 free_done:
 #if STATS
 	{
 		int i = 0;
 		struct list_head *p;
 
-		p = l3->slabs_free.next;
-		while (p != &(l3->slabs_free)) {
+		p = list3_data(cachep)->slabs_free.next;
+		while (p != &(list3_data(cachep)->slabs_free)) {
 			struct slab *slabp;
 
 			slabp = list_entry(p, struct slab, list);
@@ -2713,13 +2298,12 @@ free_done:
 		STATS_SET_FREEABLE(cachep, i);
 	}
 #endif
-	spin_unlock(&l3->list_lock);
+	spin_unlock_nort(&cachep->spinlock);
 	ac->avail -= batchcount;
-	memmove(ac->entry, &(ac->entry[batchcount]),
+	memmove(&ac_entry(ac)[0], &ac_entry(ac)[batchcount],
 			sizeof(void*)*ac->avail);
 }
 
-
 /*
  * __cache_free
  * Release an obj back to its cache. If the obj has a constructed
@@ -2729,52 +2313,24 @@ free_done:
  */
 static inline void __cache_free(kmem_cache_t *cachep, void *objp)
 {
-	struct array_cache *ac = ac_data(cachep);
+	int cpu;
+	struct array_cache *ac;
 
 	check_irq_off();
 	objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0));
 
-	/* Make sure we are not freeing a object from another
-	 * node to the array cache on this cpu.
-	 */
-#ifdef CONFIG_NUMA
-	{
-		struct slab *slabp;
-		slabp = GET_PAGE_SLAB(virt_to_page(objp));
-		if (unlikely(slabp->nodeid != numa_node_id())) {
-			struct array_cache *alien = NULL;
-			int nodeid = slabp->nodeid;
-			struct kmem_list3 *l3 = cachep->nodelists[numa_node_id()];
-
-			STATS_INC_NODEFREES(cachep);
-			if (l3->alien && l3->alien[nodeid]) {
-				alien = l3->alien[nodeid];
-				spin_lock(&alien->lock);
-				if (unlikely(alien->avail == alien->limit))
-					__drain_alien_cache(cachep,
-							alien, nodeid);
-				alien->entry[alien->avail++] = objp;
-				spin_unlock(&alien->lock);
-			} else {
-				spin_lock(&(cachep->nodelists[nodeid])->
-						list_lock);
-				free_block(cachep, &objp, 1, nodeid);
-				spin_unlock(&(cachep->nodelists[nodeid])->
-						list_lock);
-			}
-			return;
-		}
-	}
-#endif
+	spin_lock_rt(&cachep->spinlock);
+	cpu = raw_smp_processor_id();
+	ac = ac_data(cachep, cpu);
 	if (likely(ac->avail < ac->limit)) {
 		STATS_INC_FREEHIT(cachep);
-		ac->entry[ac->avail++] = objp;
-		return;
+		ac_entry(ac)[ac->avail++] = objp;
 	} else {
 		STATS_INC_FREEMISS(cachep);
 		cache_flusharray(cachep, ac);
-		ac->entry[ac->avail++] = objp;
+		ac_entry(ac)[ac->avail++] = objp;
 	}
+	spin_unlock_rt(&cachep->spinlock);
 }
 
 /**
@@ -2785,7 +2341,7 @@ static inline void __cache_free(kmem_cac
  * Allocate an object from this cache.  The flags are only relevant
  * if the cache has no available objects.
  */
-void *kmem_cache_alloc(kmem_cache_t *cachep, gfp_t flags)
+void *kmem_cache_alloc(kmem_cache_t *cachep, unsigned int __nocast flags)
 {
 	return __cache_alloc(cachep, flags);
 }
@@ -2843,37 +2399,85 @@ out:
  * Identical to kmem_cache_alloc, except that this function is slow
  * and can sleep. And it will allocate memory on the given node, which
  * can improve the performance for cpu bound structures.
- * New and improved: it will now make sure that the object gets
- * put on the correct node list so that there is no false sharing.
  */
-void *kmem_cache_alloc_node(kmem_cache_t *cachep, gfp_t flags, int nodeid)
+void *kmem_cache_alloc_node(kmem_cache_t *cachep, unsigned int __nocast flags, int nodeid)
 {
-	unsigned long save_flags;
-	void *ptr;
+	int loop;
+	void *objp;
+	struct slab *slabp;
+	kmem_bufctl_t next;
 
 	if (nodeid == -1)
-		return __cache_alloc(cachep, flags);
+		return kmem_cache_alloc(cachep, flags);
+
+	for (loop = 0;;loop++) {
+		struct list_head *q;
+
+		objp = NULL;
+		check_irq_on();
+		spin_lock_irq(&cachep->spinlock);
+		/* walk through all partial and empty slab and find one
+		 * from the right node */
+		list_for_each(q,&cachep->lists.slabs_partial) {
+			slabp = list_entry(q, struct slab, list);
+
+			if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
+					loop > 2)
+				goto got_slabp;
+		}
+		list_for_each(q, &cachep->lists.slabs_free) {
+			slabp = list_entry(q, struct slab, list);
 
-	if (unlikely(!cachep->nodelists[nodeid])) {
-		/* Fall back to __cache_alloc if we run into trouble */
-		printk(KERN_WARNING "slab: not allocating in inactive node %d for cache %s\n", nodeid, cachep->name);
-		return __cache_alloc(cachep,flags);
+			if (page_to_nid(virt_to_page(slabp->s_mem)) == nodeid ||
+					loop > 2)
+				goto got_slabp;
+		}
+		spin_unlock_irq(&cachep->spinlock);
+
+		local_irq_disable_nort();
+		if (!cache_grow(cachep, flags, nodeid)) {
+			local_irq_enable_nort();
+			return NULL;
+		}
+		local_irq_enable_nort();
 	}
+got_slabp:
+	/* found one: allocate object */
+	check_slabp(cachep, slabp);
+	check_spinlock_acquired(cachep);
 
-	cache_alloc_debugcheck_before(cachep, flags);
-	local_irq_save(save_flags);
-	if (nodeid == numa_node_id())
-		ptr = ____cache_alloc(cachep, flags);
+	STATS_INC_ALLOCED(cachep);
+	STATS_INC_ACTIVE(cachep);
+	STATS_SET_HIGH(cachep);
+	STATS_INC_NODEALLOCS(cachep);
+
+	objp = slabp->s_mem + slabp->free*cachep->objsize;
+
+	slabp->inuse++;
+	next = slab_bufctl(slabp)[slabp->free];
+#if DEBUG
+	slab_bufctl(slabp)[slabp->free] = BUFCTL_FREE;
+#endif
+	slabp->free = next;
+	check_slabp(cachep, slabp);
+
+	/* move slabp to correct slabp list: */
+	list_del(&slabp->list);
+	if (slabp->free == BUFCTL_END)
+		list_add(&slabp->list, &cachep->lists.slabs_full);
 	else
-		ptr = __cache_alloc_node(cachep, flags, nodeid);
-	local_irq_restore(save_flags);
-	ptr = cache_alloc_debugcheck_after(cachep, flags, ptr, __builtin_return_address(0));
+		list_add(&slabp->list, &cachep->lists.slabs_partial);
 
-	return ptr;
+	list3_data(cachep)->free_objects--;
+	spin_unlock_irq(&cachep->spinlock);
+
+	objp = cache_alloc_debugcheck_after(cachep, GFP_KERNEL, objp,
+					__builtin_return_address(0));
+	return objp;
 }
 EXPORT_SYMBOL(kmem_cache_alloc_node);
 
-void *kmalloc_node(size_t size, gfp_t flags, int node)
+void *kmalloc_node(size_t size, unsigned int __nocast flags, int node)
 {
 	kmem_cache_t *cachep;
 
@@ -2906,7 +2510,7 @@ EXPORT_SYMBOL(kmalloc_node);
  * platforms.  For example, on i386, it means that the memory must come
  * from the first 16MB.
  */
-void *__kmalloc(size_t size, gfp_t flags)
+void *__kmalloc(size_t size, unsigned int __nocast flags)
 {
 	kmem_cache_t *cachep;
 
@@ -2939,18 +2543,11 @@ void *__alloc_percpu(size_t size, size_t
 	if (!pdata)
 		return NULL;
 
-	/*
-	 * Cannot use for_each_online_cpu since a cpu may come online
-	 * and we have no way of figuring out how to fix the array
-	 * that we have allocated then....
-	 */
-	for_each_cpu(i) {
-		int node = cpu_to_node(i);
-
-		if (node_online(node))
-			pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL, node);
-		else
-			pdata->ptrs[i] = kmalloc(size, GFP_KERNEL);
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
+		pdata->ptrs[i] = kmalloc_node(size, GFP_KERNEL,
+						cpu_to_node(i));
 
 		if (!pdata->ptrs[i])
 			goto unwind_oom;
@@ -2984,9 +2581,9 @@ void kmem_cache_free(kmem_cache_t *cache
 {
 	unsigned long flags;
 
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	__cache_free(cachep, objp);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -2995,7 +2592,7 @@ EXPORT_SYMBOL(kmem_cache_free);
  * @size: how many bytes of memory are required.
  * @flags: the type of memory to allocate.
  */
-void *kzalloc(size_t size, gfp_t flags)
+void *kzalloc(size_t size, unsigned int __nocast flags)
 {
 	void *ret = kmalloc(size, flags);
 	if (ret)
@@ -3004,12 +2601,25 @@ void *kzalloc(size_t size, gfp_t flags)
 }
 EXPORT_SYMBOL(kzalloc);
 
+#ifdef CONFIG_DEBUG_DEADLOCKS
+static size_t cache_size(kmem_cache_t *c)
+{
+	struct cache_sizes *csizep = malloc_sizes;
+
+	for ( ; csizep->cs_size; csizep++) {
+		if (csizep->cs_cachep == c)
+			return csizep->cs_size;
+		if (csizep->cs_dmacachep == c)
+			return csizep->cs_size;
+	}
+	return 0;
+}
+#endif
+
 /**
  * kfree - free previously allocated memory
  * @objp: pointer returned by kmalloc.
  *
- * If @objp is NULL, no operation is performed.
- *
  * Don't free memory not originally allocated by kmalloc()
  * or you will run into trouble.
  */
@@ -3020,11 +2630,16 @@ void kfree(const void *objp)
 
 	if (unlikely(!objp))
 		return;
-	local_irq_save(flags);
+	local_irq_save_nort(flags);
 	kfree_debugcheck(objp);
 	c = GET_PAGE_CACHE(virt_to_page(objp));
+#ifdef CONFIG_DEBUG_DEADLOCKS
+	if (check_no_locks_freed(objp, objp+cache_size(c)))
+		printk("slab %s[%p] (%d), obj: %p\n",
+			c->name, c, c->objsize, objp);
+#endif
 	__cache_free(c, (void*)objp);
-	local_irq_restore(flags);
+	local_irq_restore_nort(flags);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3042,11 +2657,11 @@ free_percpu(const void *objp)
 	int i;
 	struct percpu_data *p = (struct percpu_data *) (~(unsigned long) objp);
 
-	/*
-	 * We allocate for all cpus so we cannot use for online cpu here.
-	 */
-	for_each_cpu(i)
+	for (i = 0; i < NR_CPUS; i++) {
+		if (!cpu_possible(i))
+			continue;
 		kfree(p->ptrs[i]);
+	}
 	kfree(p);
 }
 EXPORT_SYMBOL(free_percpu);
@@ -3064,76 +2679,21 @@ const char *kmem_cache_name(kmem_cache_t
 }
 EXPORT_SYMBOL_GPL(kmem_cache_name);
 
-/*
- * This initializes kmem_list3 for all nodes.
- */
-static int alloc_kmemlist(kmem_cache_t *cachep)
-{
-	int node;
-	struct kmem_list3 *l3;
-	int err = 0;
-
-	for_each_online_node(node) {
-		struct array_cache *nc = NULL, *new;
-		struct array_cache **new_alien = NULL;
-#ifdef CONFIG_NUMA
-		if (!(new_alien = alloc_alien_cache(node, cachep->limit)))
-			goto fail;
-#endif
-		if (!(new = alloc_arraycache(node, (cachep->shared*
-				cachep->batchcount), 0xbaadf00d)))
-			goto fail;
-		if ((l3 = cachep->nodelists[node])) {
-
-			spin_lock_irq(&l3->list_lock);
-
-			if ((nc = cachep->nodelists[node]->shared))
-				free_block(cachep, nc->entry,
-							nc->avail, node);
-
-			l3->shared = new;
-			if (!cachep->nodelists[node]->alien) {
-				l3->alien = new_alien;
-				new_alien = NULL;
-			}
-			l3->free_limit = (1 + nr_cpus_node(node))*
-				cachep->batchcount + cachep->num;
-			spin_unlock_irq(&l3->list_lock);
-			kfree(nc);
-			free_alien_cache(new_alien);
-			continue;
-		}
-		if (!(l3 = kmalloc_node(sizeof(struct kmem_list3),
-						GFP_KERNEL, node)))
-			goto fail;
-
-		kmem_list3_init(l3);
-		l3->next_reap = jiffies + REAPTIMEOUT_LIST3 +
-			((unsigned long)cachep)%REAPTIMEOUT_LIST3;
-		l3->shared = new;
-		l3->alien = new_alien;
-		l3->free_limit = (1 + nr_cpus_node(node))*
-			cachep->batchcount + cachep->num;
-		cachep->nodelists[node] = l3;
-	}
-	return err;
-fail:
-	err = -ENOMEM;
-	return err;
-}
-
 struct ccupdate_struct {
 	kmem_cache_t *cachep;
 	struct array_cache *new[NR_CPUS];
 };
 
+/*
+ * Executes in IRQ context:
+ */
 static void do_ccupdate_local(void *info)
 {
 	struct ccupdate_struct *new = (struct ccupdate_struct *)info;
 	struct array_cache *old;
 
 	check_irq_off();
-	old = ac_data(new->cachep);
+	old = ac_data(new->cachep, smp_processor_id());
 
 	new->cachep->array[smp_processor_id()] = new->new[smp_processor_id()];
 	new->new[smp_processor_id()] = old;
@@ -3144,14 +2704,19 @@ static int do_tune_cpucache(kmem_cache_t
 				int shared)
 {
 	struct ccupdate_struct new;
-	int i, err;
+	struct array_cache *new_shared;
+	int i;
 
 	memset(&new.new,0,sizeof(new.new));
-	for_each_online_cpu(i) {
-		new.new[i] = alloc_arraycache(cpu_to_node(i), limit, batchcount);
-		if (!new.new[i]) {
-			for (i--; i >= 0; i--) kfree(new.new[i]);
-			return -ENOMEM;
+	for (i = 0; i < NR_CPUS; i++) {
+		if (cpu_online(i)) {
+			new.new[i] = alloc_arraycache(i, limit, batchcount);
+			if (!new.new[i]) {
+				for (i--; i >= 0; i--) kfree(new.new[i]);
+				return -ENOMEM;
+			}
+		} else {
+			new.new[i] = NULL;
 		}
 	}
 	new.cachep = cachep;
@@ -3162,25 +2727,31 @@ static int do_tune_cpucache(kmem_cache_t
 	spin_lock_irq(&cachep->spinlock);
 	cachep->batchcount = batchcount;
 	cachep->limit = limit;
-	cachep->shared = shared;
+	cachep->free_limit = (1+num_online_cpus())*cachep->batchcount + cachep->num;
 	spin_unlock_irq(&cachep->spinlock);
 
-	for_each_online_cpu(i) {
+	for (i = 0; i < NR_CPUS; i++) {
 		struct array_cache *ccold = new.new[i];
 		if (!ccold)
 			continue;
-		spin_lock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
-		free_block(cachep, ccold->entry, ccold->avail, cpu_to_node(i));
-		spin_unlock_irq(&cachep->nodelists[cpu_to_node(i)]->list_lock);
+		spin_lock_irq(&cachep->spinlock);
+		free_block(cachep, ac_entry(ccold), ccold->avail);
+		spin_unlock_irq(&cachep->spinlock);
 		kfree(ccold);
 	}
-
-	err = alloc_kmemlist(cachep);
-	if (err) {
-		printk(KERN_ERR "alloc_kmemlist failed for %s, error %d.\n",
-				cachep->name, -err);
-		BUG();
+	new_shared = alloc_arraycache(-1, batchcount*shared, 0xbaadf00d);
+	if (new_shared) {
+		struct array_cache *old;
+
+		spin_lock_irq(&cachep->spinlock);
+		old = cachep->lists.shared;
+		cachep->lists.shared = new_shared;
+		if (old)
+			free_block(cachep, ac_entry(old), old->avail);
+		spin_unlock_irq(&cachep->spinlock);
+		kfree(old);
 	}
+
 	return 0;
 }
 
@@ -3231,6 +2802,10 @@ static void enable_cpucache(kmem_cache_t
 	if (limit > 32)
 		limit = 32;
 #endif
+#ifdef CONFIG_PREEMPT
+	if (limit > 16)
+		limit = 16;
+#endif
 	err = do_tune_cpucache(cachep, limit, (limit+1)/2, shared);
 	if (err)
 		printk(KERN_ERR "enable_cpucache failed for %s, error %d.\n",
@@ -3238,11 +2813,11 @@ static void enable_cpucache(kmem_cache_t
 }
 
 static void drain_array_locked(kmem_cache_t *cachep,
-				struct array_cache *ac, int force, int node)
+				struct array_cache *ac, int force)
 {
 	int tofree;
 
-	check_spinlock_acquired_node(cachep, node);
+	check_spinlock_acquired(cachep);
 	if (ac->touched && !force) {
 		ac->touched = 0;
 	} else if (ac->avail) {
@@ -3250,9 +2825,9 @@ static void drain_array_locked(kmem_cach
 		if (tofree > ac->avail) {
 			tofree = (ac->avail+1)/2;
 		}
-		free_block(cachep, ac->entry, tofree, node);
+		free_block(cachep, ac_entry(ac), tofree);
 		ac->avail -= tofree;
-		memmove(ac->entry, &(ac->entry[tofree]),
+		memmove(&ac_entry(ac)[0], &ac_entry(ac)[tofree],
 					sizeof(void*)*ac->avail);
 	}
 }
@@ -3270,12 +2845,14 @@ static void drain_array_locked(kmem_cach
  */
 static void cache_reap(void *unused)
 {
+	int cpu;
 	struct list_head *walk;
-	struct kmem_list3 *l3;
 
 	if (down_trylock(&cache_chain_sem)) {
 		/* Give up. Setup the next iteration. */
-		schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+next_iteration:
+		cpu = raw_smp_processor_id();
+		schedule_delayed_work(&per_cpu(reap_work, cpu), REAPTIMEOUT_CPUC + cpu);
 		return;
 	}
 
@@ -3292,32 +2869,28 @@ static void cache_reap(void *unused)
 
 		check_irq_on();
 
-		l3 = searchp->nodelists[numa_node_id()];
-		if (l3->alien)
-			drain_alien_cache(searchp, l3);
-		spin_lock_irq(&l3->list_lock);
+		spin_lock_irq(&searchp->spinlock);
+		cpu = raw_smp_processor_id();
 
-		drain_array_locked(searchp, ac_data(searchp), 0,
-				numa_node_id());
+		drain_array_locked(searchp, ac_data(searchp, cpu), 0);
 
-		if (time_after(l3->next_reap, jiffies))
+		if(time_after(searchp->lists.next_reap, jiffies))
 			goto next_unlock;
 
-		l3->next_reap = jiffies + REAPTIMEOUT_LIST3;
+		searchp->lists.next_reap = jiffies + REAPTIMEOUT_LIST3;
 
-		if (l3->shared)
-			drain_array_locked(searchp, l3->shared, 0,
-				numa_node_id());
+		if (searchp->lists.shared)
+			drain_array_locked(searchp, searchp->lists.shared, 0);
 
-		if (l3->free_touched) {
-			l3->free_touched = 0;
+		if (searchp->lists.free_touched) {
+			searchp->lists.free_touched = 0;
 			goto next_unlock;
 		}
 
-		tofree = (l3->free_limit+5*searchp->num-1)/(5*searchp->num);
+		tofree = (searchp->free_limit+5*searchp->num-1)/(5*searchp->num);
 		do {
-			p = l3->slabs_free.next;
-			if (p == &(l3->slabs_free))
+			p = list3_data(searchp)->slabs_free.next;
+			if (p == &(list3_data(searchp)->slabs_free))
 				break;
 
 			slabp = list_entry(p, struct slab, list);
@@ -3330,13 +2903,13 @@ static void cache_reap(void *unused)
 			 * searchp cannot disappear, we hold
 			 * cache_chain_lock
 			 */
-			l3->free_objects -= searchp->num;
-			spin_unlock_irq(&l3->list_lock);
+			searchp->lists.free_objects -= searchp->num;
+			spin_unlock_irq(&searchp->spinlock);
 			slab_destroy(searchp, slabp);
-			spin_lock_irq(&l3->list_lock);
+			spin_lock_irq(&searchp->spinlock);
 		} while(--tofree > 0);
 next_unlock:
-		spin_unlock_irq(&l3->list_lock);
+		spin_unlock_irq(&searchp->spinlock);
 next:
 		cond_resched();
 	}
@@ -3344,7 +2917,7 @@ next:
 	up(&cache_chain_sem);
 	drain_remote_pages();
 	/* Setup the next iteration */
-	schedule_delayed_work(&__get_cpu_var(reap_work), REAPTIMEOUT_CPUC + smp_processor_id());
+	goto next_iteration;
 }
 
 #ifdef CONFIG_PROC_FS
@@ -3370,7 +2943,7 @@ static void *s_start(struct seq_file *m,
 		seq_puts(m, " : slabdata <active_slabs> <num_slabs> <sharedavail>");
 #if STATS
 		seq_puts(m, " : globalstat <listallocs> <maxobjs> <grown> <reaped>"
-				" <error> <maxfreeable> <nodeallocs> <remotefrees>");
+				" <error> <maxfreeable> <freelimit> <nodeallocs>");
 		seq_puts(m, " : cpustat <allochit> <allocmiss> <freehit> <freemiss>");
 #endif
 		seq_putc(m, '\n');
@@ -3405,53 +2978,39 @@ static int s_show(struct seq_file *m, vo
 	unsigned long	active_objs;
 	unsigned long	num_objs;
 	unsigned long	active_slabs = 0;
-	unsigned long	num_slabs, free_objects = 0, shared_avail = 0;
+	unsigned long	num_slabs;
 	const char *name;
 	char *error = NULL;
-	int node;
-	struct kmem_list3 *l3;
 
 	check_irq_on();
 	spin_lock_irq(&cachep->spinlock);
 	active_objs = 0;
 	num_slabs = 0;
-	for_each_online_node(node) {
-		l3 = cachep->nodelists[node];
-		if (!l3)
-			continue;
-
-		spin_lock(&l3->list_lock);
-
-		list_for_each(q,&l3->slabs_full) {
-			slabp = list_entry(q, struct slab, list);
-			if (slabp->inuse != cachep->num && !error)
-				error = "slabs_full accounting error";
-			active_objs += cachep->num;
-			active_slabs++;
-		}
-		list_for_each(q,&l3->slabs_partial) {
-			slabp = list_entry(q, struct slab, list);
-			if (slabp->inuse == cachep->num && !error)
-				error = "slabs_partial inuse accounting error";
-			if (!slabp->inuse && !error)
-				error = "slabs_partial/inuse accounting error";
-			active_objs += slabp->inuse;
-			active_slabs++;
-		}
-		list_for_each(q,&l3->slabs_free) {
-			slabp = list_entry(q, struct slab, list);
-			if (slabp->inuse && !error)
-				error = "slabs_free/inuse accounting error";
-			num_slabs++;
-		}
-		free_objects += l3->free_objects;
-		shared_avail += l3->shared->avail;
-
-		spin_unlock(&l3->list_lock);
+	list_for_each(q,&cachep->lists.slabs_full) {
+		slabp = list_entry(q, struct slab, list);
+		if (slabp->inuse != cachep->num && !error)
+			error = "slabs_full accounting error";
+		active_objs += cachep->num;
+		active_slabs++;
+	}
+	list_for_each(q,&cachep->lists.slabs_partial) {
+		slabp = list_entry(q, struct slab, list);
+		if (slabp->inuse == cachep->num && !error)
+			error = "slabs_partial inuse accounting error";
+		if (!slabp->inuse && !error)
+			error = "slabs_partial/inuse accounting error";
+		active_objs += slabp->inuse;
+		active_slabs++;
+	}
+	list_for_each(q,&cachep->lists.slabs_free) {
+		slabp = list_entry(q, struct slab, list);
+		if (slabp->inuse && !error)
+			error = "slabs_free/inuse accounting error";
+		num_slabs++;
 	}
 	num_slabs+=active_slabs;
 	num_objs = num_slabs*cachep->num;
-	if (num_objs - active_objs != free_objects && !error)
+	if (num_objs - active_objs != cachep->lists.free_objects && !error)
 		error = "free_objects accounting error";
 
 	name = cachep->name; 
@@ -3463,9 +3022,9 @@ static int s_show(struct seq_file *m, vo
 		cachep->num, (1<<cachep->gfporder));
 	seq_printf(m, " : tunables %4u %4u %4u",
 			cachep->limit, cachep->batchcount,
-			cachep->shared);
-	seq_printf(m, " : slabdata %6lu %6lu %6lu",
-			active_slabs, num_slabs, shared_avail);
+			cachep->lists.shared->limit/cachep->batchcount);
+	seq_printf(m, " : slabdata %6lu %6lu %6u",
+			active_slabs, num_slabs, cachep->lists.shared->avail);
 #if STATS
 	{	/* list3 stats */
 		unsigned long high = cachep->high_mark;
@@ -3474,13 +3033,12 @@ static int s_show(struct seq_file *m, vo
 		unsigned long reaped = cachep->reaped;
 		unsigned long errors = cachep->errors;
 		unsigned long max_freeable = cachep->max_freeable;
+		unsigned long free_limit = cachep->free_limit;
 		unsigned long node_allocs = cachep->node_allocs;
-		unsigned long node_frees = cachep->node_frees;
 
-		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu \
-				%4lu %4lu %4lu %4lu",
+		seq_printf(m, " : globalstat %7lu %6lu %5lu %4lu %4lu %4lu %4lu %4lu",
 				allocs, high, grown, reaped, errors,
-				max_freeable, node_allocs, node_frees);
+				max_freeable, free_limit, node_allocs);
 	}
 	/* cpu stats */
 	{
@@ -3559,10 +3117,9 @@ ssize_t slabinfo_write(struct file *file
 			    batchcount < 1 ||
 			    batchcount > limit ||
 			    shared < 0) {
-				res = 0;
+				res = -EINVAL;
 			} else {
-				res = do_tune_cpucache(cachep, limit,
-							batchcount, shared);
+				res = do_tune_cpucache(cachep, limit, batchcount, shared);
 			}
 			break;
 		}
@@ -3574,24 +3131,20 @@ ssize_t slabinfo_write(struct file *file
 }
 #endif
 
-/**
- * ksize - get the actual amount of memory allocated for a given object
- * @objp: Pointer to the object
- *
- * kmalloc may internally round up allocations and return more memory
- * than requested. ksize() can be used to determine the actual amount of
- * memory allocated. The caller may use this additional memory, even though
- * a smaller amount of memory was initially specified with the kmalloc call.
- * The caller must guarantee that objp points to a valid object previously
- * allocated with either kmalloc() or kmem_cache_alloc(). The object
- * must not be freed during the duration of the call.
- */
 unsigned int ksize(const void *objp)
 {
-	if (unlikely(objp == NULL))
-		return 0;
+	kmem_cache_t *c;
+	unsigned long flags;
+	unsigned int size = 0;
+
+	if (likely(objp != NULL)) {
+		local_irq_save_nort(flags);
+		c = GET_PAGE_CACHE(virt_to_page(objp));
+		size = kmem_cache_size(c);
+		local_irq_restore_nort(flags);
+	}
 
-	return obj_reallen(GET_PAGE_CACHE(virt_to_page(objp)));
+	return size;
 }
 
 
@@ -3601,7 +3154,7 @@ unsigned int ksize(const void *objp)
  * @s: the string to duplicate
  * @gfp: the GFP mask used in the kmalloc() call when allocating memory
  */
-char *kstrdup(const char *s, gfp_t gfp)
+char *kstrdup(const char *s, unsigned int __nocast gfp)
 {
 	size_t len;
 	char *buf;
Index: linux/mm/sparse.c
===================================================================
--- linux.orig/mm/sparse.c
+++ linux/mm/sparse.c
@@ -40,7 +40,7 @@ static struct mem_section *sparse_index_
 
 static int sparse_index_init(unsigned long section_nr, int nid)
 {
-	static spinlock_t index_init_lock = SPIN_LOCK_UNLOCKED;
+	static DEFINE_SPINLOCK(index_init_lock);
 	unsigned long root = SECTION_NR_TO_ROOT(section_nr);
 	struct mem_section *section;
 	int ret = 0;
Index: linux/mm/swap.c
===================================================================
--- linux.orig/mm/swap.c
+++ linux/mm/swap.c
@@ -136,39 +136,45 @@ EXPORT_SYMBOL(mark_page_accessed);
  * lru_cache_add: add a page to the page lists
  * @page: the page to add
  */
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs) = { 0, };
-static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_pvecs) = { 0, };
+static DEFINE_PER_CPU_LOCKED(struct pagevec, lru_add_active_pvecs) = { 0, };
 
 void fastcall lru_cache_add(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
 }
 
 void fastcall lru_cache_add_active(struct page *page)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs);
+	int cpu;
+	struct pagevec *pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 
 	page_cache_get(page);
 	if (!pagevec_add(pvec, page))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 void lru_add_drain(void)
 {
-	struct pagevec *pvec = &get_cpu_var(lru_add_pvecs);
+	struct pagevec *pvec;
+	int cpu;
 
+	pvec = &get_cpu_var_locked(lru_add_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &__get_cpu_var(lru_add_active_pvecs);
+	put_cpu_var_locked(lru_add_pvecs, cpu);
+
+	pvec = &get_cpu_var_locked(lru_add_active_pvecs, &cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
-	put_cpu_var(lru_add_pvecs);
+	put_cpu_var_locked(lru_add_active_pvecs, cpu);
 }
 
 /*
@@ -416,12 +422,13 @@ EXPORT_SYMBOL(vm_acct_memory);
 #ifdef CONFIG_HOTPLUG_CPU
 static void lru_drain_cache(unsigned int cpu)
 {
-	struct pagevec *pvec = &per_cpu(lru_add_pvecs, cpu);
+	struct pagevec *pvec = &__get_cpu_var_locked(lru_add_pvecs, cpu);
 
 	/* CPU is dead, so no locking needed. */
 	if (pagevec_count(pvec))
 		__pagevec_lru_add(pvec);
-	pvec = &per_cpu(lru_add_active_pvecs, cpu);
+
+	pvec = &__get_cpu_var_locked(lru_add_active_pvecs, cpu);
 	if (pagevec_count(pvec))
 		__pagevec_lru_add_active(pvec);
 }
Index: linux/mm/swap_state.c
===================================================================
--- linux.orig/mm/swap_state.c
+++ linux/mm/swap_state.c
@@ -35,7 +35,7 @@ static struct backing_dev_info swap_back
 
 struct address_space swapper_space = {
 	.page_tree	= RADIX_TREE_INIT(GFP_ATOMIC|__GFP_NOWARN),
-	.tree_lock	= RW_LOCK_UNLOCKED,
+	.tree_lock	= RW_LOCK_UNLOCKED(swapper_space.tree_lock),
 	.a_ops		= &swap_aops,
 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
 	.backing_dev_info = &swap_backing_dev_info,
Index: linux/net/atm/clip.c
===================================================================
--- linux.orig/net/atm/clip.c
+++ linux/net/atm/clip.c
@@ -707,7 +707,7 @@ static struct atm_dev atmarpd_dev = {
 	.ops =			&atmarpd_dev_ops,
 	.type =			"arpd",
 	.number = 		999,
-	.lock =			SPIN_LOCK_UNLOCKED
+	.lock =			SPIN_LOCK_UNLOCKED(atmarpd_dev.lock)
 };
 
 
Index: linux/net/atm/lec.c
===================================================================
--- linux.orig/net/atm/lec.c
+++ linux/net/atm/lec.c
@@ -597,7 +597,7 @@ static struct atm_dev lecatm_dev = {
 	.ops	= &lecdev_ops,
 	.type	= "lec",
 	.number	= 999,	/* dummy device number */
-	.lock	= SPIN_LOCK_UNLOCKED
+	.lock	= SPIN_LOCK_UNLOCKED(lecatm_dev.lock)
 };
 
 /*
Index: linux/net/atm/mpc.c
===================================================================
--- linux.orig/net/atm/mpc.c
+++ linux/net/atm/mpc.c
@@ -749,7 +749,7 @@ static struct atm_dev mpc_dev = {
 	.ops	= &mpc_ops,
 	.type	= "mpc",
 	.number	= 42,
-	.lock	= SPIN_LOCK_UNLOCKED
+	.lock	= SPIN_LOCK_UNLOCKED(mpc_dev.lock)
 	/* members not explicitly initialised will be 0 */
 };
 
Index: linux/net/atm/signaling.c
===================================================================
--- linux.orig/net/atm/signaling.c
+++ linux/net/atm/signaling.c
@@ -261,7 +261,7 @@ static struct atm_dev sigd_dev = {
 	.ops =		&sigd_dev_ops,
 	.type =		"sig",
 	.number =	999,
-	.lock =		SPIN_LOCK_UNLOCKED
+	.lock =		SPIN_LOCK_UNLOCKED(sigd_dev.lock)
 };
 
 
Index: linux/net/bluetooth/hci_sock.c
===================================================================
--- linux.orig/net/bluetooth/hci_sock.c
+++ linux/net/bluetooth/hci_sock.c
@@ -84,7 +84,7 @@ static struct hci_sec_filter hci_sec_fil
 };
 
 static struct bt_sock_list hci_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(hci_sk_list.lock)
 };
 
 /* Send frame to RAW socket */
Index: linux/net/bluetooth/l2cap.c
===================================================================
--- linux.orig/net/bluetooth/l2cap.c
+++ linux/net/bluetooth/l2cap.c
@@ -61,7 +61,7 @@
 static struct proto_ops l2cap_sock_ops;
 
 static struct bt_sock_list l2cap_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(l2cap_sk_list.lock)
 };
 
 static int l2cap_conn_del(struct hci_conn *conn, int err);
Index: linux/net/bluetooth/rfcomm/sock.c
===================================================================
--- linux.orig/net/bluetooth/rfcomm/sock.c
+++ linux/net/bluetooth/rfcomm/sock.c
@@ -62,7 +62,7 @@
 static struct proto_ops rfcomm_sock_ops;
 
 static struct bt_sock_list rfcomm_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(rfcomm_sk_list.lock)
 };
 
 static void rfcomm_sock_close(struct sock *sk);
Index: linux/net/bluetooth/sco.c
===================================================================
--- linux.orig/net/bluetooth/sco.c
+++ linux/net/bluetooth/sco.c
@@ -60,7 +60,7 @@
 static struct proto_ops sco_sock_ops;
 
 static struct bt_sock_list sco_sk_list = {
-	.lock = RW_LOCK_UNLOCKED
+	.lock = RW_LOCK_UNLOCKED(sco_sk_list.lock)
 };
 
 static void __sco_chan_add(struct sco_conn *conn, struct sock *sk, struct sock *parent);
Index: linux/net/bridge/netfilter/ebtable_broute.c
===================================================================
--- linux.orig/net/bridge/netfilter/ebtable_broute.c
+++ linux/net/bridge/netfilter/ebtable_broute.c
@@ -46,7 +46,7 @@ static struct ebt_table broute_table =
 	.name		= "broute",
 	.table		= &initial_table,
 	.valid_hooks	= 1 << NF_BR_BROUTING,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(broute_table.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
Index: linux/net/bridge/netfilter/ebtable_filter.c
===================================================================
--- linux.orig/net/bridge/netfilter/ebtable_filter.c
+++ linux/net/bridge/netfilter/ebtable_filter.c
@@ -55,7 +55,7 @@ static struct ebt_table frame_filter =
 	.name		= "filter",
 	.table		= &initial_table,
 	.valid_hooks	= FILTER_VALID_HOOKS, 
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(frame_filter.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
Index: linux/net/bridge/netfilter/ebtable_nat.c
===================================================================
--- linux.orig/net/bridge/netfilter/ebtable_nat.c
+++ linux/net/bridge/netfilter/ebtable_nat.c
@@ -55,7 +55,7 @@ static struct ebt_table frame_nat =
 	.name		= "nat",
 	.table		= &initial_table,
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(frame_nat.lock),
 	.check		= check,
 	.me		= THIS_MODULE,
 };
Index: linux/net/core/dev.c
===================================================================
--- linux.orig/net/core/dev.c
+++ linux/net/core/dev.c
@@ -1308,10 +1308,16 @@ int dev_queue_xmit(struct sk_buff *skb)
 	   Either shot noqueue qdisc, it is even simpler 8)
 	 */
 	if (dev->flags & IFF_UP) {
-		int cpu = smp_processor_id(); /* ok because BHs are off */
+		int cpu = raw_smp_processor_id(); /* ok because BHs are off */
 
+		/*
+		 * No need to check for recursion with threaded interrupts:
+		 */
+#ifdef CONFIG_PREEMPT_RT
+		if (1) {
+#else
 		if (dev->xmit_lock_owner != cpu) {
-
+#endif
 			HARD_TX_LOCK(dev, cpu);
 
 			if (!netif_queue_stopped(dev)) {
@@ -1394,7 +1400,7 @@ int netif_rx(struct sk_buff *skb)
 	 * The code is rearranged so that the path is the most
 	 * short when CPU is congested, but is still operating.
 	 */
-	local_irq_save(flags);
+	raw_local_irq_save(flags);
 	queue = &__get_cpu_var(softnet_data);
 
 	__get_cpu_var(netdev_rx_stat).total++;
@@ -1403,7 +1409,7 @@ int netif_rx(struct sk_buff *skb)
 enqueue:
 			dev_hold(skb->dev);
 			__skb_queue_tail(&queue->input_pkt_queue, skb);
-			local_irq_restore(flags);
+			raw_local_irq_restore(flags);
 			return NET_RX_SUCCESS;
 		}
 
@@ -1412,7 +1418,7 @@ enqueue:
 	}
 
 	__get_cpu_var(netdev_rx_stat).dropped++;
-	local_irq_restore(flags);
+	raw_local_irq_restore(flags);
 
 	kfree_skb(skb);
 	return NET_RX_DROP;
@@ -1450,10 +1456,10 @@ static void net_tx_action(struct softirq
 	if (sd->completion_queue) {
 		struct sk_buff *clist;
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		while (clist) {
 			struct sk_buff *skb = clist;
@@ -1461,16 +1467,21 @@ static void net_tx_action(struct softirq
 
 			BUG_TRAP(!atomic_read(&skb->users));
 			__kfree_skb(skb);
+			/*
+			 * Safe to reschedule - the list is private
+			 * at this point.
+			 */
+			cond_resched_all();
 		}
 	}
 
 	if (sd->output_queue) {
 		struct net_device *head;
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		head = sd->output_queue;
 		sd->output_queue = NULL;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		while (head) {
 			struct net_device *dev = head;
@@ -1483,10 +1494,20 @@ static void net_tx_action(struct softirq
 				qdisc_run(dev);
 				spin_unlock(&dev->queue_lock);
 			} else {
+				/*
+				 * Dont re-kick the queue here, it will cause
+				 * excessive scheduling of ksoftirqd due
+				 * to retry. When the queue is released
+				 * it will be completed anyway.
+				 */
+//#warning checkme!
+#ifndef CONFIG_PREEMPT_SOFTIRQS
 				netif_schedule(dev);
+#endif
 			}
 		}
 	}
+
 }
 
 static __inline__ int deliver_skb(struct sk_buff *skb,
@@ -1667,11 +1688,11 @@ static int process_backlog(struct net_de
 		struct sk_buff *skb;
 		struct net_device *dev;
 
-		local_irq_disable();
+		raw_local_irq_disable();
 		skb = __skb_dequeue(&queue->input_pkt_queue);
 		if (!skb)
 			goto job_done;
-		local_irq_enable();
+		raw_local_irq_enable();
 
 		dev = skb->dev;
 
@@ -1698,18 +1719,19 @@ job_done:
 	smp_mb__before_clear_bit();
 	netif_poll_enable(backlog_dev);
 
-	local_irq_enable();
+	raw_local_irq_enable();
 	return 0;
 }
 
 static void net_rx_action(struct softirq_action *h)
 {
-	struct softnet_data *queue = &__get_cpu_var(softnet_data);
+	struct softnet_data *queue;
 	unsigned long start_time = jiffies;
 	int budget = netdev_budget;
 	void *have;
 
-	local_irq_disable();
+	raw_local_irq_disable();
+	queue = &__get_cpu_var(softnet_data);
 
 	while (!list_empty(&queue->poll_list)) {
 		struct net_device *dev;
@@ -1717,7 +1739,11 @@ static void net_rx_action(struct softirq
 		if (budget <= 0 || jiffies - start_time > 1)
 			goto softnet_break;
 
-		local_irq_enable();
+		raw_local_irq_enable();
+		if (unlikely(cond_resched_all())) {
+			raw_local_irq_disable();
+			continue;
+		}
 
 		dev = list_entry(queue->poll_list.next,
 				 struct net_device, poll_list);
@@ -1725,7 +1751,7 @@ static void net_rx_action(struct softirq
 
 		if (dev->quota <= 0 || dev->poll(dev, &budget)) {
 			netpoll_poll_unlock(have);
-			local_irq_disable();
+			raw_local_irq_disable();
 			list_del(&dev->poll_list);
 			list_add_tail(&dev->poll_list, &queue->poll_list);
 			if (dev->quota < 0)
@@ -1735,16 +1761,18 @@ static void net_rx_action(struct softirq
 		} else {
 			netpoll_poll_unlock(have);
 			dev_put(dev);
-			local_irq_disable();
+			raw_local_irq_disable();
 		}
 	}
 out:
-	local_irq_enable();
+	raw_local_irq_enable();
 	return;
 
 softnet_break:
+	preempt_disable();
 	__get_cpu_var(netdev_rx_stat).time_squeeze++;
 	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
+	preempt_enable();
 	goto out;
 }
 
@@ -3139,7 +3167,7 @@ static int dev_cpu_callback(struct notif
 	if (action != CPU_DEAD)
 		return NOTIFY_OK;
 
-	local_irq_disable();
+	raw_local_irq_disable();
 	cpu = smp_processor_id();
 	sd = &per_cpu(softnet_data, cpu);
 	oldsd = &per_cpu(softnet_data, oldcpu);
@@ -3161,7 +3189,7 @@ static int dev_cpu_callback(struct notif
 	oldsd->output_queue = NULL;
 
 	raise_softirq_irqoff(NET_TX_SOFTIRQ);
-	local_irq_enable();
+	raw_local_irq_enable();
 
 	/* Process offline CPU's input_pkt_queue */
 	while ((skb = __skb_dequeue(&oldsd->input_pkt_queue)))
Index: linux/net/core/flow.c
===================================================================
--- linux.orig/net/core/flow.c
+++ linux/net/core/flow.c
@@ -38,6 +38,8 @@ atomic_t flow_cache_genid = ATOMIC_INIT(
 
 static u32 flow_hash_shift;
 #define flow_hash_size	(1 << flow_hash_shift)
+
+// #warning FIXME: this code is PREEMPT_RT-unsafe
 static DEFINE_PER_CPU(struct flow_cache_entry **, flow_tables) = { NULL };
 
 #define flow_table(cpu) (per_cpu(flow_tables, cpu))
Index: linux/net/core/netpoll.c
===================================================================
--- linux.orig/net/core/netpoll.c
+++ linux/net/core/netpoll.c
@@ -135,7 +135,7 @@ static void poll_napi(struct netpoll *np
 	int budget = 16;
 
 	if (test_bit(__LINK_STATE_RX_SCHED, &np->dev->state) &&
-	    npinfo->poll_owner != smp_processor_id() &&
+	    npinfo->poll_owner != raw_smp_processor_id() &&
 	    spin_trylock(&npinfo->poll_lock)) {
 		npinfo->rx_flags |= NETPOLL_RX_DROP;
 		atomic_inc(&trapped);
@@ -154,7 +154,9 @@ void netpoll_poll(struct netpoll *np)
 		return;
 
 	/* Process pending work on NIC */
+	WARN_ON_RT(irqs_disabled());
 	np->dev->poll_controller(np->dev);
+	WARN_ON_RT(irqs_disabled());
 	if (np->dev->poll)
 		poll_napi(np);
 
@@ -181,28 +183,31 @@ static void refill_skbs(void)
 
 static void zap_completion_queue(void)
 {
-	unsigned long flags;
 	struct softnet_data *sd = &get_cpu_var(softnet_data);
+	struct sk_buff *clist = NULL;
+	unsigned long flags;
 
 	if (sd->completion_queue) {
-		struct sk_buff *clist;
-
-		local_irq_save(flags);
+		raw_local_irq_save(flags);
 		clist = sd->completion_queue;
 		sd->completion_queue = NULL;
-		local_irq_restore(flags);
-
-		while (clist != NULL) {
-			struct sk_buff *skb = clist;
-			clist = clist->next;
-			if(skb->destructor)
-				dev_kfree_skb_any(skb); /* put this one back */
-			else
-				__kfree_skb(skb);
-		}
+		raw_local_irq_restore(flags);
 	}
 
+	/*
+	 * Took the list private, can drop our softnet
+	 * reference:
+	 */
 	put_cpu_var(softnet_data);
+
+	while (clist != NULL) {
+		struct sk_buff *skb = clist;
+		clist = clist->next;
+		if(skb->destructor)
+			dev_kfree_skb_any(skb); /* put this one back */
+		else
+			__kfree_skb(skb);
+	}
 }
 
 static struct sk_buff * find_skb(struct netpoll *np, int len, int reserve)
@@ -257,8 +262,8 @@ static void netpoll_send_skb(struct netp
 	npinfo = np->dev->npinfo;
 
 	/* avoid recursion */
-	if (npinfo->poll_owner == smp_processor_id() ||
-	    np->dev->xmit_lock_owner == smp_processor_id()) {
+	if (npinfo->poll_owner == raw_smp_processor_id() ||
+	    np->dev->xmit_lock_owner == raw_smp_processor_id()) {
 		if (np->drop)
 			np->drop(skb);
 		else
@@ -269,7 +274,7 @@ static void netpoll_send_skb(struct netp
 	do {
 		npinfo->tries--;
 		spin_lock(&np->dev->xmit_lock);
-		np->dev->xmit_lock_owner = smp_processor_id();
+		np->dev->xmit_lock_owner = raw_smp_processor_id();
 
 		/*
 		 * network drivers do not expect to be called if the queue is
Index: linux/net/core/sock.c
===================================================================
--- linux.orig/net/core/sock.c
+++ linux/net/core/sock.c
@@ -1216,7 +1216,7 @@ static void sock_def_readable(struct soc
 {
 	read_lock(&sk->sk_callback_lock);
 	if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-		wake_up_interruptible(sk->sk_sleep);
+		wake_up_interruptible_sync(sk->sk_sleep);
 	sk_wake_async(sk,1,POLL_IN);
 	read_unlock(&sk->sk_callback_lock);
 }
Index: linux/net/dccp/ipv4.c
===================================================================
--- linux.orig/net/dccp/ipv4.c
+++ linux/net/dccp/ipv4.c
@@ -28,10 +28,10 @@
 #include "dccp.h"
 
 struct inet_hashinfo __cacheline_aligned dccp_hashinfo = {
-	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_lock	= RW_LOCK_UNLOCKED(dccp_hashinfo.lhash_lock),
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait = __WAIT_QUEUE_HEAD_INITIALIZER(dccp_hashinfo.lhash_wait),
-	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
+	.portalloc_lock	= SPIN_LOCK_UNLOCKED(dccp_hashinfo.portalloc_lock),
 	.port_rover	= 1024 - 1,
 };
 
Index: linux/net/dccp/minisocks.c
===================================================================
--- linux.orig/net/dccp/minisocks.c
+++ linux/net/dccp/minisocks.c
@@ -26,7 +26,7 @@
 struct inet_timewait_death_row dccp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
 	.period		= DCCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.death_lock	= SPIN_LOCK_UNLOCKED(dccp_death_row.death_lock),
 	.hashinfo	= &dccp_hashinfo,
 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
 					    (unsigned long)&dccp_death_row),
Index: linux/net/decnet/dn_dev.c
===================================================================
--- linux.orig/net/decnet/dn_dev.c
+++ linux/net/decnet/dn_dev.c
@@ -87,9 +87,9 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ethernet",
 	.ctl_name =	NET_DECNET_CONF_ETHER,
-	.up =		dn_eth_up,
-	.down = 	dn_eth_down,
-	.timer3 =	dn_send_brd_hello,
+	.dn_up =		dn_eth_up,
+	.dn_down = 	dn_eth_down,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 {
 	.type =		ARPHRD_IPGRE, /* DECnet tunneled over GRE in IP */
@@ -99,7 +99,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ipgre",
 	.ctl_name =	NET_DECNET_CONF_GRE,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #if 0
 {
@@ -110,7 +110,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"x25",
 	.ctl_name =	NET_DECNET_CONF_X25,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 #endif
 #if 0
@@ -122,7 +122,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"ppp",
 	.ctl_name =	NET_DECNET_CONF_PPP,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 },
 #endif
 {
@@ -133,7 +133,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		120,
 	.name =		"ddcmp",
 	.ctl_name =	NET_DECNET_CONF_DDCMP,
-	.timer3 =	dn_send_ptp_hello,
+	.dn_timer3 =	dn_send_ptp_hello,
 },
 {
 	.type =		ARPHRD_LOOPBACK, /* Loopback interface - always last */
@@ -143,7 +143,7 @@ static struct dn_dev_parms dn_dev_list[]
 	.t3 =		10,
 	.name =		"loopback",
 	.ctl_name =	NET_DECNET_CONF_LOOPBACK,
-	.timer3 =	dn_send_brd_hello,
+	.dn_timer3 =	dn_send_brd_hello,
 }
 };
 
@@ -332,11 +332,11 @@ static int dn_forwarding_proc(ctl_table 
 		 */
 		tmp = dn_db->parms.forwarding;
 		dn_db->parms.forwarding = old;
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = tmp;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return err;
@@ -371,11 +371,11 @@ static int dn_forwarding_sysctl(ctl_tabl
 		if (value > 2)
 			return -EINVAL;
 
-		if (dn_db->parms.down)
-			dn_db->parms.down(dev);
+		if (dn_db->parms.dn_down)
+			dn_db->parms.dn_down(dev);
 		dn_db->parms.forwarding = value;
-		if (dn_db->parms.up)
-			dn_db->parms.up(dev);
+		if (dn_db->parms.dn_up)
+			dn_db->parms.dn_up(dev);
 	}
 
 	return 0;
@@ -1061,10 +1061,10 @@ static void dn_dev_timer_func(unsigned l
 	struct dn_ifaddr *ifa;
 
 	if (dn_db->t3 <= dn_db->parms.t2) {
-		if (dn_db->parms.timer3) {
+		if (dn_db->parms.dn_timer3) {
 			for(ifa = dn_db->ifa_list; ifa; ifa = ifa->ifa_next) {
 				if (!(ifa->ifa_flags & IFA_F_SECONDARY))
-					dn_db->parms.timer3(dev, ifa);
+					dn_db->parms.dn_timer3(dev, ifa);
 			}
 		}
 		dn_db->t3 = dn_db->parms.t3;
@@ -1116,8 +1116,8 @@ struct dn_dev *dn_dev_create(struct net_
 	init_timer(&dn_db->timer);
 
 	dn_db->uptime = jiffies;
-	if (dn_db->parms.up) {
-		if (dn_db->parms.up(dev) < 0) {
+	if (dn_db->parms.dn_up) {
+		if (dn_db->parms.dn_up(dev) < 0) {
 			dev->dn_ptr = NULL;
 			kfree(dn_db);
 			return NULL;
@@ -1212,8 +1212,8 @@ static void dn_dev_delete(struct net_dev
 	dn_dev_check_default(dev);
 	neigh_ifdown(&dn_neigh_table, dev);
 
-	if (dn_db->parms.down)
-		dn_db->parms.down(dev);
+	if (dn_db->parms.dn_down)
+		dn_db->parms.dn_down(dev);
 
 	dev->dn_ptr = NULL;
 
Index: linux/net/ipv4/netfilter/arptable_filter.c
===================================================================
--- linux.orig/net/ipv4/netfilter/arptable_filter.c
+++ linux/net/ipv4/netfilter/arptable_filter.c
@@ -142,7 +142,7 @@ static struct
 static struct arpt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_filter.lock),
 	.private	= NULL,
 	.me		= THIS_MODULE,
 };
Index: linux/net/ipv4/netfilter/ip_conntrack_core.c
===================================================================
--- linux.orig/net/ipv4/netfilter/ip_conntrack_core.c
+++ linux/net/ipv4/netfilter/ip_conntrack_core.c
@@ -83,7 +83,7 @@ static unsigned int ip_conntrack_expect_
 struct notifier_block *ip_conntrack_chain;
 struct notifier_block *ip_conntrack_expect_chain;
 
-DEFINE_PER_CPU(struct ip_conntrack_ecache, ip_conntrack_ecache);
+DEFINE_PER_CPU_LOCKED(struct ip_conntrack_ecache, ip_conntrack_ecache);
 
 /* deliver cached events and clear cache entry - must be called with locally
  * disabled softirqs */
@@ -104,20 +104,23 @@ __ip_ct_deliver_cached_events(struct ip_
 void ip_ct_deliver_cached_events(const struct ip_conntrack *ct)
 {
 	struct ip_conntrack_ecache *ecache;
-	
+	int cpu;
+
 	local_bh_disable();
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &get_cpu_var_locked(ip_conntrack_ecache, &cpu);
 	if (ecache->ct == ct)
 		__ip_ct_deliver_cached_events(ecache);
+	put_cpu_var_locked(ip_conntrack_ecache, cpu);
 	local_bh_enable();
 }
 
 void __ip_ct_event_cache_init(struct ip_conntrack *ct)
 {
 	struct ip_conntrack_ecache *ecache;
+	int cpu = raw_smp_processor_id();
 
 	/* take care of delivering potentially old events */
-	ecache = &__get_cpu_var(ip_conntrack_ecache);
+	ecache = &__get_cpu_var_locked(ip_conntrack_ecache, cpu);
 	BUG_ON(ecache->ct == ct);
 	if (ecache->ct)
 		__ip_ct_deliver_cached_events(ecache);
@@ -133,8 +136,11 @@ static void ip_ct_event_cache_flush(void
 	struct ip_conntrack_ecache *ecache;
 	int cpu;
 
+	/*
+	 * First get all locks, then do the flush and drop the locks.
+	 */
 	for_each_cpu(cpu) {
-		ecache = &per_cpu(ip_conntrack_ecache, cpu);
+		ecache = &__get_cpu_var_locked(ip_conntrack_ecache, cpu);
 		if (ecache->ct)
 			ip_conntrack_put(ecache->ct);
 	}
Index: linux/net/ipv4/netfilter/ip_conntrack_standalone.c
===================================================================
--- linux.orig/net/ipv4/netfilter/ip_conntrack_standalone.c
+++ linux/net/ipv4/netfilter/ip_conntrack_standalone.c
@@ -977,7 +977,7 @@ EXPORT_SYMBOL_GPL(ip_conntrack_expect_ch
 EXPORT_SYMBOL_GPL(ip_conntrack_register_notifier);
 EXPORT_SYMBOL_GPL(ip_conntrack_unregister_notifier);
 EXPORT_SYMBOL_GPL(__ip_ct_event_cache_init);
-EXPORT_PER_CPU_SYMBOL_GPL(ip_conntrack_ecache);
+EXPORT_PER_CPU_LOCKED_SYMBOL_GPL(ip_conntrack_ecache);
 #endif
 EXPORT_SYMBOL(ip_conntrack_protocol_register);
 EXPORT_SYMBOL(ip_conntrack_protocol_unregister);
Index: linux/net/ipv4/netfilter/ip_nat_rule.c
===================================================================
--- linux.orig/net/ipv4/netfilter/ip_nat_rule.c
+++ linux/net/ipv4/netfilter/ip_nat_rule.c
@@ -93,7 +93,7 @@ static struct
 static struct ipt_table nat_table = {
 	.name		= "nat",
 	.valid_hooks	= NAT_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(nat_table.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux/net/ipv4/netfilter/ip_tables.c
===================================================================
--- linux.orig/net/ipv4/netfilter/ip_tables.c
+++ linux/net/ipv4/netfilter/ip_tables.c
@@ -111,7 +111,11 @@ struct ipt_table_info
 static LIST_HEAD(ipt_target);
 static LIST_HEAD(ipt_match);
 static LIST_HEAD(ipt_tables);
-#define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0)
+/*
+ * Use atomic add because on PREEMPT_RT the same table might
+ * be used on two CPUs at once:
+ */
+#define ADD_COUNTER(c,b,p) do { atomic_add((b), (atomic_t *)(&(c).bcnt)); atomic_add((p), (atomic_t *)(&(c).pcnt)); } while(0)
 
 #ifdef CONFIG_SMP
 #define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p))
@@ -290,8 +294,17 @@ ipt_do_table(struct sk_buff **pskb,
 
 	read_lock_bh(&table->lock);
 	IP_NF_ASSERT(table->valid_hooks & (1 << hook));
+	/*
+	 * on a PREEMPT_RT kernel the task could schedule
+	 * off and smp_processor_id() is not safe. So we take
+	 * the current value of the CPU and use that table. We
+	 * only update the counters while read-locking the table
+	 * and dont change the rules so the possibility of the
+	 * same table being used by two tasks at once is not a
+	 * problem.
+	 */
 	table_base = (void *)table->private->entries
-		+ TABLE_OFFSET(table->private, smp_processor_id());
+		+ TABLE_OFFSET(table->private, raw_smp_processor_id());
 	e = get_entry(table_base, table->private->hook_entry[hook]);
 
 #ifdef CONFIG_NETFILTER_DEBUG
@@ -299,7 +312,7 @@ ipt_do_table(struct sk_buff **pskb,
 	if (((struct ipt_entry *)table_base)->comefrom != 0xdead57ac
 	    && ((struct ipt_entry *)table_base)->comefrom != 0xeeeeeeec) {
 		printk("ASSERT: CPU #%u, %s comefrom(%p) = %X\n",
-		       smp_processor_id(),
+		       raw_smp_processor_id(),
 		       table->name,
 		       &((struct ipt_entry *)table_base)->comefrom,
 		       ((struct ipt_entry *)table_base)->comefrom);
Index: linux/net/ipv4/netfilter/iptable_filter.c
===================================================================
--- linux.orig/net/ipv4/netfilter/iptable_filter.c
+++ linux/net/ipv4/netfilter/iptable_filter.c
@@ -77,7 +77,7 @@ static struct
 static struct ipt_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE
 };
 
Index: linux/net/ipv4/netfilter/iptable_mangle.c
===================================================================
--- linux.orig/net/ipv4/netfilter/iptable_mangle.c
+++ linux/net/ipv4/netfilter/iptable_mangle.c
@@ -107,7 +107,7 @@ static struct
 static struct ipt_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux/net/ipv4/netfilter/iptable_raw.c
===================================================================
--- linux.orig/net/ipv4/netfilter/iptable_raw.c
+++ linux/net/ipv4/netfilter/iptable_raw.c
@@ -82,7 +82,7 @@ static struct
 static struct ipt_table packet_raw = { 
 	.name = "raw", 
 	.valid_hooks =  RAW_VALID_HOOKS, 
-	.lock = RW_LOCK_UNLOCKED, 
+	.lock = RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE
 };
 
Index: linux/net/ipv4/route.c
===================================================================
--- linux.orig/net/ipv4/route.c
+++ linux/net/ipv4/route.c
@@ -204,14 +204,14 @@ __u8 ip_tos2prio[16] = {
 struct rt_hash_bucket {
 	struct rtable	*chain;
 };
-#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK)
+#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) || defined(CONFIG_PREEMPT_RT)
 /*
  * Instead of using one spinlock for each rt_hash_bucket, we use a table of spinlocks
  * The size of this table is a power of two and depends on the number of CPUS.
  */
-#if NR_CPUS >= 32
+#if NR_CPUS >= 32 && !defined(CONFIG_PREEMPT_RT)
 #define RT_HASH_LOCK_SZ	4096
-#elif NR_CPUS >= 16
+#elif NR_CPUS >= 16 && !defined(CONFIG_PREEMPT_RT)
 #define RT_HASH_LOCK_SZ	2048
 #elif NR_CPUS >= 8
 #define RT_HASH_LOCK_SZ	1024
@@ -231,7 +231,7 @@ static spinlock_t	*rt_hash_locks;
 			spin_lock_init(&rt_hash_locks[i]); \
 		}
 #else
-# define rt_hash_lock_addr(slot) NULL
+# define rt_hash_lock_addr(slot) ((spinlock_t *)NULL)
 # define rt_hash_lock_init()
 #endif
 
Index: linux/net/ipv4/tcp_ipv4.c
===================================================================
--- linux.orig/net/ipv4/tcp_ipv4.c
+++ linux/net/ipv4/tcp_ipv4.c
@@ -90,10 +90,10 @@ void tcp_v4_send_check(struct sock *sk, 
 		       struct sk_buff *skb);
 
 struct inet_hashinfo __cacheline_aligned tcp_hashinfo = {
-	.lhash_lock	= RW_LOCK_UNLOCKED,
+	.lhash_lock	= RW_LOCK_UNLOCKED(tcp_hashinfo.lhash_lock),
 	.lhash_users	= ATOMIC_INIT(0),
 	.lhash_wait	= __WAIT_QUEUE_HEAD_INITIALIZER(tcp_hashinfo.lhash_wait),
-	.portalloc_lock	= SPIN_LOCK_UNLOCKED,
+	.portalloc_lock	= SPIN_LOCK_UNLOCKED(tcp_hashinfo.portalloc_lock),
 	.port_rover	= 1024 - 1,
 };
 
Index: linux/net/ipv4/tcp_minisocks.c
===================================================================
--- linux.orig/net/ipv4/tcp_minisocks.c
+++ linux/net/ipv4/tcp_minisocks.c
@@ -41,7 +41,7 @@ int sysctl_tcp_abort_on_overflow;
 struct inet_timewait_death_row tcp_death_row = {
 	.sysctl_max_tw_buckets = NR_FILE * 2,
 	.period		= TCP_TIMEWAIT_LEN / INET_TWDR_TWKILL_SLOTS,
-	.death_lock	= SPIN_LOCK_UNLOCKED,
+	.death_lock	= SPIN_LOCK_UNLOCKED(tcp_death_row.death_lock),
 	.hashinfo	= &tcp_hashinfo,
 	.tw_timer	= TIMER_INITIALIZER(inet_twdr_hangman, 0,
 					    (unsigned long)&tcp_death_row),
Index: linux/net/ipv4/xfrm4_policy.c
===================================================================
--- linux.orig/net/ipv4/xfrm4_policy.c
+++ linux/net/ipv4/xfrm4_policy.c
@@ -18,7 +18,7 @@
 static struct dst_ops xfrm4_dst_ops;
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo;
 
-static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED };
+static struct xfrm_type_map xfrm4_type_map = { .lock = RW_LOCK_UNLOCKED(xfrm4_type_map.lock) };
 
 static int xfrm4_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
@@ -296,7 +296,7 @@ static struct dst_ops xfrm4_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
 	.family = 		AF_INET,
-	.lock = 		RW_LOCK_UNLOCKED,
+	.lock = 		RW_LOCK_UNLOCKED(xfrm4_policy_afinfo.lock),
 	.type_map = 		&xfrm4_type_map,
 	.dst_ops =		&xfrm4_dst_ops,
 	.dst_lookup =		xfrm4_dst_lookup,
Index: linux/net/ipv4/xfrm4_state.c
===================================================================
--- linux.orig/net/ipv4/xfrm4_state.c
+++ linux/net/ipv4/xfrm4_state.c
@@ -116,7 +116,7 @@ __xfrm4_find_acq(u8 mode, u32 reqid, u8 
 
 static struct xfrm_state_afinfo xfrm4_state_afinfo = {
 	.family			= AF_INET,
-	.lock			= RW_LOCK_UNLOCKED,
+	.lock			= RW_LOCK_UNLOCKED(xfrm4_state_afinfo.lock),
 	.init_flags		= xfrm4_init_flags,
 	.init_tempsel		= __xfrm4_init_tempsel,
 	.state_lookup		= __xfrm4_state_lookup,
Index: linux/net/ipv6/netfilter/ip6table_filter.c
===================================================================
--- linux.orig/net/ipv6/netfilter/ip6table_filter.c
+++ linux/net/ipv6/netfilter/ip6table_filter.c
@@ -95,7 +95,7 @@ static struct
 static struct ip6t_table packet_filter = {
 	.name		= "filter",
 	.valid_hooks	= FILTER_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_filter.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux/net/ipv6/netfilter/ip6table_mangle.c
===================================================================
--- linux.orig/net/ipv6/netfilter/ip6table_mangle.c
+++ linux/net/ipv6/netfilter/ip6table_mangle.c
@@ -125,7 +125,7 @@ static struct
 static struct ip6t_table packet_mangler = {
 	.name		= "mangle",
 	.valid_hooks	= MANGLE_VALID_HOOKS,
-	.lock		= RW_LOCK_UNLOCKED,
+	.lock		= RW_LOCK_UNLOCKED(packet_mangler.lock),
 	.me		= THIS_MODULE,
 };
 
Index: linux/net/ipv6/netfilter/ip6table_raw.c
===================================================================
--- linux.orig/net/ipv6/netfilter/ip6table_raw.c
+++ linux/net/ipv6/netfilter/ip6table_raw.c
@@ -109,7 +109,7 @@ static struct
 static struct ip6t_table packet_raw = { 
 	.name = "raw", 
 	.valid_hooks = RAW_VALID_HOOKS, 
-	.lock = RW_LOCK_UNLOCKED, 
+	.lock = RW_LOCK_UNLOCKED(packet_raw.lock),
 	.me = THIS_MODULE
 };
 
Index: linux/net/ipv6/xfrm6_policy.c
===================================================================
--- linux.orig/net/ipv6/xfrm6_policy.c
+++ linux/net/ipv6/xfrm6_policy.c
@@ -24,7 +24,7 @@
 static struct dst_ops xfrm6_dst_ops;
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo;
 
-static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED };
+static struct xfrm_type_map xfrm6_type_map = { .lock = RW_LOCK_UNLOCKED(xfrm6_type_map.lock) };
 
 static int xfrm6_dst_lookup(struct xfrm_dst **dst, struct flowi *fl)
 {
@@ -309,7 +309,7 @@ static struct dst_ops xfrm6_dst_ops = {
 
 static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
 	.family =		AF_INET6,
-	.lock = 		RW_LOCK_UNLOCKED,
+	.lock = 		RW_LOCK_UNLOCKED(xfrm6_policy_afinfo.lock),
 	.type_map = 		&xfrm6_type_map,
 	.dst_ops =		&xfrm6_dst_ops,
 	.dst_lookup =		xfrm6_dst_lookup,
Index: linux/net/ipv6/xfrm6_state.c
===================================================================
--- linux.orig/net/ipv6/xfrm6_state.c
+++ linux/net/ipv6/xfrm6_state.c
@@ -118,7 +118,7 @@ __xfrm6_find_acq(u8 mode, u32 reqid, u8 
 
 static struct xfrm_state_afinfo xfrm6_state_afinfo = {
 	.family			= AF_INET6,
-	.lock			= RW_LOCK_UNLOCKED,
+	.lock			= RW_LOCK_UNLOCKED(xfrm6_state_afinfo.lock),
 	.init_tempsel		= __xfrm6_init_tempsel,
 	.state_lookup		= __xfrm6_state_lookup,
 	.find_acq		= __xfrm6_find_acq,
Index: linux/net/netfilter/nfnetlink_log.c
===================================================================
--- linux.orig/net/netfilter/nfnetlink_log.c
+++ linux/net/netfilter/nfnetlink_log.c
@@ -152,7 +152,7 @@ instance_create(u_int16_t group_num, int
 
 	memset(inst, 0, sizeof(*inst));
 	INIT_HLIST_NODE(&inst->hlist);
-	inst->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&inst->lock);
 	/* needs to be two, since we _put() after creation */
 	atomic_set(&inst->use, 2);
 
Index: linux/net/netfilter/nfnetlink_queue.c
===================================================================
--- linux.orig/net/netfilter/nfnetlink_queue.c
+++ linux/net/netfilter/nfnetlink_queue.c
@@ -149,7 +149,7 @@ instance_create(u_int16_t queue_num, int
 	atomic_set(&inst->id_sequence, 0);
 	/* needs to be two, since we _put() after creation */
 	atomic_set(&inst->use, 2);
-	inst->lock = SPIN_LOCK_UNLOCKED;
+	spin_lock_init(&inst->lock);
 	INIT_LIST_HEAD(&inst->queue_list);
 
 	if (!try_module_get(THIS_MODULE))
Index: linux/net/sched/sch_generic.c
===================================================================
--- linux.orig/net/sched/sch_generic.c
+++ linux/net/sched/sch_generic.c
@@ -14,6 +14,7 @@
 #include <asm/uaccess.h>
 #include <asm/system.h>
 #include <linux/bitops.h>
+#include <linux/kallsyms.h>
 #include <linux/config.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -32,6 +33,7 @@
 #include <linux/init.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
+#include <linux/delay.h>
 #include <net/sock.h>
 #include <net/pkt_sched.h>
 
@@ -108,8 +110,11 @@ int qdisc_restart(struct net_device *dev
 		 * will be requeued.
 		 */
 		if (!nolock) {
+#ifdef CONFIG_PREEMPT_RT
+			spin_lock(&dev->xmit_lock);
+			dev->xmit_lock_owner = raw_smp_processor_id();
+#else
 			if (!spin_trylock(&dev->xmit_lock)) {
-			collision:
 				/* So, someone grabbed the driver. */
 				
 				/* It may be transient configuration error,
@@ -117,7 +122,7 @@ int qdisc_restart(struct net_device *dev
 				   it by checking xmit owner and drop the
 				   packet when deadloop is detected.
 				*/
-				if (dev->xmit_lock_owner == smp_processor_id()) {
+				if (dev->xmit_lock_owner == raw_smp_processor_id()) {
 					kfree_skb(skb);
 					if (net_ratelimit())
 						printk(KERN_DEBUG "Dead loop on netdevice %s, fix it urgently!\n", dev->name);
@@ -127,7 +132,8 @@ int qdisc_restart(struct net_device *dev
 				goto requeue;
 			}
 			/* Remember that the driver is grabbed by us. */
-			dev->xmit_lock_owner = smp_processor_id();
+			dev->xmit_lock_owner = raw_smp_processor_id();
+#endif
 		}
 		
 		{
@@ -139,7 +145,20 @@ int qdisc_restart(struct net_device *dev
 				if (netdev_nit)
 					dev_queue_xmit_nit(skb, dev);
 
+				WARN_ON_RT(irqs_disabled());
 				ret = dev->hard_start_xmit(skb, dev);
+#ifdef CONFIG_PREEMPT_RT
+				if (irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					local_irq_enable();
+				}
+				if (raw_irqs_disabled()) {
+					if (printk_ratelimit())
+						print_symbol("network driver disabled raw interrupts: %s\n", (unsigned long)dev->hard_start_xmit);
+					raw_local_irq_enable();
+				}
+#endif
 				if (ret == NETDEV_TX_OK) { 
 					if (!nolock) {
 						dev->xmit_lock_owner = -1;
@@ -150,7 +169,10 @@ int qdisc_restart(struct net_device *dev
 				}
 				if (ret == NETDEV_TX_LOCKED && nolock) {
 					spin_lock(&dev->queue_lock);
-					goto collision; 
+					preempt_disable();
+					__get_cpu_var(netdev_rx_stat).cpu_collision++;
+					preempt_enable();
+					goto requeue;
 				}
 			}
 
@@ -578,7 +600,7 @@ void dev_deactivate(struct net_device *d
 	dev_watchdog_down(dev);
 
 	while (test_bit(__LINK_STATE_SCHED, &dev->state))
-		yield();
+		msleep(1);
 
 	spin_unlock_wait(&dev->xmit_lock);
 }
Index: linux/net/unix/af_unix.c
===================================================================
--- linux.orig/net/unix/af_unix.c
+++ linux/net/unix/af_unix.c
@@ -290,10 +290,11 @@ static void unix_write_space(struct sock
 	read_lock(&sk->sk_callback_lock);
 	if (unix_writable(sk)) {
 		if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
-			wake_up_interruptible(sk->sk_sleep);
+			wake_up_interruptible_sync(sk->sk_sleep);
 		sk_wake_async(sk, 2, POLL_OUT);
 	}
 	read_unlock(&sk->sk_callback_lock);
+	preempt_check_resched_delayed();
 }
 
 /* When dgram socket disconnects (or changes its peer), we clear its receive
Index: linux/scripts/Makefile
===================================================================
--- linux.orig/scripts/Makefile
+++ linux/scripts/Makefile
@@ -12,6 +12,9 @@ hostprogs-$(CONFIG_LOGO)         += pnmt
 hostprogs-$(CONFIG_VT)           += conmakehash
 hostprogs-$(CONFIG_PROM_CONSOLE) += conmakehash
 hostprogs-$(CONFIG_IKCONFIG)     += bin2c
+ifdef CONFIG_LPPTEST
+hostprogs-y      += testlpp
+endif
 
 always		:= $(hostprogs-y)
 
Index: linux/scripts/testlpp.c
===================================================================
--- /dev/null
+++ linux/scripts/testlpp.c
@@ -0,0 +1,159 @@
+/*
+ * testlpp.c: use the /dev/lpptest device to test IRQ handling
+ *            latencies over parallel port
+ *
+ *      Copyright (C) 2005 Thomas Gleixner
+ *
+ * licensed under the GPL
+ */
+#include <unistd.h>
+#include <stdio.h>
+#include <string.h>
+#include <signal.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <sys/io.h>
+#include <sys/ioctl.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+
+#define LPPTEST_CHAR_MAJOR 245
+#define LPPTEST_DEVICE_NAME "lpptest"
+
+#define LPPTEST_TEST    _IOR (LPPTEST_CHAR_MAJOR, 1, unsigned long long)
+#define LPPTEST_DISABLE _IOR (LPPTEST_CHAR_MAJOR, 2, unsigned long long)
+#define LPPTEST_ENABLE  _IOR (LPPTEST_CHAR_MAJOR, 3, unsigned long long)
+
+#define HIST_SIZE 10000
+
+static int hist_total;
+static unsigned long hist[HIST_SIZE];
+
+static void hist_hit(unsigned long usecs)
+{
+	hist_total++;
+	if (usecs >= HIST_SIZE-1)
+		hist[HIST_SIZE-1]++;
+	else
+		hist[usecs]++;
+}
+
+static void print_hist(void)
+{
+	int i;
+
+	printf("LPP latency histogram:\n");
+
+	for (i = 0; i < HIST_SIZE; i++) {
+		if (hist[i])
+			printf("%3d usecs: %9ld\n", i, hist[i]);
+	}
+}
+
+static inline unsigned long long int rdtsc(void)
+{
+	unsigned long long int x, y;
+	for (;;) {
+		__asm__ volatile ("rdtsc" : "=A" (x));
+		__asm__ volatile ("rdtsc" : "=A" (y));
+		if (y - x < 1000)
+			return y;
+	}
+}
+
+static unsigned long long calibrate_loop(void)
+{
+	unsigned long long mytime1, mytime2;
+
+	mytime1 = rdtsc();
+	usleep(500000);
+	mytime2 = rdtsc();
+
+	return (mytime2 - mytime1) * 2;
+}
+
+#define time_to_usecs(time) ((double)time*1000000.0/(double)cycles_per_sec)
+
+#define time_to_usecs_l(time) (long)(time*1000000/cycles_per_sec)
+
+int fd, total;
+unsigned long long tim, sum_tim, min_tim = -1ULL, max_tim, cycles_per_sec;
+
+void cleanup(int sig)
+{
+	ioctl (fd, LPPTEST_ENABLE, &tim);
+	if (sig)
+		printf("[ interrupted - exiting ]\n");
+	printf("\ntotal number of responses: %d\n", total);
+	printf("average reponse latency:   %.2lf usecs\n",
+		time_to_usecs(sum_tim/total));
+	printf("minimum latency:           %.2lf usecs\n",
+			time_to_usecs(min_tim));
+	printf("maximum latency:           %.2lf usecs\n",
+			time_to_usecs(max_tim));
+	print_hist();
+	exit(0);
+}
+
+#define HZ 3000
+
+int main (int argc, char **argv)
+{
+	unsigned int nr_requests = 0;
+
+	if (argc > 2) {
+		fprintf(stderr, "usage: testlpp [<nr_of_requests>]\n");
+		exit(-1);
+	}
+	if (argc == 2)
+		nr_requests = atol(argv[1]);
+
+	if (getuid() != 0) {
+		fprintf(stderr, "need to run as root!\n");
+		exit(-1);
+	}
+	mknod("/dev/lpptest", S_IFCHR|0666, makedev(245, 1));
+
+	fd = open("/dev/lpptest", O_RDWR);
+	if (fd == -1) {
+		fprintf(stderr, "could not open /dev/lpptest, your kernel doesnt have CONFIG_LPPTEST enabled?\n");
+		exit(-1);
+	}
+
+	signal(SIGINT,&cleanup);
+
+	ioctl (fd, LPPTEST_DISABLE, &tim);
+
+	fprintf(stderr, "calibrating cycles to usecs: ");
+	cycles_per_sec = calibrate_loop();
+	fprintf(stderr, "%lld cycles per usec\n", cycles_per_sec/1000000);
+	if (nr_requests)
+		fprintf(stderr, "[max # of requests: %u]\n", nr_requests);
+	fprintf(stderr, "starting %dHz test, hit Ctrl-C to stop:\n\n", HZ);
+
+	while(1) {
+		ioctl (fd, LPPTEST_TEST, &tim);
+		if (tim == 0)
+			printf ("No response from target.\n");
+		else {
+			hist_hit(time_to_usecs_l(tim));
+			if (tim > max_tim) {
+				printf ("new max latency: %.2lf usecs (%Ld cycles)\n", time_to_usecs(tim), tim);
+				max_tim = tim;
+			}
+			if (tim < min_tim)
+				min_tim = tim;
+			total++;
+			if (total == nr_requests)
+				break;
+			sum_tim += tim;
+		}
+		usleep(1000000/HZ);
+	}
+	cleanup(0);
+
+	return 0;
+}
+
+
Index: linux/security/keys/process_keys.c
===================================================================
--- linux.orig/security/keys/process_keys.c
+++ linux/security/keys/process_keys.c
@@ -26,7 +26,7 @@ static DECLARE_MUTEX(key_session_sem);
 struct key_user root_key_user = {
 	.usage		= ATOMIC_INIT(3),
 	.consq		= LIST_HEAD_INIT(root_key_user.consq),
-	.lock		= SPIN_LOCK_UNLOCKED,
+	.lock		= SPIN_LOCK_UNLOCKED(root_key_user.lock),
 	.nkeys		= ATOMIC_INIT(2),
 	.nikeys		= ATOMIC_INIT(2),
 	.uid		= 0,
Index: linux/sound/core/pcm_lib.c
===================================================================
--- linux.orig/sound/core/pcm_lib.c
+++ linux/sound/core/pcm_lib.c
@@ -133,6 +133,7 @@ static void xrun(snd_pcm_substream_t *su
 	snd_pcm_stop(substream, SNDRV_PCM_STATE_XRUN);
 #ifdef CONFIG_SND_DEBUG
 	if (substream->pstr->xrun_debug) {
+		user_trace_stop();
 		snd_printd(KERN_DEBUG "XRUN: pcmC%dD%d%c\n",
 			   substream->pcm->card->number,
 			   substream->pcm->device,
Index: linux/sound/oss/dmasound/dmasound_core.c
===================================================================
--- linux.orig/sound/oss/dmasound/dmasound_core.c
+++ linux/sound/oss/dmasound/dmasound_core.c
@@ -230,7 +230,7 @@ static int shared_resources_initialised;
      *  Mid level stuff
      */
 
-struct sound_settings dmasound = { .lock = SPIN_LOCK_UNLOCKED };
+struct sound_settings dmasound = { .lock = SPIN_LOCK_UNLOCKED(dmasound.lock) };
 
 static inline void sound_silence(void)
 {
Index: linux/sound/oss/emu10k1/midi.c
===================================================================
--- linux.orig/sound/oss/emu10k1/midi.c
+++ linux/sound/oss/emu10k1/midi.c
@@ -45,7 +45,7 @@
 #include "../sound_config.h"
 #endif
 
-static DEFINE_SPINLOCK(midi_spinlock __attribute((unused)));
+static DEFINE_SPINLOCK(midi_spinlock);
 
 static void init_midi_hdr(struct midi_hdr *midihdr)
 {