diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v6-3.8.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v6-3.8.patch
new file mode 100644
index 000000000..77c829a33
--- /dev/null
+++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0001-block-cgroups-kconfig-build-bits-for-BFQ-v6-3.8.patch
@@ -0,0 +1,98 @@
+# Calculate format=diff os_linux_system==desktop
+From 59fd22f37d9acfa07186a02bb1cd2d64785d82b1 Mon Sep 17 00:00:00 2001
+From: Arianna Avanzini <avanzini.arianna@gmail.com>
+Date: Sat, 4 Feb 2012 10:55:51 +0100
+Subject: [PATCH 1/2] block: cgroups, kconfig, build bits for BFQ-v6-3.8
+
+Update Kconfig.iosched and do the related Makefile changes to include
+kernel configuration options for BFQ. Also add the bfqio controller
+to the cgroups subsystem.
+
+Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
+Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
+---
+ block/Kconfig.iosched         | 26 ++++++++++++++++++++++++++
+ block/Makefile                |  1 +
+ include/linux/cgroup_subsys.h |  6 ++++++
+ 3 files changed, 33 insertions(+)
+
+diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
+index 421bef9..56474b2 100644
+--- a/block/Kconfig.iosched
++++ b/block/Kconfig.iosched
+@@ -39,6 +39,28 @@ config CFQ_GROUP_IOSCHED
+ 	---help---
+ 	  Enable group IO scheduling in CFQ.
+ 
++config IOSCHED_BFQ
++	tristate "BFQ I/O scheduler"
++	depends on EXPERIMENTAL
++	default n
++	---help---
++	  The BFQ I/O scheduler tries to distribute bandwidth among
++	  all processes according to their weights.
++	  It aims at distributing the bandwidth as desired, independently of
++	  the disk parameters and with any workload. It also tries to
++	  guarantee low latency to interactive and soft real-time
++	  applications.  If compiled built-in (saying Y here), BFQ can
++	  be configured to support hierarchical scheduling.
++
++config CGROUP_BFQIO
++	bool "BFQ hierarchical scheduling support"
++	depends on CGROUPS && IOSCHED_BFQ=y
++	default n
++	---help---
++	  Enable hierarchical scheduling in BFQ, using the cgroups
++	  filesystem interface.  The name of the subsystem will be
++	  bfqio.
++
+ choice
+ 	prompt "Default I/O scheduler"
+ 	default DEFAULT_CFQ
+@@ -52,6 +74,9 @@ choice
+ 	config DEFAULT_CFQ
+ 		bool "CFQ" if IOSCHED_CFQ=y
+ 
++	config DEFAULT_BFQ
++		bool "BFQ" if IOSCHED_BFQ=y
++
+ 	config DEFAULT_NOOP
+ 		bool "No-op"
+ 
+@@ -61,6 +86,7 @@ config DEFAULT_IOSCHED
+ 	string
+ 	default "deadline" if DEFAULT_DEADLINE
+ 	default "cfq" if DEFAULT_CFQ
++	default "bfq" if DEFAULT_BFQ
+ 	default "noop" if DEFAULT_NOOP
+ 
+ endmenu
+diff --git a/block/Makefile b/block/Makefile
+index 39b76ba..c0d20fa 100644
+--- a/block/Makefile
++++ b/block/Makefile
+@@ -15,6 +15,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING)	+= blk-throttle.o
+ obj-$(CONFIG_IOSCHED_NOOP)	+= noop-iosched.o
+ obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
+ obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
++obj-$(CONFIG_IOSCHED_BFQ)	+= bfq-iosched.o
+ 
+ obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
+ obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
+diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h
+index f204a7a..b999bfa 100644
+--- a/include/linux/cgroup_subsys.h
++++ b/include/linux/cgroup_subsys.h
+@@ -78,3 +78,9 @@ SUBSYS(hugetlb)
+ #endif
+ 
+ /* */
++
++#ifdef CONFIG_CGROUP_BFQIO
++SUBSYS(bfqio)
++#endif
++
++/* */
+-- 
+1.8.1.2
+
diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0002-block-introduce-the-BFQ-v6-I-O-sched-for-3.8.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0002-block-introduce-the-BFQ-v6-I-O-sched-for-3.8.patch
new file mode 100644
index 000000000..34cd92a6d
--- /dev/null
+++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0002-block-introduce-the-BFQ-v6-I-O-sched-for-3.8.patch
@@ -0,0 +1,5856 @@
+# Calculate format=diff os_linux_system==desktop
+From d2ba3dc4196b2b1579f6ccbb64880e662684b8ba Mon Sep 17 00:00:00 2001
+From: Arianna Avanzini <avanzini.arianna@gmail.com>
+Date: Sun, 5 Feb 2012 01:04:27 +0100
+Subject: [PATCH 2/2] block: introduce the BFQ-v6 I/O sched for 3.8
+
+Add the BFQ-v6 I/O scheduler to 3.8.
+The general structure is borrowed from CFQ, as much code. A (bfq_)queue is
+associated to each task doing I/O on a device, and each time a scheduling
+decision has to be made a queue is selected and served until it expires.
+
+    - Slices are given in the service domain: tasks are assigned budgets,
+      measured in number of sectors. Once got the disk, a task must
+      however consume its assigned budget within a configurable maximum time
+      (by default, the maximum possible value of the budgets is automatically
+      computed to comply with this timeout). This allows the desired latency
+      vs "throughput boosting" tradeoff to be set.
+
+    - Budgets are scheduled according to a variant of WF2Q+, implemented
+      using an augmented rb-tree to take eligibility into account while
+      preserving an O(log N) overall complexity.
+
+    - A low-latency tunable is provided; if enabled, both interactive and soft
+      real-time applications are guaranteed very low latency.
+
+    - Latency guarantees are preserved also in presence of NCQ.
+
+    - Also with flash-based devices, a high throughput is achieved while
+      still preserving latency guarantees.
+
+    - A useful feature borrowed from CFQ: static fallback queue for OOM.
+
+    - Differently from CFQ, BFQ uses a unified mechanism (Early Queue Merge,
+      EQM) to get a sequential read pattern, and hence a high throughput,
+      with any set of processes performing interleaved I/O. EQM also
+      preserves low latency. The code for detecting whether two queues have
+      to be merged is a slightly modified version of the CFQ code for
+      detecting whether two queues belong to cooperating processes and whether
+      the service of a queue should be preempted to boost the throughput.
+
+    - BFQ supports full hierarchical scheduling, exporting a cgroups
+      interface.  Each node has a full scheduler, so each group can
+      be assigned its own ioprio (mapped to a weight, see next point)
+      and an ioprio_class.
+
+    - If the cgroups interface is used, weights can be explictly assigned,
+      otherwise ioprio values are mapped to weights using the relation
+      weight = IOPRIO_BE_NR - ioprio.
+
+    - ioprio classes are served in strict priority order, i.e., lower
+      priority queues are not served as long as there are higher priority
+      queues.  Among queues in the same class the bandwidth is distributed
+      in proportion to the weight of each queue. A very thin extra bandwidth
+      is however guaranteed to the Idle class, to prevent it from starving.
+
+Signed-off-by: Paolo Valente <paolo.valente@unimore.it>
+Signed-off-by: Arianna Avanzini <avanzini.arianna@gmail.com>
+---
+ block/bfq-cgroup.c  |  838 ++++++++++++++
+ block/bfq-ioc.c     |   36 +
+ block/bfq-iosched.c | 3218 +++++++++++++++++++++++++++++++++++++++++++++++++++
+ block/bfq-sched.c   | 1044 +++++++++++++++++
+ block/bfq.h         |  617 ++++++++++
+ 5 files changed, 5753 insertions(+)
+ create mode 100644 block/bfq-cgroup.c
+ create mode 100644 block/bfq-ioc.c
+ create mode 100644 block/bfq-iosched.c
+ create mode 100644 block/bfq-sched.c
+ create mode 100644 block/bfq.h
+
+diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
+new file mode 100644
+index 0000000..1ae54d1
+--- /dev/null
++++ b/block/bfq-cgroup.c
+@@ -0,0 +1,838 @@
++/*
++ * BFQ: CGROUPS support.
++ *
++ * Based on ideas and code from CFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
++ *		      Paolo Valente <paolo.valente@unimore.it>
++ *
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
++ *
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
++ */
++
++#ifdef CONFIG_CGROUP_BFQIO
++static struct bfqio_cgroup bfqio_root_cgroup = {
++	.weight = BFQ_DEFAULT_GRP_WEIGHT,
++	.ioprio = BFQ_DEFAULT_GRP_IOPRIO,
++	.ioprio_class = BFQ_DEFAULT_GRP_CLASS,
++};
++
++static inline void bfq_init_entity(struct bfq_entity *entity,
++				   struct bfq_group *bfqg)
++{
++	entity->weight = entity->new_weight;
++	entity->orig_weight = entity->new_weight;
++	entity->ioprio = entity->new_ioprio;
++	entity->ioprio_class = entity->new_ioprio_class;
++	entity->parent = bfqg->my_entity;
++	entity->sched_data = &bfqg->sched_data;
++}
++
++static struct bfqio_cgroup *cgroup_to_bfqio(struct cgroup *cgroup)
++{
++	return container_of(cgroup_subsys_state(cgroup, bfqio_subsys_id),
++			    struct bfqio_cgroup, css);
++}
++
++/*
++ * Search the bfq_group for bfqd into the hash table (by now only a list)
++ * of bgrp.  Must be called under rcu_read_lock().
++ */
++static struct bfq_group *bfqio_lookup_group(struct bfqio_cgroup *bgrp,
++					    struct bfq_data *bfqd)
++{
++	struct bfq_group *bfqg;
++	struct hlist_node *n;
++	void *key;
++
++	hlist_for_each_entry_rcu(bfqg, n, &bgrp->group_data, group_node) {
++		key = rcu_dereference(bfqg->bfqd);
++		if (key == bfqd)
++			return bfqg;
++	}
++
++	return NULL;
++}
++
++static inline void bfq_group_init_entity(struct bfqio_cgroup *bgrp,
++					 struct bfq_group *bfqg)
++{
++	struct bfq_entity *entity = &bfqg->entity;
++
++	entity->weight = entity->new_weight = bgrp->weight;
++	entity->orig_weight = entity->new_weight;
++	entity->ioprio = entity->new_ioprio = bgrp->ioprio;
++	entity->ioprio_class = entity->new_ioprio_class = bgrp->ioprio_class;
++	entity->ioprio_changed = 1;
++	entity->my_sched_data = &bfqg->sched_data;
++}
++
++static inline void bfq_group_set_parent(struct bfq_group *bfqg,
++					struct bfq_group *parent)
++{
++	struct bfq_entity *entity;
++
++	BUG_ON(parent == NULL);
++	BUG_ON(bfqg == NULL);
++
++	entity = &bfqg->entity;
++	entity->parent = parent->my_entity;
++	entity->sched_data = &parent->sched_data;
++}
++
++/**
++ * bfq_group_chain_alloc - allocate a chain of groups.
++ * @bfqd: queue descriptor.
++ * @cgroup: the leaf cgroup this chain starts from.
++ *
++ * Allocate a chain of groups starting from the one belonging to
++ * @cgroup up to the root cgroup.  Stop if a cgroup on the chain
++ * to the root has already an allocated group on @bfqd.
++ */
++static struct bfq_group *bfq_group_chain_alloc(struct bfq_data *bfqd,
++					       struct cgroup *cgroup)
++{
++	struct bfqio_cgroup *bgrp;
++	struct bfq_group *bfqg, *prev = NULL, *leaf = NULL;
++
++	for (; cgroup != NULL; cgroup = cgroup->parent) {
++		bgrp = cgroup_to_bfqio(cgroup);
++
++		bfqg = bfqio_lookup_group(bgrp, bfqd);
++		if (bfqg != NULL) {
++			/*
++			 * All the cgroups in the path from there to the
++			 * root must have a bfq_group for bfqd, so we don't
++			 * need any more allocations.
++			 */
++			break;
++		}
++
++		bfqg = kzalloc(sizeof(*bfqg), GFP_ATOMIC);
++		if (bfqg == NULL)
++			goto cleanup;
++
++		bfq_group_init_entity(bgrp, bfqg);
++		bfqg->my_entity = &bfqg->entity;
++
++		if (leaf == NULL) {
++			leaf = bfqg;
++			prev = leaf;
++		} else {
++			bfq_group_set_parent(prev, bfqg);
++			/*
++			 * Build a list of allocated nodes using the bfqd
++			 * filed, that is still unused and will be initialized
++			 * only after the node will be connected.
++			 */
++			prev->bfqd = bfqg;
++			prev = bfqg;
++		}
++	}
++
++	return leaf;
++
++cleanup:
++	while (leaf != NULL) {
++		prev = leaf;
++		leaf = leaf->bfqd;
++		kfree(prev);
++	}
++
++	return NULL;
++}
++
++/**
++ * bfq_group_chain_link - link an allocatd group chain to a cgroup hierarchy.
++ * @bfqd: the queue descriptor.
++ * @cgroup: the leaf cgroup to start from.
++ * @leaf: the leaf group (to be associated to @cgroup).
++ *
++ * Try to link a chain of groups to a cgroup hierarchy, connecting the
++ * nodes bottom-up, so we can be sure that when we find a cgroup in the
++ * hierarchy that already as a group associated to @bfqd all the nodes
++ * in the path to the root cgroup have one too.
++ *
++ * On locking: the queue lock protects the hierarchy (there is a hierarchy
++ * per device) while the bfqio_cgroup lock protects the list of groups
++ * belonging to the same cgroup.
++ */
++static void bfq_group_chain_link(struct bfq_data *bfqd, struct cgroup *cgroup,
++				 struct bfq_group *leaf)
++{
++	struct bfqio_cgroup *bgrp;
++	struct bfq_group *bfqg, *next, *prev = NULL;
++	unsigned long flags;
++
++	assert_spin_locked(bfqd->queue->queue_lock);
++
++	for (; cgroup != NULL && leaf != NULL; cgroup = cgroup->parent) {
++		bgrp = cgroup_to_bfqio(cgroup);
++		next = leaf->bfqd;
++
++		bfqg = bfqio_lookup_group(bgrp, bfqd);
++		BUG_ON(bfqg != NULL);
++
++		spin_lock_irqsave(&bgrp->lock, flags);
++
++		rcu_assign_pointer(leaf->bfqd, bfqd);
++		hlist_add_head_rcu(&leaf->group_node, &bgrp->group_data);
++		hlist_add_head(&leaf->bfqd_node, &bfqd->group_list);
++
++		spin_unlock_irqrestore(&bgrp->lock, flags);
++
++		prev = leaf;
++		leaf = next;
++	}
++
++	BUG_ON(cgroup == NULL && leaf != NULL);
++	if (cgroup != NULL && prev != NULL) {
++		bgrp = cgroup_to_bfqio(cgroup);
++		bfqg = bfqio_lookup_group(bgrp, bfqd);
++		bfq_group_set_parent(prev, bfqg);
++	}
++}
++
++/**
++ * bfq_find_alloc_group - return the group associated to @bfqd in @cgroup.
++ * @bfqd: queue descriptor.
++ * @cgroup: cgroup being searched for.
++ *
++ * Return a group associated to @bfqd in @cgroup, allocating one if
++ * necessary.  When a group is returned all the cgroups in the path
++ * to the root have a group associated to @bfqd.
++ *
++ * If the allocation fails, return the root group: this breaks guarantees
++ * but is a safe fallbak.  If this loss becames a problem it can be
++ * mitigated using the equivalent weight (given by the product of the
++ * weights of the groups in the path from @group to the root) in the
++ * root scheduler.
++ *
++ * We allocate all the missing nodes in the path from the leaf cgroup
++ * to the root and we connect the nodes only after all the allocations
++ * have been successful.
++ */
++static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd,
++					      struct cgroup *cgroup)
++{
++	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
++	struct bfq_group *bfqg;
++
++	bfqg = bfqio_lookup_group(bgrp, bfqd);
++	if (bfqg != NULL)
++		return bfqg;
++
++	bfqg = bfq_group_chain_alloc(bfqd, cgroup);
++	if (bfqg != NULL)
++		bfq_group_chain_link(bfqd, cgroup, bfqg);
++	else
++		bfqg = bfqd->root_group;
++
++	return bfqg;
++}
++
++/**
++ * bfq_bfqq_move - migrate @bfqq to @bfqg.
++ * @bfqd: queue descriptor.
++ * @bfqq: the queue to move.
++ * @entity: @bfqq's entity.
++ * @bfqg: the group to move to.
++ *
++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating
++ * it on the new one.  Avoid putting the entity on the old group idle tree.
++ *
++ * Must be called under the queue lock; the cgroup owning @bfqg must
++ * not disappear (by now this just means that we are called under
++ * rcu_read_lock()).
++ */
++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++			  struct bfq_entity *entity, struct bfq_group *bfqg)
++{
++	int busy, resume;
++
++	busy = bfq_bfqq_busy(bfqq);
++	resume = !RB_EMPTY_ROOT(&bfqq->sort_list);
++
++	BUG_ON(resume && !entity->on_st);
++	BUG_ON(busy && !resume && entity->on_st && bfqq != bfqd->active_queue);
++
++	if (busy) {
++		BUG_ON(atomic_read(&bfqq->ref) < 2);
++
++		if (!resume)
++			bfq_del_bfqq_busy(bfqd, bfqq, 0);
++		else
++			bfq_deactivate_bfqq(bfqd, bfqq, 0);
++	} else if (entity->on_st)
++		bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
++
++	/*
++	 * Here we use a reference to bfqg.  We don't need a refcounter
++	 * as the cgroup reference will not be dropped, so that its
++	 * destroy() callback will not be invoked.
++	 */
++	entity->parent = bfqg->my_entity;
++	entity->sched_data = &bfqg->sched_data;
++
++	if (busy && resume)
++		bfq_activate_bfqq(bfqd, bfqq);
++
++	if (bfqd->active_queue == NULL && !bfqd->rq_in_driver)
++		bfq_schedule_dispatch(bfqd);
++}
++
++/**
++ * __bfq_bic_change_cgroup - move @bic to @cgroup.
++ * @bfqd: the queue descriptor.
++ * @bic: the bic to move.
++ * @cgroup: the cgroup to move to.
++ *
++ * Move bic to cgroup, assuming that bfqd->queue is locked; the caller
++ * has to make sure that the reference to cgroup is valid across the call.
++ *
++ * NOTE: an alternative approach might have been to store the current
++ * cgroup in bfqq and getting a reference to it, reducing the lookup
++ * time here, at the price of slightly more complex code.
++ */
++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
++						 struct bfq_io_cq *bic,
++						 struct cgroup *cgroup)
++{
++	struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
++	struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
++	struct bfq_entity *entity;
++	struct bfq_group *bfqg;
++	struct bfqio_cgroup *bgrp;
++
++	bgrp = cgroup_to_bfqio(cgroup);
++
++	bfqg = bfq_find_alloc_group(bfqd, cgroup);
++	if (async_bfqq != NULL) {
++		entity = &async_bfqq->entity;
++
++		if (entity->sched_data != &bfqg->sched_data) {
++			bic_set_bfqq(bic, NULL, 0);
++			bfq_log_bfqq(bfqd, async_bfqq,
++				     "bic_change_group: %p %d",
++				     async_bfqq, atomic_read(&async_bfqq->ref));
++			bfq_put_queue(async_bfqq);
++		}
++	}
++
++	if (sync_bfqq != NULL) {
++		entity = &sync_bfqq->entity;
++		if (entity->sched_data != &bfqg->sched_data)
++			bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg);
++	}
++
++	return bfqg;
++}
++
++/**
++ * bfq_bic_change_cgroup - move @bic to @cgroup.
++ * @bic: the bic being migrated.
++ * @cgroup: the destination cgroup.
++ *
++ * When the task owning @bic is moved to @cgroup, @bic is immediately
++ * moved into its new parent group.
++ */
++static void bfq_bic_change_cgroup(struct bfq_io_cq *bic,
++				  struct cgroup *cgroup)
++{
++	struct bfq_data *bfqd;
++	unsigned long uninitialized_var(flags);
++
++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
++	if (bfqd != NULL) {
++		__bfq_bic_change_cgroup(bfqd, bic, cgroup);
++		bfq_put_bfqd_unlock(bfqd, &flags);
++	}
++}
++
++/**
++ * bfq_bic_update_cgroup - update the cgroup of @bic.
++ * @bic: the @bic to update.
++ *
++ * Make sure that @bic is enqueued in the cgroup of the current task.
++ * We need this in addition to moving bics during the cgroup attach
++ * phase because the task owning @bic could be at its first disk
++ * access or we may end up in the root cgroup as the result of a
++ * memory allocation failure and here we try to move to the right
++ * group.
++ *
++ * Must be called under the queue lock.  It is safe to use the returned
++ * value even after the rcu_read_unlock() as the migration/destruction
++ * paths act under the queue lock too.  IOW it is impossible to race with
++ * group migration/destruction and end up with an invalid group as:
++ *   a) here cgroup has not yet been destroyed, nor its destroy callback
++ *      has started execution, as current holds a reference to it,
++ *   b) if it is destroyed after rcu_read_unlock() [after current is
++ *      migrated to a different cgroup] its attach() callback will have
++ *      taken care of remove all the references to the old cgroup data.
++ */
++static struct bfq_group *bfq_bic_update_cgroup(struct bfq_io_cq *bic)
++{
++	struct bfq_data *bfqd = bic_to_bfqd(bic);
++	struct bfq_group *bfqg;
++	struct cgroup *cgroup;
++
++	BUG_ON(bfqd == NULL);
++
++	rcu_read_lock();
++	cgroup = task_cgroup(current, bfqio_subsys_id);
++	bfqg = __bfq_bic_change_cgroup(bfqd, bic, cgroup);
++	rcu_read_unlock();
++
++	return bfqg;
++}
++
++/**
++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st.
++ * @st: the service tree being flushed.
++ */
++static inline void bfq_flush_idle_tree(struct bfq_service_tree *st)
++{
++	struct bfq_entity *entity = st->first_idle;
++
++	for (; entity != NULL; entity = st->first_idle)
++		__bfq_deactivate_entity(entity, 0);
++}
++
++/**
++ * bfq_reparent_leaf_entity - move leaf entity to the root_group.
++ * @bfqd: the device data structure with the root group.
++ * @entity: the entity to move.
++ */
++static inline void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
++					    struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++
++	BUG_ON(bfqq == NULL);
++	bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group);
++	return;
++}
++
++/**
++ * bfq_reparent_active_entities - move to the root group all active entities.
++ * @bfqd: the device data structure with the root group.
++ * @bfqg: the group to move from.
++ * @st: the service tree with the entities.
++ *
++ * Needs queue_lock to be taken and reference to be valid over the call.
++ */
++static inline void bfq_reparent_active_entities(struct bfq_data *bfqd,
++						struct bfq_group *bfqg,
++						struct bfq_service_tree *st)
++{
++	struct rb_root *active = &st->active;
++	struct bfq_entity *entity = NULL;
++
++	if (!RB_EMPTY_ROOT(&st->active))
++		entity = bfq_entity_of(rb_first(active));
++
++	for (; entity != NULL ; entity = bfq_entity_of(rb_first(active)))
++		bfq_reparent_leaf_entity(bfqd, entity);
++
++	if (bfqg->sched_data.active_entity != NULL)
++		bfq_reparent_leaf_entity(bfqd, bfqg->sched_data.active_entity);
++
++	return;
++}
++
++/**
++ * bfq_destroy_group - destroy @bfqg.
++ * @bgrp: the bfqio_cgroup containing @bfqg.
++ * @bfqg: the group being destroyed.
++ *
++ * Destroy @bfqg, making sure that it is not referenced from its parent.
++ */
++static void bfq_destroy_group(struct bfqio_cgroup *bgrp, struct bfq_group *bfqg)
++{
++	struct bfq_data *bfqd;
++	struct bfq_service_tree *st;
++	struct bfq_entity *entity = bfqg->my_entity;
++	unsigned long uninitialized_var(flags);
++	int i;
++
++	hlist_del(&bfqg->group_node);
++
++	/*
++	 * Empty all service_trees belonging to this group before deactivating
++	 * the group itself.
++	 */
++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) {
++		st = bfqg->sched_data.service_tree + i;
++
++		/*
++		 * The idle tree may still contain bfq_queues belonging
++		 * to exited task because they never migrated to a different
++		 * cgroup from the one being destroyed now.  Noone else
++		 * can access them so it's safe to act without any lock.
++		 */
++		bfq_flush_idle_tree(st);
++
++		/*
++		 * It may happen that some queues are still active
++		 * (busy) upon group destruction (if the corresponding
++		 * processes have been forced to terminate). We move
++		 * all the leaf entities corresponding to these queues
++		 * to the root_group.
++		 * Also, it may happen that the group has an entity
++		 * under service, which is disconnected from the active
++		 * tree: it must be moved, too.
++		 * There is no need to put the sync queues, as the
++		 * scheduler has taken no reference.
++		 */
++		bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
++		if (bfqd != NULL) {
++			bfq_reparent_active_entities(bfqd, bfqg, st);
++			bfq_put_bfqd_unlock(bfqd, &flags);
++		}
++		BUG_ON(!RB_EMPTY_ROOT(&st->active));
++		BUG_ON(!RB_EMPTY_ROOT(&st->idle));
++	}
++	BUG_ON(bfqg->sched_data.next_active != NULL);
++	BUG_ON(bfqg->sched_data.active_entity != NULL);
++
++	/*
++	 * We may race with device destruction, take extra care when
++	 * dereferencing bfqg->bfqd.
++	 */
++	bfqd = bfq_get_bfqd_locked(&bfqg->bfqd, &flags);
++	if (bfqd != NULL) {
++		hlist_del(&bfqg->bfqd_node);
++		__bfq_deactivate_entity(entity, 0);
++		bfq_put_async_queues(bfqd, bfqg);
++		bfq_put_bfqd_unlock(bfqd, &flags);
++	}
++	BUG_ON(entity->tree != NULL);
++
++	/*
++	 * No need to defer the kfree() to the end of the RCU grace
++	 * period: we are called from the destroy() callback of our
++	 * cgroup, so we can be sure that noone is a) still using
++	 * this cgroup or b) doing lookups in it.
++	 */
++	kfree(bfqg);
++}
++
++/**
++ * bfq_disconnect_groups - diconnect @bfqd from all its groups.
++ * @bfqd: the device descriptor being exited.
++ *
++ * When the device exits we just make sure that no lookup can return
++ * the now unused group structures.  They will be deallocated on cgroup
++ * destruction.
++ */
++static void bfq_disconnect_groups(struct bfq_data *bfqd)
++{
++	struct hlist_node *pos, *n;
++	struct bfq_group *bfqg;
++
++	bfq_log(bfqd, "disconnect_groups beginning") ;
++	hlist_for_each_entry_safe(bfqg, pos, n, &bfqd->group_list, bfqd_node) {
++		hlist_del(&bfqg->bfqd_node);
++
++		__bfq_deactivate_entity(bfqg->my_entity, 0);
++
++		/*
++		 * Don't remove from the group hash, just set an
++		 * invalid key.  No lookups can race with the
++		 * assignment as bfqd is being destroyed; this
++		 * implies also that new elements cannot be added
++		 * to the list.
++		 */
++		rcu_assign_pointer(bfqg->bfqd, NULL);
++
++		bfq_log(bfqd, "disconnect_groups: put async for group %p",
++			bfqg) ;
++		bfq_put_async_queues(bfqd, bfqg);
++	}
++}
++
++static inline void bfq_free_root_group(struct bfq_data *bfqd)
++{
++	struct bfqio_cgroup *bgrp = &bfqio_root_cgroup;
++	struct bfq_group *bfqg = bfqd->root_group;
++
++	bfq_put_async_queues(bfqd, bfqg);
++
++	spin_lock_irq(&bgrp->lock);
++	hlist_del_rcu(&bfqg->group_node);
++	spin_unlock_irq(&bgrp->lock);
++
++	/*
++	 * No need to synchronize_rcu() here: since the device is gone
++	 * there cannot be any read-side access to its root_group.
++	 */
++	kfree(bfqg);
++}
++
++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
++{
++	struct bfq_group *bfqg;
++	struct bfqio_cgroup *bgrp;
++	int i;
++
++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
++	if (bfqg == NULL)
++		return NULL;
++
++	bfqg->entity.parent = NULL;
++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
++
++	bgrp = &bfqio_root_cgroup;
++	spin_lock_irq(&bgrp->lock);
++	rcu_assign_pointer(bfqg->bfqd, bfqd);
++	hlist_add_head_rcu(&bfqg->group_node, &bgrp->group_data);
++	spin_unlock_irq(&bgrp->lock);
++
++	return bfqg;
++}
++
++#define SHOW_FUNCTION(__VAR)						\
++static u64 bfqio_cgroup_##__VAR##_read(struct cgroup *cgroup,		\
++				       struct cftype *cftype)		\
++{									\
++	struct bfqio_cgroup *bgrp;					\
++	u64 ret;							\
++									\
++	if (!cgroup_lock_live_group(cgroup))				\
++		return -ENODEV;						\
++									\
++	bgrp = cgroup_to_bfqio(cgroup);					\
++	spin_lock_irq(&bgrp->lock);					\
++	ret = bgrp->__VAR;						\
++	spin_unlock_irq(&bgrp->lock);					\
++									\
++	cgroup_unlock();						\
++									\
++	return ret;							\
++}
++
++SHOW_FUNCTION(weight);
++SHOW_FUNCTION(ioprio);
++SHOW_FUNCTION(ioprio_class);
++#undef SHOW_FUNCTION
++
++#define STORE_FUNCTION(__VAR, __MIN, __MAX)				\
++static int bfqio_cgroup_##__VAR##_write(struct cgroup *cgroup,		\
++					struct cftype *cftype,		\
++					u64 val)			\
++{									\
++	struct bfqio_cgroup *bgrp;					\
++	struct bfq_group *bfqg;						\
++	struct hlist_node *n;						\
++									\
++	if (val < (__MIN) || val > (__MAX))				\
++		return -EINVAL;						\
++									\
++	if (!cgroup_lock_live_group(cgroup))				\
++		return -ENODEV;						\
++									\
++	bgrp = cgroup_to_bfqio(cgroup);					\
++									\
++	spin_lock_irq(&bgrp->lock);					\
++	bgrp->__VAR = (unsigned short)val;				\
++	hlist_for_each_entry(bfqg, n, &bgrp->group_data, group_node) {	\
++		bfqg->entity.new_##__VAR = (unsigned short)val;		\
++		smp_wmb();						\
++		bfqg->entity.ioprio_changed = 1;			\
++	}								\
++	spin_unlock_irq(&bgrp->lock);					\
++									\
++	cgroup_unlock();						\
++									\
++	return 0;							\
++}
++
++STORE_FUNCTION(weight, BFQ_MIN_WEIGHT, BFQ_MAX_WEIGHT);
++STORE_FUNCTION(ioprio, 0, IOPRIO_BE_NR - 1);
++STORE_FUNCTION(ioprio_class, IOPRIO_CLASS_RT, IOPRIO_CLASS_IDLE);
++#undef STORE_FUNCTION
++
++static struct cftype bfqio_files[] = {
++	{
++		.name = "weight",
++		.read_u64 = bfqio_cgroup_weight_read,
++		.write_u64 = bfqio_cgroup_weight_write,
++	},
++	{
++		.name = "ioprio",
++		.read_u64 = bfqio_cgroup_ioprio_read,
++		.write_u64 = bfqio_cgroup_ioprio_write,
++	},
++	{
++		.name = "ioprio_class",
++		.read_u64 = bfqio_cgroup_ioprio_class_read,
++		.write_u64 = bfqio_cgroup_ioprio_class_write,
++	},
++	{ },	/* terminate */
++};
++
++static struct cgroup_subsys_state *bfqio_create(struct cgroup *cgroup)
++{
++	struct bfqio_cgroup *bgrp;
++
++	if (cgroup->parent != NULL) {
++		bgrp = kzalloc(sizeof(*bgrp), GFP_KERNEL);
++		if (bgrp == NULL)
++			return ERR_PTR(-ENOMEM);
++	} else
++		bgrp = &bfqio_root_cgroup;
++
++	spin_lock_init(&bgrp->lock);
++	INIT_HLIST_HEAD(&bgrp->group_data);
++	bgrp->ioprio = BFQ_DEFAULT_GRP_IOPRIO;
++	bgrp->ioprio_class = BFQ_DEFAULT_GRP_CLASS;
++
++	return &bgrp->css;
++}
++
++/*
++ * We cannot support shared io contexts, as we have no means to support
++ * two tasks with the same ioc in two different groups without major rework
++ * of the main bic/bfqq data structures.  By now we allow a task to change
++ * its cgroup only if it's the only owner of its ioc; the drawback of this
++ * behavior is that a group containing a task that forked using CLONE_IO
++ * will not be destroyed until the tasks sharing the ioc die.
++ */
++static int bfqio_can_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
++{
++	struct task_struct *task;
++	struct io_context *ioc;
++	int ret = 0;
++
++	cgroup_taskset_for_each(task, cgroup, tset) {
++		/* task_lock() is needed to avoid races with exit_io_context() */
++		task_lock(task);
++		ioc = task->io_context;
++		if (ioc != NULL && atomic_read(&ioc->nr_tasks) > 1)
++			/*
++			 * ioc == NULL means that the task is either too young or
++			 * exiting: if it has still no ioc the ioc can't be shared,
++			 * if the task is exiting the attach will fail anyway, no
++			 * matter what we return here.
++			 */
++			ret = -EINVAL;
++		task_unlock(task);
++		if (ret)
++			break;
++	}
++
++	return ret;
++}
++
++static void bfqio_attach(struct cgroup *cgroup, struct cgroup_taskset *tset)
++{
++	struct task_struct *task;
++	struct io_context *ioc;
++	struct io_cq *icq;
++	struct hlist_node *n;
++
++	/*
++	 * IMPORTANT NOTE: The move of more than one process at a time to a
++	 * new group has not yet been tested.
++	 */
++	cgroup_taskset_for_each(task, cgroup, tset) {
++		ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
++		if (ioc) {
++			/*
++			 * Handle cgroup change here.
++			 */
++			rcu_read_lock();
++			hlist_for_each_entry_rcu(icq, n, &ioc->icq_list, ioc_node)
++				if (!strncmp(icq->q->elevator->type->elevator_name,
++					     "bfq", ELV_NAME_MAX))
++					bfq_bic_change_cgroup(icq_to_bic(icq),
++							      cgroup);
++			rcu_read_unlock();
++			put_io_context(ioc);
++		}
++	}
++}
++
++static void bfqio_destroy(struct cgroup *cgroup)
++{
++	struct bfqio_cgroup *bgrp = cgroup_to_bfqio(cgroup);
++	struct hlist_node *n, *tmp;
++	struct bfq_group *bfqg;
++
++	/*
++	 * Since we are destroying the cgroup, there are no more tasks
++	 * referencing it, and all the RCU grace periods that may have
++	 * referenced it are ended (as the destruction of the parent
++	 * cgroup is RCU-safe); bgrp->group_data will not be accessed by
++	 * anything else and we don't need any synchronization.
++	 */
++	hlist_for_each_entry_safe(bfqg, n, tmp, &bgrp->group_data, group_node)
++		bfq_destroy_group(bgrp, bfqg);
++
++	BUG_ON(!hlist_empty(&bgrp->group_data));
++
++	kfree(bgrp);
++}
++
++struct cgroup_subsys bfqio_subsys = {
++	.name = "bfqio",
++	.css_alloc = bfqio_create,
++	.can_attach = bfqio_can_attach,
++	.attach = bfqio_attach,
++	.css_free = bfqio_destroy,
++	.subsys_id = bfqio_subsys_id,
++	.base_cftypes = bfqio_files,
++};
++#else
++static inline void bfq_init_entity(struct bfq_entity *entity,
++				   struct bfq_group *bfqg)
++{
++	entity->weight = entity->new_weight;
++	entity->orig_weight = entity->new_weight;
++	entity->ioprio = entity->new_ioprio;
++	entity->ioprio_class = entity->new_ioprio_class;
++	entity->sched_data = &bfqg->sched_data;
++}
++
++static inline struct bfq_group *
++bfq_bic_update_cgroup(struct bfq_io_cq *bic)
++{
++	struct bfq_data *bfqd = bic_to_bfqd(bic);
++	return bfqd->root_group;
++}
++
++static inline void bfq_bfqq_move(struct bfq_data *bfqd,
++				 struct bfq_queue *bfqq,
++				 struct bfq_entity *entity,
++				 struct bfq_group *bfqg)
++{
++}
++
++static inline void bfq_disconnect_groups(struct bfq_data *bfqd)
++{
++	bfq_put_async_queues(bfqd, bfqd->root_group);
++}
++
++static inline void bfq_free_root_group(struct bfq_data *bfqd)
++{
++	kfree(bfqd->root_group);
++}
++
++static struct bfq_group *bfq_alloc_root_group(struct bfq_data *bfqd, int node)
++{
++	struct bfq_group *bfqg;
++	int i;
++
++	bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node);
++	if (bfqg == NULL)
++		return NULL;
++
++	for (i = 0; i < BFQ_IOPRIO_CLASSES; i++)
++		bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT;
++
++	return bfqg;
++}
++#endif
+diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c
+new file mode 100644
+index 0000000..326e3ec
+--- /dev/null
++++ b/block/bfq-ioc.c
+@@ -0,0 +1,36 @@
++/*
++ * BFQ: I/O context handling.
++ *
++ * Based on ideas and code from CFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
++ *		      Paolo Valente <paolo.valente@unimore.it>
++ *
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
++ */
++
++/**
++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq.
++ * @icq: the iocontext queue.
++ */
++static inline struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
++{
++	/* bic->icq is the first member, %NULL will convert to %NULL */
++	return container_of(icq, struct bfq_io_cq, icq);
++}
++
++/**
++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
++ * @bfqd: the lookup key.
++ * @ioc: the io_context of the process doing I/O.
++ *
++ * Queue lock must be held.
++ */
++static inline struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
++					       struct io_context *ioc)
++{
++	if(ioc)
++		return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue));
++	return NULL;
++}
+diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
+new file mode 100644
+index 0000000..c9d57b0
+--- /dev/null
++++ b/block/bfq-iosched.c
+@@ -0,0 +1,3218 @@
++/*
++ * BFQ, or Budget Fair Queueing, disk scheduler.
++ *
++ * Based on ideas and code from CFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
++ *		      Paolo Valente <paolo.valente@unimore.it>
++ *
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
++ *
++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ file.
++ *
++ * BFQ is a proportional share disk scheduling algorithm based on the
++ * slice-by-slice service scheme of CFQ. But BFQ assigns budgets,
++ * measured in number of sectors, to tasks instead of time slices.
++ * The disk is not granted to the active task for a given time slice,
++ * but until it has exahusted its assigned budget.  This change from
++ * the time to the service domain allows BFQ to distribute the disk
++ * bandwidth among tasks as desired, without any distortion due to
++ * ZBR, workload fluctuations or other factors. BFQ uses an ad hoc
++ * internal scheduler, called B-WF2Q+, to schedule tasks according to
++ * their budgets.  Thanks to this accurate scheduler, BFQ can afford
++ * to assign high budgets to disk-bound non-seeky tasks (to boost the
++ * throughput), and yet guarantee low latencies to interactive and
++ * soft real-time applications.
++ *
++ * BFQ has been introduced in [1], where the interested reader can
++ * find an accurate description of the algorithm, the bandwidth
++ * distribution and latency guarantees it provides, plus formal proofs
++ * of all the properties.  With respect to the algorithm presented in
++ * the paper, this implementation adds several little heuristics, and
++ * a hierarchical extension, based on H-WF2Q+.
++ *
++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with
++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N)
++ * complexity derives from the one introduced with EEVDF in [3].
++ *
++ * [1] P. Valente and F. Checconi, ``High Throughput Disk Scheduling
++ *     with Deterministic Guarantees on Bandwidth Distribution,'',
++ *     IEEE Transactions on Computer, May 2010.
++ *
++ *     http://algo.ing.unimo.it/people/paolo/disk_sched/bfq-techreport.pdf
++ *
++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing
++ *     Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689,
++ *     Oct 1997.
++ *
++ *     http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz
++ *
++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline
++ *     First: A Flexible and Accurate Mechanism for Proportional Share
++ *     Resource Allocation,'' technical report.
++ *
++ *     http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf
++ */
++#include <linux/module.h>
++#include <linux/slab.h>
++#include <linux/blkdev.h>
++#include <linux/cgroup.h>
++#include <linux/elevator.h>
++#include <linux/jiffies.h>
++#include <linux/rbtree.h>
++#include <linux/ioprio.h>
++#include "bfq.h"
++#include "blk.h"
++
++/* Max number of dispatches in one round of service. */
++static const int bfq_quantum = 4;
++
++/* Expiration time of sync (0) and async (1) requests, in jiffies. */
++static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 };
++
++/* Maximum backwards seek, in KiB. */
++static const int bfq_back_max = 16 * 1024;
++
++/* Penalty of a backwards seek, in number of sectors. */
++static const int bfq_back_penalty = 2;
++
++/* Idling period duration, in jiffies. */
++static int bfq_slice_idle = HZ / 125;
++
++/* Default maximum budget values, in sectors and number of requests. */
++static const int bfq_default_max_budget = 16 * 1024;
++static const int bfq_max_budget_async_rq = 4;
++
++/*
++ * Async to sync throughput distribution is controlled as follows:
++ * when an async request is served, the entity is charged the number
++ * of sectors of the request, multipled by the factor below
++ */
++static const int bfq_async_charge_factor = 10;
++
++/* Default timeout values, in jiffies, approximating CFQ defaults. */
++static const int bfq_timeout_sync = HZ / 8;
++static int bfq_timeout_async = HZ / 25;
++
++struct kmem_cache *bfq_pool;
++
++/* Below this threshold (in ms), we consider thinktime immediate. */
++#define BFQ_MIN_TT		2
++
++/* hw_tag detection: parallel requests threshold and min samples needed. */
++#define BFQ_HW_QUEUE_THRESHOLD	4
++#define BFQ_HW_QUEUE_SAMPLES	32
++
++#define BFQQ_SEEK_THR	 (sector_t)(8 * 1024)
++#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR)
++
++/* Min samples used for peak rate estimation (for autotuning). */
++#define BFQ_PEAK_RATE_SAMPLES	32
++
++/* Shift used for peak rate fixed precision calculations. */
++#define BFQ_RATE_SHIFT		16
++
++/*
++ * The duration of the weight raising for interactive applications is
++ * computed automatically (as default behaviour), using the following
++ * formula: duration = (R / r) * T, where r is the peak rate of the
++ * disk, and R and T are two reference parameters. In particular, R is
++ * the peak rate of a reference disk, and T is about the maximum time
++ * for starting popular large applications on that disk, under BFQ and
++ * while reading two files in parallel. Finally, BFQ uses two
++ * different pairs (R, T) depending on whether the disk is rotational
++ * or non-rotational.
++ */
++#define T_rot			(msecs_to_jiffies(5500))
++#define T_nonrot		(msecs_to_jiffies(2000))
++/* Next two quantities are in sectors/usec, left-shifted by BFQ_RATE_SHIFT */
++#define R_rot			17415
++#define R_nonrot		34791
++
++#define BFQ_SERVICE_TREE_INIT	((struct bfq_service_tree)		\
++				{ RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 })
++
++#define RQ_BIC(rq)		((struct bfq_io_cq *) (rq)->elv.priv[0])
++#define RQ_BFQQ(rq)		((rq)->elv.priv[1])
++
++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd);
++
++#include "bfq-ioc.c"
++#include "bfq-sched.c"
++#include "bfq-cgroup.c"
++
++#define bfq_class_idle(bfqq)	((bfqq)->entity.ioprio_class ==\
++				 IOPRIO_CLASS_IDLE)
++#define bfq_class_rt(bfqq)	((bfqq)->entity.ioprio_class ==\
++				 IOPRIO_CLASS_RT)
++
++#define bfq_sample_valid(samples)	((samples) > 80)
++
++/*
++ * We regard a request as SYNC, if either it's a read or has the SYNC bit
++ * set (in which case it could also be a direct WRITE).
++ */
++static inline int bfq_bio_sync(struct bio *bio)
++{
++	if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC))
++		return 1;
++
++	return 0;
++}
++
++/*
++ * Scheduler run of queue, if there are requests pending and no one in the
++ * driver that will restart queueing.
++ */
++static inline void bfq_schedule_dispatch(struct bfq_data *bfqd)
++{
++	if (bfqd->queued != 0) {
++		bfq_log(bfqd, "schedule dispatch");
++		kblockd_schedule_work(bfqd->queue, &bfqd->unplug_work);
++	}
++}
++
++/*
++ * Lifted from AS - choose which of rq1 and rq2 that is best served now.
++ * We choose the request that is closesr to the head right now.  Distance
++ * behind the head is penalized and only allowed to a certain extent.
++ */
++static struct request *bfq_choose_req(struct bfq_data *bfqd,
++				      struct request *rq1,
++				      struct request *rq2,
++				      sector_t last)
++{
++	sector_t s1, s2, d1 = 0, d2 = 0;
++	unsigned long back_max;
++#define BFQ_RQ1_WRAP	0x01 /* request 1 wraps */
++#define BFQ_RQ2_WRAP	0x02 /* request 2 wraps */
++	unsigned wrap = 0; /* bit mask: requests behind the disk head? */
++
++	if (rq1 == NULL || rq1 == rq2)
++		return rq2;
++	if (rq2 == NULL)
++		return rq1;
++
++	if (rq_is_sync(rq1) && !rq_is_sync(rq2))
++		return rq1;
++	else if (rq_is_sync(rq2) && !rq_is_sync(rq1))
++		return rq2;
++	if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META))
++		return rq1;
++	else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META))
++		return rq2;
++
++	s1 = blk_rq_pos(rq1);
++	s2 = blk_rq_pos(rq2);
++
++	/*
++	 * By definition, 1KiB is 2 sectors.
++	 */
++	back_max = bfqd->bfq_back_max * 2;
++
++	/*
++	 * Strict one way elevator _except_ in the case where we allow
++	 * short backward seeks which are biased as twice the cost of a
++	 * similar forward seek.
++	 */
++	if (s1 >= last)
++		d1 = s1 - last;
++	else if (s1 + back_max >= last)
++		d1 = (last - s1) * bfqd->bfq_back_penalty;
++	else
++		wrap |= BFQ_RQ1_WRAP;
++
++	if (s2 >= last)
++		d2 = s2 - last;
++	else if (s2 + back_max >= last)
++		d2 = (last - s2) * bfqd->bfq_back_penalty;
++	else
++		wrap |= BFQ_RQ2_WRAP;
++
++	/* Found required data */
++
++	/*
++	 * By doing switch() on the bit mask "wrap" we avoid having to
++	 * check two variables for all permutations: --> faster!
++	 */
++	switch (wrap) {
++	case 0: /* common case for CFQ: rq1 and rq2 not wrapped */
++		if (d1 < d2)
++			return rq1;
++		else if (d2 < d1)
++			return rq2;
++		else {
++			if (s1 >= s2)
++				return rq1;
++			else
++				return rq2;
++		}
++
++	case BFQ_RQ2_WRAP:
++		return rq1;
++	case BFQ_RQ1_WRAP:
++		return rq2;
++	case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */
++	default:
++		/*
++		 * Since both rqs are wrapped,
++		 * start with the one that's further behind head
++		 * (--> only *one* back seek required),
++		 * since back seek takes more time than forward.
++		 */
++		if (s1 <= s2)
++			return rq1;
++		else
++			return rq2;
++	}
++}
++
++static struct bfq_queue *
++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root,
++		     sector_t sector, struct rb_node **ret_parent,
++		     struct rb_node ***rb_link)
++{
++	struct rb_node **p, *parent;
++	struct bfq_queue *bfqq = NULL;
++
++	parent = NULL;
++	p = &root->rb_node;
++	while (*p) {
++		struct rb_node **n;
++
++		parent = *p;
++		bfqq = rb_entry(parent, struct bfq_queue, pos_node);
++
++		/*
++		 * Sort strictly based on sector. Smallest to the left,
++		 * largest to the right.
++		 */
++		if (sector > blk_rq_pos(bfqq->next_rq))
++			n = &(*p)->rb_right;
++		else if (sector < blk_rq_pos(bfqq->next_rq))
++			n = &(*p)->rb_left;
++		else
++			break;
++		p = n;
++		bfqq = NULL;
++	}
++
++	*ret_parent = parent;
++	if (rb_link)
++		*rb_link = p;
++
++	bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d",
++		(long long unsigned)sector,
++		bfqq != NULL ? bfqq->pid : 0);
++
++	return bfqq;
++}
++
++static void bfq_rq_pos_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++{
++	struct rb_node **p, *parent;
++	struct bfq_queue *__bfqq;
++
++	if (bfqq->pos_root != NULL) {
++		rb_erase(&bfqq->pos_node, bfqq->pos_root);
++		bfqq->pos_root = NULL;
++	}
++
++	if (bfq_class_idle(bfqq))
++		return;
++	if (!bfqq->next_rq)
++		return;
++
++	bfqq->pos_root = &bfqd->rq_pos_tree;
++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
++			blk_rq_pos(bfqq->next_rq), &parent, &p);
++	if (__bfqq == NULL) {
++		rb_link_node(&bfqq->pos_node, parent, p);
++		rb_insert_color(&bfqq->pos_node, bfqq->pos_root);
++	} else
++		bfqq->pos_root = NULL;
++}
++
++static struct request *bfq_find_next_rq(struct bfq_data *bfqd,
++					struct bfq_queue *bfqq,
++					struct request *last)
++{
++	struct rb_node *rbnext = rb_next(&last->rb_node);
++	struct rb_node *rbprev = rb_prev(&last->rb_node);
++	struct request *next = NULL, *prev = NULL;
++
++	BUG_ON(RB_EMPTY_NODE(&last->rb_node));
++
++	if (rbprev != NULL)
++		prev = rb_entry_rq(rbprev);
++
++	if (rbnext != NULL)
++		next = rb_entry_rq(rbnext);
++	else {
++		rbnext = rb_first(&bfqq->sort_list);
++		if (rbnext && rbnext != &last->rb_node)
++			next = rb_entry_rq(rbnext);
++	}
++
++	return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last));
++}
++
++static void bfq_del_rq_rb(struct request *rq)
++{
++	struct bfq_queue *bfqq = RQ_BFQQ(rq);
++	struct bfq_data *bfqd = bfqq->bfqd;
++	const int sync = rq_is_sync(rq);
++
++	BUG_ON(bfqq->queued[sync] == 0);
++	bfqq->queued[sync]--;
++	bfqd->queued--;
++
++	elv_rb_del(&bfqq->sort_list, rq);
++
++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
++		if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->active_queue)
++			bfq_del_bfqq_busy(bfqd, bfqq, 1);
++		/*
++		 * Remove queue from request-position tree as it is empty.
++		 */
++		if (bfqq->pos_root != NULL) {
++			rb_erase(&bfqq->pos_node, bfqq->pos_root);
++			bfqq->pos_root = NULL;
++		}
++	}
++}
++
++/* see the definition of bfq_async_charge_factor for details */
++static inline unsigned long bfq_serv_to_charge(struct request *rq,
++					       struct bfq_queue *bfqq)
++{
++	return blk_rq_sectors(rq) *
++		(1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->raising_coeff == 1) *
++		bfq_async_charge_factor));
++}
++
++/**
++ * bfq_updated_next_req - update the queue after a new next_rq selection.
++ * @bfqd: the device data the queue belongs to.
++ * @bfqq: the queue to update.
++ *
++ * If the first request of a queue changes we make sure that the queue
++ * has enough budget to serve at least its first request (if the
++ * request has grown).  We do this because if the queue has not enough
++ * budget for its first request, it has to go through two dispatch
++ * rounds to actually get it dispatched.
++ */
++static void bfq_updated_next_req(struct bfq_data *bfqd,
++				 struct bfq_queue *bfqq)
++{
++	struct bfq_entity *entity = &bfqq->entity;
++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
++	struct request *next_rq = bfqq->next_rq;
++	unsigned long new_budget;
++
++	if (next_rq == NULL)
++		return;
++
++	if (bfqq == bfqd->active_queue)
++		/*
++		 * In order not to break guarantees, budgets cannot be
++		 * changed after an entity has been selected.
++		 */
++		return;
++
++	BUG_ON(entity->tree != &st->active);
++	BUG_ON(entity == entity->sched_data->active_entity);
++
++	new_budget = max_t(unsigned long, bfqq->max_budget,
++			   bfq_serv_to_charge(next_rq, bfqq));
++	entity->budget = new_budget;
++	bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", new_budget);
++	bfq_activate_bfqq(bfqd, bfqq);
++}
++
++static inline unsigned int bfq_wrais_duration(struct bfq_data *bfqd)
++{
++	u64 dur;
++
++	if (bfqd->bfq_raising_max_time > 0)
++		return bfqd->bfq_raising_max_time;
++
++	dur = bfqd->RT_prod;
++	do_div(dur, bfqd->peak_rate);
++
++	return dur;
++}
++
++static inline void
++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
++{
++	if (bic->saved_idle_window)
++		bfq_mark_bfqq_idle_window(bfqq);
++	else
++		bfq_clear_bfqq_idle_window(bfqq);
++	if (bic->raising_time_left) {
++		/*
++		 * Start a weight raising period with the duration given by
++		 * the raising_time_left snapshot.
++		 */
++		bfqq->raising_coeff = bfqq->bfqd->bfq_raising_coeff;
++		bfqq->raising_cur_max_time = bic->raising_time_left;
++		bfqq->last_rais_start_finish = jiffies;
++	}
++}
++
++/*
++ * Must be called with the queue_lock held.
++ */
++static int bfqq_process_refs(struct bfq_queue *bfqq)
++{
++	int process_refs, io_refs;
++
++	io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE];
++	process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st;
++	BUG_ON(process_refs < 0);
++	return process_refs;
++}
++
++static void bfq_add_rq_rb(struct request *rq)
++{
++	struct bfq_queue *bfqq = RQ_BFQQ(rq);
++	struct bfq_entity *entity = &bfqq->entity;
++	struct bfq_data *bfqd = bfqq->bfqd;
++	struct request *next_rq, *prev;
++	unsigned long old_raising_coeff = bfqq->raising_coeff;
++	int idle_for_long_time = bfqq->budget_timeout +
++		bfqd->bfq_raising_min_idle_time < jiffies;
++
++	bfq_log_bfqq(bfqd, bfqq, "add_rq_rb %d", rq_is_sync(rq));
++	bfqq->queued[rq_is_sync(rq)]++;
++	bfqd->queued++;
++
++	elv_rb_add(&bfqq->sort_list, rq);
++
++	/*
++	 * Check if this request is a better next-serve candidate.
++	 */
++	prev = bfqq->next_rq;
++	next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position);
++	BUG_ON(next_rq == NULL);
++	bfqq->next_rq = next_rq;
++
++	/*
++	 * Adjust priority tree position, if next_rq changes.
++	 */
++	if (prev != bfqq->next_rq)
++		bfq_rq_pos_tree_add(bfqd, bfqq);
++
++	if (!bfq_bfqq_busy(bfqq)) {
++		int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
++			bfqq->soft_rt_next_start < jiffies;
++		entity->budget = max_t(unsigned long, bfqq->max_budget,
++				       bfq_serv_to_charge(next_rq, bfqq));
++
++		if (! bfqd->low_latency)
++			goto add_bfqq_busy;
++
++		if (bfq_bfqq_just_split(bfqq))
++			goto set_ioprio_changed;
++
++		/*
++		 * If the queue:
++		 * - is not being boosted,
++		 * - has been idle for enough time,
++		 * - is not a sync queue or is linked to a bfq_io_cq (it is
++		 *   shared "for its nature" or it is not shared and its
++		 *   requests have not been redirected to a shared queue)
++		 * start a weight-raising period.
++		 */
++		if(old_raising_coeff == 1 && (idle_for_long_time || soft_rt) &&
++		   (!bfq_bfqq_sync(bfqq) || bfqq->bic != NULL)) {
++			bfqq->raising_coeff = bfqd->bfq_raising_coeff;
++			if (idle_for_long_time)
++				bfqq->raising_cur_max_time =
++					bfq_wrais_duration(bfqd);
++			else
++				bfqq->raising_cur_max_time =
++					bfqd->bfq_raising_rt_max_time;
++			bfq_log_bfqq(bfqd, bfqq,
++				     "wrais starting at %llu msec,"
++				     "rais_max_time %u",
++				     bfqq->last_rais_start_finish,
++				     jiffies_to_msecs(bfqq->
++					raising_cur_max_time));
++		} else if (old_raising_coeff > 1) {
++			if (idle_for_long_time)
++				bfqq->raising_cur_max_time =
++					bfq_wrais_duration(bfqd);
++			else if (bfqq->raising_cur_max_time ==
++				 bfqd->bfq_raising_rt_max_time &&
++				 !soft_rt) {
++				bfqq->raising_coeff = 1;
++				bfq_log_bfqq(bfqd, bfqq,
++					     "wrais ending at %llu msec,"
++					     "rais_max_time %u",
++					     bfqq->last_rais_start_finish,
++					     jiffies_to_msecs(bfqq->
++						raising_cur_max_time));
++				}
++		}
++set_ioprio_changed:
++		if (old_raising_coeff != bfqq->raising_coeff)
++			entity->ioprio_changed = 1;
++add_bfqq_busy:
++		bfq_add_bfqq_busy(bfqd, bfqq);
++        } else {
++                if(bfqd->low_latency && old_raising_coeff == 1 &&
++			!rq_is_sync(rq) &&
++			bfqq->last_rais_start_finish +
++                        bfqd->bfq_raising_min_inter_arr_async < jiffies) {
++                        bfqq->raising_coeff = bfqd->bfq_raising_coeff;
++			bfqq->raising_cur_max_time = bfq_wrais_duration(bfqd);
++
++			entity->ioprio_changed = 1;
++			bfq_log_bfqq(bfqd, bfqq,
++				     "non-idle wrais starting at %llu msec,"
++				     "rais_max_time %u",
++				     bfqq->last_rais_start_finish,
++				     jiffies_to_msecs(bfqq->
++					raising_cur_max_time));
++                }
++                bfq_updated_next_req(bfqd, bfqq);
++	}
++
++	if(bfqd->low_latency &&
++		(old_raising_coeff == 1 || bfqq->raising_coeff == 1 ||
++		 idle_for_long_time))
++		bfqq->last_rais_start_finish = jiffies;
++}
++
++static void bfq_reposition_rq_rb(struct bfq_queue *bfqq, struct request *rq)
++{
++	elv_rb_del(&bfqq->sort_list, rq);
++	bfqq->queued[rq_is_sync(rq)]--;
++	bfqq->bfqd->queued--;
++	bfq_add_rq_rb(rq);
++}
++
++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd,
++					  struct bio *bio)
++{
++	struct task_struct *tsk = current;
++	struct bfq_io_cq *bic;
++	struct bfq_queue *bfqq;
++
++	bic = bfq_bic_lookup(bfqd, tsk->io_context);
++	if (bic == NULL)
++		return NULL;
++
++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
++	if (bfqq != NULL) {
++		sector_t sector = bio->bi_sector + bio_sectors(bio);
++
++		return elv_rb_find(&bfqq->sort_list, sector);
++	}
++
++	return NULL;
++}
++
++static void bfq_activate_request(struct request_queue *q, struct request *rq)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++
++	bfqd->rq_in_driver++;
++	bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq);
++	bfq_log(bfqd, "activate_request: new bfqd->last_position %llu",
++		(long long unsigned)bfqd->last_position);
++}
++
++static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++
++	WARN_ON(bfqd->rq_in_driver == 0);
++	bfqd->rq_in_driver--;
++}
++
++static void bfq_remove_request(struct request *rq)
++{
++	struct bfq_queue *bfqq = RQ_BFQQ(rq);
++	struct bfq_data *bfqd = bfqq->bfqd;
++
++	if (bfqq->next_rq == rq) {
++		bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq);
++		bfq_updated_next_req(bfqd, bfqq);
++	}
++
++	list_del_init(&rq->queuelist);
++	bfq_del_rq_rb(rq);
++
++	if (rq->cmd_flags & REQ_META) {
++		WARN_ON(bfqq->meta_pending == 0);
++		bfqq->meta_pending--;
++	}
++}
++
++static int bfq_merge(struct request_queue *q, struct request **req,
++		     struct bio *bio)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++	struct request *__rq;
++
++	__rq = bfq_find_rq_fmerge(bfqd, bio);
++	if (__rq != NULL && elv_rq_merge_ok(__rq, bio)) {
++		*req = __rq;
++		return ELEVATOR_FRONT_MERGE;
++	}
++
++	return ELEVATOR_NO_MERGE;
++}
++
++static void bfq_merged_request(struct request_queue *q, struct request *req,
++			       int type)
++{
++	if (type == ELEVATOR_FRONT_MERGE) {
++		struct bfq_queue *bfqq = RQ_BFQQ(req);
++
++		bfq_reposition_rq_rb(bfqq, req);
++	}
++}
++
++static void bfq_merged_requests(struct request_queue *q, struct request *rq,
++				struct request *next)
++{
++	struct bfq_queue *bfqq = RQ_BFQQ(rq);
++
++	/*
++	 * Reposition in fifo if next is older than rq.
++	 */
++	if (!list_empty(&rq->queuelist) && !list_empty(&next->queuelist) &&
++	    time_before(rq_fifo_time(next), rq_fifo_time(rq))) {
++		list_move(&rq->queuelist, &next->queuelist);
++		rq_set_fifo_time(rq, rq_fifo_time(next));
++	}
++
++	if (bfqq->next_rq == next)
++		bfqq->next_rq = rq;
++
++	bfq_remove_request(next);
++}
++
++static inline sector_t bfq_io_struct_pos(void *io_struct, bool request)
++{
++	if (request)
++		return blk_rq_pos(io_struct);
++	else
++		return ((struct bio *)io_struct)->bi_sector;
++}
++
++static inline sector_t bfq_dist_from(sector_t pos1,
++				     sector_t pos2)
++{
++	if (pos1 >= pos2)
++		return pos1 - pos2;
++	else
++		return pos2 - pos1;
++}
++
++static inline int bfq_rq_close_to_sector(void *io_struct, bool request,
++					 sector_t sector)
++{
++	return bfq_dist_from(bfq_io_struct_pos(io_struct, request), sector) <=
++	       BFQQ_SEEK_THR;
++}
++
++static struct bfq_queue *bfqq_close(struct bfq_data *bfqd, sector_t sector)
++{
++	struct rb_root *root = &bfqd->rq_pos_tree;
++	struct rb_node *parent, *node;
++	struct bfq_queue *__bfqq;
++
++	if (RB_EMPTY_ROOT(root))
++		return NULL;
++
++	/*
++	 * First, if we find a request starting at the end of the last
++	 * request, choose it.
++	 */
++	__bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL);
++	if (__bfqq != NULL)
++		return __bfqq;
++
++	/*
++	 * If the exact sector wasn't found, the parent of the NULL leaf
++	 * will contain the closest sector (rq_pos_tree sorted by next_request
++	 * position).
++	 */
++	__bfqq = rb_entry(parent, struct bfq_queue, pos_node);
++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
++		return __bfqq;
++
++	if (blk_rq_pos(__bfqq->next_rq) < sector)
++		node = rb_next(&__bfqq->pos_node);
++	else
++		node = rb_prev(&__bfqq->pos_node);
++	if (node == NULL)
++		return NULL;
++
++	__bfqq = rb_entry(node, struct bfq_queue, pos_node);
++	if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector))
++		return __bfqq;
++
++	return NULL;
++}
++
++/*
++ * bfqd - obvious
++ * cur_bfqq - passed in so that we don't decide that the current queue
++ *            is closely cooperating with itself
++ * sector - used as a reference point to search for a close queue
++ */
++static struct bfq_queue *bfq_close_cooperator(struct bfq_data *bfqd,
++					      struct bfq_queue *cur_bfqq,
++					      sector_t sector)
++{
++	struct bfq_queue *bfqq;
++
++	if (bfq_class_idle(cur_bfqq))
++		return NULL;
++	if (!bfq_bfqq_sync(cur_bfqq))
++		return NULL;
++	if (BFQQ_SEEKY(cur_bfqq))
++		return NULL;
++
++	/* If device has only one backlogged bfq_queue, don't search. */
++	if (bfqd->busy_queues == 1)
++		return NULL;
++
++	/*
++	 * We should notice if some of the queues are cooperating, e.g.
++	 * working closely on the same area of the disk. In that case,
++	 * we can group them together and don't waste time idling.
++	 */
++	bfqq = bfqq_close(bfqd, sector);
++	if (bfqq == NULL || bfqq == cur_bfqq)
++		return NULL;
++
++	/*
++	 * Do not merge queues from different bfq_groups.
++	*/
++	if (bfqq->entity.parent != cur_bfqq->entity.parent)
++		return NULL;
++
++	/*
++	 * It only makes sense to merge sync queues.
++	 */
++	if (!bfq_bfqq_sync(bfqq))
++		return NULL;
++	if (BFQQ_SEEKY(bfqq))
++		return NULL;
++
++	/*
++	 * Do not merge queues of different priority classes.
++	 */
++	if (bfq_class_rt(bfqq) != bfq_class_rt(cur_bfqq))
++		return NULL;
++
++	return bfqq;
++}
++
++static struct bfq_queue *
++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
++{
++	int process_refs, new_process_refs;
++	struct bfq_queue *__bfqq;
++
++	/*
++	 * If there are no process references on the new_bfqq, then it is
++	 * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain
++	 * may have dropped their last reference (not just their last process
++	 * reference).
++	 */
++	if (!bfqq_process_refs(new_bfqq))
++		return NULL;
++
++	/* Avoid a circular list and skip interim queue merges. */
++	while ((__bfqq = new_bfqq->new_bfqq)) {
++		if (__bfqq == bfqq)
++			return NULL;
++		new_bfqq = __bfqq;
++	}
++
++	process_refs = bfqq_process_refs(bfqq);
++	new_process_refs = bfqq_process_refs(new_bfqq);
++	/*
++	 * If the process for the bfqq has gone away, there is no
++	 * sense in merging the queues.
++	 */
++	if (process_refs == 0 || new_process_refs == 0)
++		return NULL;
++
++	bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
++		new_bfqq->pid);
++
++	/*
++	 * Merging is just a redirection: the requests of the process owning
++	 * one of the two queues are redirected to the other queue. The latter
++	 * queue, in its turn, is set as shared if this is the first time that
++	 * the requests of some process are redirected to it.
++	 *
++	 * We redirect bfqq to new_bfqq and not the opposite, because we
++	 * are in the context of the process owning bfqq, hence we have the
++	 * io_cq of this process. So we can immediately configure this io_cq
++	 * to redirect the requests of the process to new_bfqq.
++	 *
++	 * NOTE, even if new_bfqq coincides with the active queue, the io_cq of
++	 * new_bfqq is not available, because, if the active queue is shared,
++	 * bfqd->active_bic may not point to the io_cq of the active queue.
++	 * Redirecting the requests of the process owning bfqq to the currently
++	 * active queue is in any case the best option, as we feed the active queue
++	 * with new requests close to the last request served and, by doing so,
++	 * hopefully increase the throughput.
++	 */
++	bfqq->new_bfqq = new_bfqq;
++	atomic_add(process_refs, &new_bfqq->ref);
++	return new_bfqq;
++}
++
++/*
++ * Attempt to schedule a merge of bfqq with the currently active queue or
++ * with a close queue among the scheduled queues.
++ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue
++ * structure otherwise.
++ */
++static struct bfq_queue *
++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++		     void *io_struct, bool request)
++{
++	struct bfq_queue *active_bfqq, *new_bfqq;
++
++	if (bfqq->new_bfqq)
++		return bfqq->new_bfqq;
++
++	if (!io_struct)
++		return NULL;
++
++	active_bfqq = bfqd->active_queue;
++
++	if (active_bfqq == NULL || active_bfqq == bfqq || !bfqd->active_bic)
++		goto check_scheduled;
++
++	if (bfq_class_idle(active_bfqq) || bfq_class_idle(bfqq))
++		goto check_scheduled;
++
++	if (bfq_class_rt(active_bfqq) != bfq_class_rt(bfqq))
++		goto check_scheduled;
++
++	if (active_bfqq->entity.parent != bfqq->entity.parent)
++		goto check_scheduled;
++
++	if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) &&
++	    bfq_bfqq_sync(active_bfqq) && bfq_bfqq_sync(bfqq))
++		if ((new_bfqq = bfq_setup_merge(bfqq, active_bfqq)))
++			return new_bfqq; /* Merge with the active queue */
++
++	/*
++	 * Check whether there is a cooperator among currently scheduled
++	 * queues. The only thing we need is that the bio/request is not
++	 * NULL, as we need it to establish whether a cooperator exists.
++	 */
++check_scheduled:
++	new_bfqq = bfq_close_cooperator(bfqd, bfqq,
++					bfq_io_struct_pos(io_struct, request));
++	if (new_bfqq)
++		return bfq_setup_merge(bfqq, new_bfqq);
++
++	return NULL;
++}
++
++static inline void
++bfq_bfqq_save_state(struct bfq_queue *bfqq)
++{
++	/*
++	 * If bfqq->bic == NULL, the queue is already shared or its requests
++	 * have already been redirected to a shared queue; both idle window
++	 * and weight raising state have already been saved. Do nothing.
++	 */
++	if (bfqq->bic == NULL)
++		return;
++	if (bfqq->raising_coeff > 1) {
++		unsigned long wrais_duration =
++			jiffies - bfqq->last_rais_start_finish;
++		/*
++		 * It may happen that a queue's weight raising period lasts
++		 * longer than its raising_cur_max_time, as weight raising is
++		 * handled only when a request is enqueued or dispatched (it
++		 * does not use any timer). If the weight raising period is
++		 * about to end, don't save it.
++		 */
++		if (bfqq->raising_cur_max_time <= wrais_duration)
++			bfqq->bic->raising_time_left = 0;
++		else
++			bfqq->bic->raising_time_left =
++				bfqq->raising_cur_max_time - wrais_duration;
++		/*
++		 * The bfq_queue is becoming shared or the requests of the
++		 * process owning the queue are being redirected to a shared
++		 * queue. Stop the weight raising period of the queue, as in
++		 * both cases it should not be owned by an interactive or soft
++		 * real-time application.
++		 */
++		bfqq->raising_coeff = 1;
++		bfqq->entity.ioprio_changed = 1;
++	} else
++		bfqq->bic->raising_time_left = 0;
++	bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq);
++}
++
++static inline void
++bfq_get_bic_reference(struct bfq_queue *bfqq)
++{
++	/*
++	 * If bfqq->bic has a non-NULL value, the bic to which it belongs
++	 * is about to begin using a shared bfq_queue.
++	 */
++	if (bfqq->bic)
++		atomic_long_inc(&bfqq->bic->icq.ioc->refcount);
++}
++
++static void
++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
++                struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
++{
++        bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu",
++		(long unsigned)new_bfqq->pid);
++	/* Save weight raising and idle window of the merged queues */
++	bfq_bfqq_save_state(bfqq);
++	bfq_bfqq_save_state(new_bfqq);
++	/*
++	 * Grab a reference to the bic, to prevent it from being destroyed
++	 * before being possibly touched by a bfq_split_bfqq().
++	 */
++	bfq_get_bic_reference(bfqq);
++	bfq_get_bic_reference(new_bfqq);
++	/* Merge queues (that is, let bic redirect its requests to new_bfqq) */
++        bic_set_bfqq(bic, new_bfqq, 1);
++        bfq_mark_bfqq_coop(new_bfqq);
++	/*
++	 * new_bfqq now belongs to at least two bics (it is a shared queue): set
++	 * new_bfqq->bic to NULL. bfqq either:
++	 * - does not belong to any bic any more, and hence bfqq->bic must
++	 *   be set to NULL, or
++	 * - is a queue whose owning bics have already been redirected to a
++	 *   different queue, hence the queue is destined to not belong to any
++	 *   bic soon and bfqq->bic is already NULL (therefore the next
++	 *   assignment causes no harm).
++	 */
++	new_bfqq->bic = NULL;
++	bfqq->bic = NULL;
++        bfq_put_queue(bfqq);
++}
++
++static int bfq_allow_merge(struct request_queue *q, struct request *rq,
++			   struct bio *bio)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++	struct bfq_io_cq *bic;
++	struct bfq_queue *bfqq, *new_bfqq;
++
++	/*
++	 * Disallow merge of a sync bio into an async request.
++	 */
++	if (bfq_bio_sync(bio) && !rq_is_sync(rq))
++		return 0;
++
++	/*
++	 * Lookup the bfqq that this bio will be queued with. Allow
++	 * merge only if rq is queued there.
++	 * Queue lock is held here.
++	 */
++	bic = bfq_bic_lookup(bfqd, current->io_context);
++	if (bic == NULL)
++		return 0;
++
++	bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio));
++	/*
++	 * We take advantage of this function to perform an early merge
++	 * of the queues of possible cooperating processes.
++	 */
++	if (bfqq != NULL &&
++	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false))) {
++		bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
++		/*
++		 * If we get here, the bio will be queued in the shared queue,
++		 * i.e., new_bfqq, so use new_bfqq to decide whether bio and
++		 * rq can be merged.
++		 */
++		bfqq = new_bfqq;
++	}
++
++	return bfqq == RQ_BFQQ(rq);
++}
++
++static void __bfq_set_active_queue(struct bfq_data *bfqd,
++				   struct bfq_queue *bfqq)
++{
++	if (bfqq != NULL) {
++		bfq_mark_bfqq_must_alloc(bfqq);
++		bfq_mark_bfqq_budget_new(bfqq);
++		bfq_clear_bfqq_fifo_expire(bfqq);
++
++		bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8;
++
++		bfq_log_bfqq(bfqd, bfqq, "set_active_queue, cur-budget = %lu",
++			     bfqq->entity.budget);
++	}
++
++	bfqd->active_queue = bfqq;
++}
++
++/*
++ * Get and set a new active queue for service.
++ */
++static struct bfq_queue *bfq_set_active_queue(struct bfq_data *bfqd)
++{
++	struct bfq_queue *bfqq = bfq_get_next_queue(bfqd);
++
++	__bfq_set_active_queue(bfqd, bfqq);
++	return bfqq;
++}
++
++/*
++ * If enough samples have been computed, return the current max budget
++ * stored in bfqd, which is dynamically updated according to the
++ * estimated disk peak rate; otherwise return the default max budget
++ */
++static inline unsigned long bfq_max_budget(struct bfq_data *bfqd)
++{
++	if (bfqd->budgets_assigned < 194)
++		return bfq_default_max_budget;
++	else
++		return bfqd->bfq_max_budget;
++}
++
++/*
++ * Return min budget, which is a fraction of the current or default
++ * max budget (trying with 1/32)
++ */
++static inline unsigned long bfq_min_budget(struct bfq_data *bfqd)
++{
++	if (bfqd->budgets_assigned < 194)
++		return bfq_default_max_budget / 32;
++	else
++		return bfqd->bfq_max_budget / 32;
++}
++
++/*
++ * Decides whether idling should be done for given device and
++ * given active queue.
++ */
++static inline bool bfq_queue_nonrot_noidle(struct bfq_data *bfqd,
++					   struct bfq_queue *active_bfqq)
++{
++	if (active_bfqq == NULL)
++		return false;
++	/*
++	 * If device is SSD it has no seek penalty, disable idling; but
++	 * do so only if:
++	 * - device does not support queuing, otherwise we still have
++	 *   a problem with sync vs async workloads;
++	 * - the queue is not weight-raised, to preserve guarantees.
++	 */
++	return (blk_queue_nonrot(bfqd->queue) && bfqd->hw_tag &&
++		active_bfqq->raising_coeff == 1);
++}
++
++static void bfq_arm_slice_timer(struct bfq_data *bfqd)
++{
++	struct bfq_queue *bfqq = bfqd->active_queue;
++	struct bfq_io_cq *bic;
++	unsigned long sl;
++
++	WARN_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
++
++	if (bfq_queue_nonrot_noidle(bfqd, bfqq))
++		return;
++
++	/* Idling is disabled, either manually or by past process history. */
++	if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_idle_window(bfqq))
++		return;
++
++	/* Tasks have exited, don't wait. */
++	bic = bfqd->active_bic;
++	if (bic == NULL || atomic_read(&bic->icq.ioc->active_ref) == 0)
++		return;
++
++	bfq_mark_bfqq_wait_request(bfqq);
++
++	/*
++	 * We don't want to idle for seeks, but we do want to allow
++	 * fair distribution of slice time for a process doing back-to-back
++	 * seeks. So allow a little bit of time for him to submit a new rq.
++	 *
++	 * To prevent processes with (partly) seeky workloads from
++	 * being too ill-treated, grant them a small fraction of the
++	 * assigned budget before reducing the waiting time to
++	 * BFQ_MIN_TT. This happened to help reduce latency.
++	 */
++	sl = bfqd->bfq_slice_idle;
++	if (bfq_sample_valid(bfqq->seek_samples) && BFQQ_SEEKY(bfqq) &&
++	    bfqq->entity.service > bfq_max_budget(bfqd) / 8 &&
++	    bfqq->raising_coeff == 1)
++		sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT));
++	else if (bfqq->raising_coeff > 1)
++		sl = sl * 3;
++	bfqd->last_idling_start = ktime_get();
++	mod_timer(&bfqd->idle_slice_timer, jiffies + sl);
++	bfq_log(bfqd, "arm idle: %u/%u ms",
++		jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle));
++}
++
++/*
++ * Set the maximum time for the active queue to consume its
++ * budget. This prevents seeky processes from lowering the disk
++ * throughput (always guaranteed with a time slice scheme as in CFQ).
++ */
++static void bfq_set_budget_timeout(struct bfq_data *bfqd)
++{
++	struct bfq_queue *bfqq = bfqd->active_queue;
++	unsigned int timeout_coeff;
++	if (bfqq->raising_cur_max_time == bfqd->bfq_raising_rt_max_time)
++		timeout_coeff = 1;
++	else
++		timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight;
++
++	bfqd->last_budget_start = ktime_get();
++
++	bfq_clear_bfqq_budget_new(bfqq);
++	bfqq->budget_timeout = jiffies +
++		bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff;
++
++	bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u",
++		jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] *
++		timeout_coeff));
++}
++
++/*
++ * Move request from internal lists to the request queue dispatch list.
++ */
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++	struct bfq_queue *bfqq = RQ_BFQQ(rq);
++
++	bfq_remove_request(rq);
++	bfqq->dispatched++;
++	elv_dispatch_sort(q, rq);
++
++	if (bfq_bfqq_sync(bfqq))
++		bfqd->sync_flight++;
++}
++
++/*
++ * Return expired entry, or NULL to just start from scratch in rbtree.
++ */
++static struct request *bfq_check_fifo(struct bfq_queue *bfqq)
++{
++	struct request *rq = NULL;
++
++	if (bfq_bfqq_fifo_expire(bfqq))
++		return NULL;
++
++	bfq_mark_bfqq_fifo_expire(bfqq);
++
++	if (list_empty(&bfqq->fifo))
++		return NULL;
++
++	rq = rq_entry_fifo(bfqq->fifo.next);
++
++	if (time_before(jiffies, rq_fifo_time(rq)))
++		return NULL;
++
++	return rq;
++}
++
++static inline unsigned long bfq_bfqq_budget_left(struct bfq_queue *bfqq)
++{
++	struct bfq_entity *entity = &bfqq->entity;
++	return entity->budget - entity->service;
++}
++
++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++{
++	BUG_ON(bfqq != bfqd->active_queue);
++
++	__bfq_bfqd_reset_active(bfqd);
++
++	if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
++		bfq_del_bfqq_busy(bfqd, bfqq, 1);
++		/*
++		 * overloading budget_timeout field to store when
++		 * the queue remains with no backlog, used by
++		 * the weight-raising mechanism
++		 */
++		bfqq->budget_timeout = jiffies ;
++	} else {
++		bfq_activate_bfqq(bfqd, bfqq);
++		/*
++		 * Resort priority tree of potential close cooperators.
++		 */
++		bfq_rq_pos_tree_add(bfqd, bfqq);
++	}
++
++	/*
++	 * If this bfqq is shared between multiple processes, check
++	 * to make sure that those processes are still issuing I/Os
++	 * within the mean seek distance. If not, it may be time to
++	 * break the queues apart again.
++	 */
++	if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
++		bfq_mark_bfqq_split_coop(bfqq);
++}
++
++/**
++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior.
++ * @bfqd: device data.
++ * @bfqq: queue to update.
++ * @reason: reason for expiration.
++ *
++ * Handle the feedback on @bfqq budget.  See the body for detailed
++ * comments.
++ */
++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
++				     struct bfq_queue *bfqq,
++				     enum bfqq_expiration reason)
++{
++	struct request *next_rq;
++	unsigned long budget, min_budget;
++
++	budget = bfqq->max_budget;
++	min_budget = bfq_min_budget(bfqd);
++
++	BUG_ON(bfqq != bfqd->active_queue);
++
++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %lu, budg left %lu",
++		bfqq->entity.budget, bfq_bfqq_budget_left(bfqq));
++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %lu, min budg %lu",
++		budget, bfq_min_budget(bfqd));
++	bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d",
++		bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->active_queue));
++
++	if (bfq_bfqq_sync(bfqq)) {
++		switch (reason) {
++		/*
++		 * Caveat: in all the following cases we trade latency
++		 * for throughput.
++		 */
++		case BFQ_BFQQ_TOO_IDLE:
++			/*
++			 * This is the only case where we may reduce
++			 * the budget: if there is no requets of the
++			 * process still waiting for completion, then
++			 * we assume (tentatively) that the timer has
++			 * expired because the batch of requests of
++			 * the process could have been served with a
++			 * smaller budget.  Hence, betting that
++			 * process will behave in the same way when it
++			 * becomes backlogged again, we reduce its
++			 * next budget.  As long as we guess right,
++			 * this budget cut reduces the latency
++			 * experienced by the process.
++			 *
++			 * However, if there are still outstanding
++			 * requests, then the process may have not yet
++			 * issued its next request just because it is
++			 * still waiting for the completion of some of
++			 * the still oustanding ones.  So in this
++			 * subcase we do not reduce its budget, on the
++			 * contrary we increase it to possibly boost
++			 * the throughput, as discussed in the
++			 * comments to the BUDGET_TIMEOUT case.
++			 */
++			if (bfqq->dispatched > 0) /* still oustanding reqs */
++				budget = min(budget * 2, bfqd->bfq_max_budget);
++			else {
++				if (budget > 5 * min_budget)
++					budget -= 4 * min_budget;
++				else
++					budget = min_budget;
++			}
++			break;
++		case BFQ_BFQQ_BUDGET_TIMEOUT:
++			/*
++			 * We double the budget here because: 1) it
++			 * gives the chance to boost the throughput if
++			 * this is not a seeky process (which may have
++			 * bumped into this timeout because of, e.g.,
++			 * ZBR), 2) together with charge_full_budget
++			 * it helps give seeky processes higher
++			 * timestamps, and hence be served less
++			 * frequently.
++			 */
++			budget = min(budget * 2, bfqd->bfq_max_budget);
++			break;
++		case BFQ_BFQQ_BUDGET_EXHAUSTED:
++			/*
++			 * The process still has backlog, and did not
++			 * let either the budget timeout or the disk
++			 * idling timeout expire. Hence it is not
++			 * seeky, has a short thinktime and may be
++			 * happy with a higher budget too. So
++			 * definitely increase the budget of this good
++			 * candidate to boost the disk throughput.
++			 */
++			budget = min(budget * 4, bfqd->bfq_max_budget);
++			break;
++		case BFQ_BFQQ_NO_MORE_REQUESTS:
++		       /*
++			* Leave the budget unchanged.
++			*/
++		default:
++			return;
++		}
++	} else /* async queue */
++	    /* async queues get always the maximum possible budget
++	     * (their ability to dispatch is limited by
++	     * @bfqd->bfq_max_budget_async_rq).
++	     */
++		budget = bfqd->bfq_max_budget;
++
++	bfqq->max_budget = budget;
++
++	if (bfqd->budgets_assigned >= 194 && bfqd->bfq_user_max_budget == 0 &&
++	    bfqq->max_budget > bfqd->bfq_max_budget)
++		bfqq->max_budget = bfqd->bfq_max_budget;
++
++	/*
++	 * Make sure that we have enough budget for the next request.
++	 * Since the finish time of the bfqq must be kept in sync with
++	 * the budget, be sure to call __bfq_bfqq_expire() after the
++	 * update.
++	 */
++	next_rq = bfqq->next_rq;
++	if (next_rq != NULL)
++		bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget,
++					    bfq_serv_to_charge(next_rq, bfqq));
++	else
++		bfqq->entity.budget = bfqq->max_budget;
++
++	bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %lu",
++			next_rq != NULL ? blk_rq_sectors(next_rq) : 0,
++			bfqq->entity.budget);
++}
++
++static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout)
++{
++	unsigned long max_budget;
++
++	/*
++	 * The max_budget calculated when autotuning is equal to the
++	 * amount of sectors transfered in timeout_sync at the
++	 * estimated peak rate.
++	 */
++	max_budget = (unsigned long)(peak_rate * 1000 *
++				     timeout >> BFQ_RATE_SHIFT);
++
++	return max_budget;
++}
++
++/*
++ * In addition to updating the peak rate, checks whether the process
++ * is "slow", and returns 1 if so. This slow flag is used, in addition
++ * to the budget timeout, to reduce the amount of service provided to
++ * seeky processes, and hence reduce their chances to lower the
++ * throughput. See the code for more details.
++ */
++static int bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++				int compensate, enum bfqq_expiration reason)
++{
++	u64 bw, usecs, expected, timeout;
++	ktime_t delta;
++	int update = 0;
++
++	if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq))
++		return 0;
++
++	if (compensate)
++		delta = bfqd->last_idling_start;
++	else
++		delta = ktime_get();
++	delta = ktime_sub(delta, bfqd->last_budget_start);
++	usecs = ktime_to_us(delta);
++
++	/* Don't trust short/unrealistic values. */
++	if (usecs < 100 || usecs >= LONG_MAX)
++		return 0;
++
++	/*
++	 * Calculate the bandwidth for the last slice.  We use a 64 bit
++	 * value to store the peak rate, in sectors per usec in fixed
++	 * point math.  We do so to have enough precision in the estimate
++	 * and to avoid overflows.
++	 */
++	bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT;
++	do_div(bw, (unsigned long)usecs);
++
++	timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
++
++	/*
++	 * Use only long (> 20ms) intervals to filter out spikes for
++	 * the peak rate estimation.
++	 */
++	if (usecs > 20000) {
++		if (bw > bfqd->peak_rate ||
++		   (!BFQQ_SEEKY(bfqq) &&
++		    reason == BFQ_BFQQ_BUDGET_TIMEOUT)) {
++			bfq_log(bfqd, "measured bw =%llu", bw);
++			/*
++			 * To smooth oscillations use a low-pass filter with
++			 * alpha=7/8, i.e.,
++			 * new_rate = (7/8) * old_rate + (1/8) * bw
++			 */
++			do_div(bw, 8);
++			if (bw == 0)
++				return 0;
++			bfqd->peak_rate *= 7;
++			do_div(bfqd->peak_rate, 8);
++			bfqd->peak_rate += bw;
++			update = 1;
++			bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate);
++		}
++
++		update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1;
++
++		if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES)
++			bfqd->peak_rate_samples++;
++
++		if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES &&
++		    update && bfqd->bfq_user_max_budget == 0) {
++			bfqd->bfq_max_budget =
++				bfq_calc_max_budget(bfqd->peak_rate, timeout);
++			bfq_log(bfqd, "new max_budget=%lu",
++				bfqd->bfq_max_budget);
++		}
++	}
++
++	/*
++	 * If the process has been served for a too short time
++	 * interval to let its possible sequential accesses prevail on
++	 * the initial seek time needed to move the disk head on the
++	 * first sector it requested, then give the process a chance
++	 * and for the moment return false.
++	 */
++	if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8)
++		return 0;
++
++	/*
++	 * A process is considered ``slow'' (i.e., seeky, so that we
++	 * cannot treat it fairly in the service domain, as it would
++	 * slow down too much the other processes) if, when a slice
++	 * ends for whatever reason, it has received service at a
++	 * rate that would not be high enough to complete the budget
++	 * before the budget timeout expiration.
++	 */
++	expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT;
++
++	/*
++	 * Caveat: processes doing IO in the slower disk zones will
++	 * tend to be slow(er) even if not seeky. And the estimated
++	 * peak rate will actually be an average over the disk
++	 * surface. Hence, to not be too harsh with unlucky processes,
++	 * we keep a budget/3 margin of safety before declaring a
++	 * process slow.
++	 */
++	return expected > (4 * bfqq->entity.budget) / 3;
++}
++
++/**
++ * bfq_bfqq_expire - expire a queue.
++ * @bfqd: device owning the queue.
++ * @bfqq: the queue to expire.
++ * @compensate: if true, compensate for the time spent idling.
++ * @reason: the reason causing the expiration.
++ *
++ *
++ * If the process associated to the queue is slow (i.e., seeky), or in
++ * case of budget timeout, or, finally, if it is async, we
++ * artificially charge it an entire budget (independently of the
++ * actual service it received). As a consequence, the queue will get
++ * higher timestamps than the correct ones upon reactivation, and
++ * hence it will be rescheduled as if it had received more service
++ * than what it actually received. In the end, this class of processes
++ * will receive less service in proportion to how slowly they consume
++ * their budgets (and hence how seriously they tend to lower the
++ * throughput).
++ *
++ * In contrast, when a queue expires because it has been idling for
++ * too much or because it exhausted its budget, we do not touch the
++ * amount of service it has received. Hence when the queue will be
++ * reactivated and its timestamps updated, the latter will be in sync
++ * with the actual service received by the queue until expiration.
++ *
++ * Charging a full budget to the first type of queues and the exact
++ * service to the others has the effect of using the WF2Q+ policy to
++ * schedule the former on a timeslice basis, without violating the
++ * service domain guarantees of the latter.
++ */
++static void bfq_bfqq_expire(struct bfq_data *bfqd,
++			    struct bfq_queue *bfqq,
++			    int compensate,
++			    enum bfqq_expiration reason)
++{
++	int slow;
++	BUG_ON(bfqq != bfqd->active_queue);
++
++	/* Update disk peak rate for autotuning and check whether the
++	 * process is slow (see bfq_update_peak_rate).
++	 */
++	slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason);
++
++	/*
++	 * As above explained, 'punish' slow (i.e., seeky), timed-out
++	 * and async queues, to favor sequential sync workloads.
++	 *
++	 * Processes doing IO in the slower disk zones will tend to be
++	 * slow(er) even if not seeky. Hence, since the estimated peak
++	 * rate is actually an average over the disk surface, these
++	 * processes may timeout just for bad luck. To avoid punishing
++	 * them we do not charge a full budget to a process that
++	 * succeeded in consuming at least 2/3 of its budget.
++	 */
++	if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT &&
++		     bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3))
++		bfq_bfqq_charge_full_budget(bfqq);
++
++	if (bfqd->low_latency && bfqq->raising_coeff == 1)
++		bfqq->last_rais_start_finish = jiffies;
++
++	if (bfqd->low_latency && bfqd->bfq_raising_max_softrt_rate > 0) {
++	    if(reason != BFQ_BFQQ_BUDGET_TIMEOUT)
++		bfqq->soft_rt_next_start =
++			jiffies +
++			HZ * bfqq->entity.service /
++			bfqd->bfq_raising_max_softrt_rate;
++		else
++			bfqq->soft_rt_next_start = -1; /* infinity */
++	}
++	bfq_log_bfqq(bfqd, bfqq,
++		"expire (%d, slow %d, num_disp %d, idle_win %d)", reason, slow,
++		bfqq->dispatched, bfq_bfqq_idle_window(bfqq));
++
++	/* Increase, decrease or leave budget unchanged according to reason */
++	__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
++	__bfq_bfqq_expire(bfqd, bfqq);
++}
++
++/*
++ * Budget timeout is not implemented through a dedicated timer, but
++ * just checked on request arrivals and completions, as well as on
++ * idle timer expirations.
++ */
++static int bfq_bfqq_budget_timeout(struct bfq_queue *bfqq)
++{
++	if (bfq_bfqq_budget_new(bfqq))
++		return 0;
++
++	if (time_before(jiffies, bfqq->budget_timeout))
++		return 0;
++
++	return 1;
++}
++
++/*
++ * If we expire a queue that is waiting for the arrival of a new
++ * request, we may prevent the fictitious timestamp backshifting that
++ * allows the guarantees of the queue to be preserved (see [1] for
++ * this tricky aspect). Hence we return true only if this condition
++ * does not hold, or if the queue is slow enough to deserve only to be
++ * kicked off for preserving a high throughput.
++*/
++static inline int bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq)
++{
++	bfq_log_bfqq(bfqq->bfqd, bfqq,
++		"may_budget_timeout: wr %d left %d timeout %d",
++		bfq_bfqq_wait_request(bfqq),
++			bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3,
++		bfq_bfqq_budget_timeout(bfqq));
++
++	return (!bfq_bfqq_wait_request(bfqq) ||
++		bfq_bfqq_budget_left(bfqq) >=  bfqq->entity.budget / 3)
++		&&
++		bfq_bfqq_budget_timeout(bfqq);
++}
++
++/*
++ * Select a queue for service.  If we have a current active queue,
++ * check whether to continue servicing it, or retrieve and set a new one.
++ */
++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
++{
++	struct bfq_queue *bfqq;
++	struct request *next_rq;
++	enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT;
++
++	bfqq = bfqd->active_queue;
++	if (bfqq == NULL)
++		goto new_queue;
++
++	bfq_log_bfqq(bfqd, bfqq, "select_queue: already active queue");
++
++	if (bfq_may_expire_for_budg_timeout(bfqq))
++		goto expire;
++
++	next_rq = bfqq->next_rq;
++	/*
++	 * If bfqq has requests queued and it has enough budget left to
++	 * serve them, keep the queue, otherwise expire it.
++	 */
++	if (next_rq != NULL) {
++		if (bfq_serv_to_charge(next_rq, bfqq) >
++			bfq_bfqq_budget_left(bfqq)) {
++			reason = BFQ_BFQQ_BUDGET_EXHAUSTED;
++			goto expire;
++		} else {
++			/*
++			 * The idle timer may be pending because we may not
++			 * disable disk idling even when a new request arrives
++			 */
++			if (timer_pending(&bfqd->idle_slice_timer)) {
++				/*
++				 * If we get here: 1) at least a new request
++				 * has arrived but we have not disabled the
++				 * timer because the request was too small,
++				 * 2) then the block layer has unplugged the
++				 * device, causing the dispatch to be invoked.
++				 *
++				 * Since the device is unplugged, now the
++				 * requests are probably large enough to
++				 * provide a reasonable throughput.
++				 * So we disable idling.
++				 */
++				bfq_clear_bfqq_wait_request(bfqq);
++				del_timer(&bfqd->idle_slice_timer);
++			}
++			goto keep_queue;
++		}
++	}
++
++	/*
++	 * No requests pending.  If there is no cooperator, and the active
++	 * queue still has requests in flight or is idling for a new request,
++	 * then keep it.
++	 */
++	if (timer_pending(&bfqd->idle_slice_timer) ||
++	    (bfqq->dispatched != 0 && bfq_bfqq_idle_window(bfqq) &&
++	    !bfq_queue_nonrot_noidle(bfqd, bfqq))) {
++		bfqq = NULL;
++		goto keep_queue;
++	}
++
++	reason = BFQ_BFQQ_NO_MORE_REQUESTS;
++expire:
++	bfq_bfqq_expire(bfqd, bfqq, 0, reason);
++new_queue:
++	bfqq = bfq_set_active_queue(bfqd);
++	bfq_log(bfqd, "select_queue: new queue %d returned",
++		bfqq != NULL ? bfqq->pid : 0);
++keep_queue:
++	return bfqq;
++}
++
++static void update_raising_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++{
++	struct bfq_entity *entity = &bfqq->entity;
++	if (bfqq->raising_coeff > 1) { /* queue is being boosted */
++		bfq_log_bfqq(bfqd, bfqq,
++			"raising period dur %u/%u msec, "
++			"old raising coeff %u, w %d(%d)",
++			jiffies_to_msecs(jiffies -
++				bfqq->last_rais_start_finish),
++			jiffies_to_msecs(bfqq->raising_cur_max_time),
++			bfqq->raising_coeff,
++			bfqq->entity.weight, bfqq->entity.orig_weight);
++
++		BUG_ON(bfqq != bfqd->active_queue && entity->weight !=
++			entity->orig_weight * bfqq->raising_coeff);
++		if(entity->ioprio_changed)
++			bfq_log_bfqq(bfqd, bfqq,
++			"WARN: pending prio change");
++		/*
++		 * If too much time has elapsed from the beginning
++		 * of this weight-raising period and process is not soft
++		 * real-time, stop it
++		 */
++		if (jiffies - bfqq->last_rais_start_finish >
++			bfqq->raising_cur_max_time) {
++			int soft_rt = bfqd->bfq_raising_max_softrt_rate > 0 &&
++				bfqq->soft_rt_next_start < jiffies;
++
++			bfqq->last_rais_start_finish = jiffies;
++			if (soft_rt)
++				bfqq->raising_cur_max_time =
++					bfqd->bfq_raising_rt_max_time;
++			else {
++				bfq_log_bfqq(bfqd, bfqq,
++					     "wrais ending at %llu msec,"
++					     "rais_max_time %u",
++					     bfqq->last_rais_start_finish,
++					     jiffies_to_msecs(bfqq->
++						raising_cur_max_time));
++				bfqq->raising_coeff = 1;
++				entity->ioprio_changed = 1;
++			}
++		}
++	}
++	/* Update weight both if it must be raised and if it must be lowered */
++	if ((entity->weight > entity->orig_weight) != (bfqq->raising_coeff > 1))
++		__bfq_entity_update_weight_prio(
++			bfq_entity_service_tree(entity),
++			entity);
++}
++
++
++/*
++ * Dispatch one request from bfqq, moving it to the request queue
++ * dispatch list.
++ */
++static int bfq_dispatch_request(struct bfq_data *bfqd,
++				struct bfq_queue *bfqq)
++{
++	int dispatched = 0;
++	struct request *rq;
++	unsigned long service_to_charge;
++
++	BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list));
++
++	/* Follow expired path, else get first next available. */
++	rq = bfq_check_fifo(bfqq);
++	if (rq == NULL)
++		rq = bfqq->next_rq;
++	service_to_charge = bfq_serv_to_charge(rq, bfqq);
++
++	if (service_to_charge > bfq_bfqq_budget_left(bfqq)) {
++		/*
++		 * This may happen if the next rq is chosen
++		 * in fifo order instead of sector order.
++		 * The budget is properly dimensioned
++		 * to be always sufficient to serve the next request
++		 * only if it is chosen in sector order. The reason is
++		 * that it would be quite inefficient and little useful
++		 * to always make sure that the budget is large enough
++		 * to serve even the possible next rq in fifo order.
++		 * In fact, requests are seldom served in fifo order.
++		 *
++		 * Expire the queue for budget exhaustion, and
++		 * make sure that the next act_budget is enough
++		 * to serve the next request, even if it comes
++		 * from the fifo expired path.
++		 */
++		bfqq->next_rq = rq;
++		/*
++		 * Since this dispatch is failed, make sure that
++		 * a new one will be performed
++		 */
++		if (!bfqd->rq_in_driver)
++			bfq_schedule_dispatch(bfqd);
++		goto expire;
++	}
++
++	/* Finally, insert request into driver dispatch list. */
++	bfq_bfqq_served(bfqq, service_to_charge);
++	bfq_dispatch_insert(bfqd->queue, rq);
++
++	update_raising_data(bfqd, bfqq);
++
++	bfq_log_bfqq(bfqd, bfqq, "dispatched %u sec req (%llu), "
++			"budg left %lu",
++			blk_rq_sectors(rq),
++			(long long unsigned)blk_rq_pos(rq),
++			bfq_bfqq_budget_left(bfqq));
++
++	dispatched++;
++
++	if (bfqd->active_bic == NULL) {
++		atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount);
++		bfqd->active_bic = RQ_BIC(rq);
++	}
++
++	if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) &&
++	    dispatched >= bfqd->bfq_max_budget_async_rq) ||
++	    bfq_class_idle(bfqq)))
++		goto expire;
++
++	return dispatched;
++
++expire:
++	bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_EXHAUSTED);
++	return dispatched;
++}
++
++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq)
++{
++	int dispatched = 0;
++
++	while (bfqq->next_rq != NULL) {
++		bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq);
++		dispatched++;
++	}
++
++	BUG_ON(!list_empty(&bfqq->fifo));
++	return dispatched;
++}
++
++/*
++ * Drain our current requests.  Used for barriers and when switching
++ * io schedulers on-the-fly.
++ */
++static int bfq_forced_dispatch(struct bfq_data *bfqd)
++{
++	struct bfq_queue *bfqq, *n;
++	struct bfq_service_tree *st;
++	int dispatched = 0;
++
++	bfqq = bfqd->active_queue;
++	if (bfqq != NULL)
++		__bfq_bfqq_expire(bfqd, bfqq);
++
++	/*
++	 * Loop through classes, and be careful to leave the scheduler
++	 * in a consistent state, as feedback mechanisms and vtime
++	 * updates cannot be disabled during the process.
++	 */
++	list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) {
++		st = bfq_entity_service_tree(&bfqq->entity);
++
++		dispatched += __bfq_forced_dispatch_bfqq(bfqq);
++		bfqq->max_budget = bfq_max_budget(bfqd);
++
++		bfq_forget_idle(st);
++	}
++
++	BUG_ON(bfqd->busy_queues != 0);
++
++	return dispatched;
++}
++
++static int bfq_dispatch_requests(struct request_queue *q, int force)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++	struct bfq_queue *bfqq;
++	int max_dispatch;
++
++	bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues);
++	if (bfqd->busy_queues == 0)
++		return 0;
++
++	if (unlikely(force))
++		return bfq_forced_dispatch(bfqd);
++
++	if((bfqq = bfq_select_queue(bfqd)) == NULL)
++		return 0;
++
++	max_dispatch = bfqd->bfq_quantum;
++	if (bfq_class_idle(bfqq))
++		max_dispatch = 1;
++
++	if (!bfq_bfqq_sync(bfqq))
++		max_dispatch = bfqd->bfq_max_budget_async_rq;
++
++	if (bfqq->dispatched >= max_dispatch) {
++		if (bfqd->busy_queues > 1)
++			return 0;
++		if (bfqq->dispatched >= 4 * max_dispatch)
++			return 0;
++	}
++
++	if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq))
++		return 0;
++
++	bfq_clear_bfqq_wait_request(bfqq);
++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
++
++	if (! bfq_dispatch_request(bfqd, bfqq))
++		return 0;
++
++	bfq_log_bfqq(bfqd, bfqq, "dispatched one request of %d"
++		     "(max_disp %d)", bfqq->pid, max_dispatch);
++
++	return 1;
++}
++
++/*
++ * Task holds one reference to the queue, dropped when task exits.  Each rq
++ * in-flight on this queue also holds a reference, dropped when rq is freed.
++ *
++ * Queue lock must be held here.
++ */
++static void bfq_put_queue(struct bfq_queue *bfqq)
++{
++	struct bfq_data *bfqd = bfqq->bfqd;
++
++	BUG_ON(atomic_read(&bfqq->ref) <= 0);
++
++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq,
++		     atomic_read(&bfqq->ref));
++	if (!atomic_dec_and_test(&bfqq->ref))
++		return;
++
++	BUG_ON(rb_first(&bfqq->sort_list) != NULL);
++	BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0);
++	BUG_ON(bfqq->entity.tree != NULL);
++	BUG_ON(bfq_bfqq_busy(bfqq));
++	BUG_ON(bfqd->active_queue == bfqq);
++
++	bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq);
++
++	kmem_cache_free(bfq_pool, bfqq);
++}
++
++static void bfq_put_cooperator(struct bfq_queue *bfqq)
++{
++	struct bfq_queue *__bfqq, *next;
++
++	/*
++	 * If this queue was scheduled to merge with another queue, be
++	 * sure to drop the reference taken on that queue (and others in
++	 * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs.
++	 */
++	__bfqq = bfqq->new_bfqq;
++	while (__bfqq) {
++		if (__bfqq == bfqq) {
++			WARN(1, "bfqq->new_bfqq loop detected.\n");
++			break;
++		}
++		next = __bfqq->new_bfqq;
++		bfq_put_queue(__bfqq);
++		__bfqq = next;
++	}
++}
++
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++{
++	if (bfqq == bfqd->active_queue) {
++		__bfq_bfqq_expire(bfqd, bfqq);
++		bfq_schedule_dispatch(bfqd);
++	}
++
++	bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq,
++		     atomic_read(&bfqq->ref));
++
++	bfq_put_cooperator(bfqq);
++
++	bfq_put_queue(bfqq);
++}
++
++static void bfq_init_icq(struct io_cq *icq)
++{
++	struct bfq_io_cq *bic = icq_to_bic(icq);
++
++	bic->ttime.last_end_request = jiffies;
++	bic->raising_time_left = 0;
++}
++
++static void bfq_exit_icq(struct io_cq *icq)
++{
++	struct bfq_io_cq *bic = icq_to_bic(icq);
++	struct bfq_data *bfqd = bic_to_bfqd(bic);
++
++	if (bic->bfqq[BLK_RW_ASYNC]) {
++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]);
++		bic->bfqq[BLK_RW_ASYNC] = NULL;
++	}
++
++	if (bic->bfqq[BLK_RW_SYNC]) {
++		/*
++		 * If the bic is using a shared queue, put the reference
++		 * taken on the io_context when the bic started using a
++		 * shared bfq_queue.
++		 */
++		if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC]))
++			put_io_context(icq->ioc);
++		bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]);
++		bic->bfqq[BLK_RW_SYNC] = NULL;
++	}
++}
++
++/*
++ * Update the entity prio values; note that the new values will not
++ * be used until the next (re)activation.
++ */
++static void bfq_init_prio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
++{
++	struct task_struct *tsk = current;
++	int ioprio_class;
++
++	if (!bfq_bfqq_prio_changed(bfqq))
++		return;
++
++	ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
++	switch (ioprio_class) {
++	default:
++		printk(KERN_ERR "bfq: bad prio %x\n", ioprio_class);
++	case IOPRIO_CLASS_NONE:
++		/*
++		 * No prio set, inherit CPU scheduling settings.
++		 */
++		bfqq->entity.new_ioprio = task_nice_ioprio(tsk);
++		bfqq->entity.new_ioprio_class = task_nice_ioclass(tsk);
++		break;
++	case IOPRIO_CLASS_RT:
++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_RT;
++		break;
++	case IOPRIO_CLASS_BE:
++		bfqq->entity.new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_BE;
++		break;
++	case IOPRIO_CLASS_IDLE:
++		bfqq->entity.new_ioprio_class = IOPRIO_CLASS_IDLE;
++		bfqq->entity.new_ioprio = 7;
++		bfq_clear_bfqq_idle_window(bfqq);
++		break;
++	}
++
++	bfqq->entity.ioprio_changed = 1;
++
++	/*
++	 * Keep track of original prio settings in case we have to temporarily
++	 * elevate the priority of this queue.
++	 */
++	bfqq->org_ioprio = bfqq->entity.new_ioprio;
++	bfq_clear_bfqq_prio_changed(bfqq);
++}
++
++static void bfq_changed_ioprio(struct bfq_io_cq *bic)
++{
++	struct bfq_data *bfqd;
++	struct bfq_queue *bfqq, *new_bfqq;
++	struct bfq_group *bfqg;
++	unsigned long uninitialized_var(flags);
++	int ioprio = bic->icq.ioc->ioprio;
++
++	bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), &flags);
++	/*
++	 * This condition may trigger on a newly created bic, be sure to drop the
++	 * lock before returning.
++	 */
++	if (unlikely(bfqd == NULL) || likely(bic->ioprio == ioprio))
++		goto out;
++
++	bfqq = bic->bfqq[BLK_RW_ASYNC];
++	if (bfqq != NULL) {
++		bfqg = container_of(bfqq->entity.sched_data, struct bfq_group,
++				    sched_data);
++		new_bfqq = bfq_get_queue(bfqd, bfqg, BLK_RW_ASYNC, bic,
++					 GFP_ATOMIC);
++		if (new_bfqq != NULL) {
++			bic->bfqq[BLK_RW_ASYNC] = new_bfqq;
++			bfq_log_bfqq(bfqd, bfqq,
++				     "changed_ioprio: bfqq %p %d",
++				     bfqq, atomic_read(&bfqq->ref));
++			bfq_put_queue(bfqq);
++		}
++	}
++
++	bfqq = bic->bfqq[BLK_RW_SYNC];
++	if (bfqq != NULL)
++		bfq_mark_bfqq_prio_changed(bfqq);
++
++	bic->ioprio = ioprio;
++
++out:
++	bfq_put_bfqd_unlock(bfqd, &flags);
++}
++
++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++			  pid_t pid, int is_sync)
++{
++	RB_CLEAR_NODE(&bfqq->entity.rb_node);
++	INIT_LIST_HEAD(&bfqq->fifo);
++
++	atomic_set(&bfqq->ref, 0);
++	bfqq->bfqd = bfqd;
++
++	bfq_mark_bfqq_prio_changed(bfqq);
++
++	if (is_sync) {
++		if (!bfq_class_idle(bfqq))
++			bfq_mark_bfqq_idle_window(bfqq);
++		bfq_mark_bfqq_sync(bfqq);
++	}
++
++	/* Tentative initial value to trade off between thr and lat */
++	bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3;
++	bfqq->pid = pid;
++
++	bfqq->raising_coeff = 1;
++	bfqq->last_rais_start_finish = 0;
++	bfqq->soft_rt_next_start = -1;
++}
++
++static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd,
++					      struct bfq_group *bfqg,
++					      int is_sync,
++					      struct bfq_io_cq *bic,
++					      gfp_t gfp_mask)
++{
++	struct bfq_queue *bfqq, *new_bfqq = NULL;
++
++retry:
++	/* bic always exists here */
++	bfqq = bic_to_bfqq(bic, is_sync);
++
++	/*
++	 * Always try a new alloc if we fall back to the OOM bfqq
++	 * originally, since it should just be a temporary situation.
++	 */
++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
++		bfqq = NULL;
++		if (new_bfqq != NULL) {
++			bfqq = new_bfqq;
++			new_bfqq = NULL;
++		} else if (gfp_mask & __GFP_WAIT) {
++			spin_unlock_irq(bfqd->queue->queue_lock);
++			new_bfqq = kmem_cache_alloc_node(bfq_pool,
++					gfp_mask | __GFP_ZERO,
++					bfqd->queue->node);
++			spin_lock_irq(bfqd->queue->queue_lock);
++			if (new_bfqq != NULL)
++				goto retry;
++		} else {
++			bfqq = kmem_cache_alloc_node(bfq_pool,
++					gfp_mask | __GFP_ZERO,
++					bfqd->queue->node);
++		}
++
++		if (bfqq != NULL) {
++			bfq_init_bfqq(bfqd, bfqq, current->pid, is_sync);
++			bfq_log_bfqq(bfqd, bfqq, "allocated");
++		} else {
++			bfqq = &bfqd->oom_bfqq;
++			bfq_log_bfqq(bfqd, bfqq, "using oom bfqq");
++		}
++
++		bfq_init_prio_data(bfqq, bic);
++		bfq_init_entity(&bfqq->entity, bfqg);
++	}
++
++	if (new_bfqq != NULL)
++		kmem_cache_free(bfq_pool, new_bfqq);
++
++	return bfqq;
++}
++
++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
++					       struct bfq_group *bfqg,
++					       int ioprio_class, int ioprio)
++{
++	switch (ioprio_class) {
++	case IOPRIO_CLASS_RT:
++		return &bfqg->async_bfqq[0][ioprio];
++	case IOPRIO_CLASS_NONE:
++		ioprio = IOPRIO_NORM;
++		/* fall through */
++	case IOPRIO_CLASS_BE:
++		return &bfqg->async_bfqq[1][ioprio];
++	case IOPRIO_CLASS_IDLE:
++		return &bfqg->async_idle_bfqq;
++	default:
++		BUG();
++	}
++}
++
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
++				       struct bfq_group *bfqg, int is_sync,
++				       struct bfq_io_cq *bic, gfp_t gfp_mask)
++{
++	const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
++	const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
++	struct bfq_queue **async_bfqq = NULL;
++	struct bfq_queue *bfqq = NULL;
++
++	if (!is_sync) {
++		async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
++						  ioprio);
++		bfqq = *async_bfqq;
++	}
++
++	if (bfqq == NULL)
++		bfqq = bfq_find_alloc_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
++
++	/*
++	 * Pin the queue now that it's allocated, scheduler exit will prune it.
++	 */
++	if (!is_sync && *async_bfqq == NULL) {
++		atomic_inc(&bfqq->ref);
++		bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d",
++			     bfqq, atomic_read(&bfqq->ref));
++		*async_bfqq = bfqq;
++	}
++
++	atomic_inc(&bfqq->ref);
++	bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq,
++		     atomic_read(&bfqq->ref));
++	return bfqq;
++}
++
++static void bfq_update_io_thinktime(struct bfq_data *bfqd,
++				    struct bfq_io_cq *bic)
++{
++	unsigned long elapsed = jiffies - bic->ttime.last_end_request;
++	unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle);
++
++	bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8;
++	bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8;
++	bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / bic->ttime.ttime_samples;
++}
++
++static void bfq_update_io_seektime(struct bfq_data *bfqd,
++				   struct bfq_queue *bfqq,
++				   struct request *rq)
++{
++	sector_t sdist;
++	u64 total;
++
++	if (bfqq->last_request_pos < blk_rq_pos(rq))
++		sdist = blk_rq_pos(rq) - bfqq->last_request_pos;
++	else
++		sdist = bfqq->last_request_pos - blk_rq_pos(rq);
++
++	/*
++	 * Don't allow the seek distance to get too large from the
++	 * odd fragment, pagein, etc.
++	 */
++	if (bfqq->seek_samples == 0) /* first request, not really a seek */
++		sdist = 0;
++	else if (bfqq->seek_samples <= 60) /* second & third seek */
++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024);
++	else
++		sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64);
++
++	bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8;
++	bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8;
++	total = bfqq->seek_total + (bfqq->seek_samples/2);
++	do_div(total, bfqq->seek_samples);
++	if (bfq_bfqq_coop(bfqq)) {
++		/*
++		 * If the mean seektime increases for a (non-seeky) shared
++		 * queue, some cooperator is likely to be idling too much.
++		 * On the contrary,  if it decreases, some cooperator has
++		 * probably waked up.
++		 *
++		 */
++		if ((sector_t)total < bfqq->seek_mean)
++			bfq_mark_bfqq_some_coop_idle(bfqq) ;
++		else if ((sector_t)total > bfqq->seek_mean)
++			bfq_clear_bfqq_some_coop_idle(bfqq) ;
++	}
++	bfqq->seek_mean = (sector_t)total;
++
++	bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist,
++			(u64)bfqq->seek_mean);
++}
++
++/*
++ * Disable idle window if the process thinks too long or seeks so much that
++ * it doesn't matter.
++ */
++static void bfq_update_idle_window(struct bfq_data *bfqd,
++				   struct bfq_queue *bfqq,
++				   struct bfq_io_cq *bic)
++{
++	int enable_idle;
++
++	/* Don't idle for async or idle io prio class. */
++	if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq))
++		return;
++
++	/* Idle window just restored, statistics are meaningless. */
++	if (bfq_bfqq_just_split(bfqq))
++		return;
++
++	enable_idle = bfq_bfqq_idle_window(bfqq);
++
++	if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
++	    bfqd->bfq_slice_idle == 0 ||
++		(bfqd->hw_tag && BFQQ_SEEKY(bfqq) &&
++			bfqq->raising_coeff == 1))
++		enable_idle = 0;
++	else if (bfq_sample_valid(bic->ttime.ttime_samples)) {
++		if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle &&
++			bfqq->raising_coeff == 1)
++			enable_idle = 0;
++		else
++			enable_idle = 1;
++	}
++	bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d",
++		enable_idle);
++
++	if (enable_idle)
++		bfq_mark_bfqq_idle_window(bfqq);
++	else
++		bfq_clear_bfqq_idle_window(bfqq);
++}
++
++/*
++ * Called when a new fs request (rq) is added to bfqq.  Check if there's
++ * something we should do about it.
++ */
++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++			    struct request *rq)
++{
++	struct bfq_io_cq *bic = RQ_BIC(rq);
++
++	if (rq->cmd_flags & REQ_META)
++		bfqq->meta_pending++;
++
++	bfq_update_io_thinktime(bfqd, bic);
++	bfq_update_io_seektime(bfqd, bfqq, rq);
++	if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 ||
++	    !BFQQ_SEEKY(bfqq))
++		bfq_update_idle_window(bfqd, bfqq, bic);
++	bfq_clear_bfqq_just_split(bfqq);
++
++	bfq_log_bfqq(bfqd, bfqq,
++		     "rq_enqueued: idle_window=%d (seeky %d, mean %llu)",
++		     bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq),
++		     (long long unsigned)bfqq->seek_mean);
++
++	bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
++
++	if (bfqq == bfqd->active_queue) {
++		/*
++		 * If there is just this request queued and the request
++		 * is small, just exit.
++		 * In this way, if the disk is being idled to wait for a new
++		 * request from the active queue, we avoid unplugging the
++		 * device now.
++		 *
++		 * By doing so, we spare the disk to be committed
++		 * to serve just a small request. On the contrary, we wait for
++		 * the block layer to decide when to unplug the device:
++		 * hopefully, new requests will be merged to this
++		 * one quickly, then the device will be unplugged
++		 * and larger requests will be dispatched.
++		 */
++	        if (bfqq->queued[rq_is_sync(rq)] == 1 &&
++		    blk_rq_sectors(rq) < 32) {
++		        return;
++		}
++		if (bfq_bfqq_wait_request(bfqq)) {
++			/*
++			 * If we are waiting for a request for this queue, let
++			 * it rip immediately and flag that we must not expire
++			 * this queue just now.
++			 */
++			bfq_clear_bfqq_wait_request(bfqq);
++			del_timer(&bfqd->idle_slice_timer);
++			/*
++			 * Here we can safely expire the queue, in
++			 * case of budget timeout, without wasting
++			 * guarantees
++			 */
++			if (bfq_bfqq_budget_timeout(bfqq))
++				bfq_bfqq_expire(bfqd, bfqq, 0,
++						BFQ_BFQQ_BUDGET_TIMEOUT);
++			__blk_run_queue(bfqd->queue);
++		}
++	}
++}
++
++static void bfq_insert_request(struct request_queue *q, struct request *rq)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++	struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq;
++
++	assert_spin_locked(bfqd->queue->queue_lock);
++
++	/*
++	 * An unplug may trigger a requeue of a request from the device
++	 * driver: make sure we are in process context while trying to
++	 * merge two bfq_queues.
++	 */
++	if (!in_interrupt() &&
++	    (new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true))) {
++		if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq)
++			new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1);
++		/*
++		 * Release the request's reference to the old bfqq
++		 * and make sure one is taken to the shared queue.
++		 */
++		new_bfqq->allocated[rq_data_dir(rq)]++;
++		bfqq->allocated[rq_data_dir(rq)]--;
++		atomic_inc(&new_bfqq->ref);
++		bfq_put_queue(bfqq);
++		if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
++			bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq);
++		rq->elv.priv[1] = new_bfqq;
++		bfqq = new_bfqq;
++	}
++
++	bfq_init_prio_data(bfqq, RQ_BIC(rq));
++
++	bfq_add_rq_rb(rq);
++
++	rq_set_fifo_time(rq, jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]);
++	list_add_tail(&rq->queuelist, &bfqq->fifo);
++
++	bfq_rq_enqueued(bfqd, bfqq, rq);
++}
++
++static void bfq_update_hw_tag(struct bfq_data *bfqd)
++{
++	bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver,
++				     bfqd->rq_in_driver);
++
++	if (bfqd->hw_tag == 1)
++		return;
++
++	/*
++	 * This sample is valid if the number of outstanding requests
++	 * is large enough to allow a queueing behavior.  Note that the
++	 * sum is not exact, as it's not taking into account deactivated
++	 * requests.
++	 */
++	if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD)
++		return;
++
++	if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
++		return;
++
++	bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD;
++	bfqd->max_rq_in_driver = 0;
++	bfqd->hw_tag_samples = 0;
++}
++
++static void bfq_completed_request(struct request_queue *q, struct request *rq)
++{
++	struct bfq_queue *bfqq = RQ_BFQQ(rq);
++	struct bfq_data *bfqd = bfqq->bfqd;
++	const int sync = rq_is_sync(rq);
++
++	bfq_log_bfqq(bfqd, bfqq, "completed %u sects req (%d)",
++			blk_rq_sectors(rq), sync);
++
++	bfq_update_hw_tag(bfqd);
++
++	WARN_ON(!bfqd->rq_in_driver);
++	WARN_ON(!bfqq->dispatched);
++	bfqd->rq_in_driver--;
++	bfqq->dispatched--;
++
++	if (bfq_bfqq_sync(bfqq))
++		bfqd->sync_flight--;
++
++	if (sync)
++		RQ_BIC(rq)->ttime.last_end_request = jiffies;
++
++	/*
++	 * If this is the active queue, check if it needs to be expired,
++	 * or if we want to idle in case it has no pending requests.
++	 */
++	if (bfqd->active_queue == bfqq) {
++		if (bfq_bfqq_budget_new(bfqq))
++			bfq_set_budget_timeout(bfqd);
++
++		/* Idling is disabled also for cooperation issues:
++		 * 1) there is a close cooperator for the queue, or
++		 * 2) the queue is shared and some cooperator is likely
++		 *    to be idle (in this case, by not arming the idle timer,
++		 *    we try to slow down the queue, to prevent the zones
++		 *    of the disk accessed by the active cooperators to become
++		 *    too distant from the zone that will be accessed by the
++		 *    currently idle cooperators)
++		 */
++		if (bfq_may_expire_for_budg_timeout(bfqq))
++			bfq_bfqq_expire(bfqd, bfqq, 0, BFQ_BFQQ_BUDGET_TIMEOUT);
++		else if (sync &&
++			(bfqd->rq_in_driver == 0 ||
++				bfqq->raising_coeff > 1)
++			&& RB_EMPTY_ROOT(&bfqq->sort_list)
++			&& !bfq_close_cooperator(bfqd, bfqq, bfqd->last_position)
++			&& (!bfq_bfqq_coop(bfqq) ||
++				!bfq_bfqq_some_coop_idle(bfqq)))
++			bfq_arm_slice_timer(bfqd);
++	}
++
++	if (!bfqd->rq_in_driver)
++		bfq_schedule_dispatch(bfqd);
++}
++
++static inline int __bfq_may_queue(struct bfq_queue *bfqq)
++{
++	if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) {
++		bfq_clear_bfqq_must_alloc(bfqq);
++		return ELV_MQUEUE_MUST;
++	}
++
++	return ELV_MQUEUE_MAY;
++}
++
++static int bfq_may_queue(struct request_queue *q, int rw)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++	struct task_struct *tsk = current;
++	struct bfq_io_cq *bic;
++	struct bfq_queue *bfqq;
++
++	/*
++	 * Don't force setup of a queue from here, as a call to may_queue
++	 * does not necessarily imply that a request actually will be queued.
++	 * So just lookup a possibly existing queue, or return 'may queue'
++	 * if that fails.
++	 */
++	bic = bfq_bic_lookup(bfqd, tsk->io_context);
++	if (bic == NULL)
++		return ELV_MQUEUE_MAY;
++
++	bfqq = bic_to_bfqq(bic, rw_is_sync(rw));
++	if (bfqq != NULL) {
++		bfq_init_prio_data(bfqq, bic);
++
++		return __bfq_may_queue(bfqq);
++	}
++
++	return ELV_MQUEUE_MAY;
++}
++
++/*
++ * Queue lock held here.
++ */
++static void bfq_put_request(struct request *rq)
++{
++	struct bfq_queue *bfqq = RQ_BFQQ(rq);
++
++	if (bfqq != NULL) {
++		const int rw = rq_data_dir(rq);
++
++		BUG_ON(!bfqq->allocated[rw]);
++		bfqq->allocated[rw]--;
++
++		rq->elv.priv[0] = NULL;
++		rq->elv.priv[1] = NULL;
++
++		bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d",
++			     bfqq, atomic_read(&bfqq->ref));
++		bfq_put_queue(bfqq);
++	}
++}
++
++/*
++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this
++ * was the last process referring to said bfqq.
++ */
++static struct bfq_queue *
++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
++{
++	bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue");
++
++	put_io_context(bic->icq.ioc);
++
++	if (bfqq_process_refs(bfqq) == 1) {
++		bfqq->pid = current->pid;
++		bfq_clear_bfqq_some_coop_idle(bfqq);
++		bfq_clear_bfqq_coop(bfqq);
++		bfq_clear_bfqq_split_coop(bfqq);
++		return bfqq;
++	}
++
++	bic_set_bfqq(bic, NULL, 1);
++
++	bfq_put_cooperator(bfqq);
++
++	bfq_put_queue(bfqq);
++	return NULL;
++}
++
++/*
++ * Allocate bfq data structures associated with this request.
++ */
++static int bfq_set_request(struct request_queue *q, struct request *rq,
++			   struct bio *bio, gfp_t gfp_mask)
++{
++	struct bfq_data *bfqd = q->elevator->elevator_data;
++	struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq);
++	const int rw = rq_data_dir(rq);
++	const int is_sync = rq_is_sync(rq);
++	struct bfq_queue *bfqq;
++	struct bfq_group *bfqg;
++	unsigned long flags;
++	bool split = false;
++
++	might_sleep_if(gfp_mask & __GFP_WAIT);
++
++	bfq_changed_ioprio(bic);
++
++	spin_lock_irqsave(q->queue_lock, flags);
++
++	if (bic == NULL)
++		goto queue_fail;
++
++	bfqg = bfq_bic_update_cgroup(bic);
++
++new_queue:
++	bfqq = bic_to_bfqq(bic, is_sync);
++	if (bfqq == NULL || bfqq == &bfqd->oom_bfqq) {
++		bfqq = bfq_get_queue(bfqd, bfqg, is_sync, bic, gfp_mask);
++		bic_set_bfqq(bic, bfqq, is_sync);
++	} else {
++		/* If the queue was seeky for too long, break it apart. */
++		if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
++			bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
++			bfqq = bfq_split_bfqq(bic, bfqq);
++			split = true;
++			if (!bfqq)
++				goto new_queue;
++		}
++	}
++
++	bfqq->allocated[rw]++;
++	atomic_inc(&bfqq->ref);
++	bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq,
++		     atomic_read(&bfqq->ref));
++
++	rq->elv.priv[0] = bic;
++	rq->elv.priv[1] = bfqq;
++
++	/*
++	 * If a bfq_queue has only one process reference, it is owned
++	 * by only one bfq_io_cq: we can set the bic field of the
++	 * bfq_queue to the address of that structure. Also, if the
++	 * queue has just been split, mark a flag so that the
++	 * information is available to the other scheduler hooks.
++	 */
++	if (bfqq_process_refs(bfqq) == 1) {
++		bfqq->bic = bic;
++		if (split) {
++			bfq_mark_bfqq_just_split(bfqq);
++			/*
++			 * If the queue has just been split from a shared queue,
++			 * restore the idle window and the possible weight
++			 * raising period.
++			 */
++			bfq_bfqq_resume_state(bfqq, bic);
++		}
++	}
++
++	spin_unlock_irqrestore(q->queue_lock, flags);
++
++	return 0;
++
++queue_fail:
++	bfq_schedule_dispatch(bfqd);
++	spin_unlock_irqrestore(q->queue_lock, flags);
++
++	return 1;
++}
++
++static void bfq_kick_queue(struct work_struct *work)
++{
++	struct bfq_data *bfqd =
++		container_of(work, struct bfq_data, unplug_work);
++	struct request_queue *q = bfqd->queue;
++
++	spin_lock_irq(q->queue_lock);
++	__blk_run_queue(q);
++	spin_unlock_irq(q->queue_lock);
++}
++
++/*
++ * Handler of the expiration of the timer running if the active_queue
++ * is idling inside its time slice.
++ */
++static void bfq_idle_slice_timer(unsigned long data)
++{
++	struct bfq_data *bfqd = (struct bfq_data *)data;
++	struct bfq_queue *bfqq;
++	unsigned long flags;
++	enum bfqq_expiration reason;
++
++	spin_lock_irqsave(bfqd->queue->queue_lock, flags);
++
++	bfqq = bfqd->active_queue;
++	/*
++	 * Theoretical race here: active_queue can be NULL or different
++	 * from the queue that was idling if the timer handler spins on
++	 * the queue_lock and a new request arrives for the current
++	 * queue and there is a full dispatch cycle that changes the
++	 * active_queue.  This can hardly happen, but in the worst case
++	 * we just expire a queue too early.
++	 */
++	if (bfqq != NULL) {
++		bfq_log_bfqq(bfqd, bfqq, "slice_timer expired");
++		if (bfq_bfqq_budget_timeout(bfqq))
++			/*
++			 * Also here the queue can be safely expired
++			 * for budget timeout without wasting
++			 * guarantees
++			 */
++			reason = BFQ_BFQQ_BUDGET_TIMEOUT;
++		else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0)
++			/*
++			 * The queue may not be empty upon timer expiration,
++			 * because we may not disable the timer when the first
++			 * request of the active queue arrives during
++			 * disk idling
++			 */
++			reason = BFQ_BFQQ_TOO_IDLE;
++		else
++			goto schedule_dispatch;
++
++		bfq_bfqq_expire(bfqd, bfqq, 1, reason);
++	}
++
++schedule_dispatch:
++	bfq_schedule_dispatch(bfqd);
++
++	spin_unlock_irqrestore(bfqd->queue->queue_lock, flags);
++}
++
++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd)
++{
++	del_timer_sync(&bfqd->idle_slice_timer);
++	cancel_work_sync(&bfqd->unplug_work);
++}
++
++static inline void __bfq_put_async_bfqq(struct bfq_data *bfqd,
++					struct bfq_queue **bfqq_ptr)
++{
++	struct bfq_group *root_group = bfqd->root_group;
++	struct bfq_queue *bfqq = *bfqq_ptr;
++
++	bfq_log(bfqd, "put_async_bfqq: %p", bfqq);
++	if (bfqq != NULL) {
++		bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group);
++		bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d",
++			     bfqq, atomic_read(&bfqq->ref));
++		bfq_put_queue(bfqq);
++		*bfqq_ptr = NULL;
++	}
++}
++
++/*
++ * Release all the bfqg references to its async queues.  If we are
++ * deallocating the group these queues may still contain requests, so
++ * we reparent them to the root cgroup (i.e., the only one that will
++ * exist for sure untill all the requests on a device are gone).
++ */
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
++{
++	int i, j;
++
++	for (i = 0; i < 2; i++)
++		for (j = 0; j < IOPRIO_BE_NR; j++)
++			__bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
++
++	__bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
++}
++
++static void bfq_exit_queue(struct elevator_queue *e)
++{
++	struct bfq_data *bfqd = e->elevator_data;
++	struct request_queue *q = bfqd->queue;
++	struct bfq_queue *bfqq, *n;
++
++	bfq_shutdown_timer_wq(bfqd);
++
++	spin_lock_irq(q->queue_lock);
++
++	BUG_ON(bfqd->active_queue != NULL);
++	list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list)
++		bfq_deactivate_bfqq(bfqd, bfqq, 0);
++
++	bfq_disconnect_groups(bfqd);
++	spin_unlock_irq(q->queue_lock);
++
++	bfq_shutdown_timer_wq(bfqd);
++
++	synchronize_rcu();
++
++	BUG_ON(timer_pending(&bfqd->idle_slice_timer));
++
++	bfq_free_root_group(bfqd);
++	kfree(bfqd);
++}
++
++static int bfq_init_queue(struct request_queue *q)
++{
++	struct bfq_group *bfqg;
++	struct bfq_data *bfqd;
++
++	bfqd = kmalloc_node(sizeof(*bfqd), GFP_KERNEL | __GFP_ZERO, q->node);
++	if (bfqd == NULL)
++		return -ENOMEM;
++
++	/*
++	 * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
++	 * Grab a permanent reference to it, so that the normal code flow
++	 * will not attempt to free it.
++	 */
++	bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, 1, 0);
++	atomic_inc(&bfqd->oom_bfqq.ref);
++
++	bfqd->queue = q;
++	q->elevator->elevator_data = bfqd;
++
++	bfqg = bfq_alloc_root_group(bfqd, q->node);
++	if (bfqg == NULL) {
++		kfree(bfqd);
++		return -ENOMEM;
++	}
++
++	bfqd->root_group = bfqg;
++
++	init_timer(&bfqd->idle_slice_timer);
++	bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
++	bfqd->idle_slice_timer.data = (unsigned long)bfqd;
++
++	bfqd->rq_pos_tree = RB_ROOT;
++
++	INIT_WORK(&bfqd->unplug_work, bfq_kick_queue);
++
++	INIT_LIST_HEAD(&bfqd->active_list);
++	INIT_LIST_HEAD(&bfqd->idle_list);
++
++	bfqd->hw_tag = -1;
++
++	bfqd->bfq_max_budget = bfq_default_max_budget;
++
++	bfqd->bfq_quantum = bfq_quantum;
++	bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0];
++	bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1];
++	bfqd->bfq_back_max = bfq_back_max;
++	bfqd->bfq_back_penalty = bfq_back_penalty;
++	bfqd->bfq_slice_idle = bfq_slice_idle;
++	bfqd->bfq_class_idle_last_service = 0;
++	bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq;
++	bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async;
++	bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync;
++
++	bfqd->low_latency = true;
++
++	bfqd->bfq_raising_coeff = 20;
++	bfqd->bfq_raising_rt_max_time = msecs_to_jiffies(300);
++	bfqd->bfq_raising_max_time = 0;
++	bfqd->bfq_raising_min_idle_time = msecs_to_jiffies(2000);
++	bfqd->bfq_raising_min_inter_arr_async = msecs_to_jiffies(500);
++	bfqd->bfq_raising_max_softrt_rate = 7000;
++
++	/* Initially estimate the device's peak rate as the reference rate */
++	if (blk_queue_nonrot(bfqd->queue)) {
++		bfqd->RT_prod = R_nonrot * T_nonrot;
++		bfqd->peak_rate = R_nonrot;
++	} else {
++		bfqd->RT_prod = R_rot * T_rot;
++		bfqd->peak_rate = R_rot;
++	}
++
++	return 0;
++}
++
++static void bfq_slab_kill(void)
++{
++	if (bfq_pool != NULL)
++		kmem_cache_destroy(bfq_pool);
++}
++
++static int __init bfq_slab_setup(void)
++{
++	bfq_pool = KMEM_CACHE(bfq_queue, 0);
++	if (bfq_pool == NULL)
++		return -ENOMEM;
++	return 0;
++}
++
++static ssize_t bfq_var_show(unsigned int var, char *page)
++{
++	return sprintf(page, "%d\n", var);
++}
++
++static ssize_t bfq_var_store(unsigned long *var, const char *page, size_t count)
++{
++	unsigned long new_val;
++	int ret = strict_strtoul(page, 10, &new_val);
++
++	if (ret == 0)
++		*var = new_val;
++
++	return count;
++}
++
++static ssize_t bfq_raising_max_time_show(struct elevator_queue *e, char *page)
++{
++	struct bfq_data *bfqd = e->elevator_data;
++	return sprintf(page, "%d\n", bfqd->bfq_raising_max_time > 0 ?
++		       bfqd->bfq_raising_max_time :
++		       bfq_wrais_duration(bfqd));
++}
++
++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page)
++{
++	struct bfq_queue *bfqq;
++	struct bfq_data *bfqd = e->elevator_data;
++	ssize_t num_char = 0;
++
++	num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n",
++			    bfqd->queued);
++
++	spin_lock_irq(bfqd->queue->queue_lock);
++
++	num_char += sprintf(page + num_char, "Active:\n");
++	list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) {
++	  num_char += sprintf(page + num_char,
++			      "pid%d: weight %hu, nr_queued %d %d,"
++			      " dur %d/%u\n",
++			      bfqq->pid,
++			      bfqq->entity.weight,
++			      bfqq->queued[0],
++			      bfqq->queued[1],
++			jiffies_to_msecs(jiffies -
++				bfqq->last_rais_start_finish),
++			jiffies_to_msecs(bfqq->raising_cur_max_time));
++	}
++
++	num_char += sprintf(page + num_char, "Idle:\n");
++	list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) {
++			num_char += sprintf(page + num_char,
++				"pid%d: weight %hu, dur %d/%u\n",
++				bfqq->pid,
++				bfqq->entity.weight,
++				jiffies_to_msecs(jiffies -
++					bfqq->last_rais_start_finish),
++				jiffies_to_msecs(bfqq->raising_cur_max_time));
++	}
++
++	spin_unlock_irq(bfqd->queue->queue_lock);
++
++	return num_char;
++}
++
++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV)				\
++static ssize_t __FUNC(struct elevator_queue *e, char *page)		\
++{									\
++	struct bfq_data *bfqd = e->elevator_data;			\
++	unsigned int __data = __VAR;					\
++	if (__CONV)							\
++		__data = jiffies_to_msecs(__data);			\
++	return bfq_var_show(__data, (page));				\
++}
++SHOW_FUNCTION(bfq_quantum_show, bfqd->bfq_quantum, 0);
++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1);
++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1);
++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0);
++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0);
++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1);
++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0);
++SHOW_FUNCTION(bfq_max_budget_async_rq_show, bfqd->bfq_max_budget_async_rq, 0);
++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1);
++SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1);
++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0);
++SHOW_FUNCTION(bfq_raising_coeff_show, bfqd->bfq_raising_coeff, 0);
++SHOW_FUNCTION(bfq_raising_rt_max_time_show, bfqd->bfq_raising_rt_max_time, 1);
++SHOW_FUNCTION(bfq_raising_min_idle_time_show, bfqd->bfq_raising_min_idle_time,
++	1);
++SHOW_FUNCTION(bfq_raising_min_inter_arr_async_show,
++	bfqd->bfq_raising_min_inter_arr_async,
++	1);
++SHOW_FUNCTION(bfq_raising_max_softrt_rate_show,
++	bfqd->bfq_raising_max_softrt_rate, 0);
++#undef SHOW_FUNCTION
++
++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV)			\
++static ssize_t								\
++__FUNC(struct elevator_queue *e, const char *page, size_t count)	\
++{									\
++	struct bfq_data *bfqd = e->elevator_data;			\
++	unsigned long __data;						\
++	int ret = bfq_var_store(&__data, (page), count);		\
++	if (__data < (MIN))						\
++		__data = (MIN);						\
++	else if (__data > (MAX))					\
++		__data = (MAX);						\
++	if (__CONV)							\
++		*(__PTR) = msecs_to_jiffies(__data);			\
++	else								\
++		*(__PTR) = __data;					\
++	return ret;							\
++}
++STORE_FUNCTION(bfq_quantum_store, &bfqd->bfq_quantum, 1, INT_MAX, 0);
++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1,
++		INT_MAX, 1);
++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1,
++		INT_MAX, 1);
++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0);
++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1,
++		INT_MAX, 0);
++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1);
++STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq,
++		1, INT_MAX, 0);
++STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0,
++		INT_MAX, 1);
++STORE_FUNCTION(bfq_raising_coeff_store, &bfqd->bfq_raising_coeff, 1,
++		INT_MAX, 0);
++STORE_FUNCTION(bfq_raising_max_time_store, &bfqd->bfq_raising_max_time, 0,
++		INT_MAX, 1);
++STORE_FUNCTION(bfq_raising_rt_max_time_store, &bfqd->bfq_raising_rt_max_time, 0,
++		INT_MAX, 1);
++STORE_FUNCTION(bfq_raising_min_idle_time_store,
++	       &bfqd->bfq_raising_min_idle_time, 0, INT_MAX, 1);
++STORE_FUNCTION(bfq_raising_min_inter_arr_async_store,
++		&bfqd->bfq_raising_min_inter_arr_async, 0, INT_MAX, 1);
++STORE_FUNCTION(bfq_raising_max_softrt_rate_store,
++	       &bfqd->bfq_raising_max_softrt_rate, 0, INT_MAX, 0);
++#undef STORE_FUNCTION
++
++/* do nothing for the moment */
++static ssize_t bfq_weights_store(struct elevator_queue *e,
++				    const char *page, size_t count)
++{
++	return count;
++}
++
++static inline unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd)
++{
++	u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]);
++
++	if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES)
++		return bfq_calc_max_budget(bfqd->peak_rate, timeout);
++	else
++		return bfq_default_max_budget;
++}
++
++static ssize_t bfq_max_budget_store(struct elevator_queue *e,
++				    const char *page, size_t count)
++{
++	struct bfq_data *bfqd = e->elevator_data;
++	unsigned long __data;
++	int ret = bfq_var_store(&__data, (page), count);
++
++	if (__data == 0)
++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
++	else {
++		if (__data > INT_MAX)
++			__data = INT_MAX;
++		bfqd->bfq_max_budget = __data;
++	}
++
++	bfqd->bfq_user_max_budget = __data;
++
++	return ret;
++}
++
++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e,
++				      const char *page, size_t count)
++{
++	struct bfq_data *bfqd = e->elevator_data;
++	unsigned long __data;
++	int ret = bfq_var_store(&__data, (page), count);
++
++	if (__data < 1)
++		__data = 1;
++	else if (__data > INT_MAX)
++		__data = INT_MAX;
++
++	bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data);
++	if (bfqd->bfq_user_max_budget == 0)
++		bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd);
++
++	return ret;
++}
++
++static ssize_t bfq_low_latency_store(struct elevator_queue *e,
++				     const char *page, size_t count)
++{
++	struct bfq_data *bfqd = e->elevator_data;
++	unsigned long __data;
++	int ret = bfq_var_store(&__data, (page), count);
++
++	if (__data > 1)
++		__data = 1;
++	bfqd->low_latency = __data;
++
++	return ret;
++}
++
++#define BFQ_ATTR(name) \
++	__ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store)
++
++static struct elv_fs_entry bfq_attrs[] = {
++	BFQ_ATTR(quantum),
++	BFQ_ATTR(fifo_expire_sync),
++	BFQ_ATTR(fifo_expire_async),
++	BFQ_ATTR(back_seek_max),
++	BFQ_ATTR(back_seek_penalty),
++	BFQ_ATTR(slice_idle),
++	BFQ_ATTR(max_budget),
++	BFQ_ATTR(max_budget_async_rq),
++	BFQ_ATTR(timeout_sync),
++	BFQ_ATTR(timeout_async),
++	BFQ_ATTR(low_latency),
++	BFQ_ATTR(raising_coeff),
++	BFQ_ATTR(raising_max_time),
++	BFQ_ATTR(raising_rt_max_time),
++	BFQ_ATTR(raising_min_idle_time),
++	BFQ_ATTR(raising_min_inter_arr_async),
++	BFQ_ATTR(raising_max_softrt_rate),
++	BFQ_ATTR(weights),
++	__ATTR_NULL
++};
++
++static struct elevator_type iosched_bfq = {
++	.ops = {
++		.elevator_merge_fn =		bfq_merge,
++		.elevator_merged_fn =		bfq_merged_request,
++		.elevator_merge_req_fn =	bfq_merged_requests,
++		.elevator_allow_merge_fn =	bfq_allow_merge,
++		.elevator_dispatch_fn =		bfq_dispatch_requests,
++		.elevator_add_req_fn =		bfq_insert_request,
++		.elevator_activate_req_fn =	bfq_activate_request,
++		.elevator_deactivate_req_fn =	bfq_deactivate_request,
++		.elevator_completed_req_fn =	bfq_completed_request,
++		.elevator_former_req_fn =	elv_rb_former_request,
++		.elevator_latter_req_fn =	elv_rb_latter_request,
++		.elevator_init_icq_fn =		bfq_init_icq,
++		.elevator_exit_icq_fn =		bfq_exit_icq,
++		.elevator_set_req_fn =		bfq_set_request,
++		.elevator_put_req_fn =		bfq_put_request,
++		.elevator_may_queue_fn =	bfq_may_queue,
++		.elevator_init_fn =		bfq_init_queue,
++		.elevator_exit_fn =		bfq_exit_queue,
++	},
++	.icq_size =		sizeof(struct bfq_io_cq),
++	.icq_align =		__alignof__(struct bfq_io_cq),
++	.elevator_attrs =	bfq_attrs,
++	.elevator_name =	"bfq",
++	.elevator_owner =	THIS_MODULE,
++};
++
++static int __init bfq_init(void)
++{
++	/*
++	 * Can be 0 on HZ < 1000 setups.
++	 */
++	if (bfq_slice_idle == 0)
++		bfq_slice_idle = 1;
++
++	if (bfq_timeout_async == 0)
++		bfq_timeout_async = 1;
++
++	if (bfq_slab_setup())
++		return -ENOMEM;
++
++	elv_register(&iosched_bfq);
++
++	return 0;
++}
++
++static void __exit bfq_exit(void)
++{
++	elv_unregister(&iosched_bfq);
++	bfq_slab_kill();
++}
++
++module_init(bfq_init);
++module_exit(bfq_exit);
++
++MODULE_AUTHOR("Fabio Checconi, Paolo Valente");
++MODULE_LICENSE("GPL");
++MODULE_DESCRIPTION("Budget Fair Queueing IO scheduler");
+diff --git a/block/bfq-sched.c b/block/bfq-sched.c
+new file mode 100644
+index 0000000..a0edaa2
+--- /dev/null
++++ b/block/bfq-sched.c
+@@ -0,0 +1,1044 @@
++/*
++ * BFQ: Hierarchical B-WF2Q+ scheduler.
++ *
++ * Based on ideas and code from CFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
++ *		      Paolo Valente <paolo.valente@unimore.it>
++ *
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
++ */
++
++#ifdef CONFIG_CGROUP_BFQIO
++#define for_each_entity(entity)	\
++	for (; entity != NULL; entity = entity->parent)
++
++#define for_each_entity_safe(entity, parent) \
++	for (; entity && ({ parent = entity->parent; 1; }); entity = parent)
++
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
++						 int extract,
++						 struct bfq_data *bfqd);
++
++static inline void bfq_update_budget(struct bfq_entity *next_active)
++{
++	struct bfq_entity *bfqg_entity;
++	struct bfq_group *bfqg;
++	struct bfq_sched_data *group_sd;
++
++	BUG_ON(next_active == NULL);
++
++	group_sd = next_active->sched_data;
++
++	bfqg = container_of(group_sd, struct bfq_group, sched_data);
++	/*
++	 * bfq_group's my_entity field is not NULL only if the group
++	 * is not the root group. We must not touch the root entity
++	 * as it must never become an active entity.
++	 */
++	bfqg_entity = bfqg->my_entity;
++	if (bfqg_entity != NULL)
++		bfqg_entity->budget = next_active->budget;
++}
++
++static int bfq_update_next_active(struct bfq_sched_data *sd)
++{
++	struct bfq_entity *next_active;
++
++	if (sd->active_entity != NULL)
++		/* will update/requeue at the end of service */
++		return 0;
++
++	/*
++	 * NOTE: this can be improved in many ways, such as returning
++	 * 1 (and thus propagating upwards the update) only when the
++	 * budget changes, or caching the bfqq that will be scheduled
++	 * next from this subtree.  By now we worry more about
++	 * correctness than about performance...
++	 */
++	next_active = bfq_lookup_next_entity(sd, 0, NULL);
++	sd->next_active = next_active;
++
++	if (next_active != NULL)
++		bfq_update_budget(next_active);
++
++	return 1;
++}
++
++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
++					 struct bfq_entity *entity)
++{
++	BUG_ON(sd->next_active != entity);
++}
++#else
++#define for_each_entity(entity)	\
++	for (; entity != NULL; entity = NULL)
++
++#define for_each_entity_safe(entity, parent) \
++	for (parent = NULL; entity != NULL; entity = parent)
++
++static inline int bfq_update_next_active(struct bfq_sched_data *sd)
++{
++	return 0;
++}
++
++static inline void bfq_check_next_active(struct bfq_sched_data *sd,
++					 struct bfq_entity *entity)
++{
++}
++
++static inline void bfq_update_budget(struct bfq_entity *next_active)
++{
++}
++#endif
++
++/*
++ * Shift for timestamp calculations.  This actually limits the maximum
++ * service allowed in one timestamp delta (small shift values increase it),
++ * the maximum total weight that can be used for the queues in the system
++ * (big shift values increase it), and the period of virtual time wraparounds.
++ */
++#define WFQ_SERVICE_SHIFT	22
++
++/**
++ * bfq_gt - compare two timestamps.
++ * @a: first ts.
++ * @b: second ts.
++ *
++ * Return @a > @b, dealing with wrapping correctly.
++ */
++static inline int bfq_gt(u64 a, u64 b)
++{
++	return (s64)(a - b) > 0;
++}
++
++static inline struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = NULL;
++
++	BUG_ON(entity == NULL);
++
++	if (entity->my_sched_data == NULL)
++		bfqq = container_of(entity, struct bfq_queue, entity);
++
++	return bfqq;
++}
++
++
++/**
++ * bfq_delta - map service into the virtual time domain.
++ * @service: amount of service.
++ * @weight: scale factor (weight of an entity or weight sum).
++ */
++static inline u64 bfq_delta(unsigned long service,
++					unsigned long weight)
++{
++	u64 d = (u64)service << WFQ_SERVICE_SHIFT;
++
++	do_div(d, weight);
++	return d;
++}
++
++/**
++ * bfq_calc_finish - assign the finish time to an entity.
++ * @entity: the entity to act upon.
++ * @service: the service to be charged to the entity.
++ */
++static inline void bfq_calc_finish(struct bfq_entity *entity,
++				   unsigned long service)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++
++	BUG_ON(entity->weight == 0);
++
++	entity->finish = entity->start +
++		bfq_delta(service, entity->weight);
++
++	if (bfqq != NULL) {
++		bfq_log_bfqq(bfqq->bfqd, bfqq,
++			"calc_finish: serv %lu, w %d",
++			service, entity->weight);
++		bfq_log_bfqq(bfqq->bfqd, bfqq,
++			"calc_finish: start %llu, finish %llu, delta %llu",
++			entity->start, entity->finish,
++			bfq_delta(service, entity->weight));
++	}
++}
++
++/**
++ * bfq_entity_of - get an entity from a node.
++ * @node: the node field of the entity.
++ *
++ * Convert a node pointer to the relative entity.  This is used only
++ * to simplify the logic of some functions and not as the generic
++ * conversion mechanism because, e.g., in the tree walking functions,
++ * the check for a %NULL value would be redundant.
++ */
++static inline struct bfq_entity *bfq_entity_of(struct rb_node *node)
++{
++	struct bfq_entity *entity = NULL;
++
++	if (node != NULL)
++		entity = rb_entry(node, struct bfq_entity, rb_node);
++
++	return entity;
++}
++
++/**
++ * bfq_extract - remove an entity from a tree.
++ * @root: the tree root.
++ * @entity: the entity to remove.
++ */
++static inline void bfq_extract(struct rb_root *root,
++			       struct bfq_entity *entity)
++{
++	BUG_ON(entity->tree != root);
++
++	entity->tree = NULL;
++	rb_erase(&entity->rb_node, root);
++}
++
++/**
++ * bfq_idle_extract - extract an entity from the idle tree.
++ * @st: the service tree of the owning @entity.
++ * @entity: the entity being removed.
++ */
++static void bfq_idle_extract(struct bfq_service_tree *st,
++			     struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++	struct rb_node *next;
++
++	BUG_ON(entity->tree != &st->idle);
++
++	if (entity == st->first_idle) {
++		next = rb_next(&entity->rb_node);
++		st->first_idle = bfq_entity_of(next);
++	}
++
++	if (entity == st->last_idle) {
++		next = rb_prev(&entity->rb_node);
++		st->last_idle = bfq_entity_of(next);
++	}
++
++	bfq_extract(&st->idle, entity);
++
++	if (bfqq != NULL)
++		list_del(&bfqq->bfqq_list);
++}
++
++/**
++ * bfq_insert - generic tree insertion.
++ * @root: tree root.
++ * @entity: entity to insert.
++ *
++ * This is used for the idle and the active tree, since they are both
++ * ordered by finish time.
++ */
++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity)
++{
++	struct bfq_entity *entry;
++	struct rb_node **node = &root->rb_node;
++	struct rb_node *parent = NULL;
++
++	BUG_ON(entity->tree != NULL);
++
++	while (*node != NULL) {
++		parent = *node;
++		entry = rb_entry(parent, struct bfq_entity, rb_node);
++
++		if (bfq_gt(entry->finish, entity->finish))
++			node = &parent->rb_left;
++		else
++			node = &parent->rb_right;
++	}
++
++	rb_link_node(&entity->rb_node, parent, node);
++	rb_insert_color(&entity->rb_node, root);
++
++	entity->tree = root;
++}
++
++/**
++ * bfq_update_min - update the min_start field of a entity.
++ * @entity: the entity to update.
++ * @node: one of its children.
++ *
++ * This function is called when @entity may store an invalid value for
++ * min_start due to updates to the active tree.  The function  assumes
++ * that the subtree rooted at @node (which may be its left or its right
++ * child) has a valid min_start value.
++ */
++static inline void bfq_update_min(struct bfq_entity *entity,
++				  struct rb_node *node)
++{
++	struct bfq_entity *child;
++
++	if (node != NULL) {
++		child = rb_entry(node, struct bfq_entity, rb_node);
++		if (bfq_gt(entity->min_start, child->min_start))
++			entity->min_start = child->min_start;
++	}
++}
++
++/**
++ * bfq_update_active_node - recalculate min_start.
++ * @node: the node to update.
++ *
++ * @node may have changed position or one of its children may have moved,
++ * this function updates its min_start value.  The left and right subtrees
++ * are assumed to hold a correct min_start value.
++ */
++static inline void bfq_update_active_node(struct rb_node *node)
++{
++	struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node);
++
++	entity->min_start = entity->start;
++	bfq_update_min(entity, node->rb_right);
++	bfq_update_min(entity, node->rb_left);
++}
++
++/**
++ * bfq_update_active_tree - update min_start for the whole active tree.
++ * @node: the starting node.
++ *
++ * @node must be the deepest modified node after an update.  This function
++ * updates its min_start using the values held by its children, assuming
++ * that they did not change, and then updates all the nodes that may have
++ * changed in the path to the root.  The only nodes that may have changed
++ * are the ones in the path or their siblings.
++ */
++static void bfq_update_active_tree(struct rb_node *node)
++{
++	struct rb_node *parent;
++
++up:
++	bfq_update_active_node(node);
++
++	parent = rb_parent(node);
++	if (parent == NULL)
++		return;
++
++	if (node == parent->rb_left && parent->rb_right != NULL)
++		bfq_update_active_node(parent->rb_right);
++	else if (parent->rb_left != NULL)
++		bfq_update_active_node(parent->rb_left);
++
++	node = parent;
++	goto up;
++}
++
++/**
++ * bfq_active_insert - insert an entity in the active tree of its group/device.
++ * @st: the service tree of the entity.
++ * @entity: the entity being inserted.
++ *
++ * The active tree is ordered by finish time, but an extra key is kept
++ * per each node, containing the minimum value for the start times of
++ * its children (and the node itself), so it's possible to search for
++ * the eligible node with the lowest finish time in logarithmic time.
++ */
++static void bfq_active_insert(struct bfq_service_tree *st,
++			      struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++	struct rb_node *node = &entity->rb_node;
++
++	bfq_insert(&st->active, entity);
++
++	if (node->rb_left != NULL)
++		node = node->rb_left;
++	else if (node->rb_right != NULL)
++		node = node->rb_right;
++
++	bfq_update_active_tree(node);
++
++	if (bfqq != NULL)
++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
++}
++
++/**
++ * bfq_ioprio_to_weight - calc a weight from an ioprio.
++ * @ioprio: the ioprio value to convert.
++ */
++static unsigned short bfq_ioprio_to_weight(int ioprio)
++{
++	WARN_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR);
++	return IOPRIO_BE_NR - ioprio;
++}
++
++/**
++ * bfq_weight_to_ioprio - calc an ioprio from a weight.
++ * @weight: the weight value to convert.
++ *
++ * To preserve as mush as possible the old only-ioprio user interface,
++ * 0 is used as an escape ioprio value for weights (numerically) equal or
++ * larger than IOPRIO_BE_NR
++ */
++static unsigned short bfq_weight_to_ioprio(int weight)
++{
++	WARN_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT);
++	return IOPRIO_BE_NR - weight < 0 ? 0 : IOPRIO_BE_NR - weight;
++}
++
++static inline void bfq_get_entity(struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++	struct bfq_sched_data *sd;
++
++	if (bfqq != NULL) {
++		sd = entity->sched_data;
++		atomic_inc(&bfqq->ref);
++		bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
++			     bfqq, atomic_read(&bfqq->ref));
++	}
++}
++
++/**
++ * bfq_find_deepest - find the deepest node that an extraction can modify.
++ * @node: the node being removed.
++ *
++ * Do the first step of an extraction in an rb tree, looking for the
++ * node that will replace @node, and returning the deepest node that
++ * the following modifications to the tree can touch.  If @node is the
++ * last node in the tree return %NULL.
++ */
++static struct rb_node *bfq_find_deepest(struct rb_node *node)
++{
++	struct rb_node *deepest;
++
++	if (node->rb_right == NULL && node->rb_left == NULL)
++		deepest = rb_parent(node);
++	else if (node->rb_right == NULL)
++		deepest = node->rb_left;
++	else if (node->rb_left == NULL)
++		deepest = node->rb_right;
++	else {
++		deepest = rb_next(node);
++		if (deepest->rb_right != NULL)
++			deepest = deepest->rb_right;
++		else if (rb_parent(deepest) != node)
++			deepest = rb_parent(deepest);
++	}
++
++	return deepest;
++}
++
++/**
++ * bfq_active_extract - remove an entity from the active tree.
++ * @st: the service_tree containing the tree.
++ * @entity: the entity being removed.
++ */
++static void bfq_active_extract(struct bfq_service_tree *st,
++			       struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++	struct rb_node *node;
++
++	node = bfq_find_deepest(&entity->rb_node);
++	bfq_extract(&st->active, entity);
++
++	if (node != NULL)
++		bfq_update_active_tree(node);
++
++	if (bfqq != NULL)
++		list_del(&bfqq->bfqq_list);
++}
++
++/**
++ * bfq_idle_insert - insert an entity into the idle tree.
++ * @st: the service tree containing the tree.
++ * @entity: the entity to insert.
++ */
++static void bfq_idle_insert(struct bfq_service_tree *st,
++			    struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++	struct bfq_entity *first_idle = st->first_idle;
++	struct bfq_entity *last_idle = st->last_idle;
++
++	if (first_idle == NULL || bfq_gt(first_idle->finish, entity->finish))
++		st->first_idle = entity;
++	if (last_idle == NULL || bfq_gt(entity->finish, last_idle->finish))
++		st->last_idle = entity;
++
++	bfq_insert(&st->idle, entity);
++
++	if (bfqq != NULL)
++		list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list);
++}
++
++/**
++ * bfq_forget_entity - remove an entity from the wfq trees.
++ * @st: the service tree.
++ * @entity: the entity being removed.
++ *
++ * Update the device status and forget everything about @entity, putting
++ * the device reference to it, if it is a queue.  Entities belonging to
++ * groups are not refcounted.
++ */
++static void bfq_forget_entity(struct bfq_service_tree *st,
++			      struct bfq_entity *entity)
++{
++	struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++	struct bfq_sched_data *sd;
++
++	BUG_ON(!entity->on_st);
++
++	entity->on_st = 0;
++	st->wsum -= entity->weight;
++	if (bfqq != NULL) {
++		sd = entity->sched_data;
++		bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d",
++			     bfqq, atomic_read(&bfqq->ref));
++		bfq_put_queue(bfqq);
++	}
++}
++
++/**
++ * bfq_put_idle_entity - release the idle tree ref of an entity.
++ * @st: service tree for the entity.
++ * @entity: the entity being released.
++ */
++static void bfq_put_idle_entity(struct bfq_service_tree *st,
++				struct bfq_entity *entity)
++{
++	bfq_idle_extract(st, entity);
++	bfq_forget_entity(st, entity);
++}
++
++/**
++ * bfq_forget_idle - update the idle tree if necessary.
++ * @st: the service tree to act upon.
++ *
++ * To preserve the global O(log N) complexity we only remove one entry here;
++ * as the idle tree will not grow indefinitely this can be done safely.
++ */
++static void bfq_forget_idle(struct bfq_service_tree *st)
++{
++	struct bfq_entity *first_idle = st->first_idle;
++	struct bfq_entity *last_idle = st->last_idle;
++
++	if (RB_EMPTY_ROOT(&st->active) && last_idle != NULL &&
++	    !bfq_gt(last_idle->finish, st->vtime)) {
++		/*
++		 * Forget the whole idle tree, increasing the vtime past
++		 * the last finish time of idle entities.
++		 */
++		st->vtime = last_idle->finish;
++	}
++
++	if (first_idle != NULL && !bfq_gt(first_idle->finish, st->vtime))
++		bfq_put_idle_entity(st, first_idle);
++}
++
++static struct bfq_service_tree *
++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
++			 struct bfq_entity *entity)
++{
++	struct bfq_service_tree *new_st = old_st;
++
++	if (entity->ioprio_changed) {
++		struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
++
++		BUG_ON(old_st->wsum < entity->weight);
++		old_st->wsum -= entity->weight;
++
++		if (entity->new_weight != entity->orig_weight) {
++			entity->orig_weight = entity->new_weight;
++			entity->ioprio =
++				bfq_weight_to_ioprio(entity->orig_weight);
++		} else if (entity->new_ioprio != entity->ioprio) {
++			entity->ioprio = entity->new_ioprio;
++			entity->orig_weight =
++					bfq_ioprio_to_weight(entity->ioprio);
++		} else
++			entity->new_weight = entity->orig_weight =
++				bfq_ioprio_to_weight(entity->ioprio);
++
++		entity->ioprio_class = entity->new_ioprio_class;
++		entity->ioprio_changed = 0;
++
++		/*
++		 * NOTE: here we may be changing the weight too early,
++		 * this will cause unfairness.  The correct approach
++		 * would have required additional complexity to defer
++		 * weight changes to the proper time instants (i.e.,
++		 * when entity->finish <= old_st->vtime).
++		 */
++		new_st = bfq_entity_service_tree(entity);
++		entity->weight = entity->orig_weight *
++			(bfqq != NULL ? bfqq->raising_coeff : 1);
++		new_st->wsum += entity->weight;
++
++		if (new_st != old_st)
++			entity->start = new_st->vtime;
++	}
++
++	return new_st;
++}
++
++/**
++ * bfq_bfqq_served - update the scheduler status after selection for service.
++ * @bfqq: the queue being served.
++ * @served: bytes to transfer.
++ *
++ * NOTE: this can be optimized, as the timestamps of upper level entities
++ * are synchronized every time a new bfqq is selected for service.  By now,
++ * we keep it to better check consistency.
++ */
++static void bfq_bfqq_served(struct bfq_queue *bfqq, unsigned long served)
++{
++	struct bfq_entity *entity = &bfqq->entity;
++	struct bfq_service_tree *st;
++
++	for_each_entity(entity) {
++		st = bfq_entity_service_tree(entity);
++
++		entity->service += served;
++		BUG_ON(entity->service > entity->budget);
++		BUG_ON(st->wsum == 0);
++
++		st->vtime += bfq_delta(served, st->wsum);
++		bfq_forget_idle(st);
++	}
++	bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %lu secs", served);
++}
++
++/**
++ * bfq_bfqq_charge_full_budget - set the service to the entity budget.
++ * @bfqq: the queue that needs a service update.
++ *
++ * When it's not possible to be fair in the service domain, because
++ * a queue is not consuming its budget fast enough (the meaning of
++ * fast depends on the timeout parameter), we charge it a full
++ * budget.  In this way we should obtain a sort of time-domain
++ * fairness among all the seeky/slow queues.
++ */
++static inline void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq)
++{
++	struct bfq_entity *entity = &bfqq->entity;
++
++	bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget");
++
++	bfq_bfqq_served(bfqq, entity->budget - entity->service);
++}
++
++/**
++ * __bfq_activate_entity - activate an entity.
++ * @entity: the entity being activated.
++ *
++ * Called whenever an entity is activated, i.e., it is not active and one
++ * of its children receives a new request, or has to be reactivated due to
++ * budget exhaustion.  It uses the current budget of the entity (and the
++ * service received if @entity is active) of the queue to calculate its
++ * timestamps.
++ */
++static void __bfq_activate_entity(struct bfq_entity *entity)
++{
++	struct bfq_sched_data *sd = entity->sched_data;
++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
++
++	if (entity == sd->active_entity) {
++		BUG_ON(entity->tree != NULL);
++		/*
++		 * If we are requeueing the current entity we have
++		 * to take care of not charging to it service it has
++		 * not received.
++		 */
++		bfq_calc_finish(entity, entity->service);
++		entity->start = entity->finish;
++		sd->active_entity = NULL;
++	} else if (entity->tree == &st->active) {
++		/*
++		 * Requeueing an entity due to a change of some
++		 * next_active entity below it.  We reuse the old
++		 * start time.
++		 */
++		bfq_active_extract(st, entity);
++	} else if (entity->tree == &st->idle) {
++		/*
++		 * Must be on the idle tree, bfq_idle_extract() will
++		 * check for that.
++		 */
++		bfq_idle_extract(st, entity);
++		entity->start = bfq_gt(st->vtime, entity->finish) ?
++				       st->vtime : entity->finish;
++	} else {
++		/*
++		 * The finish time of the entity may be invalid, and
++		 * it is in the past for sure, otherwise the queue
++		 * would have been on the idle tree.
++		 */
++		entity->start = st->vtime;
++		st->wsum += entity->weight;
++		bfq_get_entity(entity);
++
++		BUG_ON(entity->on_st);
++		entity->on_st = 1;
++	}
++
++	st = __bfq_entity_update_weight_prio(st, entity);
++	bfq_calc_finish(entity, entity->budget);
++	bfq_active_insert(st, entity);
++}
++
++/**
++ * bfq_activate_entity - activate an entity and its ancestors if necessary.
++ * @entity: the entity to activate.
++ *
++ * Activate @entity and all the entities on the path from it to the root.
++ */
++static void bfq_activate_entity(struct bfq_entity *entity)
++{
++	struct bfq_sched_data *sd;
++
++	for_each_entity(entity) {
++		__bfq_activate_entity(entity);
++
++		sd = entity->sched_data;
++		if (!bfq_update_next_active(sd))
++			/*
++			 * No need to propagate the activation to the
++			 * upper entities, as they will be updated when
++			 * the active entity is rescheduled.
++			 */
++			break;
++	}
++}
++
++/**
++ * __bfq_deactivate_entity - deactivate an entity from its service tree.
++ * @entity: the entity to deactivate.
++ * @requeue: if false, the entity will not be put into the idle tree.
++ *
++ * Deactivate an entity, independently from its previous state.  If the
++ * entity was not on a service tree just return, otherwise if it is on
++ * any scheduler tree, extract it from that tree, and if necessary
++ * and if the caller did not specify @requeue, put it on the idle tree.
++ *
++ * Return %1 if the caller should update the entity hierarchy, i.e.,
++ * if the entity was under service or if it was the next_active for
++ * its sched_data; return %0 otherwise.
++ */
++static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
++{
++	struct bfq_sched_data *sd = entity->sched_data;
++	struct bfq_service_tree *st = bfq_entity_service_tree(entity);
++	int was_active = entity == sd->active_entity;
++	int ret = 0;
++
++	if (!entity->on_st)
++		return 0;
++
++	BUG_ON(was_active && entity->tree != NULL);
++
++	if (was_active) {
++		bfq_calc_finish(entity, entity->service);
++		sd->active_entity = NULL;
++	} else if (entity->tree == &st->active)
++		bfq_active_extract(st, entity);
++	else if (entity->tree == &st->idle)
++		bfq_idle_extract(st, entity);
++	else if (entity->tree != NULL)
++		BUG();
++
++	if (was_active || sd->next_active == entity)
++		ret = bfq_update_next_active(sd);
++
++	if (!requeue || !bfq_gt(entity->finish, st->vtime))
++		bfq_forget_entity(st, entity);
++	else
++		bfq_idle_insert(st, entity);
++
++	BUG_ON(sd->active_entity == entity);
++	BUG_ON(sd->next_active == entity);
++
++	return ret;
++}
++
++/**
++ * bfq_deactivate_entity - deactivate an entity.
++ * @entity: the entity to deactivate.
++ * @requeue: true if the entity can be put on the idle tree
++ */
++static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue)
++{
++	struct bfq_sched_data *sd;
++	struct bfq_entity *parent;
++
++	for_each_entity_safe(entity, parent) {
++		sd = entity->sched_data;
++
++		if (!__bfq_deactivate_entity(entity, requeue))
++			/*
++			 * The parent entity is still backlogged, and
++			 * we don't need to update it as it is still
++			 * under service.
++			 */
++			break;
++
++		if (sd->next_active != NULL)
++			/*
++			 * The parent entity is still backlogged and
++			 * the budgets on the path towards the root
++			 * need to be updated.
++			 */
++			goto update;
++
++		/*
++		 * If we reach there the parent is no more backlogged and
++		 * we want to propagate the dequeue upwards.
++		 */
++		requeue = 1;
++	}
++
++	return;
++
++update:
++	entity = parent;
++	for_each_entity(entity) {
++		__bfq_activate_entity(entity);
++
++		sd = entity->sched_data;
++		if (!bfq_update_next_active(sd))
++			break;
++	}
++}
++
++/**
++ * bfq_update_vtime - update vtime if necessary.
++ * @st: the service tree to act upon.
++ *
++ * If necessary update the service tree vtime to have at least one
++ * eligible entity, skipping to its start time.  Assumes that the
++ * active tree of the device is not empty.
++ *
++ * NOTE: this hierarchical implementation updates vtimes quite often,
++ * we may end up with reactivated tasks getting timestamps after a
++ * vtime skip done because we needed a ->first_active entity on some
++ * intermediate node.
++ */
++static void bfq_update_vtime(struct bfq_service_tree *st)
++{
++	struct bfq_entity *entry;
++	struct rb_node *node = st->active.rb_node;
++
++	entry = rb_entry(node, struct bfq_entity, rb_node);
++	if (bfq_gt(entry->min_start, st->vtime)) {
++		st->vtime = entry->min_start;
++		bfq_forget_idle(st);
++	}
++}
++
++/**
++ * bfq_first_active - find the eligible entity with the smallest finish time
++ * @st: the service tree to select from.
++ *
++ * This function searches the first schedulable entity, starting from the
++ * root of the tree and going on the left every time on this side there is
++ * a subtree with at least one eligible (start >= vtime) entity.  The path
++ * on the right is followed only if a) the left subtree contains no eligible
++ * entities and b) no eligible entity has been found yet.
++ */
++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st)
++{
++	struct bfq_entity *entry, *first = NULL;
++	struct rb_node *node = st->active.rb_node;
++
++	while (node != NULL) {
++		entry = rb_entry(node, struct bfq_entity, rb_node);
++left:
++		if (!bfq_gt(entry->start, st->vtime))
++			first = entry;
++
++		BUG_ON(bfq_gt(entry->min_start, st->vtime));
++
++		if (node->rb_left != NULL) {
++			entry = rb_entry(node->rb_left,
++					 struct bfq_entity, rb_node);
++			if (!bfq_gt(entry->min_start, st->vtime)) {
++				node = node->rb_left;
++				goto left;
++			}
++		}
++		if (first != NULL)
++			break;
++		node = node->rb_right;
++	}
++
++	BUG_ON(first == NULL && !RB_EMPTY_ROOT(&st->active));
++	return first;
++}
++
++/**
++ * __bfq_lookup_next_entity - return the first eligible entity in @st.
++ * @st: the service tree.
++ *
++ * Update the virtual time in @st and return the first eligible entity
++ * it contains.
++ */
++static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st,
++						   bool force)
++{
++	struct bfq_entity *entity, *new_next_active = NULL;
++
++	if (RB_EMPTY_ROOT(&st->active))
++		return NULL;
++
++	bfq_update_vtime(st);
++	entity = bfq_first_active_entity(st);
++	BUG_ON(bfq_gt(entity->start, st->vtime));
++
++	/*
++	 * If the chosen entity does not match with the sched_data's
++	 * next_active and we are forcedly serving the IDLE priority
++	 * class tree, bubble up budget update.
++	 */
++	if (unlikely(force && entity != entity->sched_data->next_active)) {
++		new_next_active = entity;
++		for_each_entity(new_next_active)
++			bfq_update_budget(new_next_active);
++	}
++
++	return entity;
++}
++
++/**
++ * bfq_lookup_next_entity - return the first eligible entity in @sd.
++ * @sd: the sched_data.
++ * @extract: if true the returned entity will be also extracted from @sd.
++ *
++ * NOTE: since we cache the next_active entity at each level of the
++ * hierarchy, the complexity of the lookup can be decreased with
++ * absolutely no effort just returning the cached next_active value;
++ * we prefer to do full lookups to test the consistency of * the data
++ * structures.
++ */
++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
++						 int extract,
++						 struct bfq_data *bfqd)
++{
++	struct bfq_service_tree *st = sd->service_tree;
++	struct bfq_entity *entity;
++	int i=0;
++
++	BUG_ON(sd->active_entity != NULL);
++
++	if (bfqd != NULL &&
++	    jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) {
++		entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, true);
++		if (entity != NULL) {
++			i = BFQ_IOPRIO_CLASSES - 1;
++			bfqd->bfq_class_idle_last_service = jiffies;
++			sd->next_active = entity;
++		}
++	}
++	for (; i < BFQ_IOPRIO_CLASSES; i++) {
++		entity = __bfq_lookup_next_entity(st + i, false);
++		if (entity != NULL) {
++			if (extract) {
++				bfq_check_next_active(sd, entity);
++				bfq_active_extract(st + i, entity);
++				sd->active_entity = entity;
++				sd->next_active = NULL;
++			}
++			break;
++		}
++	}
++
++	return entity;
++}
++
++/*
++ * Get next queue for service.
++ */
++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd)
++{
++	struct bfq_entity *entity = NULL;
++	struct bfq_sched_data *sd;
++	struct bfq_queue *bfqq;
++
++	BUG_ON(bfqd->active_queue != NULL);
++
++	if (bfqd->busy_queues == 0)
++		return NULL;
++
++	sd = &bfqd->root_group->sched_data;
++	for (; sd != NULL; sd = entity->my_sched_data) {
++		entity = bfq_lookup_next_entity(sd, 1, bfqd);
++		BUG_ON(entity == NULL);
++		entity->service = 0;
++	}
++
++	bfqq = bfq_entity_to_bfqq(entity);
++	BUG_ON(bfqq == NULL);
++
++	return bfqq;
++}
++
++static void __bfq_bfqd_reset_active(struct bfq_data *bfqd)
++{
++	if (bfqd->active_bic != NULL) {
++		put_io_context(bfqd->active_bic->icq.ioc);
++		bfqd->active_bic = NULL;
++	}
++
++	bfqd->active_queue = NULL;
++	del_timer(&bfqd->idle_slice_timer);
++}
++
++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++				int requeue)
++{
++	struct bfq_entity *entity = &bfqq->entity;
++
++	if (bfqq == bfqd->active_queue)
++		__bfq_bfqd_reset_active(bfqd);
++
++	bfq_deactivate_entity(entity, requeue);
++}
++
++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++{
++	struct bfq_entity *entity = &bfqq->entity;
++
++	bfq_activate_entity(entity);
++}
++
++/*
++ * Called when the bfqq no longer has requests pending, remove it from
++ * the service tree.
++ */
++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
++			      int requeue)
++{
++	BUG_ON(!bfq_bfqq_busy(bfqq));
++	BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list));
++
++	bfq_log_bfqq(bfqd, bfqq, "del from busy");
++
++	bfq_clear_bfqq_busy(bfqq);
++
++	BUG_ON(bfqd->busy_queues == 0);
++	bfqd->busy_queues--;
++
++	bfq_deactivate_bfqq(bfqd, bfqq, requeue);
++}
++
++/*
++ * Called when an inactive queue receives a new request.
++ */
++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
++{
++	BUG_ON(bfq_bfqq_busy(bfqq));
++	BUG_ON(bfqq == bfqd->active_queue);
++
++	bfq_log_bfqq(bfqd, bfqq, "add to busy");
++
++	bfq_activate_bfqq(bfqd, bfqq);
++
++	bfq_mark_bfqq_busy(bfqq);
++	bfqd->busy_queues++;
++}
+diff --git a/block/bfq.h b/block/bfq.h
+new file mode 100644
+index 0000000..ce3c474
+--- /dev/null
++++ b/block/bfq.h
+@@ -0,0 +1,617 @@
++/*
++ * BFQ-v5r1 for 3.7.0: data structures and common functions prototypes.
++ *
++ * Based on ideas and code from CFQ:
++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
++ *
++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
++ *		      Paolo Valente <paolo.valente@unimore.it>
++ *
++ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it>
++ */
++
++#ifndef _BFQ_H
++#define _BFQ_H
++
++#include <linux/blktrace_api.h>
++#include <linux/hrtimer.h>
++#include <linux/ioprio.h>
++#include <linux/rbtree.h>
++
++#define BFQ_IOPRIO_CLASSES	3
++#define BFQ_CL_IDLE_TIMEOUT	HZ/5
++
++#define BFQ_MIN_WEIGHT	1
++#define BFQ_MAX_WEIGHT	1000
++
++#define BFQ_DEFAULT_GRP_WEIGHT	10
++#define BFQ_DEFAULT_GRP_IOPRIO	0
++#define BFQ_DEFAULT_GRP_CLASS	IOPRIO_CLASS_BE
++
++struct bfq_entity;
++
++/**
++ * struct bfq_service_tree - per ioprio_class service tree.
++ * @active: tree for active entities (i.e., those backlogged).
++ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i).
++ * @first_idle: idle entity with minimum F_i.
++ * @last_idle: idle entity with maximum F_i.
++ * @vtime: scheduler virtual time.
++ * @wsum: scheduler weight sum; active and idle entities contribute to it.
++ *
++ * Each service tree represents a B-WF2Q+ scheduler on its own.  Each
++ * ioprio_class has its own independent scheduler, and so its own
++ * bfq_service_tree.  All the fields are protected by the queue lock
++ * of the containing bfqd.
++ */
++struct bfq_service_tree {
++	struct rb_root active;
++	struct rb_root idle;
++
++	struct bfq_entity *first_idle;
++	struct bfq_entity *last_idle;
++
++	u64 vtime;
++	unsigned long wsum;
++};
++
++/**
++ * struct bfq_sched_data - multi-class scheduler.
++ * @active_entity: entity under service.
++ * @next_active: head-of-the-line entity in the scheduler.
++ * @service_tree: array of service trees, one per ioprio_class.
++ *
++ * bfq_sched_data is the basic scheduler queue.  It supports three
++ * ioprio_classes, and can be used either as a toplevel queue or as
++ * an intermediate queue on a hierarchical setup.
++ * @next_active points to the active entity of the sched_data service
++ * trees that will be scheduled next.
++ *
++ * The supported ioprio_classes are the same as in CFQ, in descending
++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE.
++ * Requests from higher priority queues are served before all the
++ * requests from lower priority queues; among requests of the same
++ * queue requests are served according to B-WF2Q+.
++ * All the fields are protected by the queue lock of the containing bfqd.
++ */
++struct bfq_sched_data {
++	struct bfq_entity *active_entity;
++	struct bfq_entity *next_active;
++	struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES];
++};
++
++/**
++ * struct bfq_entity - schedulable entity.
++ * @rb_node: service_tree member.
++ * @on_st: flag, true if the entity is on a tree (either the active or
++ *         the idle one of its service_tree).
++ * @finish: B-WF2Q+ finish timestamp (aka F_i).
++ * @start: B-WF2Q+ start timestamp (aka S_i).
++ * @tree: tree the entity is enqueued into; %NULL if not on a tree.
++ * @min_start: minimum start time of the (active) subtree rooted at
++ *             this entity; used for O(log N) lookups into active trees.
++ * @service: service received during the last round of service.
++ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight.
++ * @weight: weight of the queue
++ * @parent: parent entity, for hierarchical scheduling.
++ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the
++ *                 associated scheduler queue, %NULL on leaf nodes.
++ * @sched_data: the scheduler queue this entity belongs to.
++ * @ioprio: the ioprio in use.
++ * @new_weight: when a weight change is requested, the new weight value.
++ * @orig_weight: original weight, used to implement weight boosting
++ * @new_ioprio: when an ioprio change is requested, the new ioprio value.
++ * @ioprio_class: the ioprio_class in use.
++ * @new_ioprio_class: when an ioprio_class change is requested, the new
++ *                    ioprio_class value.
++ * @ioprio_changed: flag, true when the user requested a weight, ioprio or
++ *                  ioprio_class change.
++ *
++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the
++ * cgroup hierarchy) or a bfq_group into the upper level scheduler.  Each
++ * entity belongs to the sched_data of the parent group in the cgroup
++ * hierarchy.  Non-leaf entities have also their own sched_data, stored
++ * in @my_sched_data.
++ *
++ * Each entity stores independently its priority values; this would
++ * allow different weights on different devices, but this
++ * functionality is not exported to userspace by now.  Priorities and
++ * weights are updated lazily, first storing the new values into the
++ * new_* fields, then setting the @ioprio_changed flag.  As soon as
++ * there is a transition in the entity state that allows the priority
++ * update to take place the effective and the requested priority
++ * values are synchronized.
++ *
++ * Unless cgroups are used, the weight value is calculated from the
++ * ioprio to export the same interface as CFQ.  When dealing with
++ * ``well-behaved'' queues (i.e., queues that do not spend too much
++ * time to consume their budget and have true sequential behavior, and
++ * when there are no external factors breaking anticipation) the
++ * relative weights at each level of the cgroups hierarchy should be
++ * guaranteed.  All the fields are protected by the queue lock of the
++ * containing bfqd.
++ */
++struct bfq_entity {
++	struct rb_node rb_node;
++
++	int on_st;
++
++	u64 finish;
++	u64 start;
++
++	struct rb_root *tree;
++
++	u64 min_start;
++
++	unsigned long service, budget;
++	unsigned short weight, new_weight;
++	unsigned short orig_weight;
++
++	struct bfq_entity *parent;
++
++	struct bfq_sched_data *my_sched_data;
++	struct bfq_sched_data *sched_data;
++
++	unsigned short ioprio, new_ioprio;
++	unsigned short ioprio_class, new_ioprio_class;
++
++	int ioprio_changed;
++};
++
++struct bfq_group;
++
++/**
++ * struct bfq_queue - leaf schedulable entity.
++ * @ref: reference counter.
++ * @bfqd: parent bfq_data.
++ * @new_bfqq: shared bfq_queue if queue is cooperating with
++ *           one or more other queues.
++ * @pos_node: request-position tree member (see bfq_data's @rq_pos_tree).
++ * @pos_root: request-position tree root (see bfq_data's @rq_pos_tree).
++ * @sort_list: sorted list of pending requests.
++ * @next_rq: if fifo isn't expired, next request to serve.
++ * @queued: nr of requests queued in @sort_list.
++ * @allocated: currently allocated requests.
++ * @meta_pending: pending metadata requests.
++ * @fifo: fifo list of requests in sort_list.
++ * @entity: entity representing this queue in the scheduler.
++ * @max_budget: maximum budget allowed from the feedback mechanism.
++ * @budget_timeout: budget expiration (in jiffies).
++ * @dispatched: number of requests on the dispatch list or inside driver.
++ * @org_ioprio: saved ioprio during boosted periods.
++ * @flags: status flags.
++ * @bfqq_list: node for active/idle bfqq list inside our bfqd.
++ * @seek_samples: number of seeks sampled
++ * @seek_total: sum of the distances of the seeks sampled
++ * @seek_mean: mean seek distance
++ * @last_request_pos: position of the last request enqueued
++ * @pid: pid of the process owning the queue, used for logging purposes.
++ * @last_rais_start_time: last (idle -> weight-raised) transition attempt
++ * @raising_cur_max_time: current max raising time for this queue
++ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the
++ *	 queue is shared
++ *
++ * A bfq_queue is a leaf request queue; it can be associated to an io_context
++ * or more (if it is an async one).  @cgroup holds a reference to the
++ * cgroup, to be sure that it does not disappear while a bfqq still
++ * references it (mostly to avoid races between request issuing and task
++ * migration followed by cgroup distruction).
++ * All the fields are protected by the queue lock of the containing bfqd.
++ */
++struct bfq_queue {
++	atomic_t ref;
++	struct bfq_data *bfqd;
++
++	/* fields for cooperating queues handling */
++	struct bfq_queue *new_bfqq;
++	struct rb_node pos_node;
++	struct rb_root *pos_root;
++
++	struct rb_root sort_list;
++	struct request *next_rq;
++	int queued[2];
++	int allocated[2];
++	int meta_pending;
++	struct list_head fifo;
++
++	struct bfq_entity entity;
++
++	unsigned long max_budget;
++	unsigned long budget_timeout;
++
++	int dispatched;
++
++	unsigned short org_ioprio;
++
++	unsigned int flags;
++
++	struct list_head bfqq_list;
++
++	unsigned int seek_samples;
++	u64 seek_total;
++	sector_t seek_mean;
++	sector_t last_request_pos;
++
++	pid_t pid;
++	struct bfq_io_cq *bic;
++
++	/* weight-raising fields */
++	unsigned int raising_cur_max_time;
++	u64 last_rais_start_finish, soft_rt_next_start;
++	unsigned int raising_coeff;
++};
++
++/**
++ * struct bfq_ttime - per process thinktime stats.
++ * @ttime_total: total process thinktime
++ * @ttime_samples: number of thinktime samples
++ * @ttime_mean: average process thinktime
++ */
++struct bfq_ttime {
++	unsigned long last_end_request;
++
++	unsigned long ttime_total;
++	unsigned long ttime_samples;
++	unsigned long ttime_mean;
++};
++
++/**
++ * struct bfq_io_cq - per (request_queue, io_context) structure.
++ * @icq: associated io_cq structure
++ * @bfqq: array of two process queues, the sync and the async
++ * @ttime: associated @bfq_ttime struct
++ * @raising_time_left: snapshot of the time left before weight raising ends
++ *		       for the sync queue associated to this process; this
++ *		       snapshot is taken to remember this value while the weight
++ *		       raising is suspended because the queue is merged with a
++ *		       shared queue, and is used to set @raising_cur_max_time
++ *		       when the queue is split from the shared queue and its
++ *		       weight is raised again
++ * @saved_idle_window: same purpose as the previous field for the idle window
++ */
++struct bfq_io_cq {
++	struct io_cq icq; /* must be the first member */
++	struct bfq_queue *bfqq[2];
++	struct bfq_ttime ttime;
++	int ioprio;
++
++	unsigned int raising_time_left;
++	unsigned int saved_idle_window;
++};
++
++/**
++ * struct bfq_data - per device data structure.
++ * @queue: request queue for the managed device.
++ * @root_group: root bfq_group for the device.
++ * @rq_pos_tree: rbtree sorted by next_request position,
++ *		used when determining if two or more queues
++ *		have interleaving requests (see bfq_close_cooperator).
++ * @busy_queues: number of bfq_queues containing requests (including the
++ *		 queue under service, even if it is idling).
++ * @queued: number of queued requests.
++ * @rq_in_driver: number of requests dispatched and waiting for completion.
++ * @sync_flight: number of sync requests in the driver.
++ * @max_rq_in_driver: max number of reqs in driver in the last @hw_tag_samples
++ *		      completed requests .
++ * @hw_tag_samples: nr of samples used to calculate hw_tag.
++ * @hw_tag: flag set to one if the driver is showing a queueing behavior.
++ * @budgets_assigned: number of budgets assigned.
++ * @idle_slice_timer: timer set when idling for the next sequential request
++ *                    from the queue under service.
++ * @unplug_work: delayed work to restart dispatching on the request queue.
++ * @active_queue: bfq_queue under service.
++ * @active_bic: bfq_io_cq (bic) associated with the @active_queue.
++ * @last_position: on-disk position of the last served request.
++ * @last_budget_start: beginning of the last budget.
++ * @last_idling_start: beginning of the last idle slice.
++ * @peak_rate: peak transfer rate observed for a budget.
++ * @peak_rate_samples: number of samples used to calculate @peak_rate.
++ * @bfq_max_budget: maximum budget allotted to a bfq_queue before rescheduling.
++ * @group_list: list of all the bfq_groups active on the device.
++ * @active_list: list of all the bfq_queues active on the device.
++ * @idle_list: list of all the bfq_queues idle on the device.
++ * @bfq_quantum: max number of requests dispatched per dispatch round.
++ * @bfq_fifo_expire: timeout for async/sync requests; when it expires
++ *                   requests are served in fifo order.
++ * @bfq_back_penalty: weight of backward seeks wrt forward ones.
++ * @bfq_back_max: maximum allowed backward seek.
++ * @bfq_slice_idle: maximum idling time.
++ * @bfq_user_max_budget: user-configured max budget value (0 for auto-tuning).
++ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to
++ *                           async queues.
++ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to
++ *               to prevent seeky queues to impose long latencies to well
++ *               behaved ones (this also implies that seeky queues cannot
++ *               receive guarantees in the service domain; after a timeout
++ *               they are charged for the whole allocated budget, to try
++ *               to preserve a behavior reasonably fair among them, but
++ *               without service-domain guarantees).
++ * @bfq_raising_coeff: Maximum factor by which the weight of a boosted
++ *                            queue is multiplied
++ * @bfq_raising_max_time: maximum duration of a weight-raising period (jiffies)
++ * @bfq_raising_rt_max_time: maximum duration for soft real-time processes
++ * @bfq_raising_min_idle_time: minimum idle period after which weight-raising
++ *			       may be reactivated for a queue (in jiffies)
++ * @bfq_raising_min_inter_arr_async: minimum period between request arrivals
++ *				     after which weight-raising may be
++ *				     reactivated for an already busy queue
++ *				     (in jiffies)
++ * @bfq_raising_max_softrt_rate: max service-rate for a soft real-time queue,
++ *			         sectors per seconds
++ * @RT_prod: cached value of the product R*T used for computing the maximum
++ * 	     duration of the weight raising automatically
++ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions
++ *
++ * All the fields are protected by the @queue lock.
++ */
++struct bfq_data {
++	struct request_queue *queue;
++
++	struct bfq_group *root_group;
++
++	struct rb_root rq_pos_tree;
++
++	int busy_queues;
++	int queued;
++	int rq_in_driver;
++	int sync_flight;
++
++	int max_rq_in_driver;
++	int hw_tag_samples;
++	int hw_tag;
++
++	int budgets_assigned;
++
++	struct timer_list idle_slice_timer;
++	struct work_struct unplug_work;
++
++	struct bfq_queue *active_queue;
++	struct bfq_io_cq *active_bic;
++
++	sector_t last_position;
++
++	ktime_t last_budget_start;
++	ktime_t last_idling_start;
++	int peak_rate_samples;
++	u64 peak_rate;
++	unsigned long bfq_max_budget;
++
++	struct hlist_head group_list;
++	struct list_head active_list;
++	struct list_head idle_list;
++
++	unsigned int bfq_quantum;
++	unsigned int bfq_fifo_expire[2];
++	unsigned int bfq_back_penalty;
++	unsigned int bfq_back_max;
++	unsigned int bfq_slice_idle;
++	u64 bfq_class_idle_last_service;
++
++	unsigned int bfq_user_max_budget;
++	unsigned int bfq_max_budget_async_rq;
++	unsigned int bfq_timeout[2];
++
++	bool low_latency;
++
++	/* parameters of the low_latency heuristics */
++	unsigned int bfq_raising_coeff;
++	unsigned int bfq_raising_max_time;
++	unsigned int bfq_raising_rt_max_time;
++	unsigned int bfq_raising_min_idle_time;
++	unsigned int bfq_raising_min_inter_arr_async;
++	unsigned int bfq_raising_max_softrt_rate;
++	u64 RT_prod;
++
++	struct bfq_queue oom_bfqq;
++};
++
++enum bfqq_state_flags {
++	BFQ_BFQQ_FLAG_busy = 0,		/* has requests or is under service */
++	BFQ_BFQQ_FLAG_wait_request,	/* waiting for a request */
++	BFQ_BFQQ_FLAG_must_alloc,	/* must be allowed rq alloc */
++	BFQ_BFQQ_FLAG_fifo_expire,	/* FIFO checked in this slice */
++	BFQ_BFQQ_FLAG_idle_window,	/* slice idling enabled */
++	BFQ_BFQQ_FLAG_prio_changed,	/* task priority has changed */
++	BFQ_BFQQ_FLAG_sync,		/* synchronous queue */
++	BFQ_BFQQ_FLAG_budget_new,	/* no completion with this budget */
++	BFQ_BFQQ_FLAG_coop,		/* bfqq is shared */
++	BFQ_BFQQ_FLAG_split_coop,	/* shared bfqq will be splitted */
++	BFQ_BFQQ_FLAG_some_coop_idle,   /* some cooperator is inactive */
++	BFQ_BFQQ_FLAG_just_split,	/* queue has just been split */
++};
++
++#define BFQ_BFQQ_FNS(name)						\
++static inline void bfq_mark_bfqq_##name(struct bfq_queue *bfqq)		\
++{									\
++	(bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name);			\
++}									\
++static inline void bfq_clear_bfqq_##name(struct bfq_queue *bfqq)	\
++{									\
++	(bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name);			\
++}									\
++static inline int bfq_bfqq_##name(const struct bfq_queue *bfqq)		\
++{									\
++	return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0;	\
++}
++
++BFQ_BFQQ_FNS(busy);
++BFQ_BFQQ_FNS(wait_request);
++BFQ_BFQQ_FNS(must_alloc);
++BFQ_BFQQ_FNS(fifo_expire);
++BFQ_BFQQ_FNS(idle_window);
++BFQ_BFQQ_FNS(prio_changed);
++BFQ_BFQQ_FNS(sync);
++BFQ_BFQQ_FNS(budget_new);
++BFQ_BFQQ_FNS(coop);
++BFQ_BFQQ_FNS(split_coop);
++BFQ_BFQQ_FNS(some_coop_idle);
++BFQ_BFQQ_FNS(just_split);
++#undef BFQ_BFQQ_FNS
++
++/* Logging facilities. */
++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \
++	blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args)
++
++#define bfq_log(bfqd, fmt, args...) \
++	blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args)
++
++/* Expiration reasons. */
++enum bfqq_expiration {
++	BFQ_BFQQ_TOO_IDLE = 0,		/* queue has been idling for too long */
++	BFQ_BFQQ_BUDGET_TIMEOUT,	/* budget took too long to be used */
++	BFQ_BFQQ_BUDGET_EXHAUSTED,	/* budget consumed */
++	BFQ_BFQQ_NO_MORE_REQUESTS,	/* the queue has no more requests */
++};
++
++#ifdef CONFIG_CGROUP_BFQIO
++/**
++ * struct bfq_group - per (device, cgroup) data structure.
++ * @entity: schedulable entity to insert into the parent group sched_data.
++ * @sched_data: own sched_data, to contain child entities (they may be
++ *              both bfq_queues and bfq_groups).
++ * @group_node: node to be inserted into the bfqio_cgroup->group_data
++ *              list of the containing cgroup's bfqio_cgroup.
++ * @bfqd_node: node to be inserted into the @bfqd->group_list list
++ *             of the groups active on the same device; used for cleanup.
++ * @bfqd: the bfq_data for the device this group acts upon.
++ * @async_bfqq: array of async queues for all the tasks belonging to
++ *              the group, one queue per ioprio value per ioprio_class,
++ *              except for the idle class that has only one queue.
++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored).
++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used
++ *             to avoid too many special cases during group creation/migration.
++ *
++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup
++ * there is a set of bfq_groups, each one collecting the lower-level
++ * entities belonging to the group that are acting on the same device.
++ *
++ * Locking works as follows:
++ *    o @group_node is protected by the bfqio_cgroup lock, and is accessed
++ *      via RCU from its readers.
++ *    o @bfqd is protected by the queue lock, RCU is used to access it
++ *      from the readers.
++ *    o All the other fields are protected by the @bfqd queue lock.
++ */
++struct bfq_group {
++	struct bfq_entity entity;
++	struct bfq_sched_data sched_data;
++
++	struct hlist_node group_node;
++	struct hlist_node bfqd_node;
++
++	void *bfqd;
++
++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
++	struct bfq_queue *async_idle_bfqq;
++
++	struct bfq_entity *my_entity;
++};
++
++/**
++ * struct bfqio_cgroup - bfq cgroup data structure.
++ * @css: subsystem state for bfq in the containing cgroup.
++ * @weight: cgroup weight.
++ * @ioprio: cgroup ioprio.
++ * @ioprio_class: cgroup ioprio_class.
++ * @lock: spinlock that protects @ioprio, @ioprio_class and @group_data.
++ * @group_data: list containing the bfq_group belonging to this cgroup.
++ *
++ * @group_data is accessed using RCU, with @lock protecting the updates,
++ * @ioprio and @ioprio_class are protected by @lock.
++ */
++struct bfqio_cgroup {
++	struct cgroup_subsys_state css;
++
++	unsigned short weight, ioprio, ioprio_class;
++
++	spinlock_t lock;
++	struct hlist_head group_data;
++};
++#else
++struct bfq_group {
++	struct bfq_sched_data sched_data;
++
++	struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
++	struct bfq_queue *async_idle_bfqq;
++};
++#endif
++
++static inline struct bfq_service_tree *
++bfq_entity_service_tree(struct bfq_entity *entity)
++{
++	struct bfq_sched_data *sched_data = entity->sched_data;
++	unsigned int idx = entity->ioprio_class - 1;
++
++	BUG_ON(idx >= BFQ_IOPRIO_CLASSES);
++	BUG_ON(sched_data == NULL);
++
++	return sched_data->service_tree + idx;
++}
++
++static inline struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic,
++					    int is_sync)
++{
++	return bic->bfqq[!!is_sync];
++}
++
++static inline void bic_set_bfqq(struct bfq_io_cq *bic,
++				struct bfq_queue *bfqq, int is_sync)
++{
++	bic->bfqq[!!is_sync] = bfqq;
++}
++
++static inline struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
++{
++	return bic->icq.q->elevator->elevator_data;
++}
++
++/**
++ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer.
++ * @ptr: a pointer to a bfqd.
++ * @flags: storage for the flags to be saved.
++ *
++ * This function allows bfqg->bfqd to be protected by the
++ * queue lock of the bfqd they reference; the pointer is dereferenced
++ * under RCU, so the storage for bfqd is assured to be safe as long
++ * as the RCU read side critical section does not end.  After the
++ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be
++ * sure that no other writer accessed it.  If we raced with a writer,
++ * the function returns NULL, with the queue unlocked, otherwise it
++ * returns the dereferenced pointer, with the queue locked.
++ */
++static inline struct bfq_data *bfq_get_bfqd_locked(void **ptr,
++						   unsigned long *flags)
++{
++	struct bfq_data *bfqd;
++
++	rcu_read_lock();
++	bfqd = rcu_dereference(*(struct bfq_data **)ptr);
++
++	if (bfqd != NULL) {
++		spin_lock_irqsave(bfqd->queue->queue_lock, *flags);
++		if (*ptr == bfqd)
++			goto out;
++		spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
++	}
++
++	bfqd = NULL;
++out:
++	rcu_read_unlock();
++	return bfqd;
++}
++
++static inline void bfq_put_bfqd_unlock(struct bfq_data *bfqd,
++				       unsigned long *flags)
++{
++	spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags);
++}
++
++static void bfq_changed_ioprio(struct bfq_io_cq *bic);
++static void bfq_put_queue(struct bfq_queue *bfqq);
++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq);
++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
++				       struct bfq_group *bfqg, int is_sync,
++				       struct bfq_io_cq *bic, gfp_t gfp_mask);
++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
++#endif
+-- 
+1.8.1.2
+
diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0004-tuxonice-for-linux.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0004-tuxonice-for-linux.patch
new file mode 100644
index 000000000..27848ccc2
--- /dev/null
+++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/0004-tuxonice-for-linux.patch
@@ -0,0 +1,22319 @@
+# Calculate format=diff os_linux_system==desktop
+diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
+index 9aa8ff3..2ca1256 100644
+--- a/Documentation/kernel-parameters.txt
++++ b/Documentation/kernel-parameters.txt
+@@ -3073,6 +3073,9 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
+ 					HIGHMEM regardless of setting
+ 					of CONFIG_HIGHPTE.
+ 
++	uuid_debug=	(Boolean) whether to enable debugging of TuxOnIce's
++			uuid support.
++
+ 	vdso=		[X86,SH]
+ 			vdso=2: enable compat VDSO (default with COMPAT_VDSO)
+ 			vdso=1: enable VDSO (default)
+diff --git a/Documentation/power/tuxonice-internals.txt b/Documentation/power/tuxonice-internals.txt
+new file mode 100644
+index 0000000..7a96186
+--- /dev/null
++++ b/Documentation/power/tuxonice-internals.txt
+@@ -0,0 +1,477 @@
++		   TuxOnIce 3.0 Internal Documentation.
++			Updated to 26 March 2009
++
++1.  Introduction.
++
++    TuxOnIce 3.0 is an addition to the Linux Kernel, designed to
++    allow the user to quickly shutdown and quickly boot a computer, without
++    needing to close documents or programs. It is equivalent to the
++    hibernate facility in some laptops. This implementation, however,
++    requires no special BIOS or hardware support.
++
++    The code in these files is based upon the original implementation
++    prepared by Gabor Kuti and additional work by Pavel Machek and a
++    host of others. This code has been substantially reworked by Nigel
++    Cunningham, again with the help and testing of many others, not the
++    least of whom is Michael Frank. At its heart, however, the operation is
++    essentially the same as Gabor's version.
++
++2.  Overview of operation.
++
++    The basic sequence of operations is as follows:
++
++	a. Quiesce all other activity.
++	b. Ensure enough memory and storage space are available, and attempt
++	   to free memory/storage if necessary.
++	c. Allocate the required memory and storage space.
++	d. Write the image.
++	e. Power down.
++
++    There are a number of complicating factors which mean that things are
++    not as simple as the above would imply, however...
++
++    o The activity of each process must be stopped at a point where it will
++    not be holding locks necessary for saving the image, or unexpectedly
++    restart operations due to something like a timeout and thereby make
++    our image inconsistent.
++
++    o It is desirous that we sync outstanding I/O to disk before calculating
++    image statistics. This reduces corruption if one should suspend but
++    then not resume, and also makes later parts of the operation safer (see
++    below).
++
++    o We need to get as close as we can to an atomic copy of the data.
++    Inconsistencies in the image will result in inconsistent memory contents at
++    resume time, and thus in instability of the system and/or file system
++    corruption. This would appear to imply a maximum image size of one half of
++    the amount of RAM, but we have a solution... (again, below).
++
++    o In 2.6, we choose to play nicely with the other suspend-to-disk
++    implementations.
++
++3.  Detailed description of internals.
++
++    a. Quiescing activity.
++
++    Safely quiescing the system is achieved using three separate but related
++    aspects.
++
++    First, we note that the vast majority of processes don't need to run during
++    suspend. They can be 'frozen'. We therefore implement a refrigerator
++    routine, which processes enter and in which they remain until the cycle is
++    complete. Processes enter the refrigerator via try_to_freeze() invocations
++    at appropriate places.  A process cannot be frozen in any old place. It
++    must not be holding locks that will be needed for writing the image or
++    freezing other processes. For this reason, userspace processes generally
++    enter the refrigerator via the signal handling code, and kernel threads at
++    the place in their event loops where they drop locks and yield to other
++    processes or sleep.
++
++    The task of freezing processes is complicated by the fact that there can be
++    interdependencies between processes. Freezing process A before process B may
++    mean that process B cannot be frozen, because it stops at waiting for
++    process A rather than in the refrigerator. This issue is seen where
++    userspace waits on freezeable kernel threads or fuse filesystem threads. To
++    address this issue, we implement the following algorithm for quiescing
++    activity:
++
++	- Freeze filesystems (including fuse - userspace programs starting
++		new requests are immediately frozen; programs already running
++		requests complete their work before being frozen in the next
++		step)
++	- Freeze userspace
++	- Thaw filesystems (this is safe now that userspace is frozen and no
++		fuse requests are outstanding).
++	- Invoke sys_sync (noop on fuse).
++	- Freeze filesystems
++	- Freeze kernel threads
++
++    If we need to free memory, we thaw kernel threads and filesystems, but not
++    userspace. We can then free caches without worrying about deadlocks due to
++    swap files being on frozen filesystems or such like.
++
++    b. Ensure enough memory & storage are available.
++
++    We have a number of constraints to meet in order to be able to successfully
++    suspend and resume.
++
++    First, the image will be written in two parts, described below. One of these
++    parts needs to have an atomic copy made, which of course implies a maximum
++    size of one half of the amount of system memory. The other part ('pageset')
++    is not atomically copied, and can therefore be as large or small as desired.
++
++    Second, we have constraints on the amount of storage available. In these
++    calculations, we may also consider any compression that will be done. The
++    cryptoapi module allows the user to configure an expected compression ratio.
++
++    Third, the user can specify an arbitrary limit on the image size, in
++    megabytes. This limit is treated as a soft limit, so that we don't fail the
++    attempt to suspend if we cannot meet this constraint.
++
++    c. Allocate the required memory and storage space.
++
++    Having done the initial freeze, we determine whether the above constraints
++    are met, and seek to allocate the metadata for the image. If the constraints
++    are not met, or we fail to allocate the required space for the metadata, we
++    seek to free the amount of memory that we calculate is needed and try again.
++    We allow up to four iterations of this loop before aborting the cycle. If we
++    do fail, it should only be because of a bug in TuxOnIce's calculations.
++
++    These steps are merged together in the prepare_image function, found in
++    prepare_image.c. The functions are merged because of the cyclical nature
++    of the problem of calculating how much memory and storage is needed. Since
++    the data structures containing the information about the image must
++    themselves take memory and use storage, the amount of memory and storage
++    required changes as we prepare the image. Since the changes are not large,
++    only one or two iterations will be required to achieve a solution.
++
++    The recursive nature of the algorithm is miminised by keeping user space
++    frozen while preparing the image, and by the fact that our records of which
++    pages are to be saved and which pageset they are saved in use bitmaps (so
++    that changes in number or fragmentation of the pages to be saved don't
++    feedback via changes in the amount of memory needed for metadata). The
++    recursiveness is thus limited to any extra slab pages allocated to store the
++    extents that record storage used, and the effects of seeking to free memory.
++
++    d. Write the image.
++
++    We previously mentioned the need to create an atomic copy of the data, and
++    the half-of-memory limitation that is implied in this. This limitation is
++    circumvented by dividing the memory to be saved into two parts, called
++    pagesets.
++
++    Pageset2 contains most of the page cache - the pages on the active and
++    inactive LRU lists that aren't needed or modified while TuxOnIce is
++    running, so they can be safely written without an atomic copy. They are
++    therefore saved first and reloaded last. While saving these pages,
++    TuxOnIce carefully ensures that the work of writing the pages doesn't make
++    the image inconsistent. With the support for Kernel (Video) Mode Setting
++    going into the kernel at the time of writing, we need to check for pages
++    on the LRU that are used by KMS, and exclude them from pageset2. They are
++    atomically copied as part of pageset 1.
++
++    Once pageset2 has been saved, we prepare to do the atomic copy of remaining
++    memory. As part of the preparation, we power down drivers, thereby providing
++    them with the opportunity to have their state recorded in the image. The
++    amount of memory allocated by drivers for this is usually negligible, but if
++    DRI is in use, video drivers may require significants amounts. Ideally we
++    would be able to query drivers while preparing the image as to the amount of
++    memory they will need. Unfortunately no such mechanism exists at the time of
++    writing. For this reason, TuxOnIce allows the user to set an
++    'extra_pages_allowance', which is used to seek to ensure sufficient memory
++    is available for drivers at this point. TuxOnIce also lets the user set this
++    value to 0. In this case, a test driver suspend is done while preparing the
++    image, and the difference (plus a margin) used instead. TuxOnIce will also
++    automatically restart the hibernation process (twice at most) if it finds
++    that the extra pages allowance is not sufficient. It will then use what was
++    actually needed (plus a margin, again). Failure to hibernate should thus
++    be an extremely rare occurence.
++
++    Having suspended the drivers, we save the CPU context before making an
++    atomic copy of pageset1, resuming the drivers and saving the atomic copy.
++    After saving the two pagesets, we just need to save our metadata before
++    powering down.
++
++    As we mentioned earlier, the contents of pageset2 pages aren't needed once
++    they've been saved. We therefore use them as the destination of our atomic
++    copy. In the unlikely event that pageset1 is larger, extra pages are
++    allocated while the image is being prepared. This is normally only a real
++    possibility when the system has just been booted and the page cache is
++    small.
++
++    This is where we need to be careful about syncing, however. Pageset2 will
++    probably contain filesystem meta data. If this is overwritten with pageset1
++    and then a sync occurs, the filesystem will be corrupted - at least until
++    resume time and another sync of the restored data. Since there is a
++    possibility that the user might not resume or (may it never be!) that
++    TuxOnIce might oops, we do our utmost to avoid syncing filesystems after
++    copying pageset1.
++
++    e. Power down.
++
++    Powering down uses standard kernel routines. TuxOnIce supports powering down
++    using the ACPI S3, S4 and S5 methods or the kernel's non-ACPI power-off.
++    Supporting suspend to ram (S3) as a power off option might sound strange,
++    but it allows the user to quickly get their system up and running again if
++    the battery doesn't run out (we just need to re-read the overwritten pages)
++    and if the battery does run out (or the user removes power), they can still
++    resume.
++
++4.  Data Structures.
++
++    TuxOnIce uses three main structures to store its metadata and configuration
++    information:
++
++    a) Pageflags bitmaps.
++
++    TuxOnIce records which pages will be in pageset1, pageset2, the destination
++    of the atomic copy and the source of the atomically restored image using
++    bitmaps. The code used is that written for swsusp, with small improvements
++    to match TuxOnIce's requirements.
++
++    The pageset1 bitmap is thus easily stored in the image header for use at
++    resume time.
++
++    As mentioned above, using bitmaps also means that the amount of memory and
++    storage required for recording the above information is constant. This
++    greatly simplifies the work of preparing the image. In earlier versions of
++    TuxOnIce, extents were used to record which pages would be stored. In that
++    case, however, eating memory could result in greater fragmentation of the
++    lists of pages, which in turn required more memory to store the extents and
++    more storage in the image header. These could in turn require further
++    freeing of memory, and another iteration. All of this complexity is removed
++    by having bitmaps.
++
++    Bitmaps also make a lot of sense because TuxOnIce only ever iterates
++    through the lists. There is therefore no cost to not being able to find the
++    nth page in order 0 time. We only need to worry about the cost of finding
++    the n+1th page, given the location of the nth page. Bitwise optimisations
++    help here.
++
++    b) Extents for block data.
++
++    TuxOnIce supports writing the image to multiple block devices. In the case
++    of swap, multiple partitions and/or files may be in use, and we happily use
++    them all (with the exception of compcache pages, which we allocate but do
++    not use). This use of multiple block devices is accomplished as follows:
++
++    Whatever the actual source of the allocated storage, the destination of the
++    image can be viewed in terms of one or more block devices, and on each
++    device, a list of sectors. To simplify matters, we only use contiguous,
++    PAGE_SIZE aligned sectors, like the swap code does.
++
++    Since sector numbers on each bdev may well not start at 0, it makes much
++    more sense to use extents here. Contiguous ranges of pages can thus be
++    represented in the extents by contiguous values.
++
++    Variations in block size are taken account of in transforming this data
++    into the parameters for bio submission.
++
++    We can thus implement a layer of abstraction wherein the core of TuxOnIce
++    doesn't have to worry about which device we're currently writing to or
++    where in the device we are. It simply requests that the next page in the
++    pageset or header be written, leaving the details to this lower layer.
++    The lower layer remembers where in the sequence of devices and blocks each
++    pageset starts. The header always starts at the beginning of the allocated
++    storage.
++
++    So extents are:
++
++    struct extent {
++      unsigned long minimum, maximum;
++      struct extent *next;
++    }
++
++    These are combined into chains of extents for a device:
++
++    struct extent_chain {
++      int size; /* size of the extent ie sum (max-min+1) */
++      int allocs, frees;
++      char *name;
++      struct extent *first, *last_touched;
++    };
++
++    For each bdev, we need to store a little more info:
++
++    struct suspend_bdev_info {
++       struct block_device *bdev;
++       dev_t dev_t;
++       int bmap_shift;
++       int blocks_per_page;
++    };
++
++    The dev_t is used to identify the device in the stored image. As a result,
++    we expect devices at resume time to have the same major and minor numbers
++    as they had while suspending.  This is primarily a concern where the user
++    utilises LVM for storage, as they will need to dmsetup their partitions in
++    such a way as to maintain this consistency at resume time.
++
++    bmap_shift and blocks_per_page apply the effects of variations in blocks
++    per page settings for the filesystem and underlying bdev. For most
++    filesystems, these are the same, but for xfs, they can have independant
++    values.
++
++    Combining these two structures together, we have everything we need to
++    record what devices and what blocks on each device are being used to
++    store the image, and to submit i/o using bio_submit.
++
++    The last elements in the picture are a means of recording how the storage
++    is being used.
++
++    We do this first and foremost by implementing a layer of abstraction on
++    top of the devices and extent chains which allows us to view however many
++    devices there might be as one long storage tape, with a single 'head' that
++    tracks a 'current position' on the tape:
++
++    struct extent_iterate_state {
++      struct extent_chain *chains;
++      int num_chains;
++      int current_chain;
++      struct extent *current_extent;
++      unsigned long current_offset;
++    };
++
++    That is, *chains points to an array of size num_chains of extent chains.
++    For the filewriter, this is always a single chain. For the swapwriter, the
++    array is of size MAX_SWAPFILES.
++
++    current_chain, current_extent and current_offset thus point to the current
++    index in the chains array (and into a matching array of struct
++    suspend_bdev_info), the current extent in that chain (to optimise access),
++    and the current value in the offset.
++
++    The image is divided into three parts:
++    - The header
++    - Pageset 1
++    - Pageset 2
++
++    The header always starts at the first device and first block. We know its
++    size before we begin to save the image because we carefully account for
++    everything that will be stored in it.
++
++    The second pageset (LRU) is stored first. It begins on the next page after
++    the end of the header.
++
++    The first pageset is stored second. It's start location is only known once
++    pageset2 has been saved, since pageset2 may be compressed as it is written.
++    This location is thus recorded at the end of saving pageset2. It is page
++    aligned also.
++
++    Since this information is needed at resume time, and the location of extents
++    in memory will differ at resume time, this needs to be stored in a portable
++    way:
++
++    struct extent_iterate_saved_state {
++        int chain_num;
++        int extent_num;
++        unsigned long offset;
++    };
++
++    We can thus implement a layer of abstraction wherein the core of TuxOnIce
++    doesn't have to worry about which device we're currently writing to or
++    where in the device we are. It simply requests that the next page in the
++    pageset or header be written, leaving the details to this layer, and
++    invokes the routines to remember and restore the position, without having
++    to worry about the details of how the data is arranged on disk or such like.
++
++    c) Modules
++
++    One aim in designing TuxOnIce was to make it flexible. We wanted to allow
++    for the implementation of different methods of transforming a page to be
++    written to disk and different methods of getting the pages stored.
++
++    In early versions (the betas and perhaps Suspend1), compression support was
++    inlined in the image writing code, and the data structures and code for
++    managing swap were intertwined with the rest of the code. A number of people
++    had expressed interest in implementing image encryption, and alternative
++    methods of storing the image.
++
++    In order to achieve this, TuxOnIce was given a modular design.
++
++    A module is a single file which encapsulates the functionality needed
++    to transform a pageset of data (encryption or compression, for example),
++    or to write the pageset to a device. The former type of module is called
++    a 'page-transformer', the later a 'writer'.
++
++    Modules are linked together in pipeline fashion. There may be zero or more
++    page transformers in a pipeline, and there is always exactly one writer.
++    The pipeline follows this pattern:
++
++		---------------------------------
++		|          TuxOnIce Core        |
++		---------------------------------
++				|
++				|
++		---------------------------------
++		|	Page transformer 1	|
++		---------------------------------
++				|
++				|
++		---------------------------------
++		|	Page transformer 2	|
++		---------------------------------
++				|
++				|
++		---------------------------------
++		|            Writer		|
++		---------------------------------
++
++    During the writing of an image, the core code feeds pages one at a time
++    to the first module. This module performs whatever transformations it
++    implements on the incoming data, completely consuming the incoming data and
++    feeding output in a similar manner to the next module.
++
++    All routines are SMP safe, and the final result of the transformations is
++    written with an index (provided by the core) and size of the output by the
++    writer. As a result, we can have multithreaded I/O without needing to
++    worry about the sequence in which pages are written (or read).
++
++    During reading, the pipeline works in the reverse direction. The core code
++    calls the first module with the address of a buffer which should be filled.
++    (Note that the buffer size is always PAGE_SIZE at this time). This module
++    will in turn request data from the next module and so on down until the
++    writer is made to read from the stored image.
++
++    Part of definition of the structure of a module thus looks like this:
++
++        int (*rw_init) (int rw, int stream_number);
++        int (*rw_cleanup) (int rw);
++        int (*write_chunk) (struct page *buffer_page);
++        int (*read_chunk) (struct page *buffer_page, int sync);
++
++    It should be noted that the _cleanup routine may be called before the
++    full stream of data has been read or written. While writing the image,
++    the user may (depending upon settings) choose to abort suspending, and
++    if we are in the midst of writing the last portion of the image, a portion
++    of the second pageset may be reread. This may also happen if an error
++    occurs and we seek to abort the process of writing the image.
++
++    The modular design is also useful in a number of other ways. It provides
++    a means where by we can add support for:
++
++    - providing overall initialisation and cleanup routines;
++    - serialising configuration information in the image header;
++    - providing debugging information to the user;
++    - determining memory and image storage requirements;
++    - dis/enabling components at run-time;
++    - configuring the module (see below);
++
++    ...and routines for writers specific to their work:
++    - Parsing a resume= location;
++    - Determining whether an image exists;
++    - Marking a resume as having been attempted;
++    - Invalidating an image;
++
++    Since some parts of the core - the user interface and storage manager
++    support - have use for some of these functions, they are registered as
++    'miscellaneous' modules as well.
++
++    d) Sysfs data structures.
++
++    This brings us naturally to support for configuring TuxOnIce. We desired to
++    provide a way to make TuxOnIce as flexible and configurable as possible.
++    The user shouldn't have to reboot just because they want to now hibernate to
++    a file instead of a partition, for example.
++
++    To accomplish this, TuxOnIce implements a very generic means whereby the
++    core and modules can register new sysfs entries. All TuxOnIce entries use
++    a single _store and _show routine, both of which are found in
++    tuxonice_sysfs.c in the kernel/power directory. These routines handle the
++    most common operations - getting and setting the values of bits, integers,
++    longs, unsigned longs and strings in one place, and allow overrides for
++    customised get and set options as well as side-effect routines for all
++    reads and writes.
++
++    When combined with some simple macros, a new sysfs entry can then be defined
++    in just a couple of lines:
++
++        SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
++                        2048, 0, NULL),
++
++    This defines a sysfs entry named "progress_granularity" which is rw and
++    allows the user to access an integer stored at &progress_granularity, giving
++    it a value between 1 and 2048 inclusive.
++
++    Sysfs entries are registered under /sys/power/tuxonice, and entries for
++    modules are located in a subdirectory named after the module.
++
+diff --git a/Documentation/power/tuxonice.txt b/Documentation/power/tuxonice.txt
+new file mode 100644
+index 0000000..3bf0575
+--- /dev/null
++++ b/Documentation/power/tuxonice.txt
+@@ -0,0 +1,948 @@
++	--- TuxOnIce, version 3.0 ---
++
++1.  What is it?
++2.  Why would you want it?
++3.  What do you need to use it?
++4.  Why not just use the version already in the kernel?
++5.  How do you use it?
++6.  What do all those entries in /sys/power/tuxonice do?
++7.  How do you get support?
++8.  I think I've found a bug. What should I do?
++9.  When will XXX be supported?
++10  How does it work?
++11. Who wrote TuxOnIce?
++
++1. What is it?
++
++   Imagine you're sitting at your computer, working away. For some reason, you
++   need to turn off your computer for a while - perhaps it's time to go home
++   for the day. When you come back to your computer next, you're going to want
++   to carry on where you left off. Now imagine that you could push a button and
++   have your computer store the contents of its memory to disk and power down.
++   Then, when you next start up your computer, it loads that image back into
++   memory and you can carry on from where you were, just as if you'd never
++   turned the computer off. You have far less time to start up, no reopening of
++   applications or finding what directory you put that file in yesterday.
++   That's what TuxOnIce does.
++
++   TuxOnIce has a long heritage. It began life as work by Gabor Kuti, who,
++   with some help from Pavel Machek, got an early version going in 1999. The
++   project was then taken over by Florent Chabaud while still in alpha version
++   numbers. Nigel Cunningham came on the scene when Florent was unable to
++   continue, moving the project into betas, then 1.0, 2.0 and so on up to
++   the present series. During the 2.0 series, the name was contracted to
++   Suspend2 and the website suspend2.net created. Beginning around July 2007,
++   a transition to calling the software TuxOnIce was made, to seek to help
++   make it clear that TuxOnIce is more concerned with hibernation than suspend
++   to ram.
++
++   Pavel Machek's swsusp code, which was merged around 2.5.17 retains the
++   original name, and was essentially a fork of the beta code until Rafael
++   Wysocki came on the scene in 2005 and began to improve it further.
++
++2. Why would you want it?
++
++   Why wouldn't you want it?
++
++   Being able to save the state of your system and quickly restore it improves
++   your productivity - you get a useful system in far less time than through
++   the normal boot process. You also get to be completely 'green', using zero
++   power, or as close to that as possible (the computer may still provide
++   minimal power to some devices, so they can initiate a power on, but that
++   will be the same amount of power as would be used if you told the computer
++   to shutdown.
++
++3. What do you need to use it?
++
++   a. Kernel Support.
++
++   i) The TuxOnIce patch.
++
++   TuxOnIce is part of the Linux Kernel. This version is not part of Linus's
++   2.6 tree at the moment, so you will need to download the kernel source and
++   apply the latest patch. Having done that, enable the appropriate options in
++   make [menu|x]config (under Power Management Options - look for "Enhanced
++   Hibernation"), compile and install your kernel. TuxOnIce works with SMP,
++   Highmem, preemption, fuse filesystems, x86-32, PPC and x86_64.
++
++   TuxOnIce patches are available from http://tuxonice.net.
++
++   ii) Compression support.
++
++   Compression support is implemented via the cryptoapi. You will therefore want
++   to select any Cryptoapi transforms that you want to use on your image from
++   the Cryptoapi menu while configuring your kernel. We recommend the use of the
++   LZO compression method - it is very fast and still achieves good compression.
++
++   You can also tell TuxOnIce to write its image to an encrypted and/or
++   compressed filesystem/swap partition. In that case, you don't need to do
++   anything special for TuxOnIce when it comes to kernel configuration.
++
++   iii) Configuring other options.
++
++   While you're configuring your kernel, try to configure as much as possible
++   to build as modules. We recommend this because there are a number of drivers
++   that are still in the process of implementing proper power management
++   support. In those cases, the best way to work around their current lack is
++   to build them as modules and remove the modules while hibernating. You might
++   also bug the driver authors to get their support up to speed, or even help!
++
++   b. Storage.
++
++   i) Swap.
++
++   TuxOnIce can store the hibernation image in your swap partition, a swap file or
++   a combination thereof. Whichever combination you choose, you will probably
++   want to create enough swap space to store the largest image you could have,
++   plus the space you'd normally use for swap. A good rule of thumb would be
++   to calculate the amount of swap you'd want without using TuxOnIce, and then
++   add the amount of memory you have. This swapspace can be arranged in any way
++   you'd like. It can be in one partition or file, or spread over a number. The
++   only requirement is that they be active when you start a hibernation cycle.
++
++   There is one exception to this requirement. TuxOnIce has the ability to turn
++   on one swap file or partition at the start of hibernating and turn it back off
++   at the end. If you want to ensure you have enough memory to store a image
++   when your memory is fully used, you might want to make one swap partition or
++   file for 'normal' use, and another for TuxOnIce to activate & deactivate
++   automatically. (Further details below).
++
++   ii) Normal files.
++
++   TuxOnIce includes a 'file allocator'. The file allocator can store your
++   image in a simple file. Since Linux has the concept of everything being a
++   file, this is more powerful than it initially sounds. If, for example, you
++   were to set up a network block device file, you could hibernate to a network
++   server. This has been tested and works to a point, but nbd itself isn't
++   stateless enough for our purposes.
++
++   Take extra care when setting up the file allocator. If you just type
++   commands without thinking and then try to hibernate, you could cause
++   irreversible corruption on your filesystems! Make sure you have backups.
++
++   Most people will only want to hibernate to a local file. To achieve that, do
++   something along the lines of:
++
++   echo "TuxOnIce" > /hibernation-file
++   dd if=/dev/zero bs=1M count=512 >> /hibernation-file
++
++   This will create a 512MB file called /hibernation-file. To get TuxOnIce to use
++   it:
++
++   echo /hibernation-file > /sys/power/tuxonice/file/target
++
++   Then
++
++   cat /sys/power/tuxonice/resume
++
++   Put the results of this into your bootloader's configuration (see also step
++   C, below):
++
++   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
++   # cat /sys/power/tuxonice/resume
++   file:/dev/hda2:0x1e001
++
++   In this example, we would edit the append= line of our lilo.conf|menu.lst
++   so that it included:
++
++   resume=file:/dev/hda2:0x1e001
++   ---EXAMPLE-ONLY-DON'T-COPY-AND-PASTE---
++
++   For those who are thinking 'Could I make the file sparse?', the answer is
++   'No!'. At the moment, there is no way for TuxOnIce to fill in the holes in
++   a sparse file while hibernating. In the longer term (post merge!), I'd like
++   to change things so that the file could be dynamically resized and have
++   holes filled as needed. Right now, however, that's not possible and not a
++   priority.
++
++   c. Bootloader configuration.
++
++   Using TuxOnIce also requires that you add an extra parameter to
++   your lilo.conf or equivalent. Here's an example for a swap partition:
++
++   append="resume=swap:/dev/hda1"
++
++   This would tell TuxOnIce that /dev/hda1 is a swap partition you
++   have. TuxOnIce will use the swap signature of this partition as a
++   pointer to your data when you hibernate. This means that (in this example)
++   /dev/hda1 doesn't need to be _the_ swap partition where all of your data
++   is actually stored. It just needs to be a swap partition that has a
++   valid signature.
++
++   You don't need to have a swap partition for this purpose. TuxOnIce
++   can also use a swap file, but usage is a little more complex. Having made
++   your swap file, turn it on and do
++
++   cat /sys/power/tuxonice/swap/headerlocations
++
++   (this assumes you've already compiled your kernel with TuxOnIce
++   support and booted it). The results of the cat command will tell you
++   what you need to put in lilo.conf:
++
++   For swap partitions like /dev/hda1, simply use resume=/dev/hda1.
++   For swapfile `swapfile`, use resume=swap:/dev/hda2:0x242d.
++
++   If the swapfile changes for any reason (it is moved to a different
++   location, it is deleted and recreated, or the filesystem is
++   defragmented) then you will have to check
++   /sys/power/tuxonice/swap/headerlocations for a new resume_block value.
++
++   Once you've compiled and installed the kernel and adjusted your bootloader
++   configuration, you should only need to reboot for the most basic part
++   of TuxOnIce to be ready.
++
++   If you only compile in the swap allocator, or only compile in the file
++   allocator, you don't need to add the "swap:" part of the resume=
++   parameters above. resume=/dev/hda2:0x242d will work just as well. If you
++   have compiled both and your storage is on swap, you can also use this
++   format (the swap allocator is the default allocator).
++
++   When compiling your kernel, one of the options in the 'Power Management
++   Support' menu, just above the 'Enhanced Hibernation (TuxOnIce)' entry is
++   called 'Default resume partition'. This can be used to set a default value
++   for the resume= parameter.
++
++   d. The hibernate script.
++
++   Since the driver model in 2.6 kernels is still being developed, you may need
++   to do more than just configure TuxOnIce. Users of TuxOnIce usually start the
++   process via a script which prepares for the hibernation cycle, tells the
++   kernel to do its stuff and then restore things afterwards. This script might
++   involve:
++
++   - Switching to a text console and back if X doesn't like the video card
++     status on resume.
++   - Un/reloading drivers that don't play well with hibernation.
++
++   Note that you might not be able to unload some drivers if there are
++   processes using them. You might have to kill off processes that hold
++   devices open. Hint: if your X server accesses an USB mouse, doing a
++   'chvt' to a text console releases the device and you can unload the
++   module.
++
++   Check out the latest script (available on tuxonice.net).
++
++   e. The userspace user interface.
++
++   TuxOnIce has very limited support for displaying status if you only apply
++   the kernel patch - it can printk messages, but that is all. In addition,
++   some of the functions mentioned in this document (such as cancelling a cycle
++   or performing interactive debugging) are unavailable. To utilise these
++   functions, or simply get a nice display, you need the 'userui' component.
++   Userui comes in three flavours, usplash, fbsplash and text. Text should
++   work on any console. Usplash and fbsplash require the appropriate
++   (distro specific?) support.
++
++   To utilise a userui, TuxOnIce just needs to be told where to find the
++   userspace binary:
++
++   echo "/usr/local/sbin/tuxoniceui_fbsplash" > /sys/power/tuxonice/user_interface/program
++
++   The hibernate script can do this for you, and a default value for this
++   setting can be configured when compiling the kernel. This path is also
++   stored in the image header, so if you have an initrd or initramfs, you can
++   use the userui during the first part of resuming (prior to the atomic
++   restore) by putting the binary in the same path in your initrd/ramfs.
++   Alternatively, you can put it in a different location and do an echo
++   similar to the above prior to the echo > do_resume. The value saved in the
++   image header will then be ignored.
++
++4. Why not just use the version already in the kernel?
++
++   The version in the vanilla kernel has a number of drawbacks. The most
++   serious of these are:
++	- it has a maximum image size of 1/2 total memory;
++	- it doesn't allocate storage until after it has snapshotted memory.
++	  This means that you can't be sure hibernating will work until you
++	  see it start to write the image;
++	- it does not allow you to press escape to cancel a cycle;
++	- it does not allow you to press escape to cancel resuming;
++	- it does not allow you to automatically swapon a file when
++	  starting a cycle;
++	- it does not allow you to use multiple swap partitions or files;
++	- it does not allow you to use ordinary files;
++	- it just invalidates an image and continues to boot if you
++	  accidentally boot the wrong kernel after hibernating;
++	- it doesn't support any sort of nice display while hibernating;
++	- it is moving toward requiring that you have an initrd/initramfs
++	  to ever have a hope of resuming (uswsusp). While uswsusp will
++	  address some of the concerns above, it won't address all of them,
++          and will be more complicated to get set up;
++        - it doesn't have support for suspend-to-both (write a hibernation
++	  image, then suspend to ram; I think this is known as ReadySafe
++	  under M$).
++
++5. How do you use it?
++
++   A hibernation cycle can be started directly by doing:
++
++	echo > /sys/power/tuxonice/do_hibernate
++
++   In practice, though, you'll probably want to use the hibernate script
++   to unload modules, configure the kernel the way you like it and so on.
++   In that case, you'd do (as root):
++
++	hibernate
++
++   See the hibernate script's man page for more details on the options it
++   takes.
++
++   If you're using the text or splash user interface modules, one feature of
++   TuxOnIce that you might find useful is that you can press Escape at any time
++   during hibernating, and the process will be aborted.
++
++   Due to the way hibernation works, this means you'll have your system back and
++   perfectly usable almost instantly. The only exception is when it's at the
++   very end of writing the image. Then it will need to reload a small (usually
++   4-50MBs, depending upon the image characteristics) portion first.
++
++   Likewise, when resuming, you can press escape and resuming will be aborted.
++   The computer will then powerdown again according to settings at that time for
++   the powerdown method or rebooting.
++
++   You can change the settings for powering down while the image is being
++   written by pressing 'R' to toggle rebooting and 'O' to toggle between
++   suspending to ram and powering down completely).
++
++   If you run into problems with resuming, adding the "noresume" option to
++   the kernel command line will let you skip the resume step and recover your
++   system. This option shouldn't normally be needed, because TuxOnIce modifies
++   the image header prior to the atomic restore, and will thus prompt you
++   if it detects that you've tried to resume an image before (this flag is
++   removed if you press Escape to cancel a resume, so you won't be prompted
++   then).
++
++   Recent kernels (2.6.24 onwards) add support for resuming from a different
++   kernel to the one that was hibernated (thanks to Rafael for his work on
++   this - I've just embraced and enhanced the support for TuxOnIce). This
++   should further reduce the need for you to use the noresume option.
++
++6. What do all those entries in /sys/power/tuxonice do?
++
++   /sys/power/tuxonice is the directory which contains files you can use to
++   tune and configure TuxOnIce to your liking. The exact contents of
++   the directory will depend upon the version of TuxOnIce you're
++   running and the options you selected at compile time. In the following
++   descriptions, names in brackets refer to compile time options.
++   (Note that they're all dependant upon you having selected CONFIG_TUXONICE
++   in the first place!).
++
++   Since the values of these settings can open potential security risks, the
++   writeable ones are accessible only to the root user. You may want to
++   configure sudo to allow you to invoke your hibernate script as an ordinary
++   user.
++
++   - alloc/failure_test
++
++   This debugging option provides a way of testing TuxOnIce's handling of
++   memory allocation failures. Each allocation type that TuxOnIce makes has
++   been given a unique number (see the source code). Echo the appropriate
++   number into this entry, and when TuxOnIce attempts to do that allocation,
++   it will pretend there was a failure and act accordingly.
++
++   - alloc/find_max_mem_allocated
++
++   This debugging option will cause TuxOnIce to find the maximum amount of
++   memory it used during a cycle, and report that information in debugging
++   information at the end of the cycle.
++
++   - alt_resume_param
++
++   Instead of powering down after writing a hibernation image, TuxOnIce
++   supports resuming from a different image. This entry lets you set the
++   location of the signature for that image (the resume= value you'd use
++   for it). Using an alternate image and keep_image mode, you can do things
++   like using an alternate image to power down an uninterruptible power
++   supply.
++
++   - block_io/target_outstanding_io
++
++   This value controls the amount of memory that the block I/O code says it
++   needs when the core code is calculating how much memory is needed for
++   hibernating and for resuming. It doesn't directly control the amount of
++   I/O that is submitted at any one time - that depends on the amount of
++   available memory (we may have more available than we asked for), the
++   throughput that is being achieved and the ability of the CPU to keep up
++   with disk throughput (particularly where we're compressing pages).
++
++   - checksum/enabled
++
++   Use cryptoapi hashing routines to verify that Pageset2 pages don't change
++   while we're saving the first part of the image, and to get any pages that
++   do change resaved in the atomic copy. This should normally not be needed,
++   but if you're seeing issues, please enable this. If your issues stop you
++   being able to resume, enable this option, hibernate and cancel the cycle
++   after the atomic copy is done. If the debugging info shows a non-zero
++   number of pages resaved, please report this to Nigel.
++
++   - compression/algorithm
++
++   Set the cryptoapi algorithm used for compressing the image.
++
++   - compression/expected_compression
++
++   These values allow you to set an expected compression ratio, which TuxOnice
++   will use in calculating whether it meets constraints on the image size. If
++   this expected compression ratio is not attained, the hibernation cycle will
++   abort, so it is wise to allow some spare. You can see what compression
++   ratio is achieved in the logs after hibernating.
++
++   - debug_info:
++
++   This file returns information about your configuration that may be helpful
++   in diagnosing problems with hibernating.
++
++   - did_suspend_to_both:
++
++   This file can be used when you hibernate with powerdown method 3 (ie suspend
++   to ram after writing the image). There can be two outcomes in this case. We
++   can resume from the suspend-to-ram before the battery runs out, or we can run
++   out of juice and and up resuming like normal. This entry lets you find out,
++   post resume, which way we went. If the value is 1, we resumed from suspend
++   to ram. This can be useful when actions need to be run post suspend-to-ram
++   that don't need to be run if we did the normal resume from power off.
++
++   - do_hibernate:
++
++   When anything is written to this file, the kernel side of TuxOnIce will
++   begin to attempt to write an image to disk and power down. You'll normally
++   want to run the hibernate script instead, to get modules unloaded first.
++
++   - do_resume:
++
++   When anything is written to this file TuxOnIce will attempt to read and
++   restore an image. If there is no image, it will return almost immediately.
++   If an image exists, the echo > will never return. Instead, the original
++   kernel context will be restored and the original echo > do_hibernate will
++   return.
++
++   - */enabled
++
++   These option can be used to temporarily disable various parts of TuxOnIce.
++
++   - extra_pages_allowance
++
++   When TuxOnIce does its atomic copy, it calls the driver model suspend
++   and resume methods. If you have DRI enabled with a driver such as fglrx,
++   this can result in the driver allocating a substantial amount of memory
++   for storing its state. Extra_pages_allowance tells TuxOnIce how much
++   extra memory it should ensure is available for those allocations. If
++   your attempts at hibernating end with a message in dmesg indicating that
++   insufficient extra pages were allowed, you need to increase this value.
++
++   - file/target:
++
++   Read this value to get the current setting. Write to it to point TuxOnice
++   at a new storage location for the file allocator. See section 3.b.ii above
++   for details of how to set up the file allocator.
++
++   - freezer_test
++
++   This entry can be used to get TuxOnIce to just test the freezer and prepare
++   an image without actually doing a hibernation cycle. It is useful for
++   diagnosing freezing and image preparation issues.
++
++   - full_pageset2
++
++   TuxOnIce divides the pages that are stored in an image into two sets. The
++   difference between the two sets is that pages in pageset 1 are atomically
++   copied, and pages in pageset 2 are written to disk without being copied
++   first. A page CAN be written to disk without being copied first if and only
++   if its contents will not be modified or used at any time after userspace
++   processes are frozen. A page MUST be in pageset 1 if its contents are
++   modified or used at any time after userspace processes have been frozen.
++
++   Normally (ie if this option is enabled), TuxOnIce will put all pages on the
++   per-zone LRUs in pageset2, then remove those pages used by any userspace
++   user interface helper and TuxOnIce storage manager that are running,
++   together with pages used by the GEM memory manager introduced around 2.6.28
++   kernels.
++
++   If this option is disabled, a much more conservative approach will be taken.
++   The only pages in pageset2 will be those belonging to userspace processes,
++   with the exclusion of those belonging to the TuxOnIce userspace helpers
++   mentioned above. This will result in a much smaller pageset2, and will
++   therefore result in smaller images than are possible with this option
++   enabled.
++
++   - ignore_rootfs
++
++   TuxOnIce records which device is mounted as the root filesystem when
++   writing the hibernation image. It will normally check at resume time that
++   this device isn't already mounted - that would be a cause of filesystem
++   corruption. In some particular cases (RAM based root filesystems), you
++   might want to disable this check. This option allows you to do that.
++
++   - image_exists:
++
++   Can be used in a script to determine whether a valid image exists at the
++   location currently pointed to by resume=. Returns up to three lines.
++   The first is whether an image exists (-1 for unsure, otherwise 0 or 1).
++   If an image eixsts, additional lines will return the machine and version.
++   Echoing anything to this entry removes any current image.
++
++   - image_size_limit:
++
++   The maximum size of hibernation image written to disk, measured in megabytes
++   (1024*1024).
++
++   - last_result:
++
++   The result of the last hibernation cycle, as defined in
++   include/linux/suspend-debug.h with the values SUSPEND_ABORTED to
++   SUSPEND_KEPT_IMAGE. This is a bitmask.
++
++   - late_cpu_hotplug:
++
++   This sysfs entry controls whether cpu hotplugging is done - as normal - just
++   before (unplug) and after (replug) the atomic copy/restore (so that all
++   CPUs/cores are available for multithreaded I/O). The alternative is to
++   unplug all secondary CPUs/cores at the start of hibernating/resuming, and
++   replug them at the end of resuming. No multithreaded I/O will be possible in
++   this configuration, but the odd machine has been reported to require it.
++
++   - lid_file:
++
++   This determines which ACPI button file we look in to determine whether the
++   lid is open or closed after resuming from suspend to disk or power off.
++   If the entry is set to "lid/LID", we'll open /proc/acpi/button/lid/LID/state
++   and check its contents at the appropriate moment. See post_wake_state below
++   for more details on how this entry is used.
++
++   - log_everything (CONFIG_PM_DEBUG):
++
++   Setting this option results in all messages printed being logged. Normally,
++   only a subset are logged, so as to not slow the process and not clutter the
++   logs. Useful for debugging. It can be toggled during a cycle by pressing
++   'L'.
++
++   - no_load_direct:
++
++   This is a debugging option. If, when loading the atomically copied pages of
++   an image, TuxOnIce finds that the destination address for a page is free,
++   it will normally allocate the image, load the data directly into that
++   address and skip it in the atomic restore. If this option is disabled, the
++   page will be loaded somewhere else and atomically restored like other pages.
++
++   - no_flusher_thread:
++
++   When doing multithreaded I/O (see below), the first online CPU can be used
++   to _just_ submit compressed pages when writing the image, rather than
++   compressing and submitting data. This option is normally disabled, but has
++   been included because Nigel would like to see whether it will be more useful
++   as the number of cores/cpus in computers increases.
++
++   - no_multithreaded_io:
++
++   TuxOnIce will normally create one thread per cpu/core on your computer,
++   each of which will then perform I/O. This will generally result in
++   throughput that's the maximum the storage medium can handle. There
++   shouldn't be any reason to disable multithreaded I/O now, but this option
++   has been retained for debugging purposes.
++
++   - no_pageset2
++
++   See the entry for full_pageset2 above for an explanation of pagesets.
++   Enabling this option causes TuxOnIce to do an atomic copy of all pages,
++   thereby limiting the maximum image size to 1/2 of memory, as swsusp does.
++
++   - no_pageset2_if_unneeded
++
++   See the entry for full_pageset2 above for an explanation of pagesets.
++   Enabling this option causes TuxOnIce to act like no_pageset2 was enabled
++   if and only it isn't needed anyway. This option may still make TuxOnIce
++   less reliable because pageset2 pages are normally used to store the
++   atomic copy - drivers that want to do allocations of larger amounts of
++   memory in one shot will be more likely to find that those amounts aren't
++   available if this option is enabled.
++
++   - pause_between_steps (CONFIG_PM_DEBUG):
++
++   This option is used during debugging, to make TuxOnIce pause between
++   each step of the process. It is ignored when the nice display is on.
++
++   - post_wake_state:
++
++   TuxOnIce provides support for automatically waking after a user-selected
++   delay, and using a different powerdown method if the lid is still closed.
++   (Yes, we're assuming a laptop).  This entry lets you choose what state
++   should be entered next. The values are those described under
++   powerdown_method, below. It can be used to suspend to RAM after hibernating,
++   then powerdown properly (say) 20 minutes. It can also be used to power down
++   properly, then wake at (say) 6.30am and suspend to RAM until you're ready
++   to use the machine.
++
++   - powerdown_method:
++
++   Used to select a method by which TuxOnIce should powerdown after writing the
++   image. Currently:
++
++   0: Don't use ACPI to power off.
++   3: Attempt to enter Suspend-to-ram.
++   4: Attempt to enter ACPI S4 mode.
++   5: Attempt to power down via ACPI S5 mode.
++
++   Note that these options are highly dependant upon your hardware & software:
++
++   3: When succesful, your machine suspends to ram instead of powering off.
++      The advantage of using this mode is that it doesn't matter whether your
++      battery has enough charge to make it through to your next resume. If it
++      lasts, you will simply resume from suspend to ram (and the image on disk
++      will be discarded). If the battery runs out, you will resume from disk
++      instead. The disadvantage is that it takes longer than a normal
++      suspend-to-ram to enter the state, since the suspend-to-disk image needs
++      to be written first.
++   4/5: When successful, your machine will be off and comsume (almost) no power.
++      But it might still react to some external events like opening the lid or
++      trafic on  a network or usb device. For the bios, resume is then the same
++      as warm boot, similar to a situation where you used the command `reboot'
++      to reboot your machine. If your machine has problems on warm boot or if
++      you want to protect your machine with the bios password, this is probably
++      not the right choice. Mode 4 may be necessary on some machines where ACPI
++      wake up methods need to be run to properly reinitialise hardware after a
++      hibernation cycle.
++   0: Switch the machine completely off. The only possible wakeup is the power
++      button. For the bios, resume is then the same as a cold boot, in
++      particular you would  have to provide your bios boot password if your
++      machine uses that feature for booting.
++
++   - progressbar_granularity_limit:
++
++   This option can be used to limit the granularity of the progress bar
++   displayed with a bootsplash screen. The value is the maximum number of
++   steps. That is, 10 will make the progress bar jump in 10% increments.
++
++   - reboot:
++
++   This option causes TuxOnIce to reboot rather than powering down
++   at the end of saving an image. It can be toggled during a cycle by pressing
++   'R'.
++
++   - resume:
++
++   This sysfs entry can be used to read and set the location in which TuxOnIce
++   will look for the signature of an image - the value set using resume= at
++   boot time or CONFIG_PM_STD_PARTITION ("Default resume partition"). By
++   writing to this file as well as modifying your bootloader's configuration
++   file (eg menu.lst), you can set or reset the location of your image or the
++   method of storing the image without rebooting.
++
++   - replace_swsusp (CONFIG_TOI_REPLACE_SWSUSP):
++
++   This option makes
++
++     echo disk > /sys/power/state
++
++   activate TuxOnIce instead of swsusp. Regardless of whether this option is
++   enabled, any invocation of swsusp's resume time trigger will cause TuxOnIce
++   to check for an image too. This is due to the fact that at resume time, we
++   can't know whether this option was enabled until we see if an image is there
++   for us to resume from. (And when an image exists, we don't care whether we
++   did replace swsusp anyway - we just want to resume).
++
++   - resume_commandline:
++
++   This entry can be read after resuming to see the commandline that was used
++   when resuming began. You might use this to set up two bootloader entries
++   that are the same apart from the fact that one includes a extra append=
++   argument "at_work=1". You could then grep resume_commandline in your
++   post-resume scripts and configure networking (for example) differently
++   depending upon whether you're at home or work. resume_commandline can be
++   set to arbitrary text if you wish to remove sensitive contents.
++
++   - swap/swapfilename:
++
++   This entry is used to specify the swapfile or partition that
++   TuxOnIce will attempt to swapon/swapoff automatically. Thus, if
++   I normally use /dev/hda1 for swap, and want to use /dev/hda2 for specifically
++   for my hibernation image, I would
++
++   echo /dev/hda2 > /sys/power/tuxonice/swap/swapfile
++
++   /dev/hda2 would then be automatically swapon'd and swapoff'd. Note that the
++   swapon and swapoff occur while other processes are frozen (including kswapd)
++   so this swap file will not be used up when attempting to free memory. The
++   parition/file is also given the highest priority, so other swapfiles/partitions
++   will only be used to save the image when this one is filled.
++
++   The value of this file is used by headerlocations along with any currently
++   activated swapfiles/partitions.
++
++   - swap/headerlocations:
++
++   This option tells you the resume= options to use for swap devices you
++   currently have activated. It is particularly useful when you only want to
++   use a swap file to store your image. See above for further details.
++
++   - test_bio
++
++   This is a debugging option. When enabled, TuxOnIce will not hibernate.
++   Instead, when asked to write an image, it will skip the atomic copy,
++   just doing the writing of the image and then returning control to the
++   user at the point where it would have powered off. This is useful for
++   testing throughput in different configurations.
++
++   - test_filter_speed
++
++   This is a debugging option. When enabled, TuxOnIce will not hibernate.
++   Instead, when asked to write an image, it will not write anything or do
++   an atomic copy, but will only run any enabled compression algorithm on the
++   data that would have been written (the source pages of the atomic copy in
++   the case of pageset 1). This is useful for comparing the performance of
++   compression algorithms and for determining the extent to which an upgrade
++   to your storage method would improve hibernation speed.
++
++   - user_interface/debug_sections (CONFIG_PM_DEBUG):
++
++   This value, together with the console log level, controls what debugging
++   information is displayed. The console log level determines the level of
++   detail, and this value determines what detail is displayed. This value is
++   a bit vector, and the meaning of the bits can be found in the kernel tree
++   in include/linux/tuxonice.h. It can be overridden using the kernel's
++   command line option suspend_dbg.
++
++   - user_interface/default_console_level (CONFIG_PM_DEBUG):
++
++   This determines the value of the console log level at the start of a
++   hibernation cycle. If debugging is compiled in, the console log level can be
++   changed during a cycle by pressing the digit keys. Meanings are:
++
++   0: Nice display.
++   1: Nice display plus numerical progress.
++   2: Errors only.
++   3: Low level debugging info.
++   4: Medium level debugging info.
++   5: High level debugging info.
++   6: Verbose debugging info.
++
++   - user_interface/enable_escape:
++
++   Setting this to "1" will enable you abort a hibernation cycle or resuming by
++   pressing escape, "0" (default) disables this feature. Note that enabling
++   this option means that you cannot initiate a hibernation cycle and then walk
++   away from your computer, expecting it to be secure. With feature disabled,
++   you can validly have this expectation once TuxOnice begins to write the
++   image to disk. (Prior to this point, it is possible that TuxOnice might
++   about because of failure to freeze all processes or because constraints
++   on its ability to save the image are not met).
++
++   - user_interface/program
++
++   This entry is used to tell TuxOnice what userspace program to use for
++   providing a user interface while hibernating. The program uses a netlink
++   socket to pass messages back and forward to the kernel, allowing all of the
++   functions formerly implemented in the kernel user interface components.
++
++   - version:
++
++   The version of TuxOnIce you have compiled into the currently running kernel.
++
++   - wake_alarm_dir:
++
++   As mentioned above (post_wake_state), TuxOnIce supports automatically waking
++   after some delay. This entry allows you to select which wake alarm to use.
++   It should contain the value "rtc0" if you're wanting to use
++   /sys/class/rtc/rtc0.
++
++   - wake_delay:
++
++   This value determines the delay from the end of writing the image until the
++   wake alarm is triggered. You can set an absolute time by writing the desired
++   time into /sys/class/rtc/<wake_alarm_dir>/wakealarm and leaving these values
++   empty.
++
++   Note that for the wakeup to actually occur, you may need to modify entries
++   in /proc/acpi/wakeup. This is done by echoing the name of the button in the
++   first column (eg PBTN) into the file.
++
++7. How do you get support?
++
++   Glad you asked. TuxOnIce is being actively maintained and supported
++   by Nigel (the guy doing most of the kernel coding at the moment), Bernard
++   (who maintains the hibernate script and userspace user interface components)
++   and its users.
++
++   Resources availble include HowTos, FAQs and a Wiki, all available via
++   tuxonice.net.  You can find the mailing lists there.
++
++8. I think I've found a bug. What should I do?
++
++   By far and a way, the most common problems people have with TuxOnIce
++   related to drivers not having adequate power management support. In this
++   case, it is not a bug with TuxOnIce, but we can still help you. As we
++   mentioned above, such issues can usually be worked around by building the
++   functionality as modules and unloading them while hibernating. Please visit
++   the Wiki for up-to-date lists of known issues and work arounds.
++
++   If this information doesn't help, try running:
++
++   hibernate --bug-report
++
++   ..and sending the output to the users mailing list.
++
++   Good information on how to provide us with useful information from an
++   oops is found in the file REPORTING-BUGS, in the top level directory
++   of the kernel tree. If you get an oops, please especially note the
++   information about running what is printed on the screen through ksymoops.
++   The raw information is useless.
++
++9. When will XXX be supported?
++
++   If there's a feature missing from TuxOnIce that you'd like, feel free to
++   ask. We try to be obliging, within reason.
++
++   Patches are welcome. Please send to the list.
++
++10. How does it work?
++
++   TuxOnIce does its work in a number of steps.
++
++   a. Freezing system activity.
++
++   The first main stage in hibernating is to stop all other activity. This is
++   achieved in stages. Processes are considered in fours groups, which we will
++   describe in reverse order for clarity's sake: Threads with the PF_NOFREEZE
++   flag, kernel threads without this flag, userspace processes with the
++   PF_SYNCTHREAD flag and all other processes. The first set (PF_NOFREEZE) are
++   untouched by the refrigerator code. They are allowed to run during hibernating
++   and resuming, and are used to support user interaction, storage access or the
++   like. Other kernel threads (those unneeded while hibernating) are frozen last.
++   This leaves us with userspace processes that need to be frozen. When a
++   process enters one of the *_sync system calls, we set a PF_SYNCTHREAD flag on
++   that process for the duration of that call. Processes that have this flag are
++   frozen after processes without it, so that we can seek to ensure that dirty
++   data is synced to disk as quickly as possible in a situation where other
++   processes may be submitting writes at the same time. Freezing the processes
++   that are submitting data stops new I/O from being submitted. Syncthreads can
++   then cleanly finish their work. So the order is:
++
++   - Userspace processes without PF_SYNCTHREAD or PF_NOFREEZE;
++   - Userspace processes with PF_SYNCTHREAD (they won't have NOFREEZE);
++   - Kernel processes without PF_NOFREEZE.
++
++   b. Eating memory.
++
++   For a successful hibernation cycle, you need to have enough disk space to store the
++   image and enough memory for the various limitations of TuxOnIce's
++   algorithm. You can also specify a maximum image size. In order to attain
++   to those constraints, TuxOnIce may 'eat' memory. If, after freezing
++   processes, the constraints aren't met, TuxOnIce will thaw all the
++   other processes and begin to eat memory until its calculations indicate
++   the constraints are met. It will then freeze processes again and recheck
++   its calculations.
++
++   c. Allocation of storage.
++
++   Next, TuxOnIce allocates the storage that will be used to save
++   the image.
++
++   The core of TuxOnIce knows nothing about how or where pages are stored. We
++   therefore request the active allocator (remember you might have compiled in
++   more than one!) to allocate enough storage for our expect image size. If
++   this request cannot be fulfilled, we eat more memory and try again. If it
++   is fulfiled, we seek to allocate additional storage, just in case our
++   expected compression ratio (if any) isn't achieved. This time, however, we
++   just continue if we can't allocate enough storage.
++
++   If these calls to our allocator change the characteristics of the image
++   such that we haven't allocated enough memory, we also loop. (The allocator
++   may well need to allocate space for its storage information).
++
++   d. Write the first part of the image.
++
++   TuxOnIce stores the image in two sets of pages called 'pagesets'.
++   Pageset 2 contains pages on the active and inactive lists; essentially
++   the page cache. Pageset 1 contains all other pages, including the kernel.
++   We use two pagesets for one important reason: We need to make an atomic copy
++   of the kernel to ensure consistency of the image. Without a second pageset,
++   that would limit us to an image that was at most half the amount of memory
++   available. Using two pagesets allows us to store a full image. Since pageset
++   2 pages won't be needed in saving pageset 1, we first save pageset 2 pages.
++   We can then make our atomic copy of the remaining pages using both pageset 2
++   pages and any other pages that are free. While saving both pagesets, we are
++   careful not to corrupt the image. Among other things, we use lowlevel block
++   I/O routines that don't change the pagecache contents.
++
++   The next step, then, is writing pageset 2.
++
++   e. Suspending drivers and storing processor context.
++
++   Having written pageset2, TuxOnIce calls the power management functions to
++   notify drivers of the hibernation, and saves the processor state in preparation
++   for the atomic copy of memory we are about to make.
++
++   f. Atomic copy.
++
++   At this stage, everything else but the TuxOnIce code is halted. Processes
++   are frozen or idling, drivers are quiesced and have stored (ideally and where
++   necessary) their configuration in memory we are about to atomically copy.
++   In our lowlevel architecture specific code, we have saved the CPU state.
++   We can therefore now do our atomic copy before resuming drivers etc.
++
++   g. Save the atomic copy (pageset 1).
++
++   TuxOnice can then write the atomic copy of the remaining pages. Since we
++   have copied the pages into other locations, we can continue to use the
++   normal block I/O routines without fear of corruption our image.
++
++   f. Save the image header.
++
++   Nearly there! We save our settings and other parameters needed for
++   reloading pageset 1 in an 'image header'. We also tell our allocator to
++   serialise its data at this stage, so that it can reread the image at resume
++   time.
++
++   g. Set the image header.
++
++   Finally, we edit the header at our resume= location. The signature is
++   changed by the allocator to reflect the fact that an image exists, and to
++   point to the start of that data if necessary (swap allocator).
++
++   h. Power down.
++
++   Or reboot if we're debugging and the appropriate option is selected.
++
++   Whew!
++
++   Reloading the image.
++   --------------------
++
++   Reloading the image is essentially the reverse of all the above. We load
++   our copy of pageset 1, being careful to choose locations that aren't going
++   to be overwritten as we copy it back (We start very early in the boot
++   process, so there are no other processes to quiesce here). We then copy
++   pageset 1 back to its original location in memory and restore the process
++   context. We are now running with the original kernel. Next, we reload the
++   pageset 2 pages, free the memory and swap used by TuxOnIce, restore
++   the pageset header and restart processes. Sounds easy in comparison to
++   hibernating, doesn't it!
++
++   There is of course more to TuxOnIce than this, but this explanation
++   should be a good start. If there's interest, I'll write further
++   documentation on range pages and the low level I/O.
++
++11. Who wrote TuxOnIce?
++
++   (Answer based on the writings of Florent Chabaud, credits in files and
++   Nigel's limited knowledge; apologies to anyone missed out!)
++
++   The main developers of TuxOnIce have been...
++
++   Gabor Kuti
++   Pavel Machek
++   Florent Chabaud
++   Bernard Blackham
++   Nigel Cunningham
++
++   Significant portions of swsusp, the code in the vanilla kernel which
++   TuxOnIce enhances, have been worked on by Rafael Wysocki. Thanks should
++   also be expressed to him.
++
++   The above mentioned developers have been aided in their efforts by a host
++   of hundreds, if not thousands of testers and people who have submitted bug
++   fixes & suggestions. Of special note are the efforts of Michael Frank, who
++   had his computers repetitively hibernate and resume for literally tens of
++   thousands of cycles and developed scripts to stress the system and test
++   TuxOnIce far beyond the point most of us (Nigel included!) would consider
++   testing. His efforts have contributed as much to TuxOnIce as any of the
++   names above.
+diff --git a/MAINTAINERS b/MAINTAINERS
+index a92f485..4b47f3a 100644
+--- a/MAINTAINERS
++++ b/MAINTAINERS
+@@ -7831,6 +7831,13 @@ S:	Maintained
+ F:	drivers/tc/
+ F:	include/linux/tc.h
+ 
++TUXONICE (ENHANCED HIBERNATION)
++P:	Nigel Cunningham
++M:	nigel@tuxonice.net
++L:	tuxonice-devel@tuxonice.net
++W:	http://tuxonice.net
++S:	Maintained
++
+ U14-34F SCSI DRIVER
+ M:	Dario Ballabio <ballabio_dario@emc.com>
+ L:	linux-scsi@vger.kernel.org
+diff --git a/arch/powerpc/mm/pgtable_32.c b/arch/powerpc/mm/pgtable_32.c
+index 6c856fb..749ee94 100644
+--- a/arch/powerpc/mm/pgtable_32.c
++++ b/arch/powerpc/mm/pgtable_32.c
+@@ -433,6 +433,7 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
+ 
+ 	change_page_attr(page, numpages, enable ? PAGE_KERNEL : __pgprot(0));
+ }
++EXPORT_SYMBOL_GPL(kernel_map_pages);
+ #endif /* CONFIG_DEBUG_PAGEALLOC */
+ 
+ static int fixmaps;
+diff --git a/arch/powerpc/platforms/83xx/suspend.c b/arch/powerpc/platforms/83xx/suspend.c
+index 1d769a2..e2877cf 100644
+--- a/arch/powerpc/platforms/83xx/suspend.c
++++ b/arch/powerpc/platforms/83xx/suspend.c
+@@ -263,6 +263,8 @@ static int mpc83xx_suspend_begin(suspend_state_t state)
+ 
+ static int agent_thread_fn(void *data)
+ {
++	set_freezable();
++
+ 	while (1) {
+ 		wait_event_interruptible(agent_wq, pci_pm_state >= 2);
+ 		try_to_freeze();
+diff --git a/arch/powerpc/platforms/ps3/device-init.c b/arch/powerpc/platforms/ps3/device-init.c
+index 3f175e8..b5d59c6 100644
+--- a/arch/powerpc/platforms/ps3/device-init.c
++++ b/arch/powerpc/platforms/ps3/device-init.c
+@@ -841,6 +841,8 @@ static int ps3_probe_thread(void *data)
+ 	if (res)
+ 		goto fail_free_irq;
+ 
++	set_freezable();
++
+ 	/* Loop here processing the requested notification events. */
+ 	do {
+ 		try_to_freeze();
+diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
+index a1b1c88..41a5d8b 100644
+--- a/arch/x86/mm/pageattr.c
++++ b/arch/x86/mm/pageattr.c
+@@ -1368,6 +1368,8 @@ void kernel_map_pages(struct page *page, int numpages, int enable)
+ 	__flush_tlb_all();
+ }
+ 
++EXPORT_SYMBOL_GPL(kernel_map_pages);
++
+ #ifdef CONFIG_HIBERNATION
+ 
+ bool kernel_page_present(struct page *page)
+@@ -1381,7 +1383,7 @@ bool kernel_page_present(struct page *page)
+ 	pte = lookup_address((unsigned long)page_address(page), &level);
+ 	return (pte_val(*pte) & _PAGE_PRESENT);
+ }
+-
++EXPORT_SYMBOL_GPL(kernel_page_present);
+ #endif /* CONFIG_HIBERNATION */
+ 
+ #endif /* CONFIG_DEBUG_PAGEALLOC */
+diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
+index 120cee1..53b9691 100644
+--- a/arch/x86/power/cpu.c
++++ b/arch/x86/power/cpu.c
+@@ -118,9 +118,7 @@ void save_processor_state(void)
+ 	__save_processor_state(&saved_context);
+ 	x86_platform.save_sched_clock_state();
+ }
+-#ifdef CONFIG_X86_32
+ EXPORT_SYMBOL(save_processor_state);
+-#endif
+ 
+ static void do_fpu_end(void)
+ {
+diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
+index 7d28c88..4f1dd95 100644
+--- a/arch/x86/power/hibernate_32.c
++++ b/arch/x86/power/hibernate_32.c
+@@ -9,6 +9,7 @@
+ #include <linux/gfp.h>
+ #include <linux/suspend.h>
+ #include <linux/bootmem.h>
++#include <linux/export.h>
+ 
+ #include <asm/page.h>
+ #include <asm/pgtable.h>
+@@ -161,6 +162,7 @@ int swsusp_arch_resume(void)
+ 	restore_image();
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(swsusp_arch_resume);
+ 
+ /*
+  *	pfn_is_nosave - check if given pfn is in the 'nosave' section
+diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
+index a0fde91..9e8ce13 100644
+--- a/arch/x86/power/hibernate_64.c
++++ b/arch/x86/power/hibernate_64.c
+@@ -11,6 +11,7 @@
+ #include <linux/gfp.h>
+ #include <linux/smp.h>
+ #include <linux/suspend.h>
++#include <linux/export.h>
+ 
+ #include <asm/init.h>
+ #include <asm/proto.h>
+@@ -97,6 +98,7 @@ int swsusp_arch_resume(void)
+ 	restore_image();
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(swsusp_arch_resume);
+ 
+ /*
+  *	pfn_is_nosave - check if given pfn is in the 'nosave' section
+@@ -147,3 +149,4 @@ int arch_hibernation_header_restore(void *addr)
+ 	restore_cr3 = rdr->cr3;
+ 	return (rdr->magic == RESTORE_MAGIC) ? 0 : -EINVAL;
+ }
++EXPORT_SYMBOL_GPL(arch_hibernation_header_restore);
+diff --git a/block/Makefile b/block/Makefile
+index 39b76ba..0a0125a 100644
+--- a/block/Makefile
++++ b/block/Makefile
+@@ -6,7 +6,7 @@ obj-$(CONFIG_BLOCK) := elevator.o blk-core.o blk-tag.o blk-sysfs.o \
+ 			blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
+ 			blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+ 			blk-iopoll.o blk-lib.o ioctl.o genhd.o scsi_ioctl.o \
+-			partition-generic.o partitions/
++			uuid.o partition-generic.o partitions/
+ 
+ obj-$(CONFIG_BLK_DEV_BSG)	+= bsg.o
+ obj-$(CONFIG_BLK_DEV_BSGLIB)	+= bsg-lib.o
+diff --git a/block/blk-core.c b/block/blk-core.c
+index 277134c..c420a5c 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -44,6 +44,9 @@ EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
+ 
+ DEFINE_IDA(blk_queue_ida);
+ 
++int trap_non_toi_io;
++EXPORT_SYMBOL_GPL(trap_non_toi_io);
++
+ /*
+  * For the allocated request tables
+  */
+@@ -1854,6 +1857,9 @@ void submit_bio(int rw, struct bio *bio)
+ {
+ 	bio->bi_rw |= rw;
+ 
++	if (unlikely(trap_non_toi_io))
++		BUG_ON(!(bio->bi_rw & REQ_TOI));
++
+ 	/*
+ 	 * If it's a regular read/write or a barrier with data attached,
+ 	 * go through the normal accounting stuff before submission.
+diff --git a/block/genhd.c b/block/genhd.c
+index 3993ebf..6eba3d2 100644
+--- a/block/genhd.c
++++ b/block/genhd.c
+@@ -17,6 +17,8 @@
+ #include <linux/kobj_map.h>
+ #include <linux/mutex.h>
+ #include <linux/idr.h>
++#include <linux/ctype.h>
++#include <linux/fs_uuid.h>
+ #include <linux/log2.h>
+ 
+ #include "blk.h"
+@@ -1373,6 +1375,87 @@ int invalidate_partition(struct gendisk *disk, int partno)
+ 
+ EXPORT_SYMBOL(invalidate_partition);
+ 
++dev_t blk_lookup_fs_info(struct fs_info *seek)
++{
++	dev_t devt = MKDEV(0, 0);
++	struct class_dev_iter iter;
++	struct device *dev;
++	int best_score = 0;
++
++	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
++	while (best_score < 3 && (dev = class_dev_iter_next(&iter))) {
++		struct gendisk *disk = dev_to_disk(dev);
++		struct disk_part_iter piter;
++		struct hd_struct *part;
++
++		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
++
++		while (best_score < 3 && (part = disk_part_iter_next(&piter))) {
++			int score = part_matches_fs_info(part, seek);
++			if (score > best_score) {
++				devt = part_devt(part);
++				best_score = score;
++			}
++		}
++		disk_part_iter_exit(&piter);
++	}
++	class_dev_iter_exit(&iter);
++	return devt;
++}
++EXPORT_SYMBOL_GPL(blk_lookup_fs_info);
++
++/* Caller uses NULL, key to start. For each match found, we return a bdev on
++ * which we have done blkdev_get, and we do the blkdev_put on block devices
++ * that are passed to us. When no more matches are found, we return NULL.
++ */
++struct block_device *next_bdev_of_type(struct block_device *last,
++	const char *key)
++{
++	dev_t devt = MKDEV(0, 0);
++	struct class_dev_iter iter;
++	struct device *dev;
++	struct block_device *next = NULL, *bdev;
++	int got_last = 0;
++
++	if (!key)
++		goto out;
++
++	class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
++	while (!devt && (dev = class_dev_iter_next(&iter))) {
++		struct gendisk *disk = dev_to_disk(dev);
++		struct disk_part_iter piter;
++		struct hd_struct *part;
++
++		disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
++
++		while ((part = disk_part_iter_next(&piter))) {
++			bdev = bdget(part_devt(part));
++			if (last && !got_last) {
++				if (last == bdev)
++					got_last = 1;
++				continue;
++			}
++
++			if (blkdev_get(bdev, FMODE_READ, 0))
++				continue;
++
++			if (bdev_matches_key(bdev, key)) {
++				next = bdev;
++				break;
++			}
++
++			blkdev_put(bdev, FMODE_READ);
++		}
++		disk_part_iter_exit(&piter);
++	}
++	class_dev_iter_exit(&iter);
++out:
++	if (last)
++		blkdev_put(last, FMODE_READ);
++	return next;
++}
++EXPORT_SYMBOL_GPL(next_bdev_of_type);
++
+ /*
+  * Disk events - monitor disk events like media change and eject request.
+  */
+diff --git a/block/uuid.c b/block/uuid.c
+new file mode 100644
+index 0000000..7ae50d3
+--- /dev/null
++++ b/block/uuid.c
+@@ -0,0 +1,510 @@
++#include <linux/blkdev.h>
++#include <linux/ctype.h>
++#include <linux/fs_uuid.h>
++#include <linux/slab.h>
++#include <linux/export.h>
++
++static int debug_enabled;
++
++#define PRINTK(fmt, args...) do {					\
++	if (debug_enabled)						\
++		printk(KERN_DEBUG fmt, ## args);			\
++	} while(0)
++
++#define PRINT_HEX_DUMP(v1, v2, v3, v4, v5, v6, v7, v8)			\
++	do {								\
++		if (debug_enabled)					\
++			print_hex_dump(v1, v2, v3, v4, v5, v6, v7, v8);	\
++	} while(0)
++
++/*
++ * Simple UUID translation
++ */
++
++struct uuid_info {
++	const char *key;
++	const char *name;
++	long bkoff;
++	unsigned sboff;
++	unsigned sig_len;
++	const char *magic;
++	int uuid_offset;
++	int last_mount_offset;
++	int last_mount_size;
++};
++
++/*
++ * Based on libuuid's blkid_magic array. Note that I don't
++ * have uuid offsets for all of these yet - mssing ones are 0x0.
++ * Further information welcome.
++ *
++ * Rearranged by page of fs signature for optimisation.
++ */
++static struct uuid_info uuid_list[] = {
++ { NULL, "oracleasm", 0, 32, 8, "ORCLDISK", 0x0, 0, 0 },
++ { "ntfs", "ntfs", 0, 3, 8, "NTFS    ", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0x52, 5, "MSWIN", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0x52, 8, "FAT32   ", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0x36, 5, "MSDOS", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0x36, 8, "FAT16   ", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0x36, 8, "FAT12   ", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0, 1, "\353", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0, 1, "\351", 0x0, 0, 0 },
++ { "vfat", "vfat", 0, 0x1fe, 2, "\125\252", 0x0, 0, 0 },
++ { "xfs", "xfs", 0, 0, 4, "XFSB", 0x20, 0, 0 },
++ { "romfs", "romfs", 0, 0, 8, "-rom1fs-", 0x0, 0, 0 },
++ { "bfs", "bfs", 0, 0, 4, "\316\372\173\033", 0, 0, 0 },
++ { "cramfs", "cramfs", 0, 0, 4, "E=\315\050", 0x0, 0, 0 },
++ { "qnx4", "qnx4", 0, 4, 6, "QNX4FS", 0, 0, 0 },
++ { NULL, "crypt_LUKS", 0, 0, 6, "LUKS\xba\xbe", 0x0, 0, 0 },
++ { "squashfs", "squashfs", 0, 0, 4, "sqsh", 0, 0, 0 },
++ { "squashfs", "squashfs", 0, 0, 4, "hsqs", 0, 0, 0 },
++ { "ocfs", "ocfs", 0, 8, 9, "OracleCFS", 0x0, 0, 0 },
++ { "lvm2pv", "lvm2pv", 0, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
++ { "sysv", "sysv", 0, 0x3f8, 4, "\020~\030\375", 0, 0, 0 },
++ { "ext", "ext", 1, 0x38, 2, "\123\357", 0x468, 0x42c, 4 },
++ { "minix", "minix", 1, 0x10, 2, "\177\023", 0, 0, 0 },
++ { "minix", "minix", 1, 0x10, 2, "\217\023", 0, 0, 0 },
++ { "minix", "minix", 1, 0x10, 2, "\150\044", 0, 0, 0 },
++ { "minix", "minix", 1, 0x10, 2, "\170\044", 0, 0, 0 },
++ { "lvm2pv", "lvm2pv", 1, 0x018, 8, "LVM2 001", 0x0, 0, 0 },
++ { "vxfs", "vxfs", 1, 0, 4, "\365\374\001\245", 0, 0, 0 },
++ { "hfsplus", "hfsplus", 1, 0, 2, "BD", 0x0, 0, 0 },
++ { "hfsplus", "hfsplus", 1, 0, 2, "H+", 0x0, 0, 0 },
++ { "hfsplus", "hfsplus", 1, 0, 2, "HX", 0x0, 0, 0 },
++ { "hfs", "hfs", 1, 0, 2, "BD", 0x0, 0, 0 },
++ { "ocfs2", "ocfs2", 1, 0, 6, "OCFSV2", 0x0, 0, 0 },
++ { "lvm2pv", "lvm2pv", 0, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
++ { "lvm2pv", "lvm2pv", 1, 0x218, 8, "LVM2 001", 0x0, 0, 0 },
++ { "ocfs2", "ocfs2", 2, 0, 6, "OCFSV2", 0x0, 0, 0 },
++ { "swap", "swap", 0, 0xff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
++ { "swap", "swap", 0, 0xff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0xff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0xff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0xff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
++ { "ocfs2", "ocfs2", 4, 0, 6, "OCFSV2", 0x0, 0, 0 },
++ { "ocfs2", "ocfs2", 8, 0, 6, "OCFSV2", 0x0, 0, 0 },
++ { "hpfs", "hpfs", 8, 0, 4, "I\350\225\371", 0, 0, 0 },
++ { "reiserfs", "reiserfs", 8, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
++ { "reiserfs", "reiserfs", 8, 20, 8, "ReIsErFs", 0x10054, 0, 0 },
++ { "zfs", "zfs", 8, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
++ { "zfs", "zfs", 8, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
++ { "ufs", "ufs", 8, 0x55c, 4, "T\031\001\000", 0, 0, 0 },
++ { "swap", "swap", 0, 0x1ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
++ { "swap", "swap", 0, 0x1ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x1ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x1ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x1ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
++ { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr2Fs", 0x10054, 0, 0 },
++ { "reiserfs", "reiserfs", 64, 0x34, 9, "ReIsEr3Fs", 0x10054, 0, 0 },
++ { "reiserfs", "reiserfs", 64, 0x34, 8, "ReIsErFs", 0x10054, 0, 0 },
++ { "reiser4", "reiser4", 64, 0, 7, "ReIsEr4", 0x100544, 0, 0 },
++ { "gfs2", "gfs2", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
++ { "gfs", "gfs", 64, 0, 4, "\x01\x16\x19\x70", 0x0, 0, 0 },
++ { "btrfs", "btrfs", 64, 0x40, 8, "_BHRfS_M", 0x0, 0, 0 },
++ { "swap", "swap", 0, 0x3ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
++ { "swap", "swap", 0, 0x3ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x3ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x3ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x3ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
++ { "udf", "udf", 32, 1, 5, "BEA01", 0x0, 0, 0 },
++ { "udf", "udf", 32, 1, 5, "BOOT2", 0x0, 0, 0 },
++ { "udf", "udf", 32, 1, 5, "CD001", 0x0, 0, 0 },
++ { "udf", "udf", 32, 1, 5, "CDW02", 0x0, 0, 0 },
++ { "udf", "udf", 32, 1, 5, "NSR02", 0x0, 0, 0 },
++ { "udf", "udf", 32, 1, 5, "NSR03", 0x0, 0, 0 },
++ { "udf", "udf", 32, 1, 5, "TEA01", 0x0, 0, 0 },
++ { "iso9660", "iso9660", 32, 1, 5, "CD001", 0x0, 0, 0 },
++ { "iso9660", "iso9660", 32, 9, 5, "CDROM", 0x0, 0, 0 },
++ { "jfs", "jfs", 32, 0, 4, "JFS1", 0x88, 0, 0 },
++ { "swap", "swap", 0, 0x7ff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
++ { "swap", "swap", 0, 0x7ff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x7ff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x7ff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0x7ff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
++ { "swap", "swap", 0, 0xfff6, 10, "SWAP-SPACE", 0x40c, 0, 0 },
++ { "swap", "swap", 0, 0xfff6, 10, "SWAPSPACE2", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0xfff6, 9, "S1SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0xfff6, 9, "S2SUSPEND", 0x40c, 0, 0 },
++ { "swap", "swsuspend", 0, 0xfff6, 9, "ULSUSPEND", 0x40c, 0, 0 },
++ { "zfs", "zfs", 264, 0, 8, "\0\0\x02\xf5\xb0\x07\xb1\x0c", 0x0, 0, 0 },
++ { "zfs", "zfs", 264, 0, 8, "\x0c\xb1\x07\xb0\xf5\x02\0\0", 0x0, 0, 0 },
++ { NULL, NULL, 0, 0, 0, NULL, 0x0, 0, 0 }
++};
++
++static int null_uuid(const char *uuid)
++{
++	int i;
++
++	for (i = 0; i < 16 && !uuid[i]; i++);
++
++	return (i == 16);
++}
++
++
++static void uuid_end_bio(struct bio *bio, int err)
++{
++	struct page *page = bio->bi_io_vec[0].bv_page;
++
++	if(!test_bit(BIO_UPTODATE, &bio->bi_flags))
++		SetPageError(page);
++
++	unlock_page(page);
++	bio_put(bio);
++}
++
++
++/**
++ * submit - submit BIO request
++ * @dev: The block device we're using.
++ * @page_num: The page we're reading.
++ *
++ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
++ * textbook - allocate and initialize the bio. If we're writing, make sure
++ * the page is marked as dirty. Then submit it and carry on."
++ **/
++static struct page *read_bdev_page(struct block_device *dev, int page_num)
++{
++	struct bio *bio = NULL;
++	struct page *page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
++
++	if (!page) {
++		printk(KERN_ERR "Failed to allocate a page for reading data "
++				"in UUID checks.");
++		return NULL;
++	}
++
++	bio = bio_alloc(GFP_NOFS, 1);
++	bio->bi_bdev = dev;
++	bio->bi_sector = page_num << 3;
++	bio->bi_end_io = uuid_end_bio;
++
++	PRINTK("Submitting bio on device %lx, page %d using bio %p and page %p.\n",
++			(unsigned long) dev->bd_dev, page_num, bio, page);
++
++	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
++		printk(KERN_DEBUG "ERROR: adding page to bio at %d\n",
++				page_num);
++		bio_put(bio);
++		__free_page(page);
++		printk(KERN_DEBUG "read_bdev_page freed page %p (in error "
++				"path).\n", page);
++		return NULL;
++	}
++
++	lock_page(page);
++	submit_bio(READ | REQ_SYNC | REQ_TOI, bio);
++
++	wait_on_page_locked(page);
++	if (PageError(page)) {
++		__free_page(page);
++		page = NULL;
++	}
++	return page;
++}
++
++int bdev_matches_key(struct block_device *bdev, const char *key)
++{
++	unsigned char *data = NULL;
++	struct page *data_page = NULL;
++
++	int dev_offset, pg_num, pg_off, i;
++	int last_pg_num = -1;
++	int result = 0;
++	char buf[50];
++
++	if (null_uuid(key)) {
++		PRINTK("Refusing to find a NULL key.\n");
++		return 0;
++	}
++
++	if (!bdev->bd_disk) {
++		bdevname(bdev, buf);
++		PRINTK("bdev %s has no bd_disk.\n", buf);
++		return 0;
++	}
++
++	if (!bdev->bd_disk->queue) {
++		bdevname(bdev, buf);
++		PRINTK("bdev %s has no queue.\n", buf);
++		return 0;
++	}
++
++	for (i = 0; uuid_list[i].name; i++) {
++		struct uuid_info *dat = &uuid_list[i];
++
++		if (!dat->key || strcmp(dat->key, key))
++			continue;
++
++		dev_offset = (dat->bkoff << 10) + dat->sboff;
++		pg_num = dev_offset >> 12;
++		pg_off = dev_offset & 0xfff;
++
++		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
++			continue;
++
++		if (pg_num != last_pg_num) {
++			if (data_page) {
++				kunmap(data_page);
++				__free_page(data_page);
++			}
++			data_page = read_bdev_page(bdev, pg_num);
++			if (!data_page)
++				continue;
++			data = kmap(data_page);
++		}
++
++		last_pg_num = pg_num;
++
++		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
++			continue;
++
++		result = 1;
++		break;
++	}
++
++	if (data_page) {
++		kunmap(data_page);
++		__free_page(data_page);
++	}
++
++	return result;
++}
++
++/* 
++ * part_matches_fs_info - Does the given partition match the details given?
++ *
++ * Returns a score saying how good the match is.
++ * 0 = no UUID match.
++ * 1 = UUID but last mount time differs.
++ * 2 = UUID, last mount time but not dev_t
++ * 3 = perfect match
++ *
++ * This lets us cope elegantly with probing resulting in dev_ts changing
++ * from boot to boot, and with the case where a user copies a partition
++ * (UUID is non unique), and we need to check the last mount time of the
++ * correct partition.
++ */
++int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek)
++{
++	struct block_device *bdev;
++	struct fs_info *got;
++	int result = 0;
++	char buf[50];
++
++	if (null_uuid((char *) &seek->uuid)) {
++		PRINTK("Refusing to find a NULL uuid.\n");
++		return 0;
++	}
++
++	bdev = bdget(part_devt(part));
++
++	PRINTK("part_matches fs info considering %x.\n", part_devt(part));
++
++	if (blkdev_get(bdev, FMODE_READ, 0)) {
++		PRINTK("blkdev_get failed.\n");
++		return 0;
++	}
++
++	if (!bdev->bd_disk) {
++		bdevname(bdev, buf);
++		PRINTK("bdev %s has no bd_disk.\n", buf);
++		goto out;
++	}
++
++	if (!bdev->bd_disk->queue) {
++		bdevname(bdev, buf);
++		PRINTK("bdev %s has no queue.\n", buf);
++		goto out;
++	}
++
++	got = fs_info_from_block_dev(bdev);
++
++	if (got && !memcmp(got->uuid, seek->uuid, 16)) {
++		PRINTK(" Have matching UUID.\n");
++		PRINTK(" Got: LMS %d, LM %p.\n", got->last_mount_size, got->last_mount);
++		PRINTK(" Seek: LMS %d, LM %p.\n", seek->last_mount_size, seek->last_mount);
++		result = 1;
++
++		if (got->last_mount_size == seek->last_mount_size &&
++		    got->last_mount && seek->last_mount &&
++		    !memcmp(got->last_mount, seek->last_mount,
++			    got->last_mount_size)) {
++			result = 2;
++
++			PRINTK(" Matching last mount time.\n");
++
++			if (part_devt(part) == seek->dev_t) {
++				result = 3;
++				PRINTK(" Matching dev_t.\n");
++			} else
++				PRINTK("Dev_ts differ (%x vs %x).\n", part_devt(part), seek->dev_t);
++		}
++	}
++
++	PRINTK(" Score for %x is %d.\n", part_devt(part), result);
++	free_fs_info(got);
++out:
++	blkdev_put(bdev, FMODE_READ);
++	return result;
++}
++
++void free_fs_info(struct fs_info *fs_info)
++{
++	if (!fs_info || IS_ERR(fs_info))
++		return;
++
++	if (fs_info->last_mount)
++		kfree(fs_info->last_mount);
++
++	kfree(fs_info);
++}
++EXPORT_SYMBOL_GPL(free_fs_info);
++
++struct fs_info *fs_info_from_block_dev(struct block_device *bdev)
++{
++	unsigned char *data = NULL;
++	struct page *data_page = NULL;
++
++	int dev_offset, pg_num, pg_off;
++	int uuid_pg_num, uuid_pg_off, i;
++	unsigned char *uuid_data = NULL;
++	struct page *uuid_data_page = NULL;
++
++	int last_pg_num = -1, last_uuid_pg_num = 0;
++	char buf[50];
++	struct fs_info *fs_info = NULL;
++
++	bdevname(bdev, buf);
++
++	PRINTK("uuid_from_block_dev looking for partition type of %s.\n", buf);
++
++	for (i = 0; uuid_list[i].name; i++) {
++		struct uuid_info *dat = &uuid_list[i];
++		dev_offset = (dat->bkoff << 10) + dat->sboff;
++		pg_num = dev_offset >> 12;
++		pg_off = dev_offset & 0xfff;
++		uuid_pg_num = dat->uuid_offset >> 12;
++		uuid_pg_off = dat->uuid_offset & 0xfff;
++
++		if ((((pg_num + 1) << 3) - 1) > bdev->bd_part->nr_sects >> 1)
++			continue;
++
++		/* Ignore partition types with no UUID offset */
++		if (!dat->uuid_offset)
++			continue;
++
++		if (pg_num != last_pg_num) {
++			if (data_page) {
++				kunmap(data_page);
++				__free_page(data_page);
++			}
++			data_page = read_bdev_page(bdev, pg_num);
++			if (!data_page)
++				continue;
++			data = kmap(data_page);
++		}
++
++		last_pg_num = pg_num;
++
++		if (strncmp(&data[pg_off], dat->magic, dat->sig_len))
++			continue;
++
++		PRINTK("This partition looks like %s.\n", dat->name);
++
++		fs_info = kzalloc(sizeof(struct fs_info), GFP_KERNEL);
++
++		if (!fs_info) {
++			PRINTK("Failed to allocate fs_info struct.");
++			fs_info = ERR_PTR(-ENOMEM);
++			break;
++		}
++
++		/* UUID can't be off the end of the disk */
++		if ((uuid_pg_num > bdev->bd_part->nr_sects >> 3) ||
++				!dat->uuid_offset)
++			goto no_uuid;
++
++		if (!uuid_data || uuid_pg_num != last_uuid_pg_num) {
++			/* No need to reread the page from above */
++			if (uuid_pg_num == pg_num && uuid_data)
++				memcpy(uuid_data, data, PAGE_SIZE);
++			else {
++				if (uuid_data_page) {
++					kunmap(uuid_data_page);
++					__free_page(uuid_data_page);
++				}
++				uuid_data_page = read_bdev_page(bdev, uuid_pg_num);
++				if (!uuid_data_page)
++					continue;
++				uuid_data = kmap(uuid_data_page);
++			}
++		}
++
++		last_uuid_pg_num = uuid_pg_num;
++		memcpy(&fs_info->uuid, &uuid_data[uuid_pg_off], 16);
++		fs_info->dev_t = bdev->bd_dev;
++
++no_uuid:
++		PRINT_HEX_DUMP(KERN_EMERG, "fs_info_from_block_dev "
++				"returning uuid ", DUMP_PREFIX_NONE, 16, 1,
++				fs_info->uuid, 16, 0);
++
++		if (dat->last_mount_size) {
++			int pg = dat->last_mount_offset >> 12, sz;
++			int off = dat->last_mount_offset & 0xfff;
++			struct page *last_mount = read_bdev_page(bdev, pg);
++			unsigned char *last_mount_data;
++			char *ptr;
++
++			if (!last_mount) {
++				fs_info = ERR_PTR(-ENOMEM);
++				break;
++			}
++			last_mount_data = kmap(last_mount);
++			sz = dat->last_mount_size;
++			ptr = kmalloc(sz, GFP_KERNEL);
++
++			if (!ptr) {
++				printk(KERN_EMERG "fs_info_from_block_dev "
++					"failed to get memory for last mount "
++					"timestamp.");
++				free_fs_info(fs_info);
++				fs_info = ERR_PTR(-ENOMEM);
++			} else {
++				fs_info->last_mount = ptr;
++				fs_info->last_mount_size = sz;
++				memcpy(ptr, &last_mount_data[off], sz);
++			}
++
++			kunmap(last_mount);
++			__free_page(last_mount);
++		}
++		break;
++	}
++
++	if (data_page) {
++		kunmap(data_page);
++		__free_page(data_page);
++	}
++
++	if (uuid_data_page) {
++		kunmap(uuid_data_page);
++		__free_page(uuid_data_page);
++	}
++
++	return fs_info;
++}
++EXPORT_SYMBOL_GPL(fs_info_from_block_dev);
++
++static int __init uuid_debug_setup(char *str)
++{
++	int value;
++
++	if (sscanf(str, "=%d", &value))
++		debug_enabled = value;
++
++	return 1;
++}
++
++__setup("uuid_debug", uuid_debug_setup);
+diff --git a/drivers/acpi/acpi_pad.c b/drivers/acpi/acpi_pad.c
+index 31de104..fbe4f9e 100644
+--- a/drivers/acpi/acpi_pad.c
++++ b/drivers/acpi/acpi_pad.c
+@@ -154,6 +154,7 @@ static int power_saving_thread(void *data)
+ 	u64 last_jiffies = 0;
+ 
+ 	sched_setscheduler(current, SCHED_RR, &param);
++	set_freezable();
+ 
+ 	while (!kthread_should_stop()) {
+ 		int cpu;
+diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
+index 2b7f77d..45178ca 100644
+--- a/drivers/base/power/main.c
++++ b/drivers/base/power/main.c
+@@ -716,6 +716,7 @@ void dpm_resume(pm_message_t state)
+ 	async_synchronize_full();
+ 	dpm_show_time(starttime, state, NULL);
+ }
++EXPORT_SYMBOL_GPL(dpm_resume);
+ 
+ /**
+  * device_complete - Complete a PM transition for given device.
+@@ -792,6 +793,7 @@ void dpm_complete(pm_message_t state)
+ 	list_splice(&list, &dpm_list);
+ 	mutex_unlock(&dpm_list_mtx);
+ }
++EXPORT_SYMBOL_GPL(dpm_complete);
+ 
+ /**
+  * dpm_resume_end - Execute "resume" callbacks and complete system transition.
+@@ -1214,6 +1216,7 @@ int dpm_suspend(pm_message_t state)
+ 		dpm_show_time(starttime, state, NULL);
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(dpm_suspend);
+ 
+ /**
+  * device_prepare - Prepare a device for system power transition.
+@@ -1315,6 +1318,7 @@ int dpm_prepare(pm_message_t state)
+ 	mutex_unlock(&dpm_list_mtx);
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(dpm_prepare);
+ 
+ /**
+  * dpm_suspend_start - Prepare devices for PM transition and suspend them.
+diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
+index 79715e7..76e8bc2 100644
+--- a/drivers/base/power/wakeup.c
++++ b/drivers/base/power/wakeup.c
+@@ -23,6 +23,7 @@
+  * if wakeup events are registered during or immediately before the transition.
+  */
+ bool events_check_enabled __read_mostly;
++EXPORT_SYMBOL_GPL(events_check_enabled);
+ 
+ /*
+  * Combined counters of registered wakeup events and wakeup events in progress.
+@@ -712,6 +713,7 @@ bool pm_wakeup_pending(void)
+ 
+ 	return ret;
+ }
++EXPORT_SYMBOL_GPL(pm_wakeup_pending);
+ 
+ /**
+  * pm_get_wakeup_count - Read the number of registered wakeup events.
+diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
+index 5ac841f..f3cb20a 100644
+--- a/drivers/block/xen-blkback/blkback.c
++++ b/drivers/block/xen-blkback/blkback.c
+@@ -397,6 +397,7 @@ int xen_blkif_schedule(void *arg)
+ 	struct xen_vbd *vbd = &blkif->vbd;
+ 
+ 	xen_blkif_get(blkif);
++	set_freezable();
+ 
+ 	while (!kthread_should_stop()) {
+ 		if (try_to_freeze())
+diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c
+index 24efae4..8db2c38 100644
+--- a/drivers/gpu/drm/drm_gem.c
++++ b/drivers/gpu/drm/drm_gem.c
+@@ -139,7 +139,8 @@ int drm_gem_object_init(struct drm_device *dev,
+ 	BUG_ON((size & (PAGE_SIZE - 1)) != 0);
+ 
+ 	obj->dev = dev;
+-	obj->filp = shmem_file_setup("drm mm object", size, VM_NORESERVE);
++	obj->filp = shmem_file_setup("drm mm object", size,
++			VM_NORESERVE, 1);
+ 	if (IS_ERR(obj->filp))
+ 		return PTR_ERR(obj->filp);
+ 
+diff --git a/drivers/gpu/drm/ttm/ttm_tt.c b/drivers/gpu/drm/ttm/ttm_tt.c
+index 7d759a4..83f8e3b 100644
+--- a/drivers/gpu/drm/ttm/ttm_tt.c
++++ b/drivers/gpu/drm/ttm/ttm_tt.c
+@@ -337,7 +337,7 @@ int ttm_tt_swapout(struct ttm_tt *ttm, struct file *persistent_swap_storage)
+ 	if (!persistent_swap_storage) {
+ 		swap_storage = shmem_file_setup("ttm swap",
+ 						ttm->num_pages << PAGE_SHIFT,
+-						0);
++						0, 0);
+ 		if (unlikely(IS_ERR(swap_storage))) {
+ 			pr_err("Failed allocating swap storage\n");
+ 			return PTR_ERR(swap_storage);
+diff --git a/drivers/md/md.c b/drivers/md/md.c
+index 3db3d1b..a09c18d 100644
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -33,6 +33,7 @@
+ */
+ 
+ #include <linux/kthread.h>
++#include <linux/freezer.h>
+ #include <linux/blkdev.h>
+ #include <linux/sysctl.h>
+ #include <linux/seq_file.h>
+@@ -7331,10 +7332,14 @@ void md_do_sync(struct md_thread *thread)
+ 	 *
+ 	 */
+ 
++	set_freezable();
++
+ 	do {
+ 		mddev->curr_resync = 2;
+ 
+ 	try_again:
++	try_to_freeze();
++
+ 		if (kthread_should_stop())
+ 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
+ 
+@@ -7357,6 +7362,9 @@ void md_do_sync(struct md_thread *thread)
+ 					 * time 'round when curr_resync == 2
+ 					 */
+ 					continue;
++
++				try_to_freeze();
++
+ 				/* We need to wait 'interruptible' so as not to
+ 				 * contribute to the load average, and not to
+ 				 * be caught by 'softlockup'
+@@ -7369,6 +7377,7 @@ void md_do_sync(struct md_thread *thread)
+ 					       " share one or more physical units)\n",
+ 					       desc, mdname(mddev), mdname(mddev2));
+ 					mddev_put(mddev2);
++					try_to_freeze();
+ 					if (signal_pending(current))
+ 						flush_signals(current);
+ 					schedule();
+@@ -7486,6 +7495,8 @@ void md_do_sync(struct md_thread *thread)
+ 						 || kthread_should_stop());
+ 		}
+ 
++		try_to_freeze();
++
+ 		if (kthread_should_stop())
+ 			goto interrupted;
+ 
+@@ -7530,6 +7541,7 @@ void md_do_sync(struct md_thread *thread)
+ 			last_mark = next;
+ 		}
+ 
++		try_to_freeze();
+ 
+ 		if (kthread_should_stop())
+ 			goto interrupted;
+@@ -7743,8 +7755,10 @@ static void reap_sync_thread(struct mddev *mddev)
+  */
+ void md_check_recovery(struct mddev *mddev)
+ {
+-	if (mddev->suspended)
++#ifdef CONFIG_FREEZER
++	if (mddev->suspended || unlikely(atomic_read(&system_freezing_cnt)))
+ 		return;
++#endif
+ 
+ 	if (mddev->bitmap)
+ 		bitmap_daemon_work(mddev);
+diff --git a/drivers/net/irda/stir4200.c b/drivers/net/irda/stir4200.c
+index 876e709..b0653a2 100644
+--- a/drivers/net/irda/stir4200.c
++++ b/drivers/net/irda/stir4200.c
+@@ -739,6 +739,8 @@ static int stir_transmit_thread(void *arg)
+ 	struct net_device *dev = stir->netdev;
+ 	struct sk_buff *skb;
+ 
++	set_freezable();
++
+         while (!kthread_should_stop()) {
+ #ifdef CONFIG_PM
+ 		/* if suspending, then power off and wait */
+diff --git a/drivers/tty/vt/vt.c b/drivers/tty/vt/vt.c
+index 1a27280..39a2c61 100644
+--- a/drivers/tty/vt/vt.c
++++ b/drivers/tty/vt/vt.c
+@@ -2422,6 +2422,7 @@ int vt_kmsg_redirect(int new)
+ 	else
+ 		return kmsg_con;
+ }
++EXPORT_SYMBOL_GPL(vt_kmsg_redirect);
+ 
+ /*
+  *	Console on virtual terminal
+diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c
+index bdcb13c..ce8fc9c 100644
+--- a/drivers/uwb/uwbd.c
++++ b/drivers/uwb/uwbd.c
+@@ -271,6 +271,7 @@ static int uwbd(void *param)
+ 	struct uwb_event *evt;
+ 	int should_stop = 0;
+ 
++	set_freezable();
+ 	while (1) {
+ 		wait_event_interruptible_timeout(
+ 			rc->uwbd.wq,
+diff --git a/drivers/w1/w1.c b/drivers/w1/w1.c
+index 7994d933..ced5cba 100644
+--- a/drivers/w1/w1.c
++++ b/drivers/w1/w1.c
+@@ -974,6 +974,7 @@ int w1_process(void *data)
+ 	 * time can be calculated in jiffies once.
+ 	 */
+ 	const unsigned long jtime = msecs_to_jiffies(w1_timeout * 1000);
++	set_freezable();
+ 
+ 	while (!kthread_should_stop()) {
+ 		if (dev->search_count) {
+diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c
+index 58b7d14..87080a8 100644
+--- a/fs/btrfs/async-thread.c
++++ b/fs/btrfs/async-thread.c
+@@ -307,6 +307,8 @@ static int worker_loop(void *arg)
+ 	INIT_LIST_HEAD(&head);
+ 	INIT_LIST_HEAD(&prio_head);
+ 
++	set_freezable();
++
+ 	do {
+ again:
+ 		while (1) {
+diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
+index a8f652d..82588e5 100644
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -1636,6 +1636,8 @@ static int cleaner_kthread(void *arg)
+ {
+ 	struct btrfs_root *root = arg;
+ 
++	set_freezable();
++
+ 	do {
+ 		if (!(root->fs_info->sb->s_flags & MS_RDONLY) &&
+ 		    mutex_trylock(&root->fs_info->cleaner_mutex)) {
+@@ -1665,6 +1667,8 @@ static int transaction_kthread(void *arg)
+ 	unsigned long delay;
+ 	bool cannot_commit;
+ 
++	set_freezable();
++
+ 	do {
+ 		cannot_commit = false;
+ 		delay = HZ * 30;
+diff --git a/fs/drop_caches.c b/fs/drop_caches.c
+index c00e055..d023de0 100644
+--- a/fs/drop_caches.c
++++ b/fs/drop_caches.c
+@@ -8,6 +8,7 @@
+ #include <linux/writeback.h>
+ #include <linux/sysctl.h>
+ #include <linux/gfp.h>
++#include <linux/export.h>
+ #include "internal.h"
+ 
+ /* A global variable is a bit ugly, but it keeps the code simple */
+@@ -49,6 +50,13 @@ static void drop_slab(void)
+ 	} while (nr_objects > 10);
+ }
+ 
++/* For TuxOnIce */
++void drop_pagecache(void)
++{
++	iterate_supers(drop_pagecache_sb, NULL);
++}
++EXPORT_SYMBOL_GPL(drop_pagecache);
++
+ int drop_caches_sysctl_handler(ctl_table *table, int write,
+ 	void __user *buffer, size_t *length, loff_t *ppos)
+ {
+diff --git a/fs/ext4/super.c b/fs/ext4/super.c
+index 3d4fb81..4161100 100644
+--- a/fs/ext4/super.c
++++ b/fs/ext4/super.c
+@@ -2847,6 +2847,7 @@ static int ext4_lazyinit_thread(void *arg)
+ 	unsigned long next_wakeup, cur;
+ 
+ 	BUG_ON(NULL == eli);
++	set_freezable();
+ 
+ cont_thread:
+ 	while (true) {
+diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c
+index 9a2ca8b..88c7502 100644
+--- a/fs/gfs2/log.c
++++ b/fs/gfs2/log.c
+@@ -792,6 +792,8 @@ int gfs2_logd(void *data)
+ 	unsigned long t = 1;
+ 	DEFINE_WAIT(wait);
+ 
++	set_freezable();
++
+ 	while (!kthread_should_stop()) {
+ 
+ 		if (gfs2_jrnl_flush_reqd(sdp) || t == 0) {
+diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
+index 06122d0..decc509 100644
+--- a/fs/gfs2/quota.c
++++ b/fs/gfs2/quota.c
+@@ -1410,6 +1410,8 @@ int gfs2_quotad(void *data)
+ 	DEFINE_WAIT(wait);
+ 	int empty;
+ 
++	set_freezable();
++
+ 	while (!kthread_should_stop()) {
+ 
+ 		/* Update the master statfs file */
+diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c
+index 2eb952c..ce8687e 100644
+--- a/fs/jfs/jfs_logmgr.c
++++ b/fs/jfs/jfs_logmgr.c
+@@ -2337,6 +2337,8 @@ int jfsIOWait(void *arg)
+ {
+ 	struct lbuf *bp;
+ 
++	set_freezable();
++
+ 	do {
+ 		spin_lock_irq(&log_redrive_lock);
+ 		while ((bp = log_redrive_list)) {
+diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c
+index 5fcc02e..e0bc87b 100644
+--- a/fs/jfs/jfs_txnmgr.c
++++ b/fs/jfs/jfs_txnmgr.c
+@@ -2752,6 +2752,8 @@ int jfs_lazycommit(void *arg)
+ 	unsigned long flags;
+ 	struct jfs_sb_info *sbi;
+ 
++	set_freezable();
++
+ 	do {
+ 		LAZY_LOCK(flags);
+ 		jfs_commit_thread_waking = 0;	/* OK to wake another thread */
+@@ -2936,6 +2938,8 @@ int jfs_sync(void *arg)
+ 	struct jfs_inode_info *jfs_ip;
+ 	tid_t tid;
+ 
++	set_freezable();
++
+ 	do {
+ 		/*
+ 		 * write each inode on the anonymous inode list
+diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
+index a5752a58..3ae43e5 100644
+--- a/fs/nilfs2/segment.c
++++ b/fs/nilfs2/segment.c
+@@ -2440,6 +2440,8 @@ static int nilfs_segctor_thread(void *arg)
+ 	struct the_nilfs *nilfs = sci->sc_super->s_fs_info;
+ 	int timeout = 0;
+ 
++	set_freezable();
++
+ 	sci->sc_timer.data = (unsigned long)current;
+ 	sci->sc_timer.function = nilfs_construction_timeout;
+ 
+diff --git a/fs/super.c b/fs/super.c
+index 12f1237..411cb28 100644
+--- a/fs/super.c
++++ b/fs/super.c
+@@ -38,6 +38,8 @@
+ 
+ 
+ LIST_HEAD(super_blocks);
++EXPORT_SYMBOL_GPL(super_blocks);
++
+ DEFINE_SPINLOCK(sb_lock);
+ 
+ static char *sb_writers_name[SB_FREEZE_LEVELS] = {
+diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c
+index 0eda725..55de808 100644
+--- a/fs/xfs/xfs_trans_ail.c
++++ b/fs/xfs/xfs_trans_ail.c
+@@ -511,6 +511,7 @@ xfsaild(
+ 	struct xfs_ail	*ailp = data;
+ 	long		tout = 0;	/* milliseconds */
+ 
++	set_freezable();
+ 	current->flags |= PF_MEMALLOC;
+ 
+ 	while (!kthread_should_stop()) {
+diff --git a/include/linux/bio.h b/include/linux/bio.h
+index 820e7aa..b7d41d5 100644
+--- a/include/linux/bio.h
++++ b/include/linux/bio.h
+@@ -32,6 +32,8 @@
+ /* struct bio, bio_vec and BIO_* flags are defined in blk_types.h */
+ #include <linux/blk_types.h>
+ 
++extern int trap_non_toi_io;
++
+ #define BIO_DEBUG
+ 
+ #ifdef BIO_DEBUG
+diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h
+index cdf1119..fc4c817 100644
+--- a/include/linux/blk_types.h
++++ b/include/linux/blk_types.h
+@@ -175,6 +175,7 @@ enum rq_flag_bits {
+ 	__REQ_IO_STAT,		/* account I/O stat */
+ 	__REQ_MIXED_MERGE,	/* merge of different types, fail separately */
+ 	__REQ_KERNEL, 		/* direct IO to kernel pages */
++	__REQ_TOI,		/* TuxOnIce I/O */
+ 	__REQ_NR_BITS,		/* stops here */
+ };
+ 
+@@ -222,6 +223,7 @@ enum rq_flag_bits {
+ #define REQ_IO_STAT		(1 << __REQ_IO_STAT)
+ #define REQ_MIXED_MERGE		(1 << __REQ_MIXED_MERGE)
+ #define REQ_SECURE		(1 << __REQ_SECURE)
++#define REQ_TOI			(1 << __REQ_TOI)
+ #define REQ_KERNEL		(1 << __REQ_KERNEL)
+ 
+ #endif /* __LINUX_BLK_TYPES_H */
+diff --git a/include/linux/fs.h b/include/linux/fs.h
+index 7d2e893..6b3856c 100644
+--- a/include/linux/fs.h
++++ b/include/linux/fs.h
+@@ -1632,6 +1632,8 @@ struct super_operations {
+ #define S_IMA		1024	/* Inode has an associated IMA struct */
+ #define S_AUTOMOUNT	2048	/* Automount/referral quasi-directory */
+ #define S_NOSEC		4096	/* no suid or xattr security attributes */
++#define S_ATOMIC_COPY	8192	/* Pages mapped with this inode need to be
++				   atomically copied (gem) */
+ 
+ /*
+  * Note that nosuid etc flags are inode-specific: setting some file-system
+@@ -2055,6 +2057,13 @@ extern struct super_block *freeze_bdev(struct block_device *);
+ extern void emergency_thaw_all(void);
+ extern int thaw_bdev(struct block_device *bdev, struct super_block *sb);
+ extern int fsync_bdev(struct block_device *);
++extern int fsync_super(struct super_block *);
++extern int fsync_no_super(struct block_device *);
++#define FS_FREEZER_FUSE 1
++#define FS_FREEZER_NORMAL 2
++#define FS_FREEZER_ALL (FS_FREEZER_FUSE | FS_FREEZER_NORMAL)
++void freeze_filesystems(int which);
++void thaw_filesystems(int which);
+ #else
+ static inline void bd_forget(struct inode *inode) {}
+ static inline int sync_blockdev(struct block_device *bdev) { return 0; }
+diff --git a/include/linux/fs_uuid.h b/include/linux/fs_uuid.h
+new file mode 100644
+index 0000000..3234135
+--- /dev/null
++++ b/include/linux/fs_uuid.h
+@@ -0,0 +1,19 @@
++#include <linux/device.h>
++
++struct hd_struct;
++struct block_device;
++
++struct fs_info {
++	char uuid[16];
++	dev_t dev_t;
++	char *last_mount;
++	int last_mount_size;
++};
++
++int part_matches_fs_info(struct hd_struct *part, struct fs_info *seek);
++dev_t blk_lookup_fs_info(struct fs_info *seek);
++struct fs_info *fs_info_from_block_dev(struct block_device *bdev);
++void free_fs_info(struct fs_info *fs_info);
++int bdev_matches_key(struct block_device *bdev, const char *key);
++struct block_device *next_bdev_of_type(struct block_device *last,
++	const char *key);
+diff --git a/include/linux/mm.h b/include/linux/mm.h
+index 9d9dcc3..8e3282e 100644
+--- a/include/linux/mm.h
++++ b/include/linux/mm.h
+@@ -1679,6 +1679,7 @@ int drop_caches_sysctl_handler(struct ctl_table *, int,
+ unsigned long shrink_slab(struct shrink_control *shrink,
+ 			  unsigned long nr_pages_scanned,
+ 			  unsigned long lru_pages);
++void drop_pagecache(void);
+ 
+ #ifndef CONFIG_MMU
+ #define randomize_va_space 0
+diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h
+index 30aa0dc..b7ea3d4 100644
+--- a/include/linux/shmem_fs.h
++++ b/include/linux/shmem_fs.h
+@@ -46,7 +46,8 @@ static inline struct shmem_inode_info *SHMEM_I(struct inode *inode)
+ extern int shmem_init(void);
+ extern int shmem_fill_super(struct super_block *sb, void *data, int silent);
+ extern struct file *shmem_file_setup(const char *name,
+-					loff_t size, unsigned long flags);
++					loff_t size, unsigned long flags,
++					int atomic_copy);
+ extern int shmem_zero_setup(struct vm_area_struct *);
+ extern int shmem_lock(struct file *file, int lock, struct user_struct *user);
+ extern void shmem_unlock_mapping(struct address_space *mapping);
+diff --git a/include/linux/suspend.h b/include/linux/suspend.h
+index d4e3f16..3f143b0 100644
+--- a/include/linux/suspend.h
++++ b/include/linux/suspend.h
+@@ -418,6 +418,73 @@ extern bool pm_print_times_enabled;
+ #define pm_print_times_enabled	(false)
+ #endif
+ 
++enum {
++	TOI_CAN_HIBERNATE,
++	TOI_CAN_RESUME,
++	TOI_RESUME_DEVICE_OK,
++	TOI_NORESUME_SPECIFIED,
++	TOI_SANITY_CHECK_PROMPT,
++	TOI_CONTINUE_REQ,
++	TOI_RESUMED_BEFORE,
++	TOI_BOOT_TIME,
++	TOI_NOW_RESUMING,
++	TOI_IGNORE_LOGLEVEL,
++	TOI_TRYING_TO_RESUME,
++	TOI_LOADING_ALT_IMAGE,
++	TOI_STOP_RESUME,
++	TOI_IO_STOPPED,
++	TOI_NOTIFIERS_PREPARE,
++	TOI_CLUSTER_MODE,
++	TOI_BOOT_KERNEL,
++};
++
++#ifdef CONFIG_TOI
++
++/* Used in init dir files */
++extern unsigned long toi_state;
++#define set_toi_state(bit) (set_bit(bit, &toi_state))
++#define clear_toi_state(bit) (clear_bit(bit, &toi_state))
++#define test_toi_state(bit) (test_bit(bit, &toi_state))
++extern int toi_running;
++
++#define test_action_state(bit) (test_bit(bit, &toi_bkd.toi_action))
++extern int try_tuxonice_hibernate(void);
++
++#else /* !CONFIG_TOI */
++
++#define toi_state		(0)
++#define set_toi_state(bit) do { } while (0)
++#define clear_toi_state(bit) do { } while (0)
++#define test_toi_state(bit) (0)
++#define toi_running (0)
++
++static inline int try_tuxonice_hibernate(void) { return 0; }
++#define test_action_state(bit) (0)
++
++#endif /* CONFIG_TOI */
++
++#ifdef CONFIG_HIBERNATION
++#ifdef CONFIG_TOI
++extern void try_tuxonice_resume(void);
++#else
++#define try_tuxonice_resume() do { } while (0)
++#endif
++
++extern int resume_attempted;
++extern int software_resume(void);
++
++static inline void check_resume_attempted(void)
++{
++	if (resume_attempted)
++		return;
++
++	software_resume();
++}
++#else
++#define check_resume_attempted() do { } while (0)
++#define resume_attempted (0)
++#endif
++
+ #ifdef CONFIG_PM_AUTOSLEEP
+ 
+ /* kernel/power/autosleep.c */
+diff --git a/include/linux/swap.h b/include/linux/swap.h
+index 68df9c1..ceabd9d 100644
+--- a/include/linux/swap.h
++++ b/include/linux/swap.h
+@@ -217,6 +217,7 @@ extern unsigned long totalram_pages;
+ extern unsigned long totalreserve_pages;
+ extern unsigned long dirty_balance_reserve;
+ extern unsigned int nr_free_buffer_pages(void);
++extern unsigned int nr_unallocated_buffer_pages(void);
+ extern unsigned int nr_free_pagecache_pages(void);
+ 
+ /* Definition of global_page_state not available yet */
+@@ -264,6 +265,8 @@ extern unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
+ 						struct zone *zone,
+ 						unsigned long *nr_scanned);
+ extern unsigned long shrink_all_memory(unsigned long nr_pages);
++extern unsigned long shrink_memory_mask(unsigned long nr_to_reclaim,
++		gfp_t mask);
+ extern int vm_swappiness;
+ extern int remove_mapping(struct address_space *mapping, struct page *page);
+ extern long vm_total_pages;
+@@ -360,13 +363,17 @@ extern void swapcache_free(swp_entry_t, struct page *page);
+ extern int free_swap_and_cache(swp_entry_t);
+ extern int swap_type_of(dev_t, sector_t, struct block_device **);
+ extern unsigned int count_swap_pages(int, int);
++extern sector_t map_swap_entry(swp_entry_t entry, struct block_device **);
+ extern sector_t map_swap_page(struct page *, struct block_device **);
+ extern sector_t swapdev_block(int, pgoff_t);
++extern struct swap_info_struct *get_swap_info_struct(unsigned);
+ extern int page_swapcount(struct page *);
+ extern struct swap_info_struct *page_swap_info(struct page *);
+ extern int reuse_swap_page(struct page *);
+ extern int try_to_free_swap(struct page *);
+ struct backing_dev_info;
++extern void get_swap_range_of_type(int type, swp_entry_t *start,
++		swp_entry_t *end, unsigned int limit);
+ 
+ #ifdef CONFIG_MEMCG
+ extern void
+diff --git a/include/uapi/linux/netlink.h b/include/uapi/linux/netlink.h
+index 78d5b8a..864847f 100644
+--- a/include/uapi/linux/netlink.h
++++ b/include/uapi/linux/netlink.h
+@@ -26,6 +26,8 @@
+ #define NETLINK_ECRYPTFS	19
+ #define NETLINK_RDMA		20
+ #define NETLINK_CRYPTO		21	/* Crypto layer */
++#define NETLINK_TOI_USERUI	22	/* TuxOnIce's userui */
++#define NETLINK_TOI_USM		23	/* Userspace storage manager */
+ 
+ #define NETLINK_INET_DIAG	NETLINK_SOCK_DIAG
+ 
+diff --git a/init/do_mounts.c b/init/do_mounts.c
+index a2b49f2..58be071 100644
+--- a/init/do_mounts.c
++++ b/init/do_mounts.c
+@@ -281,6 +281,7 @@ fail:
+ done:
+ 	return res;
+ }
++EXPORT_SYMBOL_GPL(name_to_dev_t);
+ 
+ static int __init root_dev_setup(char *line)
+ {
+@@ -582,6 +583,8 @@ void __init prepare_namespace(void)
+ 	if (is_floppy && rd_doload && rd_load_disk(0))
+ 		ROOT_DEV = Root_RAM0;
+ 
++	check_resume_attempted();
++
+ 	mount_root();
+ out:
+ 	devtmpfs_mount("dev");
+diff --git a/init/do_mounts_initrd.c b/init/do_mounts_initrd.c
+index a32ec1c..74460d0 100644
+--- a/init/do_mounts_initrd.c
++++ b/init/do_mounts_initrd.c
+@@ -15,6 +15,7 @@
+ #include <linux/romfs_fs.h>
+ #include <linux/initrd.h>
+ #include <linux/sched.h>
++#include <linux/suspend.h>
+ #include <linux/freezer.h>
+ #include <linux/kmod.h>
+ 
+@@ -75,6 +76,11 @@ static void __init handle_initrd(void)
+ 
+ 	current->flags &= ~PF_FREEZER_SKIP;
+ 
++	if (!resume_attempted)
++		printk(KERN_ERR "TuxOnIce: No attempt was made to resume from "
++				"any image that might exist.\n");
++	clear_toi_state(TOI_BOOT_TIME);
++
+ 	/* move initrd to rootfs' /old */
+ 	sys_mount("..", ".", NULL, MS_MOVE, NULL);
+ 	/* switch root and cwd back to / of rootfs */
+diff --git a/init/main.c b/init/main.c
+index 63534a1..010d242 100644
+--- a/init/main.c
++++ b/init/main.c
+@@ -125,6 +125,7 @@ extern void softirq_init(void);
+ char __initdata boot_command_line[COMMAND_LINE_SIZE];
+ /* Untouched saved command line (eg. for /proc) */
+ char *saved_command_line;
++EXPORT_SYMBOL_GPL(saved_command_line);
+ /* Command line for parameter parsing */
+ static char *static_command_line;
+ 
+diff --git a/ipc/shm.c b/ipc/shm.c
+index 4fa6d8f..a9f4ae9 100644
+--- a/ipc/shm.c
++++ b/ipc/shm.c
+@@ -505,7 +505,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
+ 		if  ((shmflg & SHM_NORESERVE) &&
+ 				sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ 			acctflag = VM_NORESERVE;
+-		file = shmem_file_setup(name, size, acctflag);
++		file = shmem_file_setup(name, size, acctflag, 0);
+ 	}
+ 	error = PTR_ERR(file);
+ 	if (IS_ERR(file))
+diff --git a/kernel/cpu.c b/kernel/cpu.c
+index b5e4ab2..f9c94c3 100644
+--- a/kernel/cpu.c
++++ b/kernel/cpu.c
+@@ -493,6 +493,7 @@ int disable_nonboot_cpus(void)
+ 	cpu_maps_update_done();
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(disable_nonboot_cpus);
+ 
+ void __weak arch_enable_nonboot_cpus_begin(void)
+ {
+@@ -531,6 +532,7 @@ void __ref enable_nonboot_cpus(void)
+ out:
+ 	cpu_maps_update_done();
+ }
++EXPORT_SYMBOL_GPL(enable_nonboot_cpus);
+ 
+ static int __init alloc_frozen_cpus(void)
+ {
+diff --git a/kernel/kmod.c b/kernel/kmod.c
+index 56dd349..5d3c529 100644
+--- a/kernel/kmod.c
++++ b/kernel/kmod.c
+@@ -450,6 +450,7 @@ void __usermodehelper_set_disable_depth(enum umh_disable_depth depth)
+ 	wake_up(&usermodehelper_disabled_waitq);
+ 	up_write(&umhelper_sem);
+ }
++EXPORT_SYMBOL_GPL(__usermodehelper_set_disable_depth);
+ 
+ /**
+  * __usermodehelper_disable - Prevent new helpers from being started.
+@@ -483,6 +484,7 @@ int __usermodehelper_disable(enum umh_disable_depth depth)
+ 	__usermodehelper_set_disable_depth(UMH_ENABLED);
+ 	return -EAGAIN;
+ }
++EXPORT_SYMBOL_GPL(__usermodehelper_disable);
+ 
+ static void helper_lock(void)
+ {
+diff --git a/kernel/kthread.c b/kernel/kthread.c
+index 691dc2e..c037774 100644
+--- a/kernel/kthread.c
++++ b/kernel/kthread.c
+@@ -490,6 +490,8 @@ int kthread_worker_fn(void *worker_ptr)
+ 
+ 	WARN_ON(worker->task);
+ 	worker->task = current;
++	set_freezable();
++
+ repeat:
+ 	set_current_state(TASK_INTERRUPTIBLE);	/* mb paired w/ kthread_stop */
+ 
+diff --git a/kernel/pid.c b/kernel/pid.c
+index f2c6a68..b46f32a 100644
+--- a/kernel/pid.c
++++ b/kernel/pid.c
+@@ -446,6 +446,7 @@ struct task_struct *find_task_by_pid_ns(pid_t nr, struct pid_namespace *ns)
+ 			   " protection");
+ 	return pid_task(find_pid_ns(nr, ns), PIDTYPE_PID);
+ }
++EXPORT_SYMBOL_GPL(find_task_by_pid_ns);
+ 
+ struct task_struct *find_task_by_vpid(pid_t vnr)
+ {
+diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
+index 5dfdc9e..788f046 100644
+--- a/kernel/power/Kconfig
++++ b/kernel/power/Kconfig
+@@ -91,6 +91,286 @@ config PM_STD_PARTITION
+ 	  suspended image to. It will simply pick the first available swap 
+ 	  device.
+ 
++menuconfig TOI_CORE
++	tristate "Enhanced Hibernation (TuxOnIce)"
++	depends on HIBERNATION
++	default y
++	---help---
++	  TuxOnIce is the 'new and improved' suspend support.
++
++	  See the TuxOnIce home page (tuxonice.net)
++	  for FAQs, HOWTOs and other documentation.
++
++	comment "Image Storage (you need at least one allocator)"
++		depends on TOI_CORE
++
++	config TOI_FILE
++		tristate "File Allocator"
++		depends on TOI_CORE
++		default y
++		---help---
++		  This option enables support for storing an image in a
++		  simple file. You might want this if your swap is
++		  sometimes full enough that you don't have enough spare
++		  space to store an image.
++
++	config TOI_SWAP
++		tristate "Swap Allocator"
++		depends on TOI_CORE && SWAP
++		default y
++		---help---
++		  This option enables support for storing an image in your
++		  swap space.
++
++	comment "General Options"
++		depends on TOI_CORE
++
++	config TOI_INCREMENTAL
++		tristate "Incremental Image Support"
++		depends on TOI_CORE && CRYPTO && BROKEN
++		select CRYPTO_SHA1
++		default y
++		---help---
++		  This option adds initial support for using hashing algorithms
++		  (a quick, internal implementation of Fletcher16 and SHA1 via
++		  cryptoapi) to discover the number of pages which are
++		  unchanged since the image was last written. It is hoped that
++		  this will be an initial step toward implementing storing just
++		  the differences between consecutive images, which will
++		  increase the amount of storage needed for the image, but also
++		  increase the speed at which writing an image occurs and
++		  reduce the wear and tear on drives.
++
++	comment "No increemntal image support available without Cryptoapi support."
++		depends on TOI_CORE && !CRYPTO
++
++	config TOI_PRUNE
++		tristate "Image pruning support"
++		depends on TOI_CORE && CRYPTO && BROKEN
++		default y
++		---help---
++		  This option adds support for using cryptoapi hashing
++		  algorithms to identify pages with the same content. We
++		  then write a much smaller pointer to the first copy of
++		  the data instead of a complete (perhaps compressed)
++                  additional copy.
++
++		  You probably want this, so say Y here.
++
++	comment "No image pruning support available without Cryptoapi support."
++		depends on TOI_CORE && !CRYPTO
++
++	config TOI_CRYPTO
++		tristate "Compression support"
++		depends on TOI_CORE && CRYPTO
++		default y
++		---help---
++		  This option adds support for using cryptoapi compression
++		  algorithms. Compression is particularly useful as it can
++		  more than double your suspend and resume speed (depending
++		  upon how well your image compresses).
++
++		  You probably want this, so say Y here.
++
++	comment "No compression support available without Cryptoapi support."
++		depends on TOI_CORE && !CRYPTO
++
++	config TOI_USERUI
++		tristate "Userspace User Interface support"
++		depends on TOI_CORE && NET && (VT || SERIAL_CONSOLE)
++		default y
++		---help---
++		  This option enabled support for a userspace based user interface
++		  to TuxOnIce, which allows you to have a nice display while suspending
++		  and resuming, and also enables features such as pressing escape to
++		  cancel a cycle or interactive debugging.
++
++	config TOI_USERUI_DEFAULT_PATH
++		string "Default userui program location"
++		default "/usr/local/sbin/tuxoniceui_text"
++		depends on TOI_USERUI
++		---help---
++		  This entry allows you to specify a default path to the userui binary.
++
++	config TOI_DEFAULT_IMAGE_SIZE_LIMIT
++		int "Default image size limit"
++		range -2 65536 
++		default "-2"
++		depends on TOI_CORE
++		---help---
++		  This entry allows you to specify a default image size limit. It can
++		  be overridden at run-time using /sys/power/tuxonice/image_size_limit.
++
++	config TOI_KEEP_IMAGE
++		bool "Allow Keep Image Mode"
++		depends on TOI_CORE
++		---help---
++		  This option allows you to keep and image and reuse it. It is intended
++		  __ONLY__ for use with systems where all filesystems are mounted read-
++		  only (kiosks, for example). To use it, compile this option in and boot
++		  normally. Set the KEEP_IMAGE flag in /sys/power/tuxonice and suspend.
++		  When you resume, the image will not be removed. You will be unable to turn
++		  off swap partitions (assuming you are using the swap allocator), but future
++		  suspends simply do a power-down. The image can be updated using the
++		  kernel command line parameter suspend_act= to turn off the keep image
++		  bit. Keep image mode is a little less user friendly on purpose - it
++		  should not be used without thought!
++
++	config TOI_REPLACE_SWSUSP
++		bool "Replace swsusp by default"
++		default y
++		depends on TOI_CORE
++		---help---
++		  TuxOnIce can replace swsusp. This option makes that the default state,
++		  requiring you to echo 0 > /sys/power/tuxonice/replace_swsusp if you want
++		  to use the vanilla kernel functionality. Note that your initrd/ramfs will
++		  need to do this before trying to resume, too.
++		  With overriding swsusp enabled, echoing disk  to /sys/power/state will
++		  start a TuxOnIce cycle. If resume= doesn't specify an allocator and both
++		  the swap and file allocators are compiled in, the swap allocator will be
++		  used by default.
++
++	config TOI_IGNORE_LATE_INITCALL
++		bool "Wait for initrd/ramfs to run, by default"
++		default n
++		depends on TOI_CORE
++		---help---
++		  When booting, TuxOnIce can check for an image and start to resume prior
++		  to any initrd/ramfs running (via a late initcall).
++
++		  If you don't have an initrd/ramfs, this is what you want to happen -
++		  otherwise you won't be able to safely resume. You should set this option
++		  to 'No'.
++
++		  If, however, you want your initrd/ramfs to run anyway before resuming,
++		  you need to tell TuxOnIce to ignore that earlier opportunity to resume.
++		  This can be done either by using this compile time option, or by
++		  overriding this option with the boot-time parameter toi_initramfs_resume_only=1.
++
++		  Note that if TuxOnIce can't resume at the earlier opportunity, the
++		  value of this option won't matter - the initramfs/initrd (if any) will
++		  run anyway.
++
++	menuconfig TOI_CLUSTER
++		tristate "Cluster support"
++		default n
++		depends on TOI_CORE && NET && BROKEN
++		---help---
++		  Support for linking multiple machines in a cluster so that they suspend
++		  and resume together.
++
++	config TOI_DEFAULT_CLUSTER_INTERFACE
++		string "Default cluster interface"
++		depends on TOI_CLUSTER
++		---help---
++		  The default interface on which to communicate with other nodes in
++		  the cluster.
++
++		  If no value is set here, cluster support will be disabled by default.
++
++	config TOI_DEFAULT_CLUSTER_KEY
++		string "Default cluster key"
++		default "Default"
++		depends on TOI_CLUSTER
++		---help---
++		  The default key used by this node. All nodes in the same cluster
++		  have the same key. Multiple clusters may coexist on the same lan
++		  by using different values for this key.
++
++	config TOI_CLUSTER_IMAGE_TIMEOUT
++		int "Timeout when checking for image"
++		default 15
++		depends on TOI_CLUSTER
++		---help---
++		  Timeout (seconds) before continuing to boot when waiting to see
++		  whether other nodes might have an image. Set to -1 to wait
++		  indefinitely. In WAIT_UNTIL_NODES is non zero, we might continue
++		  booting sooner than this timeout.
++
++	config TOI_CLUSTER_WAIT_UNTIL_NODES
++		int "Nodes without image before continuing"
++		default 0
++		depends on TOI_CLUSTER
++		---help---
++		  When booting and no image is found, we wait to see if other nodes
++		  have an image before continuing to boot. This value lets us
++		  continue after seeing a certain number of nodes without an image,
++		  instead of continuing to wait for the timeout. Set to 0 to only
++		  use the timeout.
++
++	config TOI_DEFAULT_CLUSTER_PRE_HIBERNATE
++		string "Default pre-hibernate script"
++		depends on TOI_CLUSTER
++		---help---
++		  The default script to be called when starting to hibernate.
++
++	config TOI_DEFAULT_CLUSTER_POST_HIBERNATE
++		string "Default post-hibernate script"
++		depends on TOI_CLUSTER
++		---help---
++		  The default script to be called after resuming from hibernation.
++
++	config TOI_DEFAULT_WAIT
++		int "Default waiting time for emergency boot messages"
++		default "25"
++		range -1 32768
++		depends on TOI_CORE
++		help
++		  TuxOnIce can display warnings very early in the process of resuming,
++		  if (for example) it appears that you have booted a kernel that doesn't
++		  match an image on disk. It can then give you the opportunity to either
++		  continue booting that kernel, or reboot the machine. This option can be
++		  used to control how long to wait in such circumstances. -1 means wait
++		  forever. 0 means don't wait at all (do the default action, which will
++		  generally be to continue booting and remove the image). Values of 1 or
++		  more indicate a number of seconds (up to 255) to wait before doing the
++		  default.
++
++	config  TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE
++		int "Default extra pages allowance"
++		default "2000"
++		range 500 32768
++		depends on TOI_CORE
++		help
++		  This value controls the default for the allowance TuxOnIce makes for
++		  drivers to allocate extra memory during the atomic copy. The default
++		  value of 2000 will be okay in most cases. If you are using
++		  DRI, the easiest way to find what value to use is to try to hibernate
++		  and look at how many pages were actually needed in the sysfs entry
++		  /sys/power/tuxonice/debug_info (first number on the last line), adding
++		  a little extra because the value is not always the same.
++
++	config TOI_CHECKSUM
++		bool "Checksum pageset2"
++		default n
++		depends on TOI_CORE
++		select CRYPTO
++		select CRYPTO_ALGAPI
++		select CRYPTO_MD4
++		---help---
++		  Adds support for checksumming pageset2 pages, to ensure you really get an
++		  atomic copy. Since some filesystems (XFS especially) change metadata even
++		  when there's no other activity, we need this to check for pages that have
++		  been changed while we were saving the page cache. If your debugging output
++		  always says no pages were resaved, you may be able to safely disable this
++		  option.
++
++config TOI
++	bool
++	depends on TOI_CORE!=n
++	default y
++
++config TOI_EXPORTS
++	bool
++	depends on TOI_SWAP=m || TOI_FILE=m || \
++		TOI_CRYPTO=m || TOI_CLUSTER=m || \
++		TOI_USERUI=m || TOI_CORE=m
++	default y
++
++config TOI_ZRAM_SUPPORT
++	def_bool y
++	depends on TOI && ZRAM!=n
++
+ config PM_SLEEP
+ 	def_bool y
+ 	depends on SUSPEND || HIBERNATE_CALLBACKS
+diff --git a/kernel/power/Makefile b/kernel/power/Makefile
+index 29472bf..dd5d4f2 100644
+--- a/kernel/power/Makefile
++++ b/kernel/power/Makefile
+@@ -1,6 +1,37 @@
+ 
+ ccflags-$(CONFIG_PM_DEBUG)	:= -DDEBUG
+ 
++tuxonice_core-y := tuxonice_modules.o
++
++obj-$(CONFIG_TOI)		+= tuxonice_builtin.o
++
++tuxonice_core-$(CONFIG_PM_DEBUG)	+= tuxonice_alloc.o
++
++# Compile these in after allocation debugging, if used.
++
++tuxonice_core-y += tuxonice_sysfs.o tuxonice_highlevel.o \
++		tuxonice_io.o tuxonice_pagedir.o tuxonice_prepare_image.o \
++		tuxonice_extent.o tuxonice_pageflags.o tuxonice_ui.o \
++		tuxonice_power_off.o tuxonice_atomic_copy.o
++
++tuxonice_core-$(CONFIG_TOI_CHECKSUM)	+= tuxonice_checksum.o
++
++tuxonice_core-$(CONFIG_NET)	+= tuxonice_storage.o tuxonice_netlink.o
++
++obj-$(CONFIG_TOI_CORE)		+= tuxonice_core.o
++obj-$(CONFIG_TOI_PRUNE)		+= tuxonice_prune.o
++obj-$(CONFIG_TOI_INCREMENTAL)	+= tuxonice_incremental.o
++obj-$(CONFIG_TOI_CRYPTO)	+= tuxonice_compress.o
++
++tuxonice_bio-y := tuxonice_bio_core.o tuxonice_bio_chains.o \
++		tuxonice_bio_signature.o
++
++obj-$(CONFIG_TOI_SWAP)		+= tuxonice_bio.o tuxonice_swap.o
++obj-$(CONFIG_TOI_FILE)		+= tuxonice_bio.o tuxonice_file.o
++obj-$(CONFIG_TOI_CLUSTER)	+= tuxonice_cluster.o
++
++obj-$(CONFIG_TOI_USERUI)	+= tuxonice_userui.o
++
+ obj-y				+= qos.o
+ obj-$(CONFIG_PM)		+= main.o
+ obj-$(CONFIG_VT_CONSOLE_SLEEP)	+= console.o
+diff --git a/kernel/power/console.c b/kernel/power/console.c
+index b1dc456..bbf19a5 100644
+--- a/kernel/power/console.c
++++ b/kernel/power/console.c
+@@ -23,6 +23,7 @@ int pm_prepare_console(void)
+ 	orig_kmsg = vt_kmsg_redirect(SUSPEND_CONSOLE);
+ 	return 0;
+ }
++EXPORT_SYMBOL_GPL(pm_prepare_console);
+ 
+ void pm_restore_console(void)
+ {
+@@ -31,3 +32,4 @@ void pm_restore_console(void)
+ 		vt_kmsg_redirect(orig_kmsg);
+ 	}
+ }
++EXPORT_SYMBOL_GPL(pm_restore_console);
+diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
+index b26f5f1..9c04c3e 100644
+--- a/kernel/power/hibernate.c
++++ b/kernel/power/hibernate.c
+@@ -29,14 +29,15 @@
+ #include <linux/ctype.h>
+ #include <linux/genhd.h>
+ 
+-#include "power.h"
++#include "tuxonice.h"
+ 
+ 
+ static int nocompress;
+ static int noresume;
+ static int resume_wait;
+ static int resume_delay;
+-static char resume_file[256] = CONFIG_PM_STD_PARTITION;
++char resume_file[256] = CONFIG_PM_STD_PARTITION;
++EXPORT_SYMBOL_GPL(resume_file);
+ dev_t swsusp_resume_device;
+ sector_t swsusp_resume_block;
+ int in_suspend __nosavedata;
+@@ -114,21 +115,23 @@ static int hibernation_test(int level) { return 0; }
+  * platform_begin - Call platform to start hibernation.
+  * @platform_mode: Whether or not to use the platform driver.
+  */
+-static int platform_begin(int platform_mode)
++int platform_begin(int platform_mode)
+ {
+ 	return (platform_mode && hibernation_ops) ?
+ 		hibernation_ops->begin() : 0;
+ }
++EXPORT_SYMBOL_GPL(platform_begin);
+ 
+ /**
+  * platform_end - Call platform to finish transition to the working state.
+  * @platform_mode: Whether or not to use the platform driver.
+  */
+-static void platform_end(int platform_mode)
++void platform_end(int platform_mode)
+ {
+ 	if (platform_mode && hibernation_ops)
+ 		hibernation_ops->end();
+ }
++EXPORT_SYMBOL_GPL(platform_end);
+ 
+ /**
+  * platform_pre_snapshot - Call platform to prepare the machine for hibernation.
+@@ -138,11 +141,12 @@ static void platform_end(int platform_mode)
+  * if so configured, and return an error code if that fails.
+  */
+ 
+-static int platform_pre_snapshot(int platform_mode)
++int platform_pre_snapshot(int platform_mode)
+ {
+ 	return (platform_mode && hibernation_ops) ?
+ 		hibernation_ops->pre_snapshot() : 0;
+ }
++EXPORT_SYMBOL_GPL(platform_pre_snapshot);
+ 
+ /**
+  * platform_leave - Call platform to prepare a transition to the working state.
+@@ -153,11 +157,12 @@ static int platform_pre_snapshot(int platform_mode)
+  *
+  * This routine is called on one CPU with interrupts disabled.
+  */
+-static void platform_leave(int platform_mode)
++void platform_leave(int platform_mode)
+ {
+ 	if (platform_mode && hibernation_ops)
+ 		hibernation_ops->leave();
+ }
++EXPORT_SYMBOL_GPL(platform_leave);
+ 
+ /**
+  * platform_finish - Call platform to switch the system to the working state.
+@@ -168,11 +173,12 @@ static void platform_leave(int platform_mode)
+  *
+  * This routine must be called after platform_prepare().
+  */
+-static void platform_finish(int platform_mode)
++void platform_finish(int platform_mode)
+ {
+ 	if (platform_mode && hibernation_ops)
+ 		hibernation_ops->finish();
+ }
++EXPORT_SYMBOL_GPL(platform_finish);
+ 
+ /**
+  * platform_pre_restore - Prepare for hibernate image restoration.
+@@ -184,11 +190,12 @@ static void platform_finish(int platform_mode)
+  * If the restore fails after this function has been called,
+  * platform_restore_cleanup() must be called.
+  */
+-static int platform_pre_restore(int platform_mode)
++int platform_pre_restore(int platform_mode)
+ {
+ 	return (platform_mode && hibernation_ops) ?
+ 		hibernation_ops->pre_restore() : 0;
+ }
++EXPORT_SYMBOL_GPL(platform_pre_restore);
+ 
+ /**
+  * platform_restore_cleanup - Switch to the working state after failing restore.
+@@ -201,21 +208,23 @@ static int platform_pre_restore(int platform_mode)
+  * function must be called too, regardless of the result of
+  * platform_pre_restore().
+  */
+-static void platform_restore_cleanup(int platform_mode)
++void platform_restore_cleanup(int platform_mode)
+ {
+ 	if (platform_mode && hibernation_ops)
+ 		hibernation_ops->restore_cleanup();
+ }
++EXPORT_SYMBOL_GPL(platform_restore_cleanup);
+ 
+ /**
+  * platform_recover - Recover from a failure to suspend devices.
+  * @platform_mode: Whether or not to use the platform driver.
+  */
+-static void platform_recover(int platform_mode)
++void platform_recover(int platform_mode)
+ {
+ 	if (platform_mode && hibernation_ops && hibernation_ops->recover)
+ 		hibernation_ops->recover();
+ }
++EXPORT_SYMBOL_GPL(platform_recover);
+ 
+ /**
+  * swsusp_show_speed - Print time elapsed between two events during hibernation.
+@@ -573,6 +582,7 @@ int hibernation_platform_enter(void)
+ 
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(hibernation_platform_enter);
+ 
+ /**
+  * power_down - Shut the machine down for hibernation.
+@@ -632,6 +642,9 @@ int hibernate(void)
+ {
+ 	int error;
+ 
++	if (test_action_state(TOI_REPLACE_SWSUSP))
++		return try_tuxonice_hibernate();
++
+ 	lock_system_sleep();
+ 	/* The snapshot device should not be opened while we're running */
+ 	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
+@@ -715,11 +728,19 @@ int hibernate(void)
+  * attempts to recover gracefully and make the kernel return to the normal mode
+  * of operation.
+  */
+-static int software_resume(void)
++int software_resume(void)
+ {
+ 	int error;
+ 	unsigned int flags;
+ 
++	resume_attempted = 1;
++
++	/*
++	 * We can't know (until an image header - if any - is loaded), whether
++	 * we did override swsusp. We therefore ensure that both are tried.
++	 */
++	try_tuxonice_resume();
++
+ 	/*
+ 	 * If the user said "noresume".. bail out early.
+ 	 */
+@@ -1094,6 +1115,7 @@ static int __init hibernate_setup(char *str)
+ static int __init noresume_setup(char *str)
+ {
+ 	noresume = 1;
++	set_toi_state(TOI_NORESUME_SPECIFIED);
+ 	return 1;
+ }
+ 
+diff --git a/kernel/power/main.c b/kernel/power/main.c
+index d77663b..bbd0c86 100644
+--- a/kernel/power/main.c
++++ b/kernel/power/main.c
+@@ -19,12 +19,14 @@
+ #include "power.h"
+ 
+ DEFINE_MUTEX(pm_mutex);
++EXPORT_SYMBOL_GPL(pm_mutex);
+ 
+ #ifdef CONFIG_PM_SLEEP
+ 
+ /* Routines for PM-transition notifications */
+ 
+-static BLOCKING_NOTIFIER_HEAD(pm_chain_head);
++BLOCKING_NOTIFIER_HEAD(pm_chain_head);
++EXPORT_SYMBOL_GPL(pm_chain_head);
+ 
+ int register_pm_notifier(struct notifier_block *nb)
+ {
+@@ -44,6 +46,7 @@ int pm_notifier_call_chain(unsigned long val)
+ 
+ 	return notifier_to_errno(ret);
+ }
++EXPORT_SYMBOL_GPL(pm_notifier_call_chain);
+ 
+ /* If set, devices may be suspended and resumed asynchronously. */
+ int pm_async_enabled = 1;
+@@ -277,6 +280,7 @@ static inline void pm_print_times_init(void) {}
+ #endif /* CONFIG_PM_SLEEP_DEBUG */
+ 
+ struct kobject *power_kobj;
++EXPORT_SYMBOL_GPL(power_kobj);
+ 
+ /**
+  *	state - control system power state.
+diff --git a/kernel/power/power.h b/kernel/power/power.h
+index 7d4b7ff..98b9660 100644
+--- a/kernel/power/power.h
++++ b/kernel/power/power.h
+@@ -35,8 +35,12 @@ static inline char *check_image_kernel(struct swsusp_info *info)
+ 	return arch_hibernation_header_restore(info) ?
+ 			"architecture specific data" : NULL;
+ }
++#else
++extern char *check_image_kernel(struct swsusp_info *info);
+ #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
++extern int init_header(struct swsusp_info *info);
+ 
++extern char resume_file[256];
+ /*
+  * Keep some memory free so that I/O operations can succeed without paging
+  * [Might this be more than 4 MB?]
+@@ -55,6 +59,7 @@ extern bool freezer_test_done;
+ extern int hibernation_snapshot(int platform_mode);
+ extern int hibernation_restore(int platform_mode);
+ extern int hibernation_platform_enter(void);
++extern void platform_recover(int platform_mode);
+ 
+ #else /* !CONFIG_HIBERNATION */
+ 
+@@ -74,6 +79,8 @@ static struct kobj_attribute _name##_attr = {	\
+ 	.store	= _name##_store,		\
+ }
+ 
++extern struct pbe *restore_pblist;
++
+ /* Preferred image size in bytes (default 500 MB) */
+ extern unsigned long image_size;
+ /* Size of memory reserved for drivers (default SPARE_PAGES x PAGE_SIZE) */
+@@ -268,6 +275,90 @@ static inline void suspend_thaw_processes(void)
+ }
+ #endif
+ 
++extern struct page *saveable_page(struct zone *z, unsigned long p);
++#ifdef CONFIG_HIGHMEM
++extern struct page *saveable_highmem_page(struct zone *z, unsigned long p);
++#else
++static
++inline struct page *saveable_highmem_page(struct zone *z, unsigned long p)
++{
++	return NULL;
++}
++#endif
++
++#define PBES_PER_PAGE (PAGE_SIZE / sizeof(struct pbe))
++extern struct list_head nosave_regions;
++
++/**
++ *	This structure represents a range of page frames the contents of which
++ *	should not be saved during the suspend.
++ */
++
++struct nosave_region {
++	struct list_head list;
++	unsigned long start_pfn;
++	unsigned long end_pfn;
++};
++
++#define BM_END_OF_MAP	(~0UL)
++
++#define BM_BITS_PER_BLOCK	(PAGE_SIZE * BITS_PER_BYTE)
++
++struct bm_block {
++	struct list_head hook;		/* hook into a list of bitmap blocks */
++	unsigned long start_pfn;	/* pfn represented by the first bit */
++	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
++	unsigned long *data;	/* bitmap representing pages */
++};
++
++/* struct bm_position is used for browsing memory bitmaps */
++
++struct bm_position {
++	struct bm_block *block;
++	int bit;
++};
++
++struct memory_bitmap {
++	struct list_head blocks;	/* list of bitmap blocks */
++	struct linked_page *p_list;	/* list of pages used to store zone
++					 * bitmap objects and bitmap block
++					 * objects
++					 */
++	struct bm_position *states;	/* most recently used bit position */
++	int num_states;			/* when iterating over a bitmap and
++					 * number of states we support.
++					 */
++};
++
++extern int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask,
++		int safe_needed);
++extern int memory_bm_create_index(struct memory_bitmap *bm, gfp_t gfp_mask,
++		int safe_needed, int index);
++extern void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
++extern void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn);
++extern void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn);
++extern void memory_bm_clear_bit_index(struct memory_bitmap *bm, unsigned long pfn, int index);
++extern int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn);
++extern int memory_bm_test_bit_index(struct memory_bitmap *bm, unsigned long pfn, int index);
++extern unsigned long memory_bm_next_pfn(struct memory_bitmap *bm);
++extern unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm,
++		int index);
++extern void memory_bm_position_reset(struct memory_bitmap *bm);
++extern void memory_bm_clear(struct memory_bitmap *bm);
++extern void memory_bm_copy(struct memory_bitmap *source,
++		struct memory_bitmap *dest);
++extern void memory_bm_dup(struct memory_bitmap *source,
++		struct memory_bitmap *dest);
++extern int memory_bm_set_iterators(struct memory_bitmap *bm, int number);
++
++#ifdef CONFIG_TOI
++struct toi_module_ops;
++extern int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
++	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
++extern int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
++	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size));
++#endif
++
+ #ifdef CONFIG_PM_AUTOSLEEP
+ 
+ /* kernel/power/autosleep.c */
+diff --git a/kernel/power/process.c b/kernel/power/process.c
+index 98088e0..b340c98 100644
+--- a/kernel/power/process.c
++++ b/kernel/power/process.c
+@@ -134,6 +134,7 @@ int freeze_processes(void)
+ 		thaw_processes();
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(freeze_processes);
+ 
+ /**
+  * freeze_kernel_threads - Make freezable kernel threads go to the refrigerator.
+@@ -160,6 +161,7 @@ int freeze_kernel_threads(void)
+ 		thaw_kernel_threads();
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(freeze_kernel_threads);
+ 
+ void thaw_processes(void)
+ {
+@@ -187,6 +189,7 @@ void thaw_processes(void)
+ 	schedule();
+ 	printk("done.\n");
+ }
++EXPORT_SYMBOL_GPL(thaw_processes);
+ 
+ void thaw_kernel_threads(void)
+ {
+@@ -207,3 +210,4 @@ void thaw_kernel_threads(void)
+ 	schedule();
+ 	printk("done.\n");
+ }
++EXPORT_SYMBOL_GPL(thaw_kernel_threads);
+diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
+index 0de2857..c47a1a8 100644
+--- a/kernel/power/snapshot.c
++++ b/kernel/power/snapshot.c
+@@ -35,6 +35,8 @@
+ #include <asm/io.h>
+ 
+ #include "power.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_pagedir.h"
+ 
+ static int swsusp_page_is_free(struct page *);
+ static void swsusp_set_page_forbidden(struct page *);
+@@ -71,6 +73,10 @@ void __init hibernate_image_size_init(void)
+  * directly to their "original" page frames.
+  */
+ struct pbe *restore_pblist;
++EXPORT_SYMBOL_GPL(restore_pblist);
++
++int resume_attempted;
++EXPORT_SYMBOL_GPL(resume_attempted);
+ 
+ /* Pointer to an auxiliary buffer (1 page) */
+ static void *buffer;
+@@ -113,6 +119,9 @@ static void *get_image_page(gfp_t gfp_mask, int safe_needed)
+ 
+ unsigned long get_safe_page(gfp_t gfp_mask)
+ {
++	if (toi_running)
++		return toi_get_nonconflicting_page();
++
+ 	return (unsigned long)get_image_page(gfp_mask, PG_SAFE);
+ }
+ 
+@@ -249,47 +258,53 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size)
+  *	the represented memory area.
+  */
+ 
+-#define BM_END_OF_MAP	(~0UL)
+-
+-#define BM_BITS_PER_BLOCK	(PAGE_SIZE * BITS_PER_BYTE)
+-
+-struct bm_block {
+-	struct list_head hook;	/* hook into a list of bitmap blocks */
+-	unsigned long start_pfn;	/* pfn represented by the first bit */
+-	unsigned long end_pfn;	/* pfn represented by the last bit plus 1 */
+-	unsigned long *data;	/* bitmap representing pages */
+-};
+-
+ static inline unsigned long bm_block_bits(struct bm_block *bb)
+ {
+ 	return bb->end_pfn - bb->start_pfn;
+ }
+ 
+-/* strcut bm_position is used for browsing memory bitmaps */
++/* Functions that operate on memory bitmaps */
+ 
+-struct bm_position {
+-	struct bm_block *block;
+-	int bit;
+-};
++void memory_bm_position_reset_index(struct memory_bitmap *bm, int index)
++{
++	bm->states[index].block = list_entry(bm->blocks.next,
++				struct bm_block, hook);
++	bm->states[index].bit = 0;
++}
++EXPORT_SYMBOL_GPL(memory_bm_position_reset_index);
+ 
+-struct memory_bitmap {
+-	struct list_head blocks;	/* list of bitmap blocks */
+-	struct linked_page *p_list;	/* list of pages used to store zone
+-					 * bitmap objects and bitmap block
+-					 * objects
+-					 */
+-	struct bm_position cur;	/* most recently used bit position */
+-};
++void memory_bm_position_reset(struct memory_bitmap *bm)
++{
++	int i;
+ 
+-/* Functions that operate on memory bitmaps */
++	for (i = 0; i < bm->num_states; i++) {
++		bm->states[i].block = list_entry(bm->blocks.next,
++				struct bm_block, hook);
++		bm->states[i].bit = 0;
++	}
++}
++EXPORT_SYMBOL_GPL(memory_bm_position_reset);
+ 
+-static void memory_bm_position_reset(struct memory_bitmap *bm)
++int memory_bm_set_iterators(struct memory_bitmap *bm, int number)
+ {
+-	bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook);
+-	bm->cur.bit = 0;
+-}
++	int bytes = number * sizeof(struct bm_position);
++	struct bm_position *new_states;
++
++	if (number < bm->num_states)
++		return 0;
+ 
+-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free);
++	new_states = kmalloc(bytes, GFP_KERNEL);
++	if (!new_states)
++		return -ENOMEM;
++
++	if (bm->states)
++		kfree(bm->states);
++
++	bm->states = new_states;
++	bm->num_states = number;
++	return 0;
++}
++EXPORT_SYMBOL_GPL(memory_bm_set_iterators);
+ 
+ /**
+  *	create_bm_block_list - create a list of block bitmap objects
+@@ -397,8 +412,8 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
+ /**
+   *	memory_bm_create - allocate memory for a memory bitmap
+   */
+-static int
+-memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
++int memory_bm_create_index(struct memory_bitmap *bm, gfp_t gfp_mask,
++		int safe_needed, int states)
+ {
+ 	struct chain_allocator ca;
+ 	struct list_head mem_extents;
+@@ -442,6 +457,9 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+ 		}
+ 	}
+ 
++	if (!error)
++		error = memory_bm_set_iterators(bm, states);
++
+ 	bm->p_list = ca.chain;
+ 	memory_bm_position_reset(bm);
+  Exit:
+@@ -453,11 +471,18 @@ memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
+ 	memory_bm_free(bm, PG_UNSAFE_CLEAR);
+ 	goto Exit;
+ }
++EXPORT_SYMBOL_GPL(memory_bm_create_index);
++
++int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed)
++{
++	return memory_bm_create_index(bm, gfp_mask, safe_needed, 1);
++}
++EXPORT_SYMBOL_GPL(memory_bm_create);
+ 
+ /**
+   *	memory_bm_free - free memory occupied by the memory bitmap @bm
+   */
+-static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
++void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+ {
+ 	struct bm_block *bb;
+ 
+@@ -468,15 +493,22 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free)
+ 	free_list_of_pages(bm->p_list, clear_nosave_free);
+ 
+ 	INIT_LIST_HEAD(&bm->blocks);
++
++	if (bm->states) {
++		kfree(bm->states);
++		bm->states = NULL;
++		bm->num_states = 0;
++	}
+ }
++EXPORT_SYMBOL_GPL(memory_bm_free);
+ 
+ /**
+  *	memory_bm_find_bit - find the bit in the bitmap @bm that corresponds
+  *	to given pfn.  The cur_zone_bm member of @bm and the cur_block member
+- *	of @bm->cur_zone_bm are updated.
++ *	of @bm->states[i]_zone_bm are updated.
+  */
+-static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+-				void **addr, unsigned int *bit_nr)
++static int memory_bm_find_bit_index(struct memory_bitmap *bm, unsigned long pfn,
++				void **addr, unsigned int *bit_nr, int state)
+ {
+ 	struct bm_block *bb;
+ 
+@@ -484,7 +516,7 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ 	 * Check if the pfn corresponds to the current bitmap block and find
+ 	 * the block where it fits if this is not the case.
+ 	 */
+-	bb = bm->cur.block;
++	bb = bm->states[state].block;
+ 	if (pfn < bb->start_pfn)
+ 		list_for_each_entry_continue_reverse(bb, &bm->blocks, hook)
+ 			if (pfn >= bb->start_pfn)
+@@ -499,15 +531,21 @@ static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
+ 		return -EFAULT;
+ 
+ 	/* The block has been found */
+-	bm->cur.block = bb;
++	bm->states[state].block = bb;
+ 	pfn -= bb->start_pfn;
+-	bm->cur.bit = pfn + 1;
++	bm->states[state].bit = pfn + 1;
+ 	*bit_nr = pfn;
+ 	*addr = bb->data;
+ 	return 0;
+ }
+ 
+-static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
++static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn,
++				void **addr, unsigned int *bit_nr)
++{
++	return memory_bm_find_bit_index(bm, pfn, addr, bit_nr, 0);
++}
++
++void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+ {
+ 	void *addr;
+ 	unsigned int bit;
+@@ -517,6 +555,7 @@ static void memory_bm_set_bit(struct memory_bitmap *bm, unsigned long pfn)
+ 	BUG_ON(error);
+ 	set_bit(bit, addr);
+ }
++EXPORT_SYMBOL_GPL(memory_bm_set_bit);
+ 
+ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+ {
+@@ -530,27 +569,43 @@ static int mem_bm_set_bit_check(struct memory_bitmap *bm, unsigned long pfn)
+ 	return error;
+ }
+ 
+-static void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
++void memory_bm_clear_bit_index(struct memory_bitmap *bm, unsigned long pfn,
++		int index)
+ {
+ 	void *addr;
+ 	unsigned int bit;
+ 	int error;
+ 
+-	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
++	error = memory_bm_find_bit_index(bm, pfn, &addr, &bit, index);
+ 	BUG_ON(error);
+ 	clear_bit(bit, addr);
+ }
++EXPORT_SYMBOL_GPL(memory_bm_clear_bit_index);
++
++void memory_bm_clear_bit(struct memory_bitmap *bm, unsigned long pfn)
++{
++	memory_bm_clear_bit_index(bm, pfn, 0);
++}
++EXPORT_SYMBOL_GPL(memory_bm_clear_bit);
+ 
+-static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
++int memory_bm_test_bit_index(struct memory_bitmap *bm, unsigned long pfn,
++		int index)
+ {
+ 	void *addr;
+ 	unsigned int bit;
+ 	int error;
+ 
+-	error = memory_bm_find_bit(bm, pfn, &addr, &bit);
++	error = memory_bm_find_bit_index(bm, pfn, &addr, &bit, index);
+ 	BUG_ON(error);
+ 	return test_bit(bit, addr);
+ }
++EXPORT_SYMBOL_GPL(memory_bm_test_bit_index);
++
++int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn)
++{
++	return memory_bm_test_bit_index(bm, pfn, 0);
++}
++EXPORT_SYMBOL_GPL(memory_bm_test_bit);
+ 
+ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
+ {
+@@ -569,43 +624,184 @@ static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn)
+  *	this function.
+  */
+ 
+-static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
++unsigned long memory_bm_next_pfn_index(struct memory_bitmap *bm, int index)
+ {
+ 	struct bm_block *bb;
+ 	int bit;
+ 
+-	bb = bm->cur.block;
++	bb = bm->states[index].block;
+ 	do {
+-		bit = bm->cur.bit;
++		bit = bm->states[index].bit;
+ 		bit = find_next_bit(bb->data, bm_block_bits(bb), bit);
+ 		if (bit < bm_block_bits(bb))
+ 			goto Return_pfn;
+ 
+ 		bb = list_entry(bb->hook.next, struct bm_block, hook);
+-		bm->cur.block = bb;
+-		bm->cur.bit = 0;
++		bm->states[index].block = bb;
++		bm->states[index].bit = 0;
+ 	} while (&bb->hook != &bm->blocks);
+ 
+-	memory_bm_position_reset(bm);
++	memory_bm_position_reset_index(bm, index);
+ 	return BM_END_OF_MAP;
+ 
+  Return_pfn:
+-	bm->cur.bit = bit + 1;
++	bm->states[index].bit = bit + 1;
+ 	return bb->start_pfn + bit;
+ }
++EXPORT_SYMBOL_GPL(memory_bm_next_pfn_index);
+ 
+-/**
+- *	This structure represents a range of page frames the contents of which
+- *	should not be saved during the suspend.
+- */
++unsigned long memory_bm_next_pfn(struct memory_bitmap *bm)
++{
++	return memory_bm_next_pfn_index(bm, 0);
++}
++EXPORT_SYMBOL_GPL(memory_bm_next_pfn);
+ 
+-struct nosave_region {
+-	struct list_head list;
+-	unsigned long start_pfn;
+-	unsigned long end_pfn;
+-};
++void memory_bm_clear(struct memory_bitmap *bm)
++{
++	unsigned long pfn;
+ 
+-static LIST_HEAD(nosave_regions);
++	memory_bm_position_reset(bm);
++	pfn = memory_bm_next_pfn(bm);
++	while (pfn != BM_END_OF_MAP) {
++		memory_bm_clear_bit(bm, pfn);
++		pfn = memory_bm_next_pfn(bm);
++	}
++}
++EXPORT_SYMBOL_GPL(memory_bm_clear);
++
++void memory_bm_copy(struct memory_bitmap *source, struct memory_bitmap *dest)
++{
++	unsigned long pfn;
++
++	memory_bm_position_reset(source);
++	pfn = memory_bm_next_pfn(source);
++	while (pfn != BM_END_OF_MAP) {
++		memory_bm_set_bit(dest, pfn);
++		pfn = memory_bm_next_pfn(source);
++	}
++}
++EXPORT_SYMBOL_GPL(memory_bm_copy);
++
++void memory_bm_dup(struct memory_bitmap *source, struct memory_bitmap *dest)
++{
++	memory_bm_clear(dest);
++	memory_bm_copy(source, dest);
++}
++EXPORT_SYMBOL_GPL(memory_bm_dup);
++
++#ifdef CONFIG_TOI
++#define DEFINE_MEMORY_BITMAP(name) \
++struct memory_bitmap *name; \
++EXPORT_SYMBOL_GPL(name)
++
++DEFINE_MEMORY_BITMAP(pageset1_map);
++DEFINE_MEMORY_BITMAP(pageset1_copy_map);
++DEFINE_MEMORY_BITMAP(pageset2_map);
++DEFINE_MEMORY_BITMAP(page_resave_map);
++DEFINE_MEMORY_BITMAP(io_map);
++DEFINE_MEMORY_BITMAP(nosave_map);
++DEFINE_MEMORY_BITMAP(free_map);
++
++int memory_bm_write(struct memory_bitmap *bm, int (*rw_chunk)
++	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
++{
++	int result = 0;
++	unsigned int nr = 0;
++	struct bm_block *bb;
++
++	if (!bm)
++		return result;
++
++	list_for_each_entry(bb, &bm->blocks, hook)
++		nr++;
++
++	result = (*rw_chunk)(WRITE, NULL, (char *) &nr, sizeof(unsigned int));
++	if (result)
++		return result;
++
++	list_for_each_entry(bb, &bm->blocks, hook) {
++		result = (*rw_chunk)(WRITE, NULL, (char *) &bb->start_pfn,
++				2 * sizeof(unsigned long));
++		if (result)
++			return result;
++
++		result = (*rw_chunk)(WRITE, NULL, (char *) bb->data, PAGE_SIZE);
++		if (result)
++			return result;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(memory_bm_write);
++
++int memory_bm_read(struct memory_bitmap *bm, int (*rw_chunk)
++	(int rw, struct toi_module_ops *owner, char *buffer, int buffer_size))
++{
++	int result = 0;
++	unsigned int nr, i;
++	struct bm_block *bb;
++
++	if (!bm)
++		return result;
++
++	result = memory_bm_create(bm, GFP_KERNEL, 0);
++
++	if (result)
++		return result;
++
++	result = (*rw_chunk)(READ, NULL, (char *) &nr, sizeof(unsigned int));
++	if (result)
++		goto Free;
++
++	for (i = 0; i < nr; i++) {
++		unsigned long pfn;
++
++		result = (*rw_chunk)(READ, NULL, (char *) &pfn,
++				sizeof(unsigned long));
++		if (result)
++			goto Free;
++
++		list_for_each_entry(bb, &bm->blocks, hook)
++			if (bb->start_pfn == pfn)
++				break;
++
++		if (&bb->hook == &bm->blocks) {
++			printk(KERN_ERR
++				"TuxOnIce: Failed to load memory bitmap.\n");
++			result = -EINVAL;
++			goto Free;
++		}
++
++		result = (*rw_chunk)(READ, NULL, (char *) &pfn,
++				sizeof(unsigned long));
++		if (result)
++			goto Free;
++
++		if (pfn != bb->end_pfn) {
++			printk(KERN_ERR
++				"TuxOnIce: Failed to load memory bitmap. "
++				"End PFN doesn't match what was saved.\n");
++			result = -EINVAL;
++			goto Free;
++		}
++
++		result = (*rw_chunk)(READ, NULL, (char *) bb->data, PAGE_SIZE);
++
++		if (result)
++			goto Free;
++	}
++
++	return 0;
++
++Free:
++	memory_bm_free(bm, PG_ANY);
++	return result;
++}
++EXPORT_SYMBOL_GPL(memory_bm_read);
++#endif
++
++LIST_HEAD(nosave_regions);
++EXPORT_SYMBOL_GPL(nosave_regions);
+ 
+ /**
+  *	register_nosave_region - register a range of page frames the contents
+@@ -843,7 +1039,7 @@ static unsigned int count_free_highmem_pages(void)
+  *	We should save the page if it isn't Nosave or NosaveFree, or Reserved,
+  *	and it isn't a part of a free chunk of pages.
+  */
+-static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
++struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+ {
+ 	struct page *page;
+ 
+@@ -865,6 +1061,7 @@ static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn)
+ 
+ 	return page;
+ }
++EXPORT_SYMBOL_GPL(saveable_highmem_page);
+ 
+ /**
+  *	count_highmem_pages - compute the total number of saveable highmem
+@@ -890,11 +1087,6 @@ static unsigned int count_highmem_pages(void)
+ 	}
+ 	return n;
+ }
+-#else
+-static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+-{
+-	return NULL;
+-}
+ #endif /* CONFIG_HIGHMEM */
+ 
+ /**
+@@ -905,7 +1097,7 @@ static inline void *saveable_highmem_page(struct zone *z, unsigned long p)
+  *	of pages statically defined as 'unsaveable', and it isn't a part of
+  *	a free chunk of pages.
+  */
+-static struct page *saveable_page(struct zone *zone, unsigned long pfn)
++struct page *saveable_page(struct zone *zone, unsigned long pfn)
+ {
+ 	struct page *page;
+ 
+@@ -930,6 +1122,7 @@ static struct page *saveable_page(struct zone *zone, unsigned long pfn)
+ 
+ 	return page;
+ }
++EXPORT_SYMBOL_GPL(saveable_page);
+ 
+ /**
+  *	count_data_pages - compute the total number of saveable non-highmem
+@@ -1580,6 +1773,9 @@ asmlinkage int swsusp_save(void)
+ {
+ 	unsigned int nr_pages, nr_highmem;
+ 
++	if (toi_running)
++		return toi_post_context_save();
++
+ 	printk(KERN_INFO "PM: Creating hibernation image:\n");
+ 
+ 	drain_local_pages(NULL);
+@@ -1620,14 +1816,14 @@ asmlinkage int swsusp_save(void)
+ }
+ 
+ #ifndef CONFIG_ARCH_HIBERNATION_HEADER
+-static int init_header_complete(struct swsusp_info *info)
++int init_header_complete(struct swsusp_info *info)
+ {
+ 	memcpy(&info->uts, init_utsname(), sizeof(struct new_utsname));
+ 	info->version_code = LINUX_VERSION_CODE;
+ 	return 0;
+ }
+ 
+-static char *check_image_kernel(struct swsusp_info *info)
++char *check_image_kernel(struct swsusp_info *info)
+ {
+ 	if (info->version_code != LINUX_VERSION_CODE)
+ 		return "kernel version";
+@@ -1641,6 +1837,7 @@ static char *check_image_kernel(struct swsusp_info *info)
+ 		return "machine";
+ 	return NULL;
+ }
++EXPORT_SYMBOL_GPL(check_image_kernel);
+ #endif /* CONFIG_ARCH_HIBERNATION_HEADER */
+ 
+ unsigned long snapshot_get_image_size(void)
+@@ -1648,7 +1845,7 @@ unsigned long snapshot_get_image_size(void)
+ 	return nr_copy_pages + nr_meta_pages + 1;
+ }
+ 
+-static int init_header(struct swsusp_info *info)
++int init_header(struct swsusp_info *info)
+ {
+ 	memset(info, 0, sizeof(struct swsusp_info));
+ 	info->num_physpages = num_physpages;
+@@ -1658,6 +1855,7 @@ static int init_header(struct swsusp_info *info)
+ 	info->size <<= PAGE_SHIFT;
+ 	return init_header_complete(info);
+ }
++EXPORT_SYMBOL_GPL(init_header);
+ 
+ /**
+  *	pack_pfns - pfns corresponding to the set bits found in the bitmap @bm
+diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c
+index d4feda0..e1337b6 100644
+--- a/kernel/power/suspend.c
++++ b/kernel/power/suspend.c
+@@ -286,6 +286,7 @@ int suspend_devices_and_enter(suspend_state_t state)
+ 		suspend_ops->recover();
+ 	goto Resume_devices;
+ }
++EXPORT_SYMBOL_GPL(suspend_devices_and_enter);
+ 
+ /**
+  * suspend_finish - Clean up before finishing the suspend sequence.
+diff --git a/kernel/power/tuxonice.h b/kernel/power/tuxonice.h
+new file mode 100644
+index 0000000..6f8d127
+--- /dev/null
++++ b/kernel/power/tuxonice.h
+@@ -0,0 +1,227 @@
++/*
++ * kernel/power/tuxonice.h
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * It contains declarations used throughout swsusp.
++ *
++ */
++
++#ifndef KERNEL_POWER_TOI_H
++#define KERNEL_POWER_TOI_H
++
++#include <linux/delay.h>
++#include <linux/bootmem.h>
++#include <linux/suspend.h>
++#include <linux/fs.h>
++#include <linux/module.h>
++#include <asm/setup.h>
++#include "tuxonice_pageflags.h"
++#include "power.h"
++
++#define TOI_CORE_VERSION "3.3"
++#define	TOI_HEADER_VERSION 3
++#define MY_BOOT_KERNEL_DATA_VERSION 4
++
++struct toi_boot_kernel_data {
++	int version;
++	int size;
++	unsigned long toi_action;
++	unsigned long toi_debug_state;
++	u32 toi_default_console_level;
++	int toi_io_time[2][2];
++	char toi_nosave_commandline[COMMAND_LINE_SIZE];
++	unsigned long pages_used[33];
++	unsigned long incremental_bytes_in;
++	unsigned long incremental_bytes_out;
++	unsigned long compress_bytes_in;
++	unsigned long compress_bytes_out;
++	unsigned long pruned_pages;
++};
++
++extern struct toi_boot_kernel_data toi_bkd;
++
++/* Location of book kernel data struct in kernel being resumed */
++extern unsigned long boot_kernel_data_buffer;
++
++/*		 == Action states == 		*/
++
++enum {
++	TOI_REBOOT,
++	TOI_PAUSE,
++	TOI_LOGALL,
++	TOI_CAN_CANCEL,
++	TOI_KEEP_IMAGE,
++	TOI_FREEZER_TEST,
++	TOI_SINGLESTEP,
++	TOI_PAUSE_NEAR_PAGESET_END,
++	TOI_TEST_FILTER_SPEED,
++	TOI_TEST_BIO,
++	TOI_NO_PAGESET2,
++	TOI_IGNORE_ROOTFS,
++	TOI_REPLACE_SWSUSP,
++	TOI_PAGESET2_FULL,
++	TOI_ABORT_ON_RESAVE_NEEDED,
++	TOI_NO_MULTITHREADED_IO,
++	TOI_NO_DIRECT_LOAD, /* Obsolete */
++	TOI_LATE_CPU_HOTPLUG,
++	TOI_GET_MAX_MEM_ALLOCD,
++	TOI_NO_FLUSHER_THREAD,
++	TOI_NO_PS2_IF_UNNEEDED,
++	TOI_POST_RESUME_BREAKPOINT,
++	TOI_NO_READAHEAD,
++};
++
++extern unsigned long toi_bootflags_mask;
++
++#define clear_action_state(bit) (test_and_clear_bit(bit, &toi_bkd.toi_action))
++
++/*		 == Result states == 		*/
++
++enum {
++	TOI_ABORTED,
++	TOI_ABORT_REQUESTED,
++	TOI_NOSTORAGE_AVAILABLE,
++	TOI_INSUFFICIENT_STORAGE,
++	TOI_FREEZING_FAILED,
++	TOI_KEPT_IMAGE,
++	TOI_WOULD_EAT_MEMORY,
++	TOI_UNABLE_TO_FREE_ENOUGH_MEMORY,
++	TOI_PM_SEM,
++	TOI_DEVICE_REFUSED,
++	TOI_SYSDEV_REFUSED,
++	TOI_EXTRA_PAGES_ALLOW_TOO_SMALL,
++	TOI_UNABLE_TO_PREPARE_IMAGE,
++	TOI_FAILED_MODULE_INIT,
++	TOI_FAILED_MODULE_CLEANUP,
++	TOI_FAILED_IO,
++	TOI_OUT_OF_MEMORY,
++	TOI_IMAGE_ERROR,
++	TOI_PLATFORM_PREP_FAILED,
++	TOI_CPU_HOTPLUG_FAILED,
++	TOI_ARCH_PREPARE_FAILED, /* Removed Linux-3.0 */
++	TOI_RESAVE_NEEDED,
++	TOI_CANT_SUSPEND,
++	TOI_NOTIFIERS_PREPARE_FAILED,
++	TOI_PRE_SNAPSHOT_FAILED,
++	TOI_PRE_RESTORE_FAILED,
++	TOI_USERMODE_HELPERS_ERR,
++	TOI_CANT_USE_ALT_RESUME,
++	TOI_HEADER_TOO_BIG,
++	TOI_WAKEUP_EVENT,
++	TOI_SYSCORE_REFUSED,
++	TOI_DPM_PREPARE_FAILED,
++	TOI_DPM_SUSPEND_FAILED,
++	TOI_NUM_RESULT_STATES	/* Used in printing debug info only */
++};
++
++extern unsigned long toi_result;
++
++#define set_result_state(bit) (test_and_set_bit(bit, &toi_result))
++#define set_abort_result(bit) (test_and_set_bit(TOI_ABORTED, &toi_result), \
++				test_and_set_bit(bit, &toi_result))
++#define clear_result_state(bit) (test_and_clear_bit(bit, &toi_result))
++#define test_result_state(bit) (test_bit(bit, &toi_result))
++
++/*	 == Debug sections and levels == 	*/
++
++/* debugging levels. */
++enum {
++	TOI_STATUS = 0,
++	TOI_ERROR = 2,
++	TOI_LOW,
++	TOI_MEDIUM,
++	TOI_HIGH,
++	TOI_VERBOSE,
++};
++
++enum {
++	TOI_ANY_SECTION,
++	TOI_EAT_MEMORY,
++	TOI_IO,
++	TOI_HEADER,
++	TOI_WRITER,
++	TOI_MEMORY,
++	TOI_PAGEDIR,
++	TOI_COMPRESS,
++	TOI_BIO,
++};
++
++#define set_debug_state(bit) (test_and_set_bit(bit, &toi_bkd.toi_debug_state))
++#define clear_debug_state(bit) \
++	(test_and_clear_bit(bit, &toi_bkd.toi_debug_state))
++#define test_debug_state(bit) (test_bit(bit, &toi_bkd.toi_debug_state))
++
++/*		== Steps in hibernating ==	*/
++
++enum {
++	STEP_HIBERNATE_PREPARE_IMAGE,
++	STEP_HIBERNATE_SAVE_IMAGE,
++	STEP_HIBERNATE_POWERDOWN,
++	STEP_RESUME_CAN_RESUME,
++	STEP_RESUME_LOAD_PS1,
++	STEP_RESUME_DO_RESTORE,
++	STEP_RESUME_READ_PS2,
++	STEP_RESUME_GO,
++	STEP_RESUME_ALT_IMAGE,
++	STEP_CLEANUP,
++	STEP_QUIET_CLEANUP
++};
++
++/*		== TuxOnIce states ==
++	(see also include/linux/suspend.h)	*/
++
++#define get_toi_state()  (toi_state)
++#define restore_toi_state(saved_state) \
++	do { toi_state = saved_state; } while (0)
++
++/*		== Module support ==		*/
++
++struct toi_core_fns {
++	int (*post_context_save)(void);
++	unsigned long (*get_nonconflicting_page)(void);
++	int (*try_hibernate)(void);
++	void (*try_resume)(void);
++};
++
++extern struct toi_core_fns *toi_core_fns;
++
++/*		== All else ==			*/
++#define KB(x) ((x) << (PAGE_SHIFT - 10))
++#define MB(x) ((x) >> (20 - PAGE_SHIFT))
++
++extern int toi_start_anything(int toi_or_resume);
++extern void toi_finish_anything(int toi_or_resume);
++
++extern int save_image_part1(void);
++extern int toi_atomic_restore(void);
++
++extern int toi_try_hibernate(void);
++extern void toi_try_resume(void);
++
++extern int __toi_post_context_save(void);
++
++extern unsigned int nr_hibernates;
++extern char alt_resume_param[256];
++
++extern void copyback_post(void);
++extern int toi_hibernate(void);
++extern unsigned long extra_pd1_pages_used;
++
++#define SECTOR_SIZE 512
++
++extern void toi_early_boot_message(int can_erase_image, int default_answer,
++	char *warning_reason, ...);
++
++extern int do_check_can_resume(void);
++extern int do_toi_step(int step);
++extern int toi_launch_userspace_program(char *command, int channel_no,
++		int wait, int debug);
++
++extern char tuxonice_signature[9];
++
++extern int toi_start_other_threads(void);
++extern void toi_stop_other_threads(void);
++#endif
+diff --git a/kernel/power/tuxonice_alloc.c b/kernel/power/tuxonice_alloc.c
+new file mode 100644
+index 0000000..675f2b5
+--- /dev/null
++++ b/kernel/power/tuxonice_alloc.c
+@@ -0,0 +1,314 @@
++/*
++ * kernel/power/tuxonice_alloc.c
++ *
++ * Copyright (C) 2008-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ */
++
++#ifdef CONFIG_PM_DEBUG
++#include <linux/export.h>
++#include <linux/slab.h>
++#include "tuxonice_modules.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice.h"
++
++#define TOI_ALLOC_PATHS 40
++
++static DEFINE_MUTEX(toi_alloc_mutex);
++
++static struct toi_module_ops toi_alloc_ops;
++
++static int toi_fail_num;
++
++static atomic_t toi_alloc_count[TOI_ALLOC_PATHS],
++		toi_free_count[TOI_ALLOC_PATHS],
++		toi_test_count[TOI_ALLOC_PATHS],
++		toi_fail_count[TOI_ALLOC_PATHS];
++static int toi_cur_allocd[TOI_ALLOC_PATHS], toi_max_allocd[TOI_ALLOC_PATHS];
++static int cur_allocd, max_allocd;
++
++static char *toi_alloc_desc[TOI_ALLOC_PATHS] = {
++	"", /* 0 */
++	"get_io_info_struct",
++	"extent",
++	"extent (loading chain)",
++	"userui channel",
++	"userui arg", /* 5 */
++	"attention list metadata",
++	"extra pagedir memory metadata",
++	"bdev metadata",
++	"extra pagedir memory",
++	"header_locations_read", /* 10 */
++	"bio queue",
++	"prepare_readahead",
++	"i/o buffer",
++	"writer buffer in bio_init",
++	"checksum buffer", /* 15 */
++	"compression buffer",
++	"filewriter signature op",
++	"set resume param alloc1",
++	"set resume param alloc2",
++	"debugging info buffer", /* 20 */
++	"check can resume buffer",
++	"write module config buffer",
++	"read module config buffer",
++	"write image header buffer",
++	"read pageset1 buffer", /* 25 */
++	"get_have_image_data buffer",
++	"checksum page",
++	"worker rw loop",
++	"get nonconflicting page",
++	"ps1 load addresses", /* 30 */
++	"remove swap image",
++	"swap image exists",
++	"swap parse sig location",
++	"sysfs kobj",
++	"swap mark resume attempted buffer", /* 35 */
++	"cluster member",
++	"boot kernel data buffer",
++	"setting swap signature",
++	"block i/o bdev struct"
++};
++
++#define MIGHT_FAIL(FAIL_NUM, FAIL_VAL) \
++	do { \
++		BUG_ON(FAIL_NUM >= TOI_ALLOC_PATHS); \
++		\
++		if (FAIL_NUM == toi_fail_num) { \
++			atomic_inc(&toi_test_count[FAIL_NUM]); \
++			toi_fail_num = 0; \
++			return FAIL_VAL; \
++		} \
++	} while (0)
++
++static void alloc_update_stats(int fail_num, void *result, int size)
++{
++	if (!result) {
++		atomic_inc(&toi_fail_count[fail_num]);
++		return;
++	}
++
++	atomic_inc(&toi_alloc_count[fail_num]);
++	if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
++		mutex_lock(&toi_alloc_mutex);
++		toi_cur_allocd[fail_num]++;
++		cur_allocd += size;
++		if (unlikely(cur_allocd > max_allocd)) {
++			int i;
++
++			for (i = 0; i < TOI_ALLOC_PATHS; i++)
++				toi_max_allocd[i] = toi_cur_allocd[i];
++			max_allocd = cur_allocd;
++		}
++		mutex_unlock(&toi_alloc_mutex);
++	}
++}
++
++static void free_update_stats(int fail_num, int size)
++{
++	BUG_ON(fail_num >= TOI_ALLOC_PATHS);
++	atomic_inc(&toi_free_count[fail_num]);
++	if (unlikely(atomic_read(&toi_free_count[fail_num]) >
++				atomic_read(&toi_alloc_count[fail_num])))
++		dump_stack();
++	if (unlikely(test_action_state(TOI_GET_MAX_MEM_ALLOCD))) {
++		mutex_lock(&toi_alloc_mutex);
++		cur_allocd -= size;
++		toi_cur_allocd[fail_num]--;
++		mutex_unlock(&toi_alloc_mutex);
++	}
++}
++
++void *toi_kzalloc(int fail_num, size_t size, gfp_t flags)
++{
++	void *result;
++
++	if (toi_alloc_ops.enabled)
++		MIGHT_FAIL(fail_num, NULL);
++	result = kzalloc(size, flags);
++	if (toi_alloc_ops.enabled)
++		alloc_update_stats(fail_num, result, size);
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	return result;
++}
++EXPORT_SYMBOL_GPL(toi_kzalloc);
++
++unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
++		unsigned int order)
++{
++	unsigned long result;
++
++	if (toi_alloc_ops.enabled)
++		MIGHT_FAIL(fail_num, 0);
++	result = __get_free_pages(mask, order);
++	if (toi_alloc_ops.enabled)
++		alloc_update_stats(fail_num, (void *) result,
++				PAGE_SIZE << order);
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	return result;
++}
++EXPORT_SYMBOL_GPL(toi_get_free_pages);
++
++struct page *toi_alloc_page(int fail_num, gfp_t mask)
++{
++	struct page *result;
++
++	if (toi_alloc_ops.enabled)
++		MIGHT_FAIL(fail_num, NULL);
++	result = alloc_page(mask);
++	if (toi_alloc_ops.enabled)
++		alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	return result;
++}
++EXPORT_SYMBOL_GPL(toi_alloc_page);
++
++unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask)
++{
++	unsigned long result;
++
++	if (toi_alloc_ops.enabled)
++		MIGHT_FAIL(fail_num, 0);
++	result = get_zeroed_page(mask);
++	if (toi_alloc_ops.enabled)
++		alloc_update_stats(fail_num, (void *) result, PAGE_SIZE);
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	return result;
++}
++EXPORT_SYMBOL_GPL(toi_get_zeroed_page);
++
++void toi_kfree(int fail_num, const void *arg, int size)
++{
++	if (arg && toi_alloc_ops.enabled)
++		free_update_stats(fail_num, size);
++
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	kfree(arg);
++}
++EXPORT_SYMBOL_GPL(toi_kfree);
++
++void toi_free_page(int fail_num, unsigned long virt)
++{
++	if (virt && toi_alloc_ops.enabled)
++		free_update_stats(fail_num, PAGE_SIZE);
++
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	free_page(virt);
++}
++EXPORT_SYMBOL_GPL(toi_free_page);
++
++void toi__free_page(int fail_num, struct page *page)
++{
++	if (page && toi_alloc_ops.enabled)
++		free_update_stats(fail_num, PAGE_SIZE);
++
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	__free_page(page);
++}
++EXPORT_SYMBOL_GPL(toi__free_page);
++
++void toi_free_pages(int fail_num, struct page *page, int order)
++{
++	if (page && toi_alloc_ops.enabled)
++		free_update_stats(fail_num, PAGE_SIZE << order);
++
++	if (fail_num == toi_trace_allocs)
++		dump_stack();
++	__free_pages(page, order);
++}
++
++void toi_alloc_print_debug_stats(void)
++{
++	int i, header_done = 0;
++
++	if (!toi_alloc_ops.enabled)
++		return;
++
++	for (i = 0; i < TOI_ALLOC_PATHS; i++)
++		if (atomic_read(&toi_alloc_count[i]) !=
++		    atomic_read(&toi_free_count[i])) {
++			if (!header_done) {
++				printk(KERN_INFO "Idx  Allocs   Frees   Tests "
++					"  Fails     Max Description\n");
++				header_done = 1;
++			}
++
++			printk(KERN_INFO "%3d %7d %7d %7d %7d %7d %s\n", i,
++				atomic_read(&toi_alloc_count[i]),
++				atomic_read(&toi_free_count[i]),
++				atomic_read(&toi_test_count[i]),
++				atomic_read(&toi_fail_count[i]),
++				toi_max_allocd[i],
++				toi_alloc_desc[i]);
++		}
++}
++EXPORT_SYMBOL_GPL(toi_alloc_print_debug_stats);
++
++static int toi_alloc_initialise(int starting_cycle)
++{
++	int i;
++
++	if (!starting_cycle)
++		return 0;
++
++	if (toi_trace_allocs)
++		dump_stack();
++
++	for (i = 0; i < TOI_ALLOC_PATHS; i++) {
++		atomic_set(&toi_alloc_count[i], 0);
++		atomic_set(&toi_free_count[i], 0);
++		atomic_set(&toi_test_count[i], 0);
++		atomic_set(&toi_fail_count[i], 0);
++		toi_cur_allocd[i] = 0;
++		toi_max_allocd[i] = 0;
++	};
++
++	max_allocd = 0;
++	cur_allocd = 0;
++	return 0;
++}
++
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_INT("failure_test", SYSFS_RW, &toi_fail_num, 0, 99, 0, NULL),
++	SYSFS_INT("trace", SYSFS_RW, &toi_trace_allocs, 0, TOI_ALLOC_PATHS, 0,
++			NULL),
++	SYSFS_BIT("find_max_mem_allocated", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_GET_MAX_MEM_ALLOCD, 0),
++	SYSFS_INT("enabled", SYSFS_RW, &toi_alloc_ops.enabled, 0, 1, 0,
++			NULL)
++};
++
++static struct toi_module_ops toi_alloc_ops = {
++	.type					= MISC_HIDDEN_MODULE,
++	.name					= "allocation debugging",
++	.directory				= "alloc",
++	.module					= THIS_MODULE,
++	.early					= 1,
++	.initialise				= toi_alloc_initialise,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++int toi_alloc_init(void)
++{
++	int result = toi_register_module(&toi_alloc_ops);
++	return result;
++}
++
++void toi_alloc_exit(void)
++{
++	toi_unregister_module(&toi_alloc_ops);
++}
++#endif
+diff --git a/kernel/power/tuxonice_alloc.h b/kernel/power/tuxonice_alloc.h
+new file mode 100644
+index 0000000..099ee51
+--- /dev/null
++++ b/kernel/power/tuxonice_alloc.h
+@@ -0,0 +1,54 @@
++/*
++ * kernel/power/tuxonice_alloc.h
++ *
++ * Copyright (C) 2008-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ */
++
++#include <linux/slab.h>
++#define TOI_WAIT_GFP (GFP_NOFS | __GFP_NOWARN)
++#define TOI_ATOMIC_GFP (GFP_ATOMIC | __GFP_NOWARN)
++
++#ifdef CONFIG_PM_DEBUG
++extern void *toi_kzalloc(int fail_num, size_t size, gfp_t flags);
++extern void toi_kfree(int fail_num, const void *arg, int size);
++
++extern unsigned long toi_get_free_pages(int fail_num, gfp_t mask,
++		unsigned int order);
++#define toi_get_free_page(FAIL_NUM, MASK) toi_get_free_pages(FAIL_NUM, MASK, 0)
++extern unsigned long toi_get_zeroed_page(int fail_num, gfp_t mask);
++extern void toi_free_page(int fail_num, unsigned long buf);
++extern void toi__free_page(int fail_num, struct page *page);
++extern void toi_free_pages(int fail_num, struct page *page, int order);
++extern struct page *toi_alloc_page(int fail_num, gfp_t mask);
++extern int toi_alloc_init(void);
++extern void toi_alloc_exit(void);
++
++extern void toi_alloc_print_debug_stats(void);
++
++#else /* CONFIG_PM_DEBUG */
++
++#define toi_kzalloc(FAIL, SIZE, FLAGS) (kzalloc(SIZE, FLAGS))
++#define toi_kfree(FAIL, ALLOCN, SIZE) (kfree(ALLOCN))
++
++#define toi_get_free_pages(FAIL, FLAGS, ORDER) __get_free_pages(FLAGS, ORDER)
++#define toi_get_free_page(FAIL, FLAGS) __get_free_page(FLAGS)
++#define toi_get_zeroed_page(FAIL, FLAGS) get_zeroed_page(FLAGS)
++#define toi_free_page(FAIL, ALLOCN) do { free_page(ALLOCN); } while (0)
++#define toi__free_page(FAIL, PAGE) __free_page(PAGE)
++#define toi_free_pages(FAIL, PAGE, ORDER) __free_pages(PAGE, ORDER)
++#define toi_alloc_page(FAIL, MASK) alloc_page(MASK)
++static inline int toi_alloc_init(void)
++{
++	return 0;
++}
++
++static inline void toi_alloc_exit(void) { }
++
++static inline void toi_alloc_print_debug_stats(void) { }
++
++#endif
++
++extern int toi_trace_allocs;
+diff --git a/kernel/power/tuxonice_atomic_copy.c b/kernel/power/tuxonice_atomic_copy.c
+new file mode 100644
+index 0000000..c524acb
+--- /dev/null
++++ b/kernel/power/tuxonice_atomic_copy.c
+@@ -0,0 +1,473 @@
++/*
++ * kernel/power/tuxonice_atomic_copy.c
++ *
++ * Copyright 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * Routines for doing the atomic save/restore.
++ */
++
++#include <linux/suspend.h>
++#include <linux/highmem.h>
++#include <linux/cpu.h>
++#include <linux/freezer.h>
++#include <linux/console.h>
++#include <linux/syscore_ops.h>
++#include <linux/ftrace.h>
++#include <asm/suspend.h>
++#include "tuxonice.h"
++#include "tuxonice_storage.h"
++#include "tuxonice_power_off.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_io.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice_pageflags.h"
++#include "tuxonice_checksum.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_atomic_copy.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_modules.h"
++
++unsigned long extra_pd1_pages_used;
++
++/**
++ * free_pbe_list - free page backup entries used by the atomic copy code.
++ * @list:	List to free.
++ * @highmem:	Whether the list is in highmem.
++ *
++ * Normally, this function isn't used. If, however, we need to abort before
++ * doing the atomic copy, we use this to free the pbes previously allocated.
++ **/
++static void free_pbe_list(struct pbe **list, int highmem)
++{
++	while (*list) {
++		int i;
++		struct pbe *free_pbe, *next_page = NULL;
++		struct page *page;
++
++		if (highmem) {
++			page = (struct page *) *list;
++			free_pbe = (struct pbe *) kmap(page);
++		} else {
++			page = virt_to_page(*list);
++			free_pbe = *list;
++		}
++
++		for (i = 0; i < PBES_PER_PAGE; i++) {
++			if (!free_pbe)
++				break;
++			if (highmem)
++				toi__free_page(29, free_pbe->address);
++			else
++				toi_free_page(29,
++					(unsigned long) free_pbe->address);
++			free_pbe = free_pbe->next;
++		}
++
++		if (highmem) {
++			if (free_pbe)
++				next_page = free_pbe;
++			kunmap(page);
++		} else {
++			if (free_pbe)
++				next_page = free_pbe;
++		}
++
++		toi__free_page(29, page);
++		*list = (struct pbe *) next_page;
++	};
++}
++
++/**
++ * copyback_post - post atomic-restore actions
++ *
++ * After doing the atomic restore, we have a few more things to do:
++ *	1) We want to retain some values across the restore, so we now copy
++ *	these from the nosave variables to the normal ones.
++ *	2) Set the status flags.
++ *	3) Resume devices.
++ *	4) Tell userui so it can redraw & restore settings.
++ *	5) Reread the page cache.
++ **/
++void copyback_post(void)
++{
++	struct toi_boot_kernel_data *bkd =
++		(struct toi_boot_kernel_data *) boot_kernel_data_buffer;
++
++	if (toi_activate_storage(1))
++		panic("Failed to reactivate our storage.");
++
++	toi_post_atomic_restore_modules(bkd);
++
++	toi_cond_pause(1, "About to reload secondary pagedir.");
++
++	if (read_pageset2(0))
++		panic("Unable to successfully reread the page cache.");
++
++	/*
++	 * If the user wants to sleep again after resuming from full-off,
++	 * it's most likely to be in order to suspend to ram, so we'll
++	 * do this check after loading pageset2, to give them the fastest
++	 * wakeup when they are ready to use the computer again.
++	 */
++	toi_check_resleep();
++}
++
++/**
++ * toi_copy_pageset1 - do the atomic copy of pageset1
++ *
++ * Make the atomic copy of pageset1. We can't use copy_page (as we once did)
++ * because we can't be sure what side effects it has. On my old Duron, with
++ * 3DNOW, kernel_fpu_begin increments preempt count, making our preempt
++ * count at resume time 4 instead of 3.
++ *
++ * We don't want to call kmap_atomic unconditionally because it has the side
++ * effect of incrementing the preempt count, which will leave it one too high
++ * post resume (the page containing the preempt count will be copied after
++ * its incremented. This is essentially the same problem.
++ **/
++void toi_copy_pageset1(void)
++{
++	int i;
++	unsigned long source_index, dest_index;
++
++	memory_bm_position_reset(pageset1_map);
++	memory_bm_position_reset(pageset1_copy_map);
++
++	source_index = memory_bm_next_pfn(pageset1_map);
++	dest_index = memory_bm_next_pfn(pageset1_copy_map);
++
++	for (i = 0; i < pagedir1.size; i++) {
++		unsigned long *origvirt, *copyvirt;
++		struct page *origpage, *copypage;
++		int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1,
++		    was_present1, was_present2;
++
++		origpage = pfn_to_page(source_index);
++		copypage = pfn_to_page(dest_index);
++
++		origvirt = PageHighMem(origpage) ?
++			kmap_atomic(origpage) :
++			page_address(origpage);
++
++		copyvirt = PageHighMem(copypage) ?
++			kmap_atomic(copypage) :
++			page_address(copypage);
++
++		was_present1 = kernel_page_present(origpage);
++		if (!was_present1)
++			kernel_map_pages(origpage, 1, 1);
++
++		was_present2 = kernel_page_present(copypage);
++		if (!was_present2)
++			kernel_map_pages(copypage, 1, 1);
++
++		while (loop >= 0) {
++			*(copyvirt + loop) = *(origvirt + loop);
++			loop--;
++		}
++
++		if (!was_present1)
++			kernel_map_pages(origpage, 1, 0);
++
++		if (!was_present2)
++			kernel_map_pages(copypage, 1, 0);
++
++		if (PageHighMem(origpage))
++			kunmap_atomic(origvirt);
++
++		if (PageHighMem(copypage))
++			kunmap_atomic(copyvirt);
++
++		source_index = memory_bm_next_pfn(pageset1_map);
++		dest_index = memory_bm_next_pfn(pageset1_copy_map);
++	}
++}
++
++/**
++ * __toi_post_context_save - steps after saving the cpu context
++ *
++ * Steps taken after saving the CPU state to make the actual
++ * atomic copy.
++ *
++ * Called from swsusp_save in snapshot.c via toi_post_context_save.
++ **/
++int __toi_post_context_save(void)
++{
++	unsigned long old_ps1_size = pagedir1.size;
++
++	check_checksums();
++
++	free_checksum_pages();
++
++	toi_recalculate_image_contents(1);
++
++	extra_pd1_pages_used = pagedir1.size > old_ps1_size ?
++		pagedir1.size - old_ps1_size : 0;
++
++	if (extra_pd1_pages_used > extra_pd1_pages_allowance) {
++		printk(KERN_INFO "Pageset1 has grown by %lu pages. "
++			"extra_pages_allowance is currently only %lu.\n",
++			pagedir1.size - old_ps1_size,
++			extra_pd1_pages_allowance);
++
++		/*
++		 * Highlevel code will see this, clear the state and
++		 * retry if we haven't already done so twice.
++		 */
++		if (any_to_free(1)) {
++			set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
++			return 1;
++		}
++		if (try_allocate_extra_memory()) {
++			printk(KERN_INFO "Failed to allocate the extra memory"
++					" needed. Restarting the process.");
++			set_abort_result(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
++			return 1;
++		}
++		printk(KERN_INFO "However it looks like there's enough"
++			" free ram and storage to handle this, so "
++			" continuing anyway.");
++		/* 
++		 * What if try_allocate_extra_memory above calls
++		 * toi_allocate_extra_pagedir_memory and it allocs a new
++		 * slab page via toi_kzalloc which should be in ps1? So...
++		 */
++		toi_recalculate_image_contents(1);
++	}
++
++	if (!test_action_state(TOI_TEST_FILTER_SPEED) &&
++	    !test_action_state(TOI_TEST_BIO))
++		toi_copy_pageset1();
++
++	return 0;
++}
++
++/**
++ * toi_hibernate - high level code for doing the atomic copy
++ *
++ * High-level code which prepares to do the atomic copy. Loosely based
++ * on the swsusp version, but with the following twists:
++ *	- We set toi_running so the swsusp code uses our code paths.
++ *	- We give better feedback regarding what goes wrong if there is a
++ *	  problem.
++ *	- We use an extra function to call the assembly, just in case this code
++ *	  is in a module (return address).
++ **/
++int toi_hibernate(void)
++{
++	int error;
++
++	toi_running = 1; /* For the swsusp code we use :< */
++
++	error = toi_lowlevel_builtin();
++
++	if (!error) {
++		struct toi_boot_kernel_data *bkd =
++			(struct toi_boot_kernel_data *) boot_kernel_data_buffer;
++
++		/*
++		 * The boot kernel's data may be larger (newer version) or
++		 * smaller (older version) than ours. Copy the minimum
++		 * of the two sizes, so that we don't overwrite valid values
++		 * from pre-atomic copy.
++		 */
++
++		memcpy(&toi_bkd, (char *) boot_kernel_data_buffer,
++			min_t(int, sizeof(struct toi_boot_kernel_data),
++				bkd->size));
++	}
++
++	toi_running = 0;
++	return error;
++}
++
++/**
++ * toi_atomic_restore - prepare to do the atomic restore
++ *
++ * Get ready to do the atomic restore. This part gets us into the same
++ * state we are in prior to do calling do_toi_lowlevel while
++ * hibernating: hot-unplugging secondary cpus and freeze processes,
++ * before starting the thread that will do the restore.
++ **/
++int toi_atomic_restore(void)
++{
++	int error;
++
++	toi_running = 1;
++
++	toi_prepare_status(DONT_CLEAR_BAR,	"Atomic restore.");
++
++	memcpy(&toi_bkd.toi_nosave_commandline, saved_command_line,
++		strlen(saved_command_line));
++
++	toi_pre_atomic_restore_modules(&toi_bkd);
++
++	if (add_boot_kernel_data_pbe())
++		goto Failed;
++
++	toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
++
++	if (toi_go_atomic(PMSG_QUIESCE, 0))
++		goto Failed;
++
++	/* We'll ignore saved state, but this gets preempt count (etc) right */
++	save_processor_state();
++
++	error = swsusp_arch_resume();
++	/*
++	 * Code below is only ever reached in case of failure. Otherwise
++	 * execution continues at place where swsusp_arch_suspend was called.
++	 *
++	 * We don't know whether it's safe to continue (this shouldn't happen),
++	 * so lets err on the side of caution.
++	 */
++	BUG();
++
++Failed:
++	free_pbe_list(&restore_pblist, 0);
++#ifdef CONFIG_HIGHMEM
++	free_pbe_list(&restore_highmem_pblist, 1);
++#endif
++	toi_running = 0;
++	return 1;
++}
++
++/**
++ * toi_go_atomic - do the actual atomic copy/restore
++ * @state:	   The state to use for dpm_suspend_start & power_down calls.
++ * @suspend_time:  Whether we're suspending or resuming.
++ **/
++int toi_go_atomic(pm_message_t state, int suspend_time)
++{
++  if (suspend_time) {
++    if (platform_begin(1)) {
++      set_abort_result(TOI_PLATFORM_PREP_FAILED);
++      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
++      return 1;
++    }
++
++    if (dpm_prepare(PMSG_FREEZE)) {
++      set_abort_result(TOI_DPM_PREPARE_FAILED);
++      dpm_complete(PMSG_RECOVER);
++      toi_end_atomic(ATOMIC_STEP_PLATFORM_END, suspend_time, 3);
++      return 1;
++    }
++  }
++
++	suspend_console();
++	ftrace_stop();
++	pm_restrict_gfp_mask();
++
++  if (suspend_time) {
++    if (dpm_suspend(state)) {
++      set_abort_result(TOI_DPM_SUSPEND_FAILED);
++      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
++      return 1;
++    }
++  } else {
++    if (dpm_suspend_start(state)) {
++      set_abort_result(TOI_DPM_SUSPEND_FAILED);
++      toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 3);
++      return 1;
++    }
++  }
++
++	/* At this point, dpm_suspend_start() has been called, but *not*
++	 * dpm_suspend_noirq(). We *must* dpm_suspend_noirq() now.
++	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
++	 * become desynchronized with the actual state of the hardware
++	 * at resume time, and evil weirdness ensues.
++	 */
++
++	if (dpm_suspend_end(state)) {
++		set_abort_result(TOI_DEVICE_REFUSED);
++		toi_end_atomic(ATOMIC_STEP_DEVICE_RESUME, suspend_time, 1);
++		return 1;
++	}
++
++	if (suspend_time) {
++		if (platform_pre_snapshot(1))
++			set_abort_result(TOI_PRE_SNAPSHOT_FAILED);
++	} else {
++		if (platform_pre_restore(1))
++			set_abort_result(TOI_PRE_RESTORE_FAILED);
++	}
++
++	if (test_result_state(TOI_ABORTED)) {
++		toi_end_atomic(ATOMIC_STEP_PLATFORM_FINISH, suspend_time, 1);
++		return 1;
++	}
++
++	if (test_action_state(TOI_LATE_CPU_HOTPLUG)) {
++		if (disable_nonboot_cpus()) {
++			set_abort_result(TOI_CPU_HOTPLUG_FAILED);
++			toi_end_atomic(ATOMIC_STEP_CPU_HOTPLUG,
++					suspend_time, 1);
++			return 1;
++		}
++	}
++
++	local_irq_disable();
++
++	if (syscore_suspend()) {
++		set_abort_result(TOI_SYSCORE_REFUSED);
++		toi_end_atomic(ATOMIC_STEP_IRQS, suspend_time, 1);
++		return 1;
++	}
++
++	if (suspend_time && pm_wakeup_pending()) {
++		set_abort_result(TOI_WAKEUP_EVENT);
++		toi_end_atomic(ATOMIC_STEP_SYSCORE_RESUME, suspend_time, 1);
++		return 1;
++	}
++	return 0;
++}
++
++/**
++ * toi_end_atomic - post atomic copy/restore routines
++ * @stage:		What step to start at.
++ * @suspend_time:	Whether we're suspending or resuming.
++ * @error:		Whether we're recovering from an error.
++ **/
++void toi_end_atomic(int stage, int suspend_time, int error)
++{
++	pm_message_t msg = suspend_time ? (error ? PMSG_RECOVER : PMSG_THAW) :
++		PMSG_RESTORE;
++
++	switch (stage) {
++	case ATOMIC_ALL_STEPS:
++		if (!suspend_time) {
++			events_check_enabled = false;
++			platform_leave(1);
++		}
++	case ATOMIC_STEP_SYSCORE_RESUME:
++		syscore_resume();
++	case ATOMIC_STEP_IRQS:
++		local_irq_enable();
++	case ATOMIC_STEP_CPU_HOTPLUG:
++		if (test_action_state(TOI_LATE_CPU_HOTPLUG))
++			enable_nonboot_cpus();
++	case ATOMIC_STEP_PLATFORM_FINISH:
++		if (!suspend_time && error & 2)
++			platform_restore_cleanup(1);
++		else 
++			platform_finish(1);
++		dpm_resume_start(msg);
++	case ATOMIC_STEP_DEVICE_RESUME:
++		if (suspend_time && (error & 2))
++			platform_recover(1);
++		dpm_resume(msg);
++		if (error || !toi_in_suspend())
++			pm_restore_gfp_mask();
++		ftrace_start();
++		resume_console();
++	case ATOMIC_STEP_DPM_COMPLETE:
++		dpm_complete(msg);
++	case ATOMIC_STEP_PLATFORM_END:
++		platform_end(1);
++
++		toi_prepare_status(DONT_CLEAR_BAR, "Post atomic.");
++	}
++}
+diff --git a/kernel/power/tuxonice_atomic_copy.h b/kernel/power/tuxonice_atomic_copy.h
+new file mode 100644
+index 0000000..6a989c1
+--- /dev/null
++++ b/kernel/power/tuxonice_atomic_copy.h
+@@ -0,0 +1,23 @@
++/*
++ * kernel/power/tuxonice_atomic_copy.h
++ *
++ * Copyright 2008-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * Routines for doing the atomic save/restore.
++ */
++
++enum {
++	ATOMIC_ALL_STEPS,
++	ATOMIC_STEP_SYSCORE_RESUME,
++	ATOMIC_STEP_IRQS,
++	ATOMIC_STEP_CPU_HOTPLUG,
++	ATOMIC_STEP_PLATFORM_FINISH,
++	ATOMIC_STEP_DEVICE_RESUME,
++	ATOMIC_STEP_DPM_COMPLETE,
++	ATOMIC_STEP_PLATFORM_END,
++};
++
++int toi_go_atomic(pm_message_t state, int toi_time);
++void toi_end_atomic(int stage, int toi_time, int error);
+diff --git a/kernel/power/tuxonice_bio.h b/kernel/power/tuxonice_bio.h
+new file mode 100644
+index 0000000..9627ccc
+--- /dev/null
++++ b/kernel/power/tuxonice_bio.h
+@@ -0,0 +1,77 @@
++/*
++ * kernel/power/tuxonice_bio.h
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * This file contains declarations for functions exported from
++ * tuxonice_bio.c, which contains low level io functions.
++ */
++
++#include <linux/buffer_head.h>
++#include "tuxonice_extent.h"
++
++void toi_put_extent_chain(struct hibernate_extent_chain *chain);
++int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
++		unsigned long start, unsigned long end);
++
++struct hibernate_extent_saved_state {
++	int extent_num;
++	struct hibernate_extent *extent_ptr;
++	unsigned long offset;
++};
++
++struct toi_bdev_info {
++	struct toi_bdev_info *next;
++	struct hibernate_extent_chain blocks;
++	struct block_device *bdev;
++	struct toi_module_ops *allocator;
++	int allocator_index;
++	struct hibernate_extent_chain allocations;
++	char name[266]; /* "swap on " or "file " + up to 256 chars */
++
++	/* Saved in header */
++	char uuid[17];
++	dev_t dev_t;
++	int prio;
++	int bmap_shift;
++	int blocks_per_page;
++	unsigned long pages_used;
++	struct hibernate_extent_saved_state saved_state[4];
++};
++
++struct toi_extent_iterate_state {
++	struct toi_bdev_info *current_chain;
++	int num_chains;
++	int saved_chain_number[4];
++	struct toi_bdev_info *saved_chain_ptr[4];
++};
++
++/*
++ * Our exported interface so the swapwriter and filewriter don't
++ * need these functions duplicated.
++ */
++struct toi_bio_ops {
++	int (*bdev_page_io) (int rw, struct block_device *bdev, long pos,
++			struct page *page);
++	int (*register_storage)(struct toi_bdev_info *new);
++	void (*free_storage)(void);
++};
++
++struct toi_allocator_ops {
++	unsigned long (*toi_swap_storage_available) (void);
++};
++
++extern struct toi_bio_ops toi_bio_ops;
++
++extern char *toi_writer_buffer;
++extern int toi_writer_buffer_posn;
++
++struct toi_bio_allocator_ops {
++	int (*register_storage) (void);
++	unsigned long (*storage_available)(void);
++	int (*allocate_storage) (struct toi_bdev_info *, unsigned long);
++	int (*bmap) (struct toi_bdev_info *);
++	void (*free_storage) (struct toi_bdev_info *);
++};
+diff --git a/kernel/power/tuxonice_bio_chains.c b/kernel/power/tuxonice_bio_chains.c
+new file mode 100644
+index 0000000..c214d18
+--- /dev/null
++++ b/kernel/power/tuxonice_bio_chains.c
+@@ -0,0 +1,1048 @@
++/*
++ * kernel/power/tuxonice_bio_devinfo.c
++ *
++ * Copyright (C) 2009-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ */
++
++#include <linux/mm_types.h>
++#include "tuxonice_bio.h"
++#include "tuxonice_bio_internal.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_ui.h"
++#include "tuxonice.h"
++#include "tuxonice_io.h"
++
++static struct toi_bdev_info *prio_chain_head;
++static int num_chains;
++
++/* Pointer to current entry being loaded/saved. */
++struct toi_extent_iterate_state toi_writer_posn;
++
++#define metadata_size (sizeof(struct toi_bdev_info) - \
++		offsetof(struct toi_bdev_info, uuid))
++
++/*
++ * After section 0 (header) comes 2 => next_section[0] = 2
++ */
++static int next_section[3] = { 2, 3, 1 };
++
++/**
++ * dump_block_chains - print the contents of the bdev info array.
++ **/
++void dump_block_chains(void)
++{
++	int i = 0;
++	int j;
++	struct toi_bdev_info *cur_chain = prio_chain_head;
++
++	while (cur_chain) {
++		struct hibernate_extent *this = cur_chain->blocks.first;
++
++		printk(KERN_DEBUG "Chain %d (prio %d):", i, cur_chain->prio);
++
++		while (this) {
++			printk(KERN_CONT " [%lu-%lu]%s", this->start,
++					this->end, this->next ? "," : "");
++			this = this->next;
++		}
++
++		printk("\n");
++		cur_chain = cur_chain->next;
++		i++;
++	}
++
++	printk(KERN_DEBUG "Saved states:\n");
++	for (i = 0; i < 4; i++) {
++		printk(KERN_DEBUG "Slot %d: Chain %d.\n",
++			i, toi_writer_posn.saved_chain_number[i]);
++
++		cur_chain = prio_chain_head;
++		j = 0;
++		while (cur_chain) {
++			printk(KERN_DEBUG " Chain %d: Extent %d. Offset %lu.\n",
++					j, cur_chain->saved_state[i].extent_num,
++					cur_chain->saved_state[i].offset);
++			cur_chain = cur_chain->next;
++			j++;
++		}
++		printk(KERN_CONT "\n");
++	}
++}
++
++/**
++ *
++ **/
++static void toi_extent_chain_next(void)
++{
++	struct toi_bdev_info *this = toi_writer_posn.current_chain;
++
++	if (!this->blocks.current_extent)
++		return;
++
++	if (this->blocks.current_offset == this->blocks.current_extent->end) {
++		if (this->blocks.current_extent->next) {
++			this->blocks.current_extent =
++				this->blocks.current_extent->next;
++			this->blocks.current_offset =
++				this->blocks.current_extent->start;
++		} else {
++			this->blocks.current_extent = NULL;
++			this->blocks.current_offset = 0;
++		}
++	} else
++		this->blocks.current_offset++;
++}
++
++/**
++ *
++ */
++
++static struct toi_bdev_info *__find_next_chain_same_prio(void)
++{
++	struct toi_bdev_info *start_chain = toi_writer_posn.current_chain;
++	struct toi_bdev_info *this = start_chain;
++	int orig_prio = this->prio;
++
++	do {
++		this = this->next;
++
++		if (!this)
++			this = prio_chain_head;
++
++		/* Back on original chain? Use it again. */
++		if (this == start_chain)
++			return start_chain;
++
++	} while (!this->blocks.current_extent || this->prio != orig_prio);
++
++	return this;
++}
++
++static void find_next_chain(void)
++{
++	struct toi_bdev_info *this;
++
++	this = __find_next_chain_same_prio();
++
++	/*
++	 * If we didn't get another chain of the same priority that we
++	 * can use, look for the next priority.
++	 */
++	while (this && !this->blocks.current_extent)
++		this = this->next;
++
++	toi_writer_posn.current_chain = this;
++}
++
++/**
++ * toi_extent_state_next - go to the next extent
++ * @blocks: The number of values to progress.
++ * @stripe_mode: Whether to spread usage across all chains.
++ *
++ * Given a state, progress to the next valid entry. We may begin in an
++ * invalid state, as we do when invoked after extent_state_goto_start below.
++ *
++ * When using compression and expected_compression > 0, we let the image size
++ * be larger than storage, so we can validly run out of data to return.
++ **/
++static unsigned long toi_extent_state_next(int blocks, int current_stream)
++{
++	int i;
++
++	if (!toi_writer_posn.current_chain)
++		return -ENOSPC;
++
++	/* Assume chains always have lengths that are multiples of @blocks */
++	for (i = 0; i < blocks; i++)
++		toi_extent_chain_next();
++
++	/* The header stream is not striped */
++	if (current_stream ||
++	    !toi_writer_posn.current_chain->blocks.current_extent)
++		find_next_chain();
++
++	return  toi_writer_posn.current_chain ? 0 : -ENOSPC;
++}
++
++static void toi_insert_chain_in_prio_list(struct toi_bdev_info *this)
++{
++	struct toi_bdev_info **prev_ptr;
++	struct toi_bdev_info *cur;
++
++	/* Loop through the existing chain, finding where to insert it */
++	prev_ptr = &prio_chain_head;
++	cur = prio_chain_head;
++
++	while (cur && cur->prio >= this->prio) {
++		prev_ptr = &cur->next;
++		cur = cur->next;
++	}
++
++	this->next = *prev_ptr;
++	*prev_ptr = this;
++
++	this = prio_chain_head;
++	while (this)
++		this = this->next;
++	num_chains++;
++}
++
++/**
++ * toi_extent_state_goto_start - reinitialize an extent chain iterator
++ * @state:	Iterator to reinitialize
++ **/
++void toi_extent_state_goto_start(void)
++{
++	struct toi_bdev_info *this = prio_chain_head;
++
++	while (this) {
++		toi_message(TOI_BIO, TOI_VERBOSE, 0,
++			"Setting current extent to %p.", this->blocks.first);
++		this->blocks.current_extent = this->blocks.first;
++		if (this->blocks.current_extent) {
++			toi_message(TOI_BIO, TOI_VERBOSE, 0,
++					"Setting current offset to %lu.",
++					this->blocks.current_extent->start);
++			this->blocks.current_offset =
++				this->blocks.current_extent->start;
++		}
++
++		this = this->next;
++	}
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Setting current chain to %p.",
++			prio_chain_head);
++	toi_writer_posn.current_chain = prio_chain_head;
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Leaving extent state goto start.");
++}
++
++/**
++ * toi_extent_state_save - save state of the iterator
++ * @state:		Current state of the chain
++ * @saved_state:	Iterator to populate
++ *
++ * Given a state and a struct hibernate_extent_state_store, save the current
++ * position in a format that can be used with relocated chains (at
++ * resume time).
++ **/
++void toi_extent_state_save(int slot)
++{
++	struct toi_bdev_info *cur_chain = prio_chain_head;
++	struct hibernate_extent *extent;
++	struct hibernate_extent_saved_state *chain_state;
++	int i = 0;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_extent_state_save, slot %d.",
++			slot);
++
++	if (!toi_writer_posn.current_chain) {
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current chain => "
++				"chain_num = -1.");
++		toi_writer_posn.saved_chain_number[slot] = -1;
++		return;
++	}
++
++	while (cur_chain) {
++		i++;
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saving chain %d (%p) "
++				"state, slot %d.", i, cur_chain, slot);
++
++		chain_state = &cur_chain->saved_state[slot];
++
++		chain_state->offset = cur_chain->blocks.current_offset;
++
++		if (toi_writer_posn.current_chain == cur_chain) {
++			toi_writer_posn.saved_chain_number[slot] = i;
++			toi_message(TOI_BIO, TOI_VERBOSE, 0, "This is the chain "
++					"we were on => chain_num is %d.", i);
++		}
++
++		if (!cur_chain->blocks.current_extent) {
++			chain_state->extent_num = 0;
++			toi_message(TOI_BIO, TOI_VERBOSE, 0, "No current extent "
++					"for this chain => extent_num %d is 0.",
++					i);
++			cur_chain = cur_chain->next;
++			continue;
++		}
++
++		extent = cur_chain->blocks.first;
++		chain_state->extent_num = 1;
++
++		while (extent != cur_chain->blocks.current_extent) {
++			chain_state->extent_num++;
++			extent = extent->next;
++		}
++
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "extent num %d is %d.", i,
++				chain_state->extent_num);
++
++		cur_chain = cur_chain->next;
++	}
++	toi_message(TOI_BIO, TOI_VERBOSE, 0,
++			"Completed saving extent state slot %d.", slot);
++}
++
++/**
++ * toi_extent_state_restore - restore the position saved by extent_state_save
++ * @state:		State to populate
++ * @saved_state:	Iterator saved to restore
++ **/
++void toi_extent_state_restore(int slot)
++{
++	int i = 0;
++	struct toi_bdev_info *cur_chain = prio_chain_head;
++	struct hibernate_extent_saved_state *chain_state;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0,
++			"toi_extent_state_restore - slot %d.", slot);
++
++	if (toi_writer_posn.saved_chain_number[slot] == -1) {
++		toi_writer_posn.current_chain = NULL;
++		return;
++	}
++
++	while (cur_chain) {
++		int posn;
++		int j;
++		i++;
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Restoring chain %d (%p) "
++				"state, slot %d.", i, cur_chain, slot);
++
++		chain_state = &cur_chain->saved_state[slot];
++
++		posn = chain_state->extent_num;
++
++		cur_chain->blocks.current_extent = cur_chain->blocks.first;
++		cur_chain->blocks.current_offset = chain_state->offset;
++
++		if (i == toi_writer_posn.saved_chain_number[slot]) {
++			toi_writer_posn.current_chain = cur_chain;
++			toi_message(TOI_BIO, TOI_VERBOSE, 0,
++					"Found current chain.");
++		}
++
++		for (j = 0; j < 4; j++)
++			if (i == toi_writer_posn.saved_chain_number[j]) {
++				toi_writer_posn.saved_chain_ptr[j] = cur_chain;
++				toi_message(TOI_BIO, TOI_VERBOSE, 0,
++					"Found saved chain ptr %d (%p) (offset"
++					" %d).", j, cur_chain,
++					cur_chain->saved_state[j].offset);
++			}
++
++		if (posn) {
++			while (--posn)
++				cur_chain->blocks.current_extent =
++					cur_chain->blocks.current_extent->next;
++		} else
++			cur_chain->blocks.current_extent = NULL;
++
++		cur_chain = cur_chain->next;
++	}
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done.");
++	if (test_action_state(TOI_LOGALL))
++		dump_block_chains();
++}
++
++/*
++ * Storage needed
++ *
++ * Returns amount of space in the image header required
++ * for the chain data. This ignores the links between
++ * pages, which we factor in when allocating the space.
++ */
++int toi_bio_devinfo_storage_needed(void)
++{
++	int result = sizeof(num_chains);
++	struct toi_bdev_info *chain = prio_chain_head;
++
++	while (chain) {
++		result += metadata_size;
++
++		/* Chain size */
++		result += sizeof(int);
++
++		/* Extents */
++		result += (2 * sizeof(unsigned long) *
++			chain->blocks.num_extents);
++
++		chain = chain->next;
++	}
++
++	result += 4 * sizeof(int);
++	return result;
++}
++
++static unsigned long chain_pages_used(struct toi_bdev_info *chain)
++{
++	struct hibernate_extent *this = chain->blocks.first;
++	struct hibernate_extent_saved_state *state = &chain->saved_state[3];
++	unsigned long size = 0;
++	int extent_idx = 1;
++
++	if (!state->extent_num) {
++		if (!this)
++			return 0;
++		else
++			return chain->blocks.size;
++	}
++
++	while (extent_idx < state->extent_num) {
++		size += (this->end - this->start + 1);
++		this = this->next;
++		extent_idx++;
++	}
++
++	/* We didn't use the one we're sitting on, so don't count it */
++	return size + state->offset - this->start;
++}
++
++/**
++ * toi_serialise_extent_chain - write a chain in the image
++ * @chain:	Chain to write.
++ **/
++static int toi_serialise_extent_chain(struct toi_bdev_info *chain)
++{
++	struct hibernate_extent *this;
++	int ret;
++	int i = 1;
++
++	chain->pages_used = chain_pages_used(chain);
++
++	if (test_action_state(TOI_LOGALL))
++		dump_block_chains();
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Serialising chain (dev_t %lx).",
++			chain->dev_t);
++	/* Device info -  dev_t, prio, bmap_shift, blocks per page, positions */
++	ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
++			(char *) &chain->uuid, metadata_size);
++	if (ret)
++		return ret;
++
++	/* Num extents */
++	ret = toiActiveAllocator->rw_header_chunk(WRITE, &toi_blockwriter_ops,
++			(char *) &chain->blocks.num_extents, sizeof(int));
++	if (ret)
++		return ret;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
++			chain->blocks.num_extents);
++
++	this = chain->blocks.first;
++	while (this) {
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i);
++		ret = toiActiveAllocator->rw_header_chunk(WRITE,
++				&toi_blockwriter_ops,
++				(char *) this, 2 * sizeof(this->start));
++		if (ret)
++			return ret;
++		this = this->next;
++		i++;
++	}
++
++	return ret;
++}
++
++int toi_serialise_extent_chains(void)
++{
++	struct toi_bdev_info *this = prio_chain_head;
++	int result;
++
++	/* Write the number of chains */
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Write number of chains (%d)",
++			num_chains);
++	result = toiActiveAllocator->rw_header_chunk(WRITE,
++			&toi_blockwriter_ops, (char *) &num_chains,
++			sizeof(int));
++	if (result)
++		return result;
++
++	/* Then the chains themselves */
++	while (this) {
++		result = toi_serialise_extent_chain(this);
++		if (result)
++			return result;
++		this = this->next;
++	}
++
++	/*
++	 * Finally, the chain we should be on at the start of each
++	 * section.
++	 */
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Saved chain numbers.");
++	result = toiActiveAllocator->rw_header_chunk(WRITE,
++			&toi_blockwriter_ops,
++			(char *) &toi_writer_posn.saved_chain_number[0],
++			4 * sizeof(int));
++
++	return result;
++}
++
++int toi_register_storage_chain(struct toi_bdev_info *new)
++{
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Inserting chain %p into list.",
++			new);
++	toi_insert_chain_in_prio_list(new);
++	return 0;
++}
++
++static void free_bdev_info(struct toi_bdev_info *chain)
++{
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Free chain %p.", chain);
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Block extents.");
++	toi_put_extent_chain(&chain->blocks);
++
++	/*
++	 * The allocator may need to do more than just free the chains
++	 * (swap_free, for example). Don't call from boot kernel.
++	 */
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Allocator extents.");
++	if (chain->allocator)
++		chain->allocator->bio_allocator_ops->free_storage(chain);
++
++	/*
++	 * Dropping out of reading atomic copy? Need to undo
++	 * toi_open_by_devnum.
++	 */
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Bdev.");
++	if (chain->bdev && !IS_ERR(chain->bdev) &&
++			chain->bdev != resume_block_device &&
++			chain->bdev != header_block_device &&
++			test_toi_state(TOI_TRYING_TO_RESUME))
++		toi_close_bdev(chain->bdev);
++
++	/* Poison */
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, " - Struct.");
++	toi_kfree(39, chain, sizeof(*chain));
++
++	if (prio_chain_head == chain)
++		prio_chain_head = NULL;
++
++	num_chains--;
++}
++
++void free_all_bdev_info(void)
++{
++	struct toi_bdev_info *this = prio_chain_head;
++
++	while (this) {
++		struct toi_bdev_info *next = this->next;
++		free_bdev_info(this);
++		this = next;
++	}
++
++	memset((char *) &toi_writer_posn, 0, sizeof(toi_writer_posn));
++	prio_chain_head = NULL;
++}
++
++static void set_up_start_position(void)
++{
++	toi_writer_posn.current_chain = prio_chain_head;
++	go_next_page(0, 0);
++}
++
++/**
++ * toi_load_extent_chain - read back a chain saved in the image
++ * @chain:	Chain to load
++ *
++ * The linked list of extents is reconstructed from the disk. chain will point
++ * to the first entry.
++ **/
++int toi_load_extent_chain(int index, int *num_loaded)
++{
++	struct toi_bdev_info *chain = toi_kzalloc(39,
++			sizeof(struct toi_bdev_info), GFP_ATOMIC);
++	struct hibernate_extent *this, *last = NULL;
++	int i, ret;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Loading extent chain %d.", index);
++	/* Get dev_t, prio, bmap_shift, blocks per page, positions */
++	ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
++			(char *) &chain->uuid, metadata_size);
++
++	if (ret) {
++		printk(KERN_ERR "Failed to read the size of extent chain.\n");
++		toi_kfree(39, chain, sizeof(*chain));
++		return 1;
++	}
++
++	toi_bkd.pages_used[index] = chain->pages_used;
++
++	ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
++			(char *) &chain->blocks.num_extents, sizeof(int));
++	if (ret) {
++		printk(KERN_ERR "Failed to read the size of extent chain.\n");
++		toi_kfree(39, chain, sizeof(*chain));
++		return 1;
++	}
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d extents.",
++			chain->blocks.num_extents);
++
++	for (i = 0; i < chain->blocks.num_extents; i++) {
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent %d.", i + 1);
++
++		this = toi_kzalloc(2, sizeof(struct hibernate_extent),
++				TOI_ATOMIC_GFP);
++		if (!this) {
++			printk(KERN_INFO "Failed to allocate a new extent.\n");
++			free_bdev_info(chain);
++			return -ENOMEM;
++		}
++		this->next = NULL;
++		/* Get the next page */
++		ret = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
++				NULL, (char *) this, 2 * sizeof(this->start));
++		if (ret) {
++			printk(KERN_INFO "Failed to read an extent.\n");
++			toi_kfree(2, this, sizeof(struct hibernate_extent));
++			free_bdev_info(chain);
++			return 1;
++		}
++
++		if (last)
++			last->next = this;
++		else {
++			char b1[32], b2[32], b3[32];
++			/*
++			 * Open the bdev
++			 */
++			toi_message(TOI_BIO, TOI_VERBOSE, 0,
++				"Chain dev_t is %s. Resume dev t is %s. Header"
++				" bdev_t is %s.\n",
++				format_dev_t(b1, chain->dev_t),
++				format_dev_t(b2, resume_dev_t),
++				format_dev_t(b3, toi_sig_data->header_dev_t));
++
++			if (chain->dev_t == resume_dev_t)
++				chain->bdev = resume_block_device;
++			else if (chain->dev_t == toi_sig_data->header_dev_t)
++				chain->bdev = header_block_device;
++			else {
++				chain->bdev = toi_open_bdev(chain->uuid,
++						chain->dev_t, 1);
++				if (IS_ERR(chain->bdev)) {
++					free_bdev_info(chain);
++					return -ENODEV;
++				}
++			}
++
++			toi_message(TOI_BIO, TOI_VERBOSE, 0, "Chain bmap shift "
++					"is %d and blocks per page is %d.",
++					chain->bmap_shift,
++					chain->blocks_per_page);
++
++			chain->blocks.first = this;
++
++			/*
++			 * Couldn't do this earlier, but can't do
++			 * goto_start now - we may have already used blocks
++			 * in the first chain.
++			 */
++			chain->blocks.current_extent = this;
++			chain->blocks.current_offset = this->start;
++
++			/*
++			 * Can't wait until we've read the whole chain
++			 * before we insert it in the list. We might need
++			 * this chain to read the next page in the header
++			 */
++			toi_insert_chain_in_prio_list(chain);
++		}
++
++		/*
++		 * We have to wait until 2 extents are loaded before setting up
++		 * properly because if the first extent has only one page, we
++		 * will need to put the position on the second extent. Sounds
++		 * obvious, but it wasn't!
++		 */
++		(*num_loaded)++;
++		if ((*num_loaded) == 2)
++			set_up_start_position();
++		last = this;
++	}
++
++	/*
++	 * Shouldn't get empty chains, but it's not impossible. Link them in so
++	 * they get freed properly later.
++	 */
++	if (!chain->blocks.num_extents)
++		toi_insert_chain_in_prio_list(chain);
++
++	if (!chain->blocks.current_extent) {
++		chain->blocks.current_extent = chain->blocks.first;
++		if (chain->blocks.current_extent)
++			chain->blocks.current_offset =
++				chain->blocks.current_extent->start;
++	}
++	return 0;
++}
++
++int toi_load_extent_chains(void)
++{
++	int result;
++	int to_load;
++	int i;
++	int extents_loaded = 0;
++
++	result = toiActiveAllocator->rw_header_chunk_noreadahead(READ, NULL,
++			(char *) &to_load,
++			sizeof(int));
++	if (result)
++		return result;
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "%d chains to read.", to_load);
++
++	for (i = 0; i < to_load; i++) {
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, " >> Loading chain %d/%d.",
++				i, to_load);
++		result = toi_load_extent_chain(i, &extents_loaded);
++		if (result)
++			return result;
++	}
++
++	/* If we never got to a second extent, we still need to do this. */
++	if (extents_loaded == 1)
++		set_up_start_position();
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Save chain numbers.");
++	result = toiActiveAllocator->rw_header_chunk_noreadahead(READ,
++			&toi_blockwriter_ops,
++			(char *) &toi_writer_posn.saved_chain_number[0],
++			4 * sizeof(int));
++
++	return result;
++}
++
++static int toi_end_of_stream(int writing, int section_barrier)
++{
++	struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
++	int compare_to = next_section[current_stream];
++	struct toi_bdev_info *compare_chain =
++		toi_writer_posn.saved_chain_ptr[compare_to];
++	int compare_offset = compare_chain ?
++		compare_chain->saved_state[compare_to].offset : 0;
++
++	if (!section_barrier)
++		return 0;
++
++	if (!cur_chain)
++		return 1;
++
++	if (cur_chain == compare_chain &&
++	    cur_chain->blocks.current_offset == compare_offset) {
++		if (writing) {
++			if (!current_stream) {
++				debug_broken_header();
++				return 1;
++			}
++		} else {
++			more_readahead = 0;
++			toi_message(TOI_BIO, TOI_VERBOSE, 0,
++					"Reached the end of stream %d "
++					"(not an error).", current_stream);
++			return 1;
++		}
++	}
++
++	return 0;
++}
++
++/**
++ * go_next_page - skip blocks to the start of the next page
++ * @writing: Whether we're reading or writing the image.
++ *
++ * Go forward one page.
++ **/
++int go_next_page(int writing, int section_barrier)
++{
++	struct toi_bdev_info *cur_chain = toi_writer_posn.current_chain;
++	int max = cur_chain ? cur_chain->blocks_per_page : 1;
++
++	/* Nope. Go foward a page - or maybe two. Don't stripe the header,
++	 * so that bad fragmentation doesn't put the extent data containing
++	 * the location of the second page out of the first header page.
++	 */
++	if (toi_extent_state_next(max, current_stream)) {
++		/* Don't complain if readahead falls off the end */
++		if (writing && section_barrier) {
++			toi_message(TOI_BIO, TOI_VERBOSE, 0, "Extent state eof. "
++				"Expected compression ratio too optimistic?");
++			if (test_action_state(TOI_LOGALL))
++				dump_block_chains();
++		}
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Ran out of extents to "
++				"read/write. (Not necessarily a fatal error.");
++		return -ENOSPC;
++	}
++
++	return 0;
++}
++
++int devices_of_same_priority(struct toi_bdev_info *this)
++{
++	struct toi_bdev_info *check = prio_chain_head;
++	int i = 0;
++
++	while (check) {
++		if (check->prio == this->prio)
++			i++;
++		check = check->next;
++	}
++
++	return i;
++}
++
++/**
++ * toi_bio_rw_page - do i/o on the next disk page in the image
++ * @writing: Whether reading or writing.
++ * @page: Page to do i/o on.
++ * @is_readahead: Whether we're doing readahead
++ * @free_group: The group used in allocating the page
++ *
++ * Submit a page for reading or writing, possibly readahead.
++ * Pass the group used in allocating the page as well, as it should
++ * be freed on completion of the bio if we're writing the page.
++ **/
++int toi_bio_rw_page(int writing, struct page *page,
++		int is_readahead, int free_group)
++{
++	int result = toi_end_of_stream(writing, 1);
++	struct toi_bdev_info *dev_info = toi_writer_posn.current_chain;
++
++	if (result) {
++		if (writing)
++			abort_hibernate(TOI_INSUFFICIENT_STORAGE,
++				"Insufficient storage for your image.");
++		else
++			toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking to "
++				"read/write another page when stream has "
++				"ended.");
++		return -ENOSPC;
++	}
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0,
++			"%s %lx:%ld",
++			writing ? "Write" : "Read",
++			dev_info->dev_t, dev_info->blocks.current_offset);
++
++	result = toi_do_io(writing, dev_info->bdev,
++		dev_info->blocks.current_offset << dev_info->bmap_shift,
++		page, is_readahead, 0, free_group);
++
++	/* Ignore the result here - will check end of stream if come in again */
++	go_next_page(writing, 1);
++
++	if (result)
++		printk(KERN_ERR "toi_do_io returned %d.\n", result);
++	return result;
++}
++
++dev_t get_header_dev_t(void)
++{
++	return prio_chain_head->dev_t;
++}
++
++struct block_device *get_header_bdev(void)
++{
++	return prio_chain_head->bdev;
++}
++
++unsigned long get_headerblock(void)
++{
++	return prio_chain_head->blocks.first->start <<
++		prio_chain_head->bmap_shift;
++}
++
++int get_main_pool_phys_params(void)
++{
++	struct toi_bdev_info *this = prio_chain_head;
++	int result;
++
++	while (this) {
++		result = this->allocator->bio_allocator_ops->bmap(this);
++		if (result)
++			return result;
++		this = this->next;
++	}
++
++	return 0;
++}
++
++static int apply_header_reservation(void)
++{
++	int i;
++
++	if (!header_pages_reserved) {
++		toi_message(TOI_BIO, TOI_VERBOSE, 0,
++				"No header pages reserved at the moment.");
++		return 0;
++	}
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Applying header reservation.");
++
++	/* Apply header space reservation */
++	toi_extent_state_goto_start();
++
++	for (i = 0; i < header_pages_reserved; i++)
++		if (go_next_page(1, 0))
++			return -ENOSPC;
++
++	/* The end of header pages will be the start of pageset 2 */
++	toi_extent_state_save(2);
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0,
++			"Finished applying header reservation.");
++	return 0;
++}
++
++static int toi_bio_register_storage(void)
++{
++	int result = 0;
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled ||
++		    this_module->type != BIO_ALLOCATOR_MODULE)
++			continue;
++		toi_message(TOI_BIO, TOI_VERBOSE, 0,
++				"Registering storage from %s.",
++				this_module->name);
++		result = this_module->bio_allocator_ops->register_storage();
++		if (result)
++			break;
++	}
++
++	return result;
++}
++
++int toi_bio_allocate_storage(unsigned long request)
++{
++	struct toi_bdev_info *chain = prio_chain_head;
++	unsigned long to_get = request;
++	unsigned long extra_pages, needed;
++	int no_free = 0;
++
++	if (!chain) {
++		int result = toi_bio_register_storage();
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
++			"Registering storage.");
++		if (result)
++			return 0;
++		chain = prio_chain_head;
++		if (!chain) {
++			printk("TuxOnIce: No storage was registered.\n");
++			return 0;
++		}
++	}
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_allocate_storage: "
++			"Request is %lu pages.", request);
++	extra_pages = DIV_ROUND_UP(request * (sizeof(unsigned long)
++			       + sizeof(int)), PAGE_SIZE);
++	needed = request + extra_pages + header_pages_reserved;
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Adding %lu extra pages and %lu "
++			"for header => %lu.",
++			extra_pages, header_pages_reserved, needed);
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Already allocated %lu pages.",
++			raw_pages_allocd);
++
++	to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd : 0;
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Need to get %lu pages.", to_get);
++
++	if (!to_get)
++		return apply_header_reservation();
++
++	while (to_get && chain) {
++		int num_group = devices_of_same_priority(chain);
++		int divisor = num_group - no_free;
++		int i;
++		unsigned long portion = DIV_ROUND_UP(to_get, divisor);
++		unsigned long got = 0;
++		unsigned long got_this_round = 0;
++		struct toi_bdev_info *top = chain;
++
++		toi_message(TOI_BIO, TOI_VERBOSE, 0,
++				" Start of loop. To get is %lu. Divisor is %d.",
++				to_get, divisor);
++		no_free = 0;
++
++		/*
++		 * We're aiming to spread the allocated storage as evenly
++		 * as possible, but we also want to get all the storage we
++		 * can off this priority.
++		 */
++		for (i = 0; i < num_group; i++) {
++			struct toi_bio_allocator_ops *ops =
++				chain->allocator->bio_allocator_ops;
++			toi_message(TOI_BIO, TOI_VERBOSE, 0,
++					" Asking for %lu pages from chain %p.",
++					portion, chain);
++			got = ops->allocate_storage(chain, portion);
++			toi_message(TOI_BIO, TOI_VERBOSE, 0,
++					" Got %lu pages from allocator %p.",
++					got, chain);
++			if (!got)
++				no_free++;
++			got_this_round += got;
++			chain = chain->next;
++		}
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, " Loop finished. Got a "
++				"total of %lu pages from %d allocators.",
++				got_this_round, divisor - no_free);
++
++		raw_pages_allocd += got_this_round;
++		to_get = needed > raw_pages_allocd ? needed - raw_pages_allocd :
++			0;
++
++		/*
++		 * If we got anything from chains of this priority and we
++		 * still have storage to allocate, go over this priority
++		 * again.
++		 */
++		if (got_this_round && to_get)
++			chain = top;
++		else
++			no_free = 0;
++	}
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Finished allocating. Calling "
++			"get_main_pool_phys_params");
++	/* Now let swap allocator bmap the pages */
++	get_main_pool_phys_params();
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Done. Reserving header.");
++	return apply_header_reservation();
++}
++
++void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd)
++{
++	int i = 0;
++	struct toi_bdev_info *cur_chain = prio_chain_head;
++
++	while (cur_chain) {
++		cur_chain->pages_used = bkd->pages_used[i];
++		cur_chain = cur_chain->next;
++		i++;
++	}
++}
++
++int toi_bio_chains_debug_info(char *buffer, int size)
++{
++	/* Show what we actually used */
++	struct toi_bdev_info *cur_chain = prio_chain_head;
++	int len = 0;
++
++	while (cur_chain) {
++		len += scnprintf(buffer + len, size - len, "  Used %lu pages "
++				"from %s.\n", cur_chain->pages_used,
++				cur_chain->name);
++		cur_chain = cur_chain->next;
++	}
++
++	return len;
++}
+diff --git a/kernel/power/tuxonice_bio_core.c b/kernel/power/tuxonice_bio_core.c
+new file mode 100644
+index 0000000..790b829
+--- /dev/null
++++ b/kernel/power/tuxonice_bio_core.c
+@@ -0,0 +1,1838 @@
++/*
++ * kernel/power/tuxonice_bio.c
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * This file contains block io functions for TuxOnIce. These are
++ * used by the swapwriter and it is planned that they will also
++ * be used by the NFSwriter.
++ *
++ */
++
++#include <linux/blkdev.h>
++#include <linux/syscalls.h>
++#include <linux/suspend.h>
++#include <linux/ctype.h>
++#include <linux/fs_uuid.h>
++
++#include "tuxonice.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice_bio.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_io.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_bio_internal.h"
++
++#define MEMORY_ONLY 1
++#define THROTTLE_WAIT 2
++
++/* #define MEASURE_MUTEX_CONTENTION */
++#ifndef MEASURE_MUTEX_CONTENTION
++#define my_mutex_lock(index, the_lock) mutex_lock(the_lock)
++#define my_mutex_unlock(index, the_lock) mutex_unlock(the_lock)
++#else
++unsigned long mutex_times[2][2][NR_CPUS];
++#define my_mutex_lock(index, the_lock) do { \
++	int have_mutex; \
++	have_mutex = mutex_trylock(the_lock); \
++	if (!have_mutex) { \
++		mutex_lock(the_lock); \
++		mutex_times[index][0][smp_processor_id()]++; \
++	} else { \
++		mutex_times[index][1][smp_processor_id()]++; \
++	}
++
++#define my_mutex_unlock(index, the_lock) \
++	mutex_unlock(the_lock); \
++} while (0)
++#endif
++
++static int page_idx, reset_idx;
++
++static int target_outstanding_io = 1024;
++static int max_outstanding_writes, max_outstanding_reads;
++
++static struct page *bio_queue_head, *bio_queue_tail;
++static atomic_t toi_bio_queue_size;
++static DEFINE_SPINLOCK(bio_queue_lock);
++
++static int free_mem_throttle, throughput_throttle;
++int more_readahead = 1;
++static struct page *readahead_list_head, *readahead_list_tail;
++
++static struct page *waiting_on;
++
++static atomic_t toi_io_in_progress, toi_io_done;
++static DECLARE_WAIT_QUEUE_HEAD(num_in_progress_wait);
++
++int current_stream;
++/* Not static, so that the allocators can setup and complete
++ * writing the header */
++char *toi_writer_buffer;
++int toi_writer_buffer_posn;
++
++static DEFINE_MUTEX(toi_bio_mutex);
++static DEFINE_MUTEX(toi_bio_readahead_mutex);
++
++static struct task_struct *toi_queue_flusher;
++static int toi_bio_queue_flush_pages(int dedicated_thread);
++
++struct toi_module_ops toi_blockwriter_ops;
++
++#define TOTAL_OUTSTANDING_IO (atomic_read(&toi_io_in_progress) + \
++	       atomic_read(&toi_bio_queue_size))
++
++unsigned long raw_pages_allocd, header_pages_reserved;
++
++/**
++ * set_free_mem_throttle - set the point where we pause to avoid oom.
++ *
++ * Initially, this value is zero, but when we first fail to allocate memory,
++ * we set it (plus a buffer) and thereafter throttle i/o once that limit is
++ * reached.
++ **/
++static void set_free_mem_throttle(void)
++{
++	int new_throttle = nr_unallocated_buffer_pages() + 256;
++
++	if (new_throttle > free_mem_throttle)
++		free_mem_throttle = new_throttle;
++}
++
++#define NUM_REASONS 7
++static atomic_t reasons[NUM_REASONS];
++static char *reason_name[NUM_REASONS] = {
++	"readahead not ready",
++	"bio allocation",
++	"synchronous I/O",
++	"toi_bio_get_new_page",
++	"memory low",
++	"readahead buffer allocation",
++	"throughput_throttle",
++};
++
++/* User Specified Parameters. */
++unsigned long resume_firstblock;
++dev_t resume_dev_t;
++struct block_device *resume_block_device;
++static atomic_t resume_bdev_open_count;
++
++struct block_device *header_block_device;
++
++/**
++ * toi_open_bdev: Open a bdev at resume time.
++ *
++ * index: The swap index. May be MAX_SWAPFILES for the resume_dev_t
++ * (the user can have resume= pointing at a swap partition/file that isn't
++ * swapon'd when they hibernate. MAX_SWAPFILES+1 for the first page of the
++ * header. It will be from a swap partition that was enabled when we hibernated,
++ * but we don't know it's real index until we read that first page.
++ * dev_t: The device major/minor.
++ * display_errs: Whether to try to do this quietly.
++ *
++ * We stored a dev_t in the image header. Open the matching device without
++ * requiring /dev/<whatever> in most cases and record the details needed
++ * to close it later and avoid duplicating work.
++ */
++struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
++		int display_errs)
++{
++	struct block_device *bdev;
++	dev_t device = default_device;
++	char buf[32];
++	int retried = 0;
++
++retry:
++	if (uuid) {
++		struct fs_info seek;
++		strncpy((char *) &seek.uuid, uuid, 16);
++		seek.dev_t = 0;
++		seek.last_mount_size = 0;
++		device = blk_lookup_fs_info(&seek);
++		if (!device) {
++			device = default_device;
++			printk(KERN_DEBUG "Unable to resolve uuid. Falling back"
++					" to dev_t.\n");
++		} else
++			printk(KERN_DEBUG "Resolved uuid to device %s.\n",
++					format_dev_t(buf, device));
++	}
++
++	if (!device) {
++		printk(KERN_ERR "TuxOnIce attempting to open a "
++				"blank dev_t!\n");
++		dump_stack();
++		return NULL;
++	}
++	bdev = toi_open_by_devnum(device);
++
++	if (IS_ERR(bdev) || !bdev) {
++		if (!retried) {
++			retried = 1;
++			wait_for_device_probe();
++			goto retry;
++		}
++		if (display_errs)
++			toi_early_boot_message(1, TOI_CONTINUE_REQ,
++				"Failed to get access to block device "
++				"\"%x\" (error %d).\n Maybe you need "
++				"to run mknod and/or lvmsetup in an "
++				"initrd/ramfs?", device, bdev);
++		return ERR_PTR(-EINVAL);
++	}
++	toi_message(TOI_BIO, TOI_VERBOSE, 0,
++			"TuxOnIce got bdev %p for dev_t %x.",
++			bdev, device);
++
++	return bdev;
++}
++
++static void toi_bio_reserve_header_space(unsigned long request)
++{
++	header_pages_reserved = request;
++}
++
++/**
++ * do_bio_wait - wait for some TuxOnIce I/O to complete
++ * @reason: The array index of the reason we're waiting.
++ *
++ * Wait for a particular page of I/O if we're after a particular page.
++ * If we're not after a particular page, wait instead for all in flight
++ * I/O to be completed or for us to have enough free memory to be able
++ * to submit more I/O.
++ *
++ * If we wait, we also update our statistics regarding why we waited.
++ **/
++static void do_bio_wait(int reason)
++{
++	struct page *was_waiting_on = waiting_on;
++
++	/* On SMP, waiting_on can be reset, so we make a copy */
++	if (was_waiting_on) {
++		wait_on_page_locked(was_waiting_on);
++		atomic_inc(&reasons[reason]);
++	} else {
++		atomic_inc(&reasons[reason]);
++
++		wait_event(num_in_progress_wait,
++			!atomic_read(&toi_io_in_progress) ||
++			nr_unallocated_buffer_pages() > free_mem_throttle);
++	}
++}
++
++/**
++ * throttle_if_needed - wait for I/O completion if throttle points are reached
++ * @flags: What to check and how to act.
++ *
++ * Check whether we need to wait for some I/O to complete. We always check
++ * whether we have enough memory available, but may also (depending upon
++ * @reason) check if the throughput throttle limit has been reached.
++ **/
++static int throttle_if_needed(int flags)
++{
++	int free_pages = nr_unallocated_buffer_pages();
++
++	/* Getting low on memory and I/O is in progress? */
++	while (unlikely(free_pages < free_mem_throttle) &&
++			atomic_read(&toi_io_in_progress) &&
++			!test_result_state(TOI_ABORTED)) {
++		if (!(flags & THROTTLE_WAIT))
++			return -ENOMEM;
++		do_bio_wait(4);
++		free_pages = nr_unallocated_buffer_pages();
++	}
++
++	while (!(flags & MEMORY_ONLY) && throughput_throttle &&
++		TOTAL_OUTSTANDING_IO >= throughput_throttle &&
++		!test_result_state(TOI_ABORTED)) {
++		int result = toi_bio_queue_flush_pages(0);
++		if (result)
++			return result;
++		atomic_inc(&reasons[6]);
++		wait_event(num_in_progress_wait,
++			!atomic_read(&toi_io_in_progress) ||
++			TOTAL_OUTSTANDING_IO < throughput_throttle);
++	}
++
++	return 0;
++}
++
++/**
++ * update_throughput_throttle - update the raw throughput throttle
++ * @jif_index: The number of times this function has been called.
++ *
++ * This function is called four times per second by the core, and used to limit
++ * the amount of I/O we submit at once, spreading out our waiting through the
++ * whole job and letting userui get an opportunity to do its work.
++ *
++ * We don't start limiting I/O until 1/4s has gone so that we get a
++ * decent sample for our initial limit, and keep updating it because
++ * throughput may vary (on rotating media, eg) with our block number.
++ *
++ * We throttle to 1/10s worth of I/O.
++ **/
++static void update_throughput_throttle(int jif_index)
++{
++	int done = atomic_read(&toi_io_done);
++	throughput_throttle = done * 2 / 5 / jif_index;
++}
++
++/**
++ * toi_finish_all_io - wait for all outstanding i/o to complete
++ *
++ * Flush any queued but unsubmitted I/O and wait for it all to complete.
++ **/
++static int toi_finish_all_io(void)
++{
++	int result = toi_bio_queue_flush_pages(0);
++	toi_bio_queue_flusher_should_finish = 1;
++	wake_up(&toi_io_queue_flusher);
++	wait_event(num_in_progress_wait, !TOTAL_OUTSTANDING_IO);
++	return result;
++}
++
++/**
++ * toi_end_bio - bio completion function.
++ * @bio: bio that has completed.
++ * @err: Error value. Yes, like end_swap_bio_read, we ignore it.
++ *
++ * Function called by the block driver from interrupt context when I/O is
++ * completed. If we were writing the page, we want to free it and will have
++ * set bio->bi_private to the parameter we should use in telling the page
++ * allocation accounting code what the page was allocated for. If we're
++ * reading the page, it will be in the singly linked list made from
++ * page->private pointers.
++ **/
++static void toi_end_bio(struct bio *bio, int err)
++{
++	struct page *page = bio->bi_io_vec[0].bv_page;
++
++	BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
++
++	unlock_page(page);
++	bio_put(bio);
++
++	if (waiting_on == page)
++		waiting_on = NULL;
++
++	put_page(page);
++
++	if (bio->bi_private)
++		toi__free_page((int) ((unsigned long) bio->bi_private) , page);
++
++	bio_put(bio);
++
++	atomic_dec(&toi_io_in_progress);
++	atomic_inc(&toi_io_done);
++
++	wake_up(&num_in_progress_wait);
++}
++
++/**
++ * submit - submit BIO request
++ * @writing: READ or WRITE.
++ * @dev: The block device we're using.
++ * @first_block: The first sector we're using.
++ * @page: The page being used for I/O.
++ * @free_group: If writing, the group that was used in allocating the page
++ * 	and which will be used in freeing the page from the completion
++ * 	routine.
++ *
++ * Based on Patrick Mochell's pmdisk code from long ago: "Straight from the
++ * textbook - allocate and initialize the bio. If we're writing, make sure
++ * the page is marked as dirty. Then submit it and carry on."
++ *
++ * If we're just testing the speed of our own code, we fake having done all
++ * the hard work and all toi_end_bio immediately.
++ **/
++static int submit(int writing, struct block_device *dev, sector_t first_block,
++		struct page *page, int free_group)
++{
++	struct bio *bio = NULL;
++	int cur_outstanding_io, result;
++
++	/*
++	 * Shouldn't throttle if reading - can deadlock in the single
++	 * threaded case as pages are only freed when we use the
++	 * readahead.
++	 */
++	if (writing) {
++		result = throttle_if_needed(MEMORY_ONLY | THROTTLE_WAIT);
++		if (result)
++			return result;
++	}
++
++	while (!bio) {
++		bio = bio_alloc(TOI_ATOMIC_GFP, 1);
++		if (!bio) {
++			set_free_mem_throttle();
++			do_bio_wait(1);
++		}
++	}
++
++	bio->bi_bdev = dev;
++	bio->bi_sector = first_block;
++	bio->bi_private = (void *) ((unsigned long) free_group);
++	bio->bi_end_io = toi_end_bio;
++
++	if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
++		printk(KERN_DEBUG "ERROR: adding page to bio at %lld\n",
++				(unsigned long long) first_block);
++		bio_put(bio);
++		return -EFAULT;
++	}
++
++	bio_get(bio);
++
++	cur_outstanding_io = atomic_add_return(1, &toi_io_in_progress);
++	if (writing) {
++		if (cur_outstanding_io > max_outstanding_writes)
++			max_outstanding_writes = cur_outstanding_io;
++	} else {
++		if (cur_outstanding_io > max_outstanding_reads)
++			max_outstanding_reads = cur_outstanding_io;
++	}
++
++
++	/* Still read the header! */
++	if (unlikely(test_action_state(TOI_TEST_BIO) && writing)) {
++		/* Fake having done the hard work */
++		set_bit(BIO_UPTODATE, &bio->bi_flags);
++		toi_end_bio(bio, 0);
++	} else
++		submit_bio(writing | REQ_TOI | REQ_SYNC, bio);
++
++	return 0;
++}
++
++/**
++ * toi_do_io: Prepare to do some i/o on a page and submit or batch it.
++ *
++ * @writing: Whether reading or writing.
++ * @bdev: The block device which we're using.
++ * @block0: The first sector we're reading or writing.
++ * @page: The page on which I/O is being done.
++ * @readahead_index: If doing readahead, the index (reset this flag when done).
++ * @syncio: Whether the i/o is being done synchronously.
++ *
++ * Prepare and start a read or write operation.
++ *
++ * Note that we always work with our own page. If writing, we might be given a
++ * compression buffer that will immediately be used to start compressing the
++ * next page. For reading, we do readahead and therefore don't know the final
++ * address where the data needs to go.
++ **/
++int toi_do_io(int writing, struct block_device *bdev, long block0,
++	struct page *page, int is_readahead, int syncio, int free_group)
++{
++	page->private = 0;
++
++	/* Do here so we don't race against toi_bio_get_next_page_read */
++	lock_page(page);
++
++	if (is_readahead) {
++		if (readahead_list_head)
++			readahead_list_tail->private = (unsigned long) page;
++		else
++			readahead_list_head = page;
++
++		readahead_list_tail = page;
++	}
++
++	/* Done before submitting to avoid races. */
++	if (syncio)
++		waiting_on = page;
++
++	/* Submit the page */
++	get_page(page);
++
++	if (submit(writing, bdev, block0, page, free_group))
++		return -EFAULT;
++
++	if (syncio)
++		do_bio_wait(2);
++
++	return 0;
++}
++
++/**
++ * toi_bdev_page_io - simpler interface to do directly i/o on a single page
++ * @writing: Whether reading or writing.
++ * @bdev: Block device on which we're operating.
++ * @pos: Sector at which page to read or write starts.
++ * @page: Page to be read/written.
++ *
++ * A simple interface to submit a page of I/O and wait for its completion.
++ * The caller must free the page used.
++ **/
++static int toi_bdev_page_io(int writing, struct block_device *bdev,
++		long pos, struct page *page)
++{
++	return toi_do_io(writing, bdev, pos, page, 0, 1, 0);
++}
++
++/**
++ * toi_bio_memory_needed - report the amount of memory needed for block i/o
++ *
++ * We want to have at least enough memory so as to have target_outstanding_io
++ * or more transactions on the fly at once. If we can do more, fine.
++ **/
++static int toi_bio_memory_needed(void)
++{
++	return target_outstanding_io * (PAGE_SIZE + sizeof(struct request) +
++				sizeof(struct bio));
++}
++
++/**
++ * toi_bio_print_debug_stats - put out debugging info in the buffer provided
++ * @buffer: A buffer of size @size into which text should be placed.
++ * @size: The size of @buffer.
++ *
++ * Fill a buffer with debugging info. This is used for both our debug_info sysfs
++ * entry and for recording the same info in dmesg.
++ **/
++static int toi_bio_print_debug_stats(char *buffer, int size)
++{
++	int len = 0;
++
++	if (toiActiveAllocator != &toi_blockwriter_ops) {
++		len = scnprintf(buffer, size,
++				"- Block I/O inactive.\n");
++		return len;
++	}
++
++	len = scnprintf(buffer, size, "- Block I/O active.\n");
++
++	len += toi_bio_chains_debug_info(buffer + len, size - len);
++
++	len += scnprintf(buffer + len, size - len,
++			"- Max outstanding reads %d. Max writes %d.\n",
++			max_outstanding_reads, max_outstanding_writes);
++
++	len += scnprintf(buffer + len, size - len,
++		"  Memory_needed: %d x (%lu + %u + %u) = %d bytes.\n",
++		target_outstanding_io,
++		PAGE_SIZE, (unsigned int) sizeof(struct request),
++		(unsigned int) sizeof(struct bio), toi_bio_memory_needed());
++
++#ifdef MEASURE_MUTEX_CONTENTION
++	{
++	int i;
++
++	len += scnprintf(buffer + len, size - len,
++		"  Mutex contention while reading:\n  Contended      Free\n");
++
++	for_each_online_cpu(i)
++		len += scnprintf(buffer + len, size - len,
++		"  %9lu %9lu\n",
++		mutex_times[0][0][i], mutex_times[0][1][i]);
++
++	len += scnprintf(buffer + len, size - len,
++		"  Mutex contention while writing:\n  Contended      Free\n");
++
++	for_each_online_cpu(i)
++		len += scnprintf(buffer + len, size - len,
++		"  %9lu %9lu\n",
++		mutex_times[1][0][i], mutex_times[1][1][i]);
++
++	}
++#endif
++
++	return len + scnprintf(buffer + len, size - len,
++		"  Free mem throttle point reached %d.\n", free_mem_throttle);
++}
++
++static int total_header_bytes;
++static int unowned;
++
++void debug_broken_header(void)
++{
++	printk(KERN_DEBUG "Image header too big for size allocated!\n");
++	print_toi_header_storage_for_modules();
++	printk(KERN_DEBUG "Page flags : %d.\n", toi_pageflags_space_needed());
++	printk(KERN_DEBUG "toi_header : %zu.\n", sizeof(struct toi_header));
++	printk(KERN_DEBUG "Total unowned : %d.\n", unowned);
++	printk(KERN_DEBUG "Total used : %d (%ld pages).\n", total_header_bytes,
++			DIV_ROUND_UP(total_header_bytes, PAGE_SIZE));
++	printk(KERN_DEBUG "Space needed now : %ld.\n",
++			get_header_storage_needed());
++	dump_block_chains();
++	abort_hibernate(TOI_HEADER_TOO_BIG, "Header reservation too small.");
++}
++
++/**
++ * toi_rw_init - prepare to read or write a stream in the image
++ * @writing: Whether reading or writing.
++ * @stream number: Section of the image being processed.
++ *
++ * Prepare to read or write a section ('stream') in the image.
++ **/
++static int toi_rw_init(int writing, int stream_number)
++{
++	if (stream_number)
++		toi_extent_state_restore(stream_number);
++	else
++		toi_extent_state_goto_start();
++
++	if (writing) {
++		reset_idx = 0;
++		if (!current_stream)
++			page_idx = 0;
++	} else {
++		reset_idx = 1;
++	}
++
++	atomic_set(&toi_io_done, 0);
++	if (!toi_writer_buffer)
++		toi_writer_buffer = (char *) toi_get_zeroed_page(11,
++				TOI_ATOMIC_GFP);
++	toi_writer_buffer_posn = writing ? 0 : PAGE_SIZE;
++
++	current_stream = stream_number;
++
++	more_readahead = 1;
++
++	return toi_writer_buffer ? 0 : -ENOMEM;
++}
++
++/**
++ * toi_bio_queue_write - queue a page for writing
++ * @full_buffer: Pointer to a page to be queued
++ *
++ * Add a page to the queue to be submitted. If we're the queue flusher,
++ * we'll do this once we've dropped toi_bio_mutex, so other threads can
++ * continue to submit I/O while we're on the slow path doing the actual
++ * submission.
++ **/
++static void toi_bio_queue_write(char **full_buffer)
++{
++	struct page *page = virt_to_page(*full_buffer);
++	unsigned long flags;
++
++	*full_buffer = NULL;
++	page->private = 0;
++
++	spin_lock_irqsave(&bio_queue_lock, flags);
++	if (!bio_queue_head)
++		bio_queue_head = page;
++	else
++		bio_queue_tail->private = (unsigned long) page;
++
++	bio_queue_tail = page;
++	atomic_inc(&toi_bio_queue_size);
++
++	spin_unlock_irqrestore(&bio_queue_lock, flags);
++	wake_up(&toi_io_queue_flusher);
++}
++
++/**
++ * toi_rw_cleanup - Cleanup after i/o.
++ * @writing: Whether we were reading or writing.
++ *
++ * Flush all I/O and clean everything up after reading or writing a
++ * section of the image.
++ **/
++static int toi_rw_cleanup(int writing)
++{
++	int i, result = 0;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_rw_cleanup.");
++	if (writing) {
++		if (toi_writer_buffer_posn && !test_result_state(TOI_ABORTED))
++			toi_bio_queue_write(&toi_writer_buffer);
++
++		while (bio_queue_head && !result)
++			result = toi_bio_queue_flush_pages(0);
++
++		if (result)
++			return result;
++
++		if (current_stream == 2)
++			toi_extent_state_save(1);
++		else if (current_stream == 1)
++			toi_extent_state_save(3);
++	}
++
++	result = toi_finish_all_io();
++
++	while (readahead_list_head) {
++		void *next = (void *) readahead_list_head->private;
++		toi__free_page(12, readahead_list_head);
++		readahead_list_head = next;
++	}
++
++	readahead_list_tail = NULL;
++
++	if (!current_stream)
++		return result;
++
++	for (i = 0; i < NUM_REASONS; i++) {
++		if (!atomic_read(&reasons[i]))
++			continue;
++		printk(KERN_DEBUG "Waited for i/o due to %s %d times.\n",
++				reason_name[i], atomic_read(&reasons[i]));
++		atomic_set(&reasons[i], 0);
++	}
++
++	current_stream = 0;
++	return result;
++}
++
++/**
++ * toi_start_one_readahead - start one page of readahead
++ * @dedicated_thread: Is this a thread dedicated to doing readahead?
++ *
++ * Start one new page of readahead. If this is being called by a thread
++ * whose only just is to submit readahead, don't quit because we failed
++ * to allocate a page.
++ **/
++static int toi_start_one_readahead(int dedicated_thread)
++{
++	char *buffer = NULL;
++	int oom = 0, result;
++
++	result = throttle_if_needed(dedicated_thread ? THROTTLE_WAIT : 0);
++	if (result)
++		return result;
++
++	mutex_lock(&toi_bio_readahead_mutex);
++
++	while (!buffer) {
++		buffer = (char *) toi_get_zeroed_page(12,
++				TOI_ATOMIC_GFP);
++		if (!buffer) {
++			if (oom && !dedicated_thread) {
++				mutex_unlock(&toi_bio_readahead_mutex);
++				return -ENOMEM;
++			}
++
++			oom = 1;
++			set_free_mem_throttle();
++			do_bio_wait(5);
++		}
++	}
++
++	result = toi_bio_rw_page(READ, virt_to_page(buffer), 1, 0);
++	if (result == -ENOSPC)
++		toi__free_page(12, virt_to_page(buffer));
++	mutex_unlock(&toi_bio_readahead_mutex);
++	if (result) {
++		if (result == -ENOSPC)
++			toi_message(TOI_BIO, TOI_VERBOSE, 0,
++					"Last readahead page submitted.");
++		else
++			printk(KERN_DEBUG "toi_bio_rw_page returned %d.\n",
++					result);
++	}
++	return result;
++}
++
++/**
++ * toi_start_new_readahead - start new readahead
++ * @dedicated_thread: Are we dedicated to this task?
++ *
++ * Start readahead of image pages.
++ *
++ * We can be called as a thread dedicated to this task (may be helpful on
++ * systems with lots of CPUs), in which case we don't exit until there's no
++ * more readahead.
++ *
++ * If this is not called by a dedicated thread, we top up our queue until
++ * there's no more readahead to submit, we've submitted the number given
++ * in target_outstanding_io or the number in progress exceeds the target
++ * outstanding I/O value.
++ *
++ * No mutex needed because this is only ever called by the first cpu.
++ **/
++static int toi_start_new_readahead(int dedicated_thread)
++{
++	int last_result, num_submitted = 0;
++
++	/* Start a new readahead? */
++	if (!more_readahead)
++		return 0;
++
++	do {
++		last_result = toi_start_one_readahead(dedicated_thread);
++
++		if (last_result) {
++			if (last_result == -ENOMEM || last_result == -ENOSPC)
++				return 0;
++
++			printk(KERN_DEBUG
++				"Begin read chunk returned %d.\n",
++				last_result);
++		} else
++			num_submitted++;
++
++	} while (more_readahead && !last_result &&
++		 (dedicated_thread ||
++		  (num_submitted < target_outstanding_io &&
++		   atomic_read(&toi_io_in_progress) < target_outstanding_io)));
++
++	return last_result;
++}
++
++/**
++ * bio_io_flusher - start the dedicated I/O flushing routine
++ * @writing: Whether we're writing the image.
++ **/
++static int bio_io_flusher(int writing)
++{
++
++	if (writing)
++		return toi_bio_queue_flush_pages(1);
++	else
++		return toi_start_new_readahead(1);
++}
++
++/**
++ * toi_bio_get_next_page_read - read a disk page, perhaps with readahead
++ * @no_readahead: Whether we can use readahead
++ *
++ * Read a page from disk, submitting readahead and cleaning up finished i/o
++ * while we wait for the page we're after.
++ **/
++static int toi_bio_get_next_page_read(int no_readahead)
++{
++	char *virt;
++	struct page *old_readahead_list_head;
++
++	/*
++	 * When reading the second page of the header, we have to
++	 * delay submitting the read until after we've gotten the
++	 * extents out of the first page.
++	 */
++	if (unlikely(no_readahead && toi_start_one_readahead(0))) {
++		printk(KERN_EMERG "No readahead and toi_start_one_readahead "
++				"returned non-zero.\n");
++		return -EIO;
++	}
++
++	if (unlikely(!readahead_list_head)) {
++		/*
++		 * If the last page finishes exactly on the page
++		 * boundary, we will be called one extra time and
++		 * have no data to return. In this case, we should
++		 * not BUG(), like we used to!
++		 */
++		if (!more_readahead) {
++			printk(KERN_EMERG "No more readahead.\n");
++			return -ENOSPC;
++		}
++		if (unlikely(toi_start_one_readahead(0))) {
++			printk(KERN_EMERG "No readahead and "
++			 "toi_start_one_readahead returned non-zero.\n");
++			return -EIO;
++		}
++	}
++
++	if (PageLocked(readahead_list_head)) {
++		waiting_on = readahead_list_head;
++		do_bio_wait(0);
++	}
++
++	virt = page_address(readahead_list_head);
++	memcpy(toi_writer_buffer, virt, PAGE_SIZE);
++	
++	mutex_lock(&toi_bio_readahead_mutex);
++	old_readahead_list_head = readahead_list_head;
++	readahead_list_head = (struct page *) readahead_list_head->private;
++	mutex_unlock(&toi_bio_readahead_mutex);
++	toi__free_page(12, old_readahead_list_head);
++	return 0;
++}
++
++/**
++ * toi_bio_queue_flush_pages - flush the queue of pages queued for writing
++ * @dedicated_thread: Whether we're a dedicated thread
++ *
++ * Flush the queue of pages ready to be written to disk.
++ *
++ * If we're a dedicated thread, stay in here until told to leave,
++ * sleeping in wait_event.
++ *
++ * The first thread is normally the only one to come in here. Another
++ * thread can enter this routine too, though, via throttle_if_needed.
++ * Since that's the case, we must be careful to only have one thread
++ * doing this work at a time. Otherwise we have a race and could save
++ * pages out of order.
++ *
++ * If an error occurs, free all remaining pages without submitting them
++ * for I/O.
++ **/
++
++int toi_bio_queue_flush_pages(int dedicated_thread)
++{
++	unsigned long flags;
++	int result = 0;
++	static DEFINE_MUTEX(busy);
++
++	if (!mutex_trylock(&busy))
++		return 0;
++
++top:
++	spin_lock_irqsave(&bio_queue_lock, flags);
++	while (bio_queue_head) {
++		struct page *page = bio_queue_head;
++		bio_queue_head = (struct page *) page->private;
++		if (bio_queue_tail == page)
++			bio_queue_tail = NULL;
++		atomic_dec(&toi_bio_queue_size);
++		spin_unlock_irqrestore(&bio_queue_lock, flags);
++
++		/* Don't generate more error messages if already had one */
++		if (!result)
++			result = toi_bio_rw_page(WRITE, page, 0, 11);
++		/*
++		 * If writing the page failed, don't drop out.
++		 * Flush the rest of the queue too.
++		 */
++		if (result)
++			toi__free_page(11 , page);
++		spin_lock_irqsave(&bio_queue_lock, flags);
++	}
++	spin_unlock_irqrestore(&bio_queue_lock, flags);
++
++	if (dedicated_thread) {
++		wait_event(toi_io_queue_flusher, bio_queue_head ||
++				toi_bio_queue_flusher_should_finish);
++		if (likely(!toi_bio_queue_flusher_should_finish))
++			goto top;
++		toi_bio_queue_flusher_should_finish = 0;
++	}
++
++	mutex_unlock(&busy);
++	return result;
++}
++
++/**
++ * toi_bio_get_new_page - get a new page for I/O
++ * @full_buffer: Pointer to a page to allocate.
++ **/
++static int toi_bio_get_new_page(char **full_buffer)
++{
++	int result = throttle_if_needed(THROTTLE_WAIT);
++	if (result)
++		return result;
++
++	while (!*full_buffer) {
++		*full_buffer = (char *) toi_get_zeroed_page(11, TOI_ATOMIC_GFP);
++		if (!*full_buffer) {
++			set_free_mem_throttle();
++			do_bio_wait(3);
++		}
++	}
++
++	return 0;
++}
++
++/**
++ * toi_rw_buffer - combine smaller buffers into PAGE_SIZE I/O
++ * @writing:		Bool - whether writing (or reading).
++ * @buffer:		The start of the buffer to write or fill.
++ * @buffer_size:	The size of the buffer to write or fill.
++ * @no_readahead:	Don't try to start readhead (when getting extents).
++ **/
++static int toi_rw_buffer(int writing, char *buffer, int buffer_size,
++		int no_readahead)
++{
++	int bytes_left = buffer_size, result = 0;
++
++	while (bytes_left) {
++		char *source_start = buffer + buffer_size - bytes_left;
++		char *dest_start = toi_writer_buffer + toi_writer_buffer_posn;
++		int capacity = PAGE_SIZE - toi_writer_buffer_posn;
++		char *to = writing ? dest_start : source_start;
++		char *from = writing ? source_start : dest_start;
++
++		if (bytes_left <= capacity) {
++			memcpy(to, from, bytes_left);
++			toi_writer_buffer_posn += bytes_left;
++			return 0;
++		}
++
++		/* Complete this page and start a new one */
++		memcpy(to, from, capacity);
++		bytes_left -= capacity;
++
++		if (!writing) {
++			/*
++			 * Perform actual I/O:
++			 * read readahead_list_head into toi_writer_buffer
++			 */
++			int result = toi_bio_get_next_page_read(no_readahead);
++			if (result) {
++				printk("toi_bio_get_next_page_read "
++						"returned %d.\n", result);
++				return result;
++			}
++		} else {
++			toi_bio_queue_write(&toi_writer_buffer);
++			result = toi_bio_get_new_page(&toi_writer_buffer);
++			if (result) {
++				printk(KERN_ERR "toi_bio_get_new_page returned "
++						"%d.\n", result);
++				return result;
++			}
++		}
++
++		toi_writer_buffer_posn = 0;
++		toi_cond_pause(0, NULL);
++	}
++
++	return 0;
++}
++
++/**
++ * toi_bio_read_page - read a page of the image
++ * @pfn:		The pfn where the data belongs.
++ * @buffer_page:	The page containing the (possibly compressed) data.
++ * @buf_size:		The number of bytes on @buffer_page used (PAGE_SIZE).
++ *
++ * Read a (possibly compressed) page from the image, into buffer_page,
++ * returning its pfn and the buffer size.
++ **/
++static int toi_bio_read_page(unsigned long *pfn, int buf_type,
++		void *buffer_page, unsigned int *buf_size)
++{
++	int result = 0;
++	int this_idx;
++	char *buffer_virt = TOI_MAP(buf_type, buffer_page);
++
++	/*
++	 * Only call start_new_readahead if we don't have a dedicated thread
++	 * and we're the queue flusher.
++	 */
++	if (current == toi_queue_flusher && more_readahead &&
++			!test_action_state(TOI_NO_READAHEAD)) {
++		int result2 = toi_start_new_readahead(0);
++		if (result2) {
++			printk(KERN_DEBUG "Queue flusher and "
++			 "toi_start_one_readahead returned non-zero.\n");
++			result = -EIO;
++			goto out;
++		}
++	}
++
++	my_mutex_lock(0, &toi_bio_mutex);
++
++	/*
++	 * Structure in the image:
++	 *	[destination pfn|page size|page data]
++	 * buf_size is PAGE_SIZE
++	 * We can validly find there's nothing to read in a multithreaded
++	 * situation.
++	 */
++	if (toi_rw_buffer(READ, (char *) &this_idx, sizeof(int), 0) ||
++	    toi_rw_buffer(READ, (char *) pfn, sizeof(unsigned long), 0) ||
++	    toi_rw_buffer(READ, (char *) buf_size, sizeof(int), 0) ||
++	    toi_rw_buffer(READ, buffer_virt, *buf_size, 0)) {
++		result = -ENODATA;
++		goto out_unlock;
++	}
++
++	if (reset_idx) {
++		page_idx = this_idx;
++		reset_idx = 0;
++	} else {
++		page_idx++;
++		if (!this_idx)
++			result = -ENODATA;
++		else if (page_idx != this_idx)
++			printk(KERN_ERR "Got page index %d, expected %d.\n",
++					this_idx, page_idx);
++	}
++
++out_unlock:
++	my_mutex_unlock(0, &toi_bio_mutex);
++out:
++	TOI_UNMAP(buf_type, buffer_page);
++	return result;
++}
++
++/**
++ * toi_bio_write_page - write a page of the image
++ * @pfn:		The pfn where the data belongs.
++ * @buffer_page:	The page containing the (possibly compressed) data.
++ * @buf_size:	The number of bytes on @buffer_page used.
++ *
++ * Write a (possibly compressed) page to the image from the buffer, together
++ * with it's index and buffer size.
++ **/
++static int toi_bio_write_page(unsigned long pfn, int buf_type,
++		void *buffer_page, unsigned int buf_size)
++{
++	char *buffer_virt;
++	int result = 0, result2 = 0;
++
++	if (unlikely(test_action_state(TOI_TEST_FILTER_SPEED)))
++		return 0;
++
++	my_mutex_lock(1, &toi_bio_mutex);
++
++	if (test_result_state(TOI_ABORTED)) {
++		my_mutex_unlock(1, &toi_bio_mutex);
++		return 0;
++	}
++
++	buffer_virt = TOI_MAP(buf_type, buffer_page);
++	page_idx++;
++
++	/*
++	 * Structure in the image:
++	 *	[destination pfn|page size|page data]
++	 * buf_size is PAGE_SIZE
++	 */
++	if (toi_rw_buffer(WRITE, (char *) &page_idx, sizeof(int), 0) ||
++	    toi_rw_buffer(WRITE, (char *) &pfn, sizeof(unsigned long), 0) ||
++	    toi_rw_buffer(WRITE, (char *) &buf_size, sizeof(int), 0) ||
++	    toi_rw_buffer(WRITE, buffer_virt, buf_size, 0)) {
++		printk(KERN_DEBUG "toi_rw_buffer returned non-zero to "
++				"toi_bio_write_page.\n");
++		result = -EIO;
++	}
++
++	TOI_UNMAP(buf_type, buffer_page);
++	my_mutex_unlock(1, &toi_bio_mutex);
++
++	if (current == toi_queue_flusher)
++		result2 = toi_bio_queue_flush_pages(0);
++
++	return result ? result : result2;
++}
++
++/**
++ * _toi_rw_header_chunk - read or write a portion of the image header
++ * @writing:		Whether reading or writing.
++ * @owner:		The module for which we're writing.
++ *			Used for confirming that modules
++ *			don't use more header space than they asked for.
++ * @buffer:		Address of the data to write.
++ * @buffer_size:	Size of the data buffer.
++ * @no_readahead:	Don't try to start readhead (when getting extents).
++ *
++ * Perform PAGE_SIZE I/O. Start readahead if needed.
++ **/
++static int _toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
++		char *buffer, int buffer_size, int no_readahead)
++{
++	int result = 0;
++
++	if (owner) {
++		owner->header_used += buffer_size;
++		toi_message(TOI_HEADER, TOI_LOW, 1,
++			"Header: %s : %d bytes (%d/%d) from offset %d.",
++			owner->name,
++			buffer_size, owner->header_used,
++			owner->header_requested,
++			toi_writer_buffer_posn);
++		if (owner->header_used > owner->header_requested && writing) {
++			printk(KERN_EMERG "TuxOnIce module %s is using more "
++				"header space (%u) than it requested (%u).\n",
++				owner->name,
++				owner->header_used,
++				owner->header_requested);
++			return buffer_size;
++		}
++	} else {
++		unowned += buffer_size;
++		toi_message(TOI_HEADER, TOI_LOW, 1,
++			"Header: (No owner): %d bytes (%d total so far) from "
++			"offset %d.", buffer_size, unowned,
++			toi_writer_buffer_posn);
++	}
++
++	if (!writing && !no_readahead && more_readahead) {
++		result = toi_start_new_readahead(0);
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Start new readahead "
++				"returned %d.", result);
++	}
++
++	if (!result) {
++		result = toi_rw_buffer(writing, buffer, buffer_size,
++				no_readahead);
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "rw_buffer returned "
++				"%d.", result);
++	}
++
++	total_header_bytes += buffer_size;
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "_toi_rw_header_chunk returning "
++			"%d.", result);
++	return result;
++}
++
++static int toi_rw_header_chunk(int writing, struct toi_module_ops *owner,
++		char *buffer, int size)
++{
++	return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
++}
++
++static int toi_rw_header_chunk_noreadahead(int writing,
++		struct toi_module_ops *owner, char *buffer, int size)
++{
++	return _toi_rw_header_chunk(writing, owner, buffer, size, 1);
++}
++
++/**
++ * toi_bio_storage_needed - get the amount of storage needed for my fns
++ **/
++static int toi_bio_storage_needed(void)
++{
++	return sizeof(int) + PAGE_SIZE + toi_bio_devinfo_storage_needed();
++}
++
++/**
++ * toi_bio_save_config_info - save block I/O config to image header
++ * @buf:	PAGE_SIZE'd buffer into which data should be saved.
++ **/
++static int toi_bio_save_config_info(char *buf)
++{
++	int *ints = (int *) buf;
++	ints[0] = target_outstanding_io;
++	return sizeof(int);
++}
++
++/**
++ * toi_bio_load_config_info - restore block I/O config
++ * @buf:	Data to be reloaded.
++ * @size:	Size of the buffer saved.
++ **/
++static void toi_bio_load_config_info(char *buf, int size)
++{
++	int *ints = (int *) buf;
++	target_outstanding_io  = ints[0];
++}
++
++void close_resume_dev_t(int force)
++{
++	if (!resume_block_device)
++		return;
++
++	if (force)
++		atomic_set(&resume_bdev_open_count, 0);
++	else
++		atomic_dec(&resume_bdev_open_count);
++
++	if (!atomic_read(&resume_bdev_open_count)) {
++		toi_close_bdev(resume_block_device);
++		resume_block_device = NULL;
++	}
++}
++
++int open_resume_dev_t(int force, int quiet)
++{
++	if (force) {
++		close_resume_dev_t(1);
++		atomic_set(&resume_bdev_open_count, 1);
++	} else
++		atomic_inc(&resume_bdev_open_count);
++
++	if (resume_block_device)
++		return 0;
++
++	resume_block_device = toi_open_bdev(NULL, resume_dev_t, 0);
++	if (IS_ERR(resume_block_device)) {
++		if (!quiet)
++			toi_early_boot_message(1, TOI_CONTINUE_REQ,
++				"Failed to open device %x, where"
++				" the header should be found.",
++				resume_dev_t);
++		resume_block_device = NULL;
++		atomic_set(&resume_bdev_open_count, 0);
++		return 1;
++	}
++
++	return 0;
++}
++
++/**
++ * toi_bio_initialise - initialise bio code at start of some action
++ * @starting_cycle:	Whether starting a hibernation cycle, or just reading or
++ *			writing a sysfs value.
++ **/
++static int toi_bio_initialise(int starting_cycle)
++{
++	int result;
++
++	if (!starting_cycle || !resume_dev_t)
++		return 0;
++
++	max_outstanding_writes = 0;
++	max_outstanding_reads = 0;
++	current_stream = 0;
++	toi_queue_flusher = current;
++#ifdef MEASURE_MUTEX_CONTENTION
++	{
++		int i, j, k;
++
++		for (i = 0; i < 2; i++)
++			for (j = 0; j < 2; j++)
++				for_each_online_cpu(k)
++					mutex_times[i][j][k] = 0;
++	}
++#endif
++	result = open_resume_dev_t(0, 1);
++
++	if (result)
++		return result;
++
++	return get_signature_page();
++}
++
++static unsigned long raw_to_real(unsigned long raw)
++{
++	unsigned long extra;
++
++	extra = (raw * (sizeof(unsigned long) + sizeof(int)) +
++		(PAGE_SIZE + sizeof(unsigned long) + sizeof(int) + 1)) /
++		(PAGE_SIZE + sizeof(unsigned long) + sizeof(int));
++
++	return raw > extra ? raw - extra : 0;
++}
++
++static unsigned long toi_bio_storage_available(void)
++{
++	unsigned long sum = 0;
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled ||
++		    this_module->type != BIO_ALLOCATOR_MODULE)
++			continue;
++		toi_message(TOI_BIO, TOI_VERBOSE, 0, "Seeking storage "
++				"available from %s.", this_module->name);
++		sum += this_module->bio_allocator_ops->storage_available();
++	}
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Total storage available is %lu "
++			"pages (%d header pages).", sum, header_pages_reserved);
++
++	return sum > header_pages_reserved ?
++		raw_to_real(sum - header_pages_reserved) : 0;
++
++}
++
++static unsigned long toi_bio_storage_allocated(void)
++{
++	return raw_pages_allocd > header_pages_reserved ?
++		raw_to_real(raw_pages_allocd - header_pages_reserved) : 0;
++}
++
++/*
++ * If we have read part of the image, we might have filled  memory with
++ * data that should be zeroed out.
++ */
++static void toi_bio_noresume_reset(void)
++{
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_noresume_reset.");
++	toi_rw_cleanup(READ);
++	free_all_bdev_info();
++}
++
++/**
++ * toi_bio_cleanup - cleanup after some action
++ * @finishing_cycle:	Whether completing a cycle.
++ **/
++static void toi_bio_cleanup(int finishing_cycle)
++{
++	if (!finishing_cycle)
++		return;
++
++	if (toi_writer_buffer) {
++		toi_free_page(11, (unsigned long) toi_writer_buffer);
++		toi_writer_buffer = NULL;
++	}
++
++	forget_signature_page();
++
++	if (header_block_device && toi_sig_data &&
++			toi_sig_data->header_dev_t != resume_dev_t)
++		toi_close_bdev(header_block_device);
++
++	header_block_device = NULL;
++
++	close_resume_dev_t(0);
++}
++
++static int toi_bio_write_header_init(void)
++{
++	int result;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_write_header_init");
++	toi_rw_init(WRITE, 0);
++	toi_writer_buffer_posn = 0;
++
++	/* Info needed to bootstrap goes at the start of the header.
++	 * First we save the positions and devinfo, including the number
++	 * of header pages. Then we save the structs containing data needed
++	 * for reading the header pages back.
++	 * Note that even if header pages take more than one page, when we
++	 * read back the info, we will have restored the location of the
++	 * next header page by the time we go to use it.
++	 */
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise extent chains.");
++	result = toi_serialise_extent_chains();
++
++	if (result)
++		return result;
++
++	/*
++	 * Signature page hasn't been modified at this point. Write it in
++	 * the header so we can restore it later.
++	 */
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "serialise signature page.");
++	return toi_rw_header_chunk_noreadahead(WRITE, &toi_blockwriter_ops,
++			(char *) toi_cur_sig_page,
++			PAGE_SIZE);
++}
++
++static int toi_bio_write_header_cleanup(void)
++{
++	int result = 0;
++
++	if (toi_writer_buffer_posn)
++		toi_bio_queue_write(&toi_writer_buffer);
++
++	result = toi_finish_all_io();
++
++	unowned = 0;
++	total_header_bytes = 0;
++
++	/* Set signature to save we have an image */
++	if (!result)
++		result = toi_bio_mark_have_image();
++
++	return result;
++}
++
++/*
++ * toi_bio_read_header_init()
++ *
++ * Description:
++ * 1. Attempt to read the device specified with resume=.
++ * 2. Check the contents of the swap header for our signature.
++ * 3. Warn, ignore, reset and/or continue as appropriate.
++ * 4. If continuing, read the toi_swap configuration section
++ *    of the header and set up block device info so we can read
++ *    the rest of the header & image.
++ *
++ * Returns:
++ * May not return if user choose to reboot at a warning.
++ * -EINVAL if cannot resume at this time. Booting should continue
++ * normally.
++ */
++
++static int toi_bio_read_header_init(void)
++{
++	int result = 0;
++	char buf[32];
++
++	toi_writer_buffer_posn = 0;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_init");
++
++	if (!toi_sig_data) {
++		printk(KERN_INFO "toi_bio_read_header_init called when we "
++				"haven't verified there is an image!\n");
++		return -EINVAL;
++	}
++
++	/*
++	 * If the header is not on the resume_swap_dev_t, get the resume device
++	 * first.
++	 */
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "Header dev_t is %lx.",
++			toi_sig_data->header_dev_t);
++	if (toi_sig_data->have_uuid) {
++		struct fs_info seek;
++		dev_t device;
++
++		strncpy((char *) seek.uuid, toi_sig_data->header_uuid, 16);
++		seek.dev_t = toi_sig_data->header_dev_t;
++		seek.last_mount_size = 0;
++		device = blk_lookup_fs_info(&seek);
++		if (device) {
++			printk("Using dev_t %s, returned by blk_lookup_fs_info.\n",
++					format_dev_t(buf, device));
++			toi_sig_data->header_dev_t = device;
++		}
++	}
++	if (toi_sig_data->header_dev_t != resume_dev_t) {
++		header_block_device = toi_open_bdev(NULL,
++				toi_sig_data->header_dev_t, 1);
++
++		if (IS_ERR(header_block_device))
++			return PTR_ERR(header_block_device);
++	} else
++		header_block_device = resume_block_device;
++
++	if (!toi_writer_buffer)
++		toi_writer_buffer = (char *) toi_get_zeroed_page(11,
++				TOI_ATOMIC_GFP);
++	more_readahead = 1;
++
++	/*
++	 * Read toi_swap configuration.
++	 * Headerblock size taken into account already.
++	 */
++	result = toi_bio_ops.bdev_page_io(READ, header_block_device,
++			toi_sig_data->first_header_block,
++			virt_to_page((unsigned long) toi_writer_buffer));
++	if (result)
++		return result;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "load extent chains.");
++	result = toi_load_extent_chains();
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "load original signature page.");
++	toi_orig_sig_page = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
++	if (!toi_orig_sig_page) {
++		printk(KERN_ERR "Failed to allocate memory for the current"
++			" image signature.\n");
++		return -ENOMEM;
++	}
++
++	return toi_rw_header_chunk_noreadahead(READ, &toi_blockwriter_ops,
++			(char *) toi_orig_sig_page,
++			PAGE_SIZE);
++}
++
++static int toi_bio_read_header_cleanup(void)
++{
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_read_header_cleanup.");
++	return toi_rw_cleanup(READ);
++}
++
++/* Works only for digits and letters, but small and fast */
++#define TOLOWER(x) ((x) | 0x20)
++
++/*
++ * UUID must be 32 chars long. It may have dashes, but nothing
++ * else.
++ */
++char *uuid_from_commandline(char *commandline)
++{
++	int low = 0;
++	char *result = NULL, *output, *ptr;
++
++	if (strncmp(commandline, "UUID=", 5))
++		return NULL;
++
++	result = kzalloc(17, GFP_KERNEL);
++	if (!result) {
++		printk("Failed to kzalloc UUID text memory.\n");
++		return NULL;
++	}
++
++	ptr = commandline + 5;
++	output = result;
++
++	while (*ptr && (output - result) < 16) {
++		if (isxdigit(*ptr)) {
++			int value = isdigit(*ptr) ? *ptr - '0' :
++				TOLOWER(*ptr) - 'a' + 10;
++			if (low) {
++				*output += value;
++				output++;
++			} else {
++				*output = value << 4;
++			}
++			low = !low;
++		} else if (*ptr != '-')
++			break;
++		ptr++;
++	}
++
++	if ((output - result) < 16 || *ptr) {
++		printk(KERN_DEBUG "Found resume=UUID=, but the value looks "
++				"invalid.\n");
++		kfree(result);
++		result = NULL;
++	}
++
++	return result;
++}
++
++#define retry_if_fails(command) \
++do { \
++	command; \
++	if (!resume_dev_t && !waited_for_device_probe) { \
++		wait_for_device_probe(); \
++		command; \
++		waited_for_device_probe = 1; \
++	} \
++} while(0)
++
++/**
++ * try_to_open_resume_device: Try to parse and open resume=
++ *
++ * Any "swap:" has been stripped away and we just have the path to deal with.
++ * We attempt to do name_to_dev_t, open and stat the file. Having opened the
++ * file, get the struct block_device * to match.
++ */
++static int try_to_open_resume_device(char *commandline, int quiet)
++{
++	struct kstat stat;
++	int error = 0;
++	char *uuid = uuid_from_commandline(commandline);
++	int waited_for_device_probe = 0;
++
++	resume_dev_t = MKDEV(0, 0);
++
++	if (!strlen(commandline))
++		retry_if_fails(toi_bio_scan_for_image(quiet));
++
++	if (uuid) {
++		struct fs_info seek;
++		strncpy((char *) &seek.uuid, uuid, 16);
++		seek.dev_t = resume_dev_t;
++		seek.last_mount_size = 0;
++		retry_if_fails(resume_dev_t = blk_lookup_fs_info(&seek));
++		kfree(uuid);
++	}
++
++	if (!resume_dev_t)
++		retry_if_fails(resume_dev_t = name_to_dev_t(commandline));
++
++	if (!resume_dev_t) {
++		struct file *file = filp_open(commandline,
++				O_RDONLY|O_LARGEFILE, 0);
++
++		if (!IS_ERR(file) && file) {
++			vfs_getattr(file->f_vfsmnt, file->f_dentry, &stat);
++			filp_close(file, NULL);
++		} else
++			error = vfs_stat(commandline, &stat);
++		if (!error)
++			resume_dev_t = stat.rdev;
++	}
++
++	if (!resume_dev_t) {
++		if (quiet)
++			return 1;
++
++		if (test_toi_state(TOI_TRYING_TO_RESUME))
++			toi_early_boot_message(1, toi_translate_err_default,
++			  "Failed to translate \"%s\" into a device id.\n",
++			  commandline);
++		else
++			printk("TuxOnIce: Can't translate \"%s\" into a device "
++					"id yet.\n", commandline);
++		return 1;
++	}
++
++	return open_resume_dev_t(1, quiet);
++}
++
++/*
++ * Parse Image Location
++ *
++ * Attempt to parse a resume= parameter.
++ * Swap Writer accepts:
++ * resume=[swap:|file:]DEVNAME[:FIRSTBLOCK][@BLOCKSIZE]
++ *
++ * Where:
++ * DEVNAME is convertable to a dev_t by name_to_dev_t
++ * FIRSTBLOCK is the location of the first block in the swap file
++ * (specifying for a swap partition is nonsensical but not prohibited).
++ * Data is validated by attempting to read a swap header from the
++ * location given. Failure will result in toi_swap refusing to
++ * save an image, and a reboot with correct parameters will be
++ * necessary.
++ */
++static int toi_bio_parse_sig_location(char *commandline,
++		int only_allocator, int quiet)
++{
++	char *thischar, *devstart, *colon = NULL;
++	int signature_found, result = -EINVAL, temp_result = 0;
++
++	if (strncmp(commandline, "swap:", 5) &&
++	    strncmp(commandline, "file:", 5)) {
++		/*
++		 * Failing swap:, we'll take a simple resume=/dev/hda2, or a
++		 * blank value (scan) but fall through to other allocators
++		 * if /dev/ or UUID= isn't matched.
++		 */
++		if (strncmp(commandline, "/dev/", 5) &&
++		    strncmp(commandline, "UUID=", 5) &&
++		    strlen(commandline))
++			return 1;
++	} else
++		commandline += 5;
++
++	devstart = commandline;
++	thischar = commandline;
++	while ((*thischar != ':') && (*thischar != '@') &&
++		((thischar - commandline) < 250) && (*thischar))
++		thischar++;
++
++	if (*thischar == ':') {
++		colon = thischar;
++		*colon = 0;
++		thischar++;
++	}
++
++	while ((thischar - commandline) < 250 && *thischar)
++		thischar++;
++
++	if (colon) {
++		unsigned long block;
++		temp_result = strict_strtoul(colon + 1, 0, &block);
++		if (!temp_result)
++			resume_firstblock = (int) block;
++	} else
++		resume_firstblock = 0;
++
++	clear_toi_state(TOI_CAN_HIBERNATE);
++	clear_toi_state(TOI_CAN_RESUME);
++
++	if (!temp_result)
++		temp_result = try_to_open_resume_device(devstart, quiet);
++
++	if (colon)
++		*colon = ':';
++
++	/* No error if we only scanned */
++	if (temp_result)
++		return strlen(commandline) ? -EINVAL : 1;
++
++	signature_found = toi_bio_image_exists(quiet);
++
++	if (signature_found != -1) {
++		result = 0;
++		/*
++		 * TODO: If only file storage, CAN_HIBERNATE should only be
++		 * set if file allocator's target is valid.
++		 */
++		set_toi_state(TOI_CAN_HIBERNATE);
++		set_toi_state(TOI_CAN_RESUME);
++	} else
++		if (!quiet)
++			printk(KERN_ERR "TuxOnIce: Block I/O: No "
++				"signature found at %s.\n", devstart);
++
++	return result;
++}
++
++static void toi_bio_release_storage(void)
++{
++	header_pages_reserved = 0;
++	raw_pages_allocd = 0;
++
++	free_all_bdev_info();
++}
++
++/* toi_swap_remove_image
++ *
++ */
++static int toi_bio_remove_image(void)
++{
++	int result;
++
++	toi_message(TOI_BIO, TOI_VERBOSE, 0, "toi_bio_remove_image.");
++
++	result = toi_bio_restore_original_signature();
++
++	/*
++	 * We don't do a sanity check here: we want to restore the swap
++	 * whatever version of kernel made the hibernate image.
++	 *
++	 * We need to write swap, but swap may not be enabled so
++	 * we write the device directly
++	 *
++	 * If we don't have an current_signature_page, we didn't
++	 * read an image header, so don't change anything.
++	 */
++
++	toi_bio_release_storage();
++
++	return result;
++}
++
++struct toi_bio_ops toi_bio_ops = {
++	.bdev_page_io = toi_bdev_page_io,
++	.register_storage = toi_register_storage_chain,
++	.free_storage = toi_bio_release_storage,
++};
++EXPORT_SYMBOL_GPL(toi_bio_ops);
++
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_INT("target_outstanding_io", SYSFS_RW, &target_outstanding_io,
++			0, 16384, 0, NULL),
++};
++
++struct toi_module_ops toi_blockwriter_ops = {
++	.type				= WRITER_MODULE,
++	.name				= "block i/o",
++	.directory			= "block_io",
++	.module				= THIS_MODULE,
++	.memory_needed			= toi_bio_memory_needed,
++	.print_debug_info		= toi_bio_print_debug_stats,
++	.storage_needed			= toi_bio_storage_needed,
++	.save_config_info		= toi_bio_save_config_info,
++	.load_config_info		= toi_bio_load_config_info,
++	.initialise			= toi_bio_initialise,
++	.cleanup			= toi_bio_cleanup,
++	.post_atomic_restore		= toi_bio_chains_post_atomic,
++
++	.rw_init			= toi_rw_init,
++	.rw_cleanup			= toi_rw_cleanup,
++	.read_page			= toi_bio_read_page,
++	.write_page			= toi_bio_write_page,
++	.rw_header_chunk		= toi_rw_header_chunk,
++	.rw_header_chunk_noreadahead	= toi_rw_header_chunk_noreadahead,
++	.io_flusher			= bio_io_flusher,
++	.update_throughput_throttle	= update_throughput_throttle,
++	.finish_all_io			= toi_finish_all_io,
++
++	.noresume_reset			= toi_bio_noresume_reset,
++	.storage_available 		= toi_bio_storage_available,
++	.storage_allocated		= toi_bio_storage_allocated,
++	.reserve_header_space		= toi_bio_reserve_header_space,
++	.allocate_storage		= toi_bio_allocate_storage,
++	.image_exists			= toi_bio_image_exists,
++	.mark_resume_attempted		= toi_bio_mark_resume_attempted,
++	.write_header_init		= toi_bio_write_header_init,
++	.write_header_cleanup		= toi_bio_write_header_cleanup,
++	.read_header_init		= toi_bio_read_header_init,
++	.read_header_cleanup		= toi_bio_read_header_cleanup,
++	.get_header_version		= toi_bio_get_header_version,
++	.remove_image			= toi_bio_remove_image,
++	.parse_sig_location		= toi_bio_parse_sig_location,
++
++	.sysfs_data			= sysfs_params,
++	.num_sysfs_entries		= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/**
++ * toi_block_io_load - load time routine for block I/O module
++ *
++ * Register block i/o ops and sysfs entries.
++ **/
++static __init int toi_block_io_load(void)
++{
++	return toi_register_module(&toi_blockwriter_ops);
++}
++
++#ifdef MODULE
++static __exit void toi_block_io_unload(void)
++{
++	toi_unregister_module(&toi_blockwriter_ops);
++}
++
++module_init(toi_block_io_load);
++module_exit(toi_block_io_unload);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("TuxOnIce block io functions");
++#else
++late_initcall(toi_block_io_load);
++#endif
+diff --git a/kernel/power/tuxonice_bio_internal.h b/kernel/power/tuxonice_bio_internal.h
+new file mode 100644
+index 0000000..58c2481
+--- /dev/null
++++ b/kernel/power/tuxonice_bio_internal.h
+@@ -0,0 +1,86 @@
++/*
++ * kernel/power/tuxonice_bio_internal.h
++ *
++ * Copyright (C) 2009-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * This file contains declarations for functions exported from
++ * tuxonice_bio.c, which contains low level io functions.
++ */
++
++/* Extent chains */
++void toi_extent_state_goto_start(void);
++void toi_extent_state_save(int slot);
++int go_next_page(int writing, int section_barrier);
++void toi_extent_state_restore(int slot);
++void free_all_bdev_info(void);
++int devices_of_same_priority(struct toi_bdev_info *this);
++int toi_register_storage_chain(struct toi_bdev_info *new);
++int toi_serialise_extent_chains(void);
++int toi_load_extent_chains(void);
++int toi_bio_rw_page(int writing, struct page *page, int is_readahead,
++		int free_group);
++int toi_bio_restore_original_signature(void);
++int toi_bio_devinfo_storage_needed(void);
++unsigned long get_headerblock(void);
++dev_t get_header_dev_t(void);
++struct block_device *get_header_bdev(void);
++int toi_bio_allocate_storage(unsigned long request);
++
++/* Signature functions */
++#define HaveImage "HaveImage"
++#define NoImage "TuxOnIce"
++#define sig_size (sizeof(HaveImage))
++
++struct sig_data {
++	char sig[sig_size];
++	int have_image;
++	int resumed_before;
++
++	char have_uuid;
++	char header_uuid[17];
++	dev_t header_dev_t;
++	unsigned long first_header_block;
++
++	/* Repeat the signature to be sure we have a header version */
++	char sig2[sig_size];
++	int header_version;
++};
++
++void forget_signature_page(void);
++int toi_check_for_signature(void);
++int toi_bio_image_exists(int quiet);
++int get_signature_page(void);
++int toi_bio_mark_resume_attempted(int);
++extern char *toi_cur_sig_page;
++extern char *toi_orig_sig_page;
++int toi_bio_mark_have_image(void);
++extern struct sig_data *toi_sig_data;
++extern dev_t resume_dev_t;
++extern struct block_device *resume_block_device;
++extern struct block_device *header_block_device;
++extern unsigned long resume_firstblock;
++
++struct block_device *open_bdev(dev_t device, int display_errs);
++extern int current_stream;
++extern int more_readahead;
++int toi_do_io(int writing, struct block_device *bdev, long block0,
++	struct page *page, int is_readahead, int syncio, int free_group);
++int get_main_pool_phys_params(void);
++
++void toi_close_bdev(struct block_device *bdev);
++struct block_device *toi_open_bdev(char *uuid, dev_t default_device,
++		int display_errs);
++
++extern struct toi_module_ops toi_blockwriter_ops;
++void dump_block_chains(void);
++void debug_broken_header(void);
++extern unsigned long raw_pages_allocd, header_pages_reserved;
++int toi_bio_chains_debug_info(char *buffer, int size);
++void toi_bio_chains_post_atomic(struct toi_boot_kernel_data *bkd);
++int toi_bio_scan_for_image(int quiet);
++int toi_bio_get_header_version(void);
++
++void close_resume_dev_t(int force);
++int open_resume_dev_t(int force, int quiet);
+diff --git a/kernel/power/tuxonice_bio_signature.c b/kernel/power/tuxonice_bio_signature.c
+new file mode 100644
+index 0000000..2ebee7e
+--- /dev/null
++++ b/kernel/power/tuxonice_bio_signature.c
+@@ -0,0 +1,404 @@
++/*
++ * kernel/power/tuxonice_bio_signature.c
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ */
++
++#include <linux/fs_uuid.h>
++
++#include "tuxonice.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice_bio.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_io.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_bio_internal.h"
++
++struct sig_data *toi_sig_data;
++
++/* Struct of swap header pages */
++
++struct old_sig_data {
++	dev_t device;
++	unsigned long sector;
++	int resume_attempted;
++	int orig_sig_type;
++};
++
++union diskpage {
++	union swap_header swh;	/* swh.magic is the only member used */
++	struct sig_data sig_data;
++	struct old_sig_data old_sig_data;
++};
++
++union p_diskpage {
++	union diskpage *pointer;
++	char *ptr;
++	unsigned long address;
++};
++
++char *toi_cur_sig_page;
++char *toi_orig_sig_page;
++int have_image;
++int have_old_image;
++
++int get_signature_page(void)
++{
++	if (!toi_cur_sig_page) {
++		toi_message(TOI_IO, TOI_VERBOSE, 0,
++				"Allocating current signature page.");
++		toi_cur_sig_page = (char *) toi_get_zeroed_page(38,
++			TOI_ATOMIC_GFP);
++		if (!toi_cur_sig_page) {
++			printk(KERN_ERR "Failed to allocate memory for the "
++				"current image signature.\n");
++			return -ENOMEM;
++		}
++
++		toi_sig_data = (struct sig_data *) toi_cur_sig_page;
++	}
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Reading signature from dev %lx,"
++			" sector %d.",
++			resume_block_device->bd_dev, resume_firstblock);
++
++	return toi_bio_ops.bdev_page_io(READ, resume_block_device,
++		resume_firstblock, virt_to_page(toi_cur_sig_page));
++}
++
++void forget_signature_page(void)
++{
++	if (toi_cur_sig_page) {
++		toi_sig_data = NULL;
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_cur_sig_page"
++				" (%p).", toi_cur_sig_page);
++		toi_free_page(38, (unsigned long) toi_cur_sig_page);
++		toi_cur_sig_page = NULL;
++	}
++
++	if (toi_orig_sig_page) {
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing toi_orig_sig_page"
++				" (%p).", toi_orig_sig_page);
++		toi_free_page(38, (unsigned long) toi_orig_sig_page);
++		toi_orig_sig_page = NULL;
++	}
++}
++
++/*
++ * We need to ensure we use the signature page that's currently on disk,
++ * so as to not remove the image header. Post-atomic-restore, the orig sig
++ * page will be empty, so we can use that as our method of knowing that we
++ * need to load the on-disk signature and not use the non-image sig in
++ * memory. (We're going to powerdown after writing the change, so it's safe.
++ */
++int toi_bio_mark_resume_attempted(int flag)
++{
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Make resume attempted = %d.",
++			flag);
++	if (!toi_orig_sig_page) {
++		forget_signature_page();
++		get_signature_page();
++	}
++	toi_sig_data->resumed_before = flag;
++	return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
++		resume_firstblock, virt_to_page(toi_cur_sig_page));
++}
++
++int toi_bio_mark_have_image(void)
++{
++	int result = 0;
++	char buf[32];
++	struct fs_info *fs_info;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that an image exists.");
++	memcpy(toi_sig_data->sig, tuxonice_signature,
++			sizeof(tuxonice_signature));
++	toi_sig_data->have_image = 1;
++	toi_sig_data->resumed_before = 0;
++	toi_sig_data->header_dev_t = get_header_dev_t();
++	toi_sig_data->have_uuid = 0;
++
++	fs_info = fs_info_from_block_dev(get_header_bdev());
++	if (fs_info && !IS_ERR(fs_info)) {
++		memcpy(toi_sig_data->header_uuid, &fs_info->uuid, 16);
++		free_fs_info(fs_info);
++	} else
++		result = (int) PTR_ERR(fs_info);
++
++	if (!result) {
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Got uuid for dev_t %s.",
++				format_dev_t(buf, get_header_dev_t()));
++		toi_sig_data->have_uuid = 1;
++	} else
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Could not get uuid for "
++				"dev_t %s.",
++				format_dev_t(buf, get_header_dev_t()));
++
++	toi_sig_data->first_header_block = get_headerblock();
++	have_image = 1;
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is %x. First block "
++			"is %d.", toi_sig_data->header_dev_t,
++			toi_sig_data->first_header_block);
++
++	memcpy(toi_sig_data->sig2, tuxonice_signature,
++			sizeof(tuxonice_signature));
++	toi_sig_data->header_version = TOI_HEADER_VERSION;
++
++	return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
++		resume_firstblock, virt_to_page(toi_cur_sig_page));
++}
++
++int remove_old_signature(void)
++{
++	union p_diskpage swap_header_page = (union p_diskpage) toi_cur_sig_page;
++	char *orig_sig, *no_image_signature_contents;
++	char *header_start = (char *) toi_get_zeroed_page(38, TOI_ATOMIC_GFP);
++	int result;
++	struct block_device *header_bdev;
++	struct old_sig_data *old_sig_data =
++		&swap_header_page.pointer->old_sig_data;
++
++	header_bdev = toi_open_bdev(NULL, old_sig_data->device, 1);
++	result = toi_bio_ops.bdev_page_io(READ, header_bdev,
++			old_sig_data->sector, virt_to_page(header_start));
++
++	if (result)
++		goto out;
++
++	/*
++	 * TODO: Get the original contents of the first bytes of the swap
++	 * header page.
++	 */
++	if (!old_sig_data->orig_sig_type)
++		orig_sig = "SWAP-SPACE";
++	else
++		orig_sig = "SWAPSPACE2";
++
++	memcpy(swap_header_page.pointer->swh.magic.magic, orig_sig, 10);
++	memcpy(swap_header_page.ptr, header_start,
++			sizeof(no_image_signature_contents));
++
++	result = toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
++		resume_firstblock, virt_to_page(swap_header_page.ptr));
++
++out:
++	toi_close_bdev(header_bdev);
++	have_old_image = 0;
++	toi_free_page(38, (unsigned long) header_start);
++	return result;
++}
++
++/*
++ * toi_bio_restore_original_signature - restore the original signature
++ *
++ * At boot time (aborting pre atomic-restore), toi_orig_sig_page gets used.
++ * It will have the original signature page contents, stored in the image
++ * header. Post atomic-restore, we use :toi_cur_sig_page, which will contain
++ * the contents that were loaded when we started the cycle.
++ */
++int toi_bio_restore_original_signature(void)
++{
++	char *use = toi_orig_sig_page ? toi_orig_sig_page : toi_cur_sig_page;
++
++	if (have_old_image)
++		return remove_old_signature();
++
++	if (!use) {
++		printk("toi_bio_restore_original_signature: No signature "
++				"page loaded.\n");
++		return 0;
++	}
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Recording that no image exists.");
++	have_image = 0;
++	toi_sig_data->have_image = 0;
++	return toi_bio_ops.bdev_page_io(WRITE, resume_block_device,
++		resume_firstblock, virt_to_page(use));
++}
++
++/*
++ * check_for_signature - See whether we have an image.
++ *
++ * Returns 0 if no image, 1 if there is one, -1 if indeterminate.
++ */
++int toi_check_for_signature(void)
++{
++	union p_diskpage swap_header_page;
++	int type;
++	const char *normal_sigs[] = {"SWAP-SPACE", "SWAPSPACE2" };
++	const char *swsusp_sigs[] = {"S1SUSP", "S2SUSP", "S1SUSPEND" };
++	char *swap_header;
++
++	if (!toi_cur_sig_page) {
++		int result = get_signature_page();
++
++		if (result)
++			return result;
++	}
++
++	/*
++	 * Start by looking for the binary header.
++	 */
++	if (!memcmp(tuxonice_signature, toi_cur_sig_page,
++				sizeof(tuxonice_signature))) {
++		have_image = toi_sig_data->have_image;
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Have binary signature. "
++				"Have image is %d.", have_image);
++		if (have_image)
++			toi_message(TOI_IO, TOI_VERBOSE, 0, "header dev_t is "
++					"%x. First block is %d.",
++					toi_sig_data->header_dev_t,
++					toi_sig_data->first_header_block);
++		return toi_sig_data->have_image;
++	}
++
++	/*
++	 * Failing that, try old file allocator headers.
++	 */
++
++	if (!memcmp(HaveImage, toi_cur_sig_page, strlen(HaveImage))) {
++		have_image = 1;
++		return 1;
++	}
++
++	have_image = 0;
++
++	if (!memcmp(NoImage, toi_cur_sig_page, strlen(NoImage)))
++		return 0;
++
++	/*
++	 * Nope? How about swap?
++	 */
++	swap_header_page = (union p_diskpage) toi_cur_sig_page;
++	swap_header = swap_header_page.pointer->swh.magic.magic;
++
++	/* Normal swapspace? */
++	for (type = 0; type < 2; type++)
++		if (!memcmp(normal_sigs[type], swap_header,
++					strlen(normal_sigs[type])))
++			return 0;
++
++	/* Swsusp or uswsusp? */
++	for (type = 0; type < 3; type++)
++		if (!memcmp(swsusp_sigs[type], swap_header,
++					strlen(swsusp_sigs[type])))
++			return 2;
++
++	/* Old TuxOnIce version? */
++	if (!memcmp(tuxonice_signature, swap_header,
++				sizeof(tuxonice_signature) - 1)) {
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Found old TuxOnIce "
++				"signature.");
++		have_old_image = 1;
++		return 3;
++	}
++
++	return -1;
++}
++
++/*
++ * Image_exists
++ *
++ * Returns -1 if don't know, otherwise 0 (no) or 1 (yes).
++ */
++int toi_bio_image_exists(int quiet)
++{
++	int result;
++	char *msg = NULL;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_bio_image_exists.");
++
++	if (!resume_dev_t) {
++		if (!quiet)
++			printk(KERN_INFO "Not even trying to read header "
++				"because resume_dev_t is not set.\n");
++		return -1;
++	}
++
++	if (open_resume_dev_t(0, quiet))
++		return -1;
++
++	result = toi_check_for_signature();
++
++	clear_toi_state(TOI_RESUMED_BEFORE);
++	if (toi_sig_data->resumed_before)
++		set_toi_state(TOI_RESUMED_BEFORE);
++
++	if (quiet || result == -ENOMEM)
++		return result;
++
++	if (result == -1)
++		msg = "TuxOnIce: Unable to find a signature."
++				" Could you have moved a swap file?\n";
++	else if (!result)
++		msg = "TuxOnIce: No image found.\n";
++	else if (result == 1)
++		msg = "TuxOnIce: Image found.\n";
++	else if (result == 2)
++		msg = "TuxOnIce: uswsusp or swsusp image found.\n";
++	else if (result == 3)
++		msg = "TuxOnIce: Old implementation's signature found.\n";
++
++	printk(KERN_INFO "%s", msg);
++
++	return result;
++}
++
++int toi_bio_scan_for_image(int quiet)
++{
++	struct block_device *bdev;
++	char default_name[255] = "";
++
++	if (!quiet)
++		printk(KERN_DEBUG "Scanning swap devices for TuxOnIce "
++				"signature...\n");
++	for (bdev = next_bdev_of_type(NULL, "swap"); bdev;
++				bdev = next_bdev_of_type(bdev, "swap")) {
++		int result;
++		char name[255] = "";
++		sprintf(name, "%u:%u", MAJOR(bdev->bd_dev),
++				MINOR(bdev->bd_dev));
++		if (!quiet)
++			printk(KERN_DEBUG "- Trying %s.\n", name);
++		resume_block_device = bdev;
++		resume_dev_t = bdev->bd_dev;
++
++		result = toi_check_for_signature();
++
++		resume_block_device = NULL;
++		resume_dev_t = MKDEV(0, 0);
++
++		if (!default_name[0])
++			strcpy(default_name, name);
++
++		if (result == 1) {
++			/* Got one! */
++			strcpy(resume_file, name);
++			next_bdev_of_type(bdev, NULL);
++			if (!quiet)
++				printk(KERN_DEBUG " ==> Image found on %s.\n",
++						resume_file);
++			return 1;
++		}
++		forget_signature_page();
++	}
++
++	if (!quiet)
++		printk(KERN_DEBUG "TuxOnIce scan: No image found.\n");
++	strcpy(resume_file, default_name);
++	return 0;
++}
++
++int toi_bio_get_header_version(void)
++{
++	return (memcmp(toi_sig_data->sig2, tuxonice_signature,
++				sizeof(tuxonice_signature))) ?
++		0 : toi_sig_data->header_version;
++
++}
+diff --git a/kernel/power/tuxonice_builtin.c b/kernel/power/tuxonice_builtin.c
+new file mode 100644
+index 0000000..62b5d14
+--- /dev/null
++++ b/kernel/power/tuxonice_builtin.c
+@@ -0,0 +1,445 @@
++/*
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ */
++#include <linux/resume-trace.h>
++#include <linux/kernel.h>
++#include <linux/swap.h>
++#include <linux/syscalls.h>
++#include <linux/bio.h>
++#include <linux/root_dev.h>
++#include <linux/freezer.h>
++#include <linux/reboot.h>
++#include <linux/writeback.h>
++#include <linux/tty.h>
++#include <linux/crypto.h>
++#include <linux/cpu.h>
++#include <linux/ctype.h>
++#include "tuxonice_io.h"
++#include "tuxonice.h"
++#include "tuxonice_extent.h"
++#include "tuxonice_netlink.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_pagedir.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_power_off.h"
++#include "tuxonice_alloc.h"
++
++unsigned long toi_bootflags_mask;
++EXPORT_SYMBOL_GPL(toi_bootflags_mask);
++
++/*
++ * Highmem related functions (x86 only).
++ */
++
++#ifdef CONFIG_HIGHMEM
++
++/**
++ * copyback_high: Restore highmem pages.
++ *
++ * Highmem data and pbe lists are/can be stored in highmem.
++ * The format is slightly different to the lowmem pbe lists
++ * used for the assembly code: the last pbe in each page is
++ * a struct page * instead of struct pbe *, pointing to the
++ * next page where pbes are stored (or NULL if happens to be
++ * the end of the list). Since we don't want to generate
++ * unnecessary deltas against swsusp code, we use a cast
++ * instead of a union.
++ **/
++
++static void copyback_high(void)
++{
++	struct page *pbe_page = (struct page *) restore_highmem_pblist;
++	struct pbe *this_pbe, *first_pbe;
++	unsigned long *origpage, *copypage;
++	int pbe_index = 1;
++
++	if (!pbe_page)
++		return;
++
++	this_pbe = (struct pbe *) kmap_atomic(pbe_page);
++	first_pbe = this_pbe;
++
++	while (this_pbe) {
++		int loop = (PAGE_SIZE / sizeof(unsigned long)) - 1;
++
++		origpage = kmap_atomic(pfn_to_page((unsigned long) this_pbe->orig_address));
++		copypage = kmap_atomic((struct page *) this_pbe->address);
++
++		while (loop >= 0) {
++			*(origpage + loop) = *(copypage + loop);
++			loop--;
++		}
++
++		kunmap_atomic(origpage);
++		kunmap_atomic(copypage);
++
++		if (!this_pbe->next)
++			break;
++
++		if (pbe_index < PBES_PER_PAGE) {
++			this_pbe++;
++			pbe_index++;
++		} else {
++			pbe_page = (struct page *) this_pbe->next;
++			kunmap_atomic(first_pbe);
++			if (!pbe_page)
++				return;
++			this_pbe = (struct pbe *) kmap_atomic(pbe_page);
++			first_pbe = this_pbe;
++			pbe_index = 1;
++		}
++	}
++	kunmap_atomic(first_pbe);
++}
++
++#else /* CONFIG_HIGHMEM */
++static void copyback_high(void) { }
++#endif
++
++char toi_wait_for_keypress_dev_console(int timeout)
++{
++	int fd, this_timeout = 255;
++	char key = '\0';
++	struct termios t, t_backup;
++
++	/* We should be guaranteed /dev/console exists after populate_rootfs()
++	 * in init/main.c.
++	 */
++	fd = sys_open("/dev/console", O_RDONLY, 0);
++	if (fd < 0) {
++		printk(KERN_INFO "Couldn't open /dev/console.\n");
++		return key;
++	}
++
++	if (sys_ioctl(fd, TCGETS, (long)&t) < 0)
++		goto out_close;
++
++	memcpy(&t_backup, &t, sizeof(t));
++
++	t.c_lflag &= ~(ISIG|ICANON|ECHO);
++	t.c_cc[VMIN] = 0;
++
++new_timeout:
++	if (timeout > 0) {
++		this_timeout = timeout < 26 ? timeout : 25;
++		timeout -= this_timeout;
++		this_timeout *= 10;
++	}
++
++	t.c_cc[VTIME] = this_timeout;
++
++	if (sys_ioctl(fd, TCSETS, (long)&t) < 0)
++		goto out_restore;
++
++	while (1) {
++		if (sys_read(fd, &key, 1) <= 0) {
++			if (timeout)
++				goto new_timeout;
++			key = '\0';
++			break;
++		}
++		key = tolower(key);
++		if (test_toi_state(TOI_SANITY_CHECK_PROMPT)) {
++			if (key == 'c') {
++				set_toi_state(TOI_CONTINUE_REQ);
++				break;
++			} else if (key == ' ')
++				break;
++		} else
++			break;
++	}
++
++out_restore:
++	sys_ioctl(fd, TCSETS, (long)&t_backup);
++out_close:
++	sys_close(fd);
++
++	return key;
++}
++EXPORT_SYMBOL_GPL(toi_wait_for_keypress_dev_console);
++
++struct toi_boot_kernel_data toi_bkd __nosavedata
++		__attribute__((aligned(PAGE_SIZE))) = {
++	MY_BOOT_KERNEL_DATA_VERSION,
++	0,
++#ifdef CONFIG_TOI_REPLACE_SWSUSP
++	(1 << TOI_REPLACE_SWSUSP) |
++#endif
++	(1 << TOI_NO_FLUSHER_THREAD) |
++	(1 << TOI_PAGESET2_FULL) | (1 << TOI_LATE_CPU_HOTPLUG),
++};
++EXPORT_SYMBOL_GPL(toi_bkd);
++
++struct block_device *toi_open_by_devnum(dev_t dev)
++{
++	struct block_device *bdev = bdget(dev);
++	int err = -ENOMEM;
++	if (bdev)
++		err = blkdev_get(bdev, FMODE_READ | FMODE_NDELAY, NULL);
++	return err ? ERR_PTR(err) : bdev;
++}
++EXPORT_SYMBOL_GPL(toi_open_by_devnum);
++
++/**
++ * toi_close_bdev: Close a swap bdev.
++ *
++ * int: The swap entry number to close.
++ */
++void toi_close_bdev(struct block_device *bdev)
++{
++	blkdev_put(bdev, FMODE_READ | FMODE_NDELAY);
++}
++EXPORT_SYMBOL_GPL(toi_close_bdev);
++
++int toi_wait = CONFIG_TOI_DEFAULT_WAIT;
++EXPORT_SYMBOL_GPL(toi_wait);
++
++struct toi_core_fns *toi_core_fns;
++EXPORT_SYMBOL_GPL(toi_core_fns);
++
++unsigned long toi_result;
++EXPORT_SYMBOL_GPL(toi_result);
++
++struct pagedir pagedir1 = {1};
++EXPORT_SYMBOL_GPL(pagedir1);
++
++unsigned long toi_get_nonconflicting_page(void)
++{
++	return toi_core_fns->get_nonconflicting_page();
++}
++
++int toi_post_context_save(void)
++{
++	return toi_core_fns->post_context_save();
++}
++
++int try_tuxonice_hibernate(void)
++{
++	if (!toi_core_fns)
++		return -ENODEV;
++
++	return toi_core_fns->try_hibernate();
++}
++
++static int num_resume_calls;
++#ifdef CONFIG_TOI_IGNORE_LATE_INITCALL
++static int ignore_late_initcall = 1;
++#else
++static int ignore_late_initcall;
++#endif
++
++int toi_translate_err_default = TOI_CONTINUE_REQ;
++EXPORT_SYMBOL_GPL(toi_translate_err_default);
++
++void try_tuxonice_resume(void)
++{
++	/* Don't let it wrap around eventually */
++	if (num_resume_calls < 2)
++		num_resume_calls++;
++
++	if (num_resume_calls == 1 && ignore_late_initcall) {
++		printk(KERN_INFO "TuxOnIce: Ignoring late initcall, as requested.\n");
++		return;
++	}
++
++	if (toi_core_fns)
++		toi_core_fns->try_resume();
++	else
++		printk(KERN_INFO "TuxOnIce core not loaded yet.\n");
++}
++
++int toi_lowlevel_builtin(void)
++{
++	int error = 0;
++
++	save_processor_state();
++	error = swsusp_arch_suspend();
++	if (error)
++		printk(KERN_ERR "Error %d hibernating\n", error);
++
++	/* Restore control flow appears here */
++	if (!toi_in_hibernate) {
++		copyback_high();
++		set_toi_state(TOI_NOW_RESUMING);
++	}
++
++	restore_processor_state();
++	return error;
++}
++EXPORT_SYMBOL_GPL(toi_lowlevel_builtin);
++
++unsigned long toi_compress_bytes_in;
++EXPORT_SYMBOL_GPL(toi_compress_bytes_in);
++
++unsigned long toi_compress_bytes_out;
++EXPORT_SYMBOL_GPL(toi_compress_bytes_out);
++
++int toi_in_suspend(void)
++{
++  return in_suspend;
++}
++EXPORT_SYMBOL_GPL(toi_in_suspend);
++
++unsigned long toi_state = ((1 << TOI_BOOT_TIME) |
++		(1 << TOI_IGNORE_LOGLEVEL) |
++		(1 << TOI_IO_STOPPED));
++EXPORT_SYMBOL_GPL(toi_state);
++
++/* The number of hibernates we have started (some may have been cancelled) */
++unsigned int nr_hibernates;
++EXPORT_SYMBOL_GPL(nr_hibernates);
++
++int toi_running;
++EXPORT_SYMBOL_GPL(toi_running);
++
++__nosavedata int toi_in_hibernate;
++EXPORT_SYMBOL_GPL(toi_in_hibernate);
++
++__nosavedata struct pbe *restore_highmem_pblist;
++EXPORT_SYMBOL_GPL(restore_highmem_pblist);
++
++int toi_trace_allocs;
++EXPORT_SYMBOL_GPL(toi_trace_allocs);
++
++void toi_read_lock_tasklist(void)
++{
++	read_lock(&tasklist_lock);
++}
++EXPORT_SYMBOL_GPL(toi_read_lock_tasklist);
++
++void toi_read_unlock_tasklist(void)
++{
++	read_unlock(&tasklist_lock);
++}
++EXPORT_SYMBOL_GPL(toi_read_unlock_tasklist);
++
++#ifdef CONFIG_TOI_ZRAM_SUPPORT
++int (*toi_flag_zram_disks) (void);
++EXPORT_SYMBOL_GPL(toi_flag_zram_disks);
++
++int toi_do_flag_zram_disks(void)
++{
++	return toi_flag_zram_disks ? (*toi_flag_zram_disks)() : 0;
++}
++EXPORT_SYMBOL_GPL(toi_do_flag_zram_disks);
++#endif
++
++static int __init toi_wait_setup(char *str)
++{
++	int value;
++
++	if (sscanf(str, "=%d", &value)) {
++		if (value < -1 || value > 255)
++			printk(KERN_INFO "TuxOnIce_wait outside range -1 to "
++					"255.\n");
++		else
++			toi_wait = value;
++	}
++
++	return 1;
++}
++
++__setup("toi_wait", toi_wait_setup);
++
++static int __init toi_translate_retry_setup(char *str)
++{
++	toi_translate_err_default = 0;
++	return 1;
++}
++
++__setup("toi_translate_retry", toi_translate_retry_setup);
++
++static int __init toi_debug_setup(char *str)
++{
++	toi_bkd.toi_action |= (1 << TOI_LOGALL);
++	toi_bootflags_mask |= (1 << TOI_LOGALL);
++	toi_bkd.toi_debug_state = 255;
++	toi_bkd.toi_default_console_level = 7;
++	return 1;
++}
++
++__setup("toi_debug_setup", toi_debug_setup);
++
++static int __init toi_pause_setup(char *str)
++{
++	toi_bkd.toi_action |= (1 << TOI_PAUSE);
++	toi_bootflags_mask |= (1 << TOI_PAUSE);
++	return 1;
++}
++
++__setup("toi_pause", toi_pause_setup);
++
++#ifdef CONFIG_PM_DEBUG
++static int __init toi_trace_allocs_setup(char *str)
++{
++	int value;
++
++	if (sscanf(str, "=%d", &value))
++		toi_trace_allocs = value;
++
++	return 1;
++}
++__setup("toi_trace_allocs", toi_trace_allocs_setup);
++#endif
++
++static int __init toi_ignore_late_initcall_setup(char *str)
++{
++	int value;
++
++	if (sscanf(str, "=%d", &value))
++		ignore_late_initcall = value;
++
++	return 1;
++}
++
++__setup("toi_initramfs_resume_only", toi_ignore_late_initcall_setup);
++
++static int __init toi_force_no_multithreaded_setup(char *str)
++{
++	int value;
++
++	toi_bkd.toi_action &= ~(1 << TOI_NO_MULTITHREADED_IO);
++	toi_bootflags_mask |= (1 << TOI_NO_MULTITHREADED_IO);
++
++	if (sscanf(str, "=%d", &value) && value)
++		toi_bkd.toi_action |= (1 << TOI_NO_MULTITHREADED_IO);
++
++	return 1;
++}
++
++__setup("toi_no_multithreaded", toi_force_no_multithreaded_setup);
++
++#ifdef CONFIG_KGDB
++static int __init toi_post_resume_breakpoint_setup(char *str)
++{
++	int value;
++
++	toi_bkd.toi_action &= ~(1 << TOI_POST_RESUME_BREAKPOINT);
++	toi_bootflags_mask |= (1 << TOI_POST_RESUME_BREAKPOINT);
++	if (sscanf(str, "=%d", &value) && value)
++		toi_bkd.toi_action |= (1 << TOI_POST_RESUME_BREAKPOINT);
++
++	return 1;
++}
++
++__setup("toi_post_resume_break", toi_post_resume_breakpoint_setup);
++#endif
++
++static int __init toi_disable_readahead_setup(char *str)
++{
++	int value;
++
++	toi_bkd.toi_action &= ~(1 << TOI_NO_READAHEAD);
++	toi_bootflags_mask |= (1 << TOI_NO_READAHEAD);
++	if (sscanf(str, "=%d", &value) && value)
++		toi_bkd.toi_action |= (1 << TOI_NO_READAHEAD);
++
++	return 1;
++}
++
++__setup("toi_no_readahead", toi_disable_readahead_setup);
+diff --git a/kernel/power/tuxonice_builtin.h b/kernel/power/tuxonice_builtin.h
+new file mode 100644
+index 0000000..eea0155
+--- /dev/null
++++ b/kernel/power/tuxonice_builtin.h
+@@ -0,0 +1,39 @@
++/*
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ */
++#include <asm/setup.h>
++
++extern struct toi_core_fns *toi_core_fns;
++extern unsigned long toi_compress_bytes_in, toi_compress_bytes_out;
++extern unsigned int nr_hibernates;
++extern int toi_in_hibernate;
++
++extern __nosavedata struct pbe *restore_highmem_pblist;
++
++int toi_lowlevel_builtin(void);
++
++#ifdef CONFIG_HIGHMEM
++extern __nosavedata struct zone_data *toi_nosave_zone_list;
++extern __nosavedata unsigned long toi_nosave_max_pfn;
++#endif
++
++extern unsigned long toi_get_nonconflicting_page(void);
++extern int toi_post_context_save(void);
++
++extern char toi_wait_for_keypress_dev_console(int timeout);
++extern struct block_device *toi_open_by_devnum(dev_t dev);
++extern void toi_close_bdev(struct block_device *bdev);
++extern int toi_wait;
++extern int toi_translate_err_default;
++extern int toi_force_no_multithreaded;
++extern void toi_read_lock_tasklist(void);
++extern void toi_read_unlock_tasklist(void);
++extern int toi_in_suspend(void);
++
++#ifdef CONFIG_TOI_ZRAM_SUPPORT
++extern int toi_do_flag_zram_disks(void);
++#else
++#define toi_do_flag_zram_disks() (0)
++#endif
+diff --git a/kernel/power/tuxonice_checksum.c b/kernel/power/tuxonice_checksum.c
+new file mode 100644
+index 0000000..006e68b
+--- /dev/null
++++ b/kernel/power/tuxonice_checksum.c
+@@ -0,0 +1,384 @@
++/*
++ * kernel/power/tuxonice_checksum.c
++ *
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * This file contains data checksum routines for TuxOnIce,
++ * using cryptoapi. They are used to locate any modifications
++ * made to pageset 2 while we're saving it.
++ */
++
++#include <linux/suspend.h>
++#include <linux/highmem.h>
++#include <linux/vmalloc.h>
++#include <linux/crypto.h>
++#include <linux/scatterlist.h>
++
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_io.h"
++#include "tuxonice_pageflags.h"
++#include "tuxonice_checksum.h"
++#include "tuxonice_pagedir.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_ui.h"
++
++static struct toi_module_ops toi_checksum_ops;
++
++/* Constant at the mo, but I might allow tuning later */
++static char toi_checksum_name[32] = "md4";
++/* Bytes per checksum */
++#define CHECKSUM_SIZE (16)
++
++#define CHECKSUMS_PER_PAGE ((PAGE_SIZE - sizeof(void *)) / CHECKSUM_SIZE)
++
++struct cpu_context {
++	struct crypto_hash *transform;
++	struct hash_desc desc;
++	struct scatterlist sg[2];
++	char *buf;
++};
++
++static DEFINE_PER_CPU(struct cpu_context, contexts);
++static int pages_allocated;
++static unsigned long page_list;
++
++static int toi_num_resaved;
++
++static unsigned long this_checksum, next_page;
++static int checksum_index;
++
++static inline int checksum_pages_needed(void)
++{
++	return DIV_ROUND_UP(pagedir2.size, CHECKSUMS_PER_PAGE);
++}
++
++/* ---- Local buffer management ---- */
++
++/*
++ * toi_checksum_cleanup
++ *
++ * Frees memory allocated for our labours.
++ */
++static void toi_checksum_cleanup(int ending_cycle)
++{
++	int cpu;
++
++	if (ending_cycle) {
++		for_each_online_cpu(cpu) {
++			struct cpu_context *this = &per_cpu(contexts, cpu);
++			if (this->transform) {
++				crypto_free_hash(this->transform);
++				this->transform = NULL;
++				this->desc.tfm = NULL;
++			}
++
++			if (this->buf) {
++				toi_free_page(27, (unsigned long) this->buf);
++				this->buf = NULL;
++			}
++		}
++	}
++}
++
++/*
++ * toi_crypto_initialise
++ *
++ * Prepare to do some work by allocating buffers and transforms.
++ * Returns: Int: Zero. Even if we can't set up checksum, we still
++ * seek to hibernate.
++ */
++static int toi_checksum_initialise(int starting_cycle)
++{
++	int cpu;
++
++	if (!(starting_cycle & SYSFS_HIBERNATE) || !toi_checksum_ops.enabled)
++		return 0;
++
++	if (!*toi_checksum_name) {
++		printk(KERN_INFO "TuxOnIce: No checksum algorithm name set.\n");
++		return 1;
++	}
++
++	for_each_online_cpu(cpu) {
++		struct cpu_context *this = &per_cpu(contexts, cpu);
++		struct page *page;
++
++		this->transform = crypto_alloc_hash(toi_checksum_name, 0, 0);
++		if (IS_ERR(this->transform)) {
++			printk(KERN_INFO "TuxOnIce: Failed to initialise the "
++				"%s checksum algorithm: %ld.\n",
++				toi_checksum_name, (long) this->transform);
++			this->transform = NULL;
++			return 1;
++		}
++
++		this->desc.tfm = this->transform;
++		this->desc.flags = 0;
++
++		page = toi_alloc_page(27, GFP_KERNEL);
++		if (!page)
++			return 1;
++		this->buf = page_address(page);
++		sg_init_one(&this->sg[0], this->buf, PAGE_SIZE);
++	}
++	return 0;
++}
++
++/*
++ * toi_checksum_print_debug_stats
++ * @buffer: Pointer to a buffer into which the debug info will be printed.
++ * @size: Size of the buffer.
++ *
++ * Print information to be recorded for debugging purposes into a buffer.
++ * Returns: Number of characters written to the buffer.
++ */
++
++static int toi_checksum_print_debug_stats(char *buffer, int size)
++{
++	int len;
++
++	if (!toi_checksum_ops.enabled)
++		return scnprintf(buffer, size,
++			"- Checksumming disabled.\n");
++
++	len = scnprintf(buffer, size, "- Checksum method is '%s'.\n",
++			toi_checksum_name);
++	len += scnprintf(buffer + len, size - len,
++		"  %d pages resaved in atomic copy.\n", toi_num_resaved);
++	return len;
++}
++
++static int toi_checksum_memory_needed(void)
++{
++	return toi_checksum_ops.enabled ?
++		checksum_pages_needed() << PAGE_SHIFT : 0;
++}
++
++static int toi_checksum_storage_needed(void)
++{
++	if (toi_checksum_ops.enabled)
++		return strlen(toi_checksum_name) + sizeof(int) + 1;
++	else
++		return 0;
++}
++
++/*
++ * toi_checksum_save_config_info
++ * @buffer: Pointer to a buffer of size PAGE_SIZE.
++ *
++ * Save informaton needed when reloading the image at resume time.
++ * Returns: Number of bytes used for saving our data.
++ */
++static int toi_checksum_save_config_info(char *buffer)
++{
++	int namelen = strlen(toi_checksum_name) + 1;
++	int total_len;
++
++	*((unsigned int *) buffer) = namelen;
++	strncpy(buffer + sizeof(unsigned int), toi_checksum_name, namelen);
++	total_len = sizeof(unsigned int) + namelen;
++	return total_len;
++}
++
++/* toi_checksum_load_config_info
++ * @buffer: Pointer to the start of the data.
++ * @size: Number of bytes that were saved.
++ *
++ * Description:	Reload information needed for dechecksuming the image at
++ * resume time.
++ */
++static void toi_checksum_load_config_info(char *buffer, int size)
++{
++	int namelen;
++
++	namelen = *((unsigned int *) (buffer));
++	strncpy(toi_checksum_name, buffer + sizeof(unsigned int),
++			namelen);
++	return;
++}
++
++/*
++ * Free Checksum Memory
++ */
++
++void free_checksum_pages(void)
++{
++	while (pages_allocated) {
++		unsigned long next = *((unsigned long *) page_list);
++		ClearPageNosave(virt_to_page(page_list));
++		toi_free_page(15, (unsigned long) page_list);
++		page_list = next;
++		pages_allocated--;
++	}
++}
++
++/*
++ * Allocate Checksum Memory
++ */
++
++int allocate_checksum_pages(void)
++{
++	int pages_needed = checksum_pages_needed();
++
++	if (!toi_checksum_ops.enabled)
++		return 0;
++
++	while (pages_allocated < pages_needed) {
++		unsigned long *new_page =
++		  (unsigned long *) toi_get_zeroed_page(15, TOI_ATOMIC_GFP);
++		if (!new_page) {
++			printk(KERN_ERR "Unable to allocate checksum pages.\n");
++			return -ENOMEM;
++		}
++		SetPageNosave(virt_to_page(new_page));
++		(*new_page) = page_list;
++		page_list = (unsigned long) new_page;
++		pages_allocated++;
++	}
++
++	next_page = (unsigned long) page_list;
++	checksum_index = 0;
++
++	return 0;
++}
++
++char *tuxonice_get_next_checksum(void)
++{
++	if (!toi_checksum_ops.enabled)
++		return NULL;
++
++	if (checksum_index % CHECKSUMS_PER_PAGE)
++		this_checksum += CHECKSUM_SIZE;
++	else {
++		this_checksum = next_page + sizeof(void *);
++		next_page = *((unsigned long *) next_page);
++	}
++
++	checksum_index++;
++	return (char *) this_checksum;
++}
++
++int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
++{
++	char *pa;
++	int result, cpu = smp_processor_id();
++	struct cpu_context *ctx = &per_cpu(contexts, cpu);
++
++	if (!toi_checksum_ops.enabled)
++		return 0;
++
++	pa = kmap(page);
++	memcpy(ctx->buf, pa, PAGE_SIZE);
++	kunmap(page);
++	result = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
++						checksum_locn);
++	if (result)
++		printk(KERN_ERR "TuxOnIce checksumming: crypto_hash_digest "
++				"returned %d.\n", result);
++	return result;
++}
++/*
++ * Calculate checksums
++ */
++
++void check_checksums(void)
++{
++	int pfn, index = 0, cpu = smp_processor_id();
++	char current_checksum[CHECKSUM_SIZE];
++	struct cpu_context *ctx = &per_cpu(contexts, cpu);
++
++	if (!toi_checksum_ops.enabled) {
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksumming disabled.");
++		return;
++	}
++
++	next_page = (unsigned long) page_list;
++
++	toi_num_resaved = 0;
++	this_checksum = 0;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Verifying checksums.");
++	memory_bm_position_reset(pageset2_map);
++	for (pfn = memory_bm_next_pfn(pageset2_map); pfn != BM_END_OF_MAP;
++			pfn = memory_bm_next_pfn(pageset2_map)) {
++		int ret;
++		char *pa;
++		struct page *page = pfn_to_page(pfn);
++
++		if (index % CHECKSUMS_PER_PAGE) {
++			this_checksum += CHECKSUM_SIZE;
++		} else {
++			this_checksum = next_page + sizeof(void *);
++			next_page = *((unsigned long *) next_page);
++		}
++
++		/* Done when IRQs disabled so must be atomic */
++		pa = kmap_atomic(page);
++		memcpy(ctx->buf, pa, PAGE_SIZE);
++		kunmap_atomic(pa);
++		ret = crypto_hash_digest(&ctx->desc, ctx->sg, PAGE_SIZE,
++							current_checksum);
++
++		if (ret) {
++			printk(KERN_INFO "Digest failed. Returned %d.\n", ret);
++			return;
++		}
++
++		if (memcmp(current_checksum, (char *) this_checksum,
++							CHECKSUM_SIZE)) {
++			toi_message(TOI_IO, TOI_VERBOSE, 0, "Resaving %ld.",
++					pfn);
++			SetPageResave(pfn_to_page(pfn));
++			toi_num_resaved++;
++			if (test_action_state(TOI_ABORT_ON_RESAVE_NEEDED))
++				set_abort_result(TOI_RESAVE_NEEDED);
++		}
++
++		index++;
++	}
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Checksum verification complete.");
++}
++
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_INT("enabled", SYSFS_RW, &toi_checksum_ops.enabled, 0, 1, 0,
++			NULL),
++	SYSFS_BIT("abort_if_resave_needed", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_ABORT_ON_RESAVE_NEEDED, 0)
++};
++
++/*
++ * Ops structure.
++ */
++static struct toi_module_ops toi_checksum_ops = {
++	.type			= MISC_MODULE,
++	.name			= "checksumming",
++	.directory		= "checksum",
++	.module			= THIS_MODULE,
++	.initialise		= toi_checksum_initialise,
++	.cleanup		= toi_checksum_cleanup,
++	.print_debug_info	= toi_checksum_print_debug_stats,
++	.save_config_info	= toi_checksum_save_config_info,
++	.load_config_info	= toi_checksum_load_config_info,
++	.memory_needed		= toi_checksum_memory_needed,
++	.storage_needed		= toi_checksum_storage_needed,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* ---- Registration ---- */
++int toi_checksum_init(void)
++{
++	int result = toi_register_module(&toi_checksum_ops);
++	return result;
++}
++
++void toi_checksum_exit(void)
++{
++	toi_unregister_module(&toi_checksum_ops);
++}
+diff --git a/kernel/power/tuxonice_checksum.h b/kernel/power/tuxonice_checksum.h
+new file mode 100644
+index 0000000..0f2812e
+--- /dev/null
++++ b/kernel/power/tuxonice_checksum.h
+@@ -0,0 +1,31 @@
++/*
++ * kernel/power/tuxonice_checksum.h
++ *
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * This file contains data checksum routines for TuxOnIce,
++ * using cryptoapi. They are used to locate any modifications
++ * made to pageset 2 while we're saving it.
++ */
++
++#if defined(CONFIG_TOI_CHECKSUM)
++extern int toi_checksum_init(void);
++extern void toi_checksum_exit(void);
++void check_checksums(void);
++int allocate_checksum_pages(void);
++void free_checksum_pages(void);
++char *tuxonice_get_next_checksum(void);
++int tuxonice_calc_checksum(struct page *page, char *checksum_locn);
++#else
++static inline int toi_checksum_init(void) { return 0; }
++static inline void toi_checksum_exit(void) { }
++static inline void check_checksums(void) { };
++static inline int allocate_checksum_pages(void) { return 0; };
++static inline void free_checksum_pages(void) { };
++static inline char *tuxonice_get_next_checksum(void) { return NULL; };
++static inline int tuxonice_calc_checksum(struct page *page, char *checksum_locn)
++	{ return 0; }
++#endif
++
+diff --git a/kernel/power/tuxonice_cluster.c b/kernel/power/tuxonice_cluster.c
+new file mode 100644
+index 0000000..0e5a262
+--- /dev/null
++++ b/kernel/power/tuxonice_cluster.c
+@@ -0,0 +1,1069 @@
++/*
++ * kernel/power/tuxonice_cluster.c
++ *
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * This file contains routines for cluster hibernation support.
++ *
++ * Based on ip autoconfiguration code in net/ipv4/ipconfig.c.
++ *
++ * How does it work?
++ *
++ * There is no 'master' node that tells everyone else what to do. All nodes
++ * send messages to the broadcast address/port, maintain a list of peers
++ * and figure out when to progress to the next step in hibernating or resuming.
++ * This makes us more fault tolerant when it comes to nodes coming and going
++ * (which may be more of an issue if we're hibernating when power supplies
++ * are being unreliable).
++ *
++ * At boot time, we start a ktuxonice thread that handles communication with
++ * other nodes. This node maintains a state machine that controls our progress
++ * through hibernating and resuming, keeping us in step with other nodes. Nodes
++ * are identified by their hw address.
++ *
++ * On startup, the node sends CLUSTER_PING on the configured interface's
++ * broadcast address, port $toi_cluster_port (see below) and begins to listen
++ * for other broadcast messages. CLUSTER_PING messages are repeated at
++ * intervals of 5 minutes, with a random offset to spread traffic out.
++ *
++ * A hibernation cycle is initiated from any node via
++ *
++ * echo > /sys/power/tuxonice/do_hibernate
++ *
++ * and (possibily) the hibernate script. At each step of the process, the node
++ * completes its work, and waits for all other nodes to signal completion of
++ * their work (or timeout) before progressing to the next step.
++ *
++ * Request/state  Action before reply	Possible reply	Next state
++ * HIBERNATE	  capable, pre-script	HIBERNATE|ACK	NODE_PREP
++ * 					HIBERNATE|NACK	INIT_0
++ *
++ * PREP		  prepare_image		PREP|ACK	IMAGE_WRITE
++ *		 			PREP|NACK	INIT_0
++ * 					ABORT		RUNNING
++ *
++ * IO		  write image		IO|ACK		power off
++ * 					ABORT		POST_RESUME
++ *
++ * (Boot time)	  check for image	IMAGE|ACK	RESUME_PREP
++ * 					(Note 1)
++ * 					IMAGE|NACK	(Note 2)
++ *
++ * PREP		  prepare read image	PREP|ACK	IMAGE_READ
++ * 					PREP|NACK	(As NACK_IMAGE)
++ *
++ * IO		  read image		IO|ACK		POST_RESUME
++ *
++ * POST_RESUME	  thaw, post-script			RUNNING
++ *
++ * INIT_0	  init 0
++ *
++ * Other messages:
++ *
++ * - PING: Request for all other live nodes to send a PONG. Used at startup to
++ *   announce presence, when a node is suspected dead and periodically, in case
++ *   segments of the network are [un]plugged.
++ *
++ * - PONG: Response to a PING.
++ *
++ * - ABORT: Request to cancel writing an image.
++ *
++ * - BYE: Notification that this node is shutting down.
++ *
++ * Note 1: Repeated at 3s intervals until we continue to boot/resume, so that
++ * nodes which are slower to start up can get state synchronised. If a node
++ * starting up sees other nodes sending RESUME_PREP or IMAGE_READ, it may send
++ * ACK_IMAGE and they will wait for it to catch up. If it sees ACK_READ, it
++ * must invalidate its image (if any) and boot normally.
++ *
++ * Note 2: May occur when one node lost power or powered off while others
++ * hibernated. This node waits for others to complete resuming (ACK_READ)
++ * before completing its boot, so that it appears as a fail node restarting.
++ *
++ * If any node has an image, then it also has a list of nodes that hibernated
++ * in synchronisation with it. The node will wait for other nodes to appear
++ * or timeout before beginning its restoration.
++ *
++ * If a node has no image, it needs to wait, in case other nodes which do have
++ * an image are going to resume, but are taking longer to announce their
++ * presence. For this reason, the user can specify a timeout value and a number
++ * of nodes detected before we just continue. (We might want to assume in a
++ * cluster of, say, 15 nodes, if 8 others have booted without finding an image,
++ * the remaining nodes will too. This might help in situations where some nodes
++ * are much slower to boot, or more subject to hardware failures or such like).
++ */
++
++#include <linux/suspend.h>
++#include <linux/module.h>
++#include <linux/moduleparam.h>
++#include <linux/if.h>
++#include <linux/rtnetlink.h>
++#include <linux/ip.h>
++#include <linux/udp.h>
++#include <linux/in.h>
++#include <linux/if_arp.h>
++#include <linux/kthread.h>
++#include <linux/wait.h>
++#include <linux/netdevice.h>
++#include <net/ip.h>
++
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_io.h"
++
++#if 1
++#define PRINTK(a, b...) do { printk(a, ##b); } while (0)
++#else
++#define PRINTK(a, b...) do { } while (0)
++#endif
++
++static int loopback_mode;
++static int num_local_nodes = 1;
++#define MAX_LOCAL_NODES 8
++#define SADDR (loopback_mode ? b->sid : h->saddr)
++
++#define MYNAME "TuxOnIce Clustering"
++
++enum cluster_message {
++	MSG_ACK = 1,
++	MSG_NACK = 2,
++	MSG_PING = 4,
++	MSG_ABORT = 8,
++	MSG_BYE = 16,
++	MSG_HIBERNATE = 32,
++	MSG_IMAGE = 64,
++	MSG_IO = 128,
++	MSG_RUNNING = 256
++};
++
++static char *str_message(int message)
++{
++	switch (message) {
++	case 4:
++		return "Ping";
++	case 8:
++		return "Abort";
++	case 9:
++		return "Abort acked";
++	case 10:
++		return "Abort nacked";
++	case 16:
++		return "Bye";
++	case 17:
++		return "Bye acked";
++	case 18:
++		return "Bye nacked";
++	case 32:
++		return "Hibernate request";
++	case 33:
++		return "Hibernate ack";
++	case 34:
++		return "Hibernate nack";
++	case 64:
++		return "Image exists?";
++	case 65:
++		return "Image does exist";
++	case 66:
++		return "No image here";
++	case 128:
++		return "I/O";
++	case 129:
++		return "I/O okay";
++	case 130:
++		return "I/O failed";
++	case 256:
++		return "Running";
++	default:
++		printk(KERN_ERR "Unrecognised message %d.\n", message);
++		return "Unrecognised message (see dmesg)";
++	}
++}
++
++#define MSG_ACK_MASK (MSG_ACK | MSG_NACK)
++#define MSG_STATE_MASK (~MSG_ACK_MASK)
++
++struct node_info {
++	struct list_head member_list;
++	wait_queue_head_t member_events;
++	spinlock_t member_list_lock;
++	spinlock_t receive_lock;
++	int peer_count, ignored_peer_count;
++	struct toi_sysfs_data sysfs_data;
++	enum cluster_message current_message;
++};
++
++struct node_info node_array[MAX_LOCAL_NODES];
++
++struct cluster_member {
++	__be32 addr;
++	enum cluster_message message;
++	struct list_head list;
++	int ignore;
++};
++
++#define toi_cluster_port_send 3501
++#define toi_cluster_port_recv 3502
++
++static struct net_device *net_dev;
++static struct toi_module_ops toi_cluster_ops;
++
++static int toi_recv(struct sk_buff *skb, struct net_device *dev,
++		struct packet_type *pt, struct net_device *orig_dev);
++
++static struct packet_type toi_cluster_packet_type = {
++	.type =	__constant_htons(ETH_P_IP),
++	.func =	toi_recv,
++};
++
++struct toi_pkt {		/* BOOTP packet format */
++	struct iphdr iph;	/* IP header */
++	struct udphdr udph;	/* UDP header */
++	u8 htype;		/* HW address type */
++	u8 hlen;		/* HW address length */
++	__be32 xid;		/* Transaction ID */
++	__be16 secs;		/* Seconds since we started */
++	__be16 flags;		/* Just what it says */
++	u8 hw_addr[16];		/* Sender's HW address */
++	u16 message;		/* Message */
++	unsigned long sid;	/* Source ID for loopback testing */
++};
++
++static char toi_cluster_iface[IFNAMSIZ] = CONFIG_TOI_DEFAULT_CLUSTER_INTERFACE;
++
++static int added_pack;
++
++static int others_have_image;
++
++/* Key used to allow multiple clusters on the same lan */
++static char toi_cluster_key[32] = CONFIG_TOI_DEFAULT_CLUSTER_KEY;
++static char pre_hibernate_script[255] =
++	CONFIG_TOI_DEFAULT_CLUSTER_PRE_HIBERNATE;
++static char post_hibernate_script[255] =
++	CONFIG_TOI_DEFAULT_CLUSTER_POST_HIBERNATE;
++
++/*			List of cluster members			*/
++static unsigned long continue_delay = 5 * HZ;
++static unsigned long cluster_message_timeout = 3 * HZ;
++
++/* 		=== Membership list === 	*/
++
++static void print_member_info(int index)
++{
++	struct cluster_member *this;
++
++	printk(KERN_INFO "==> Dumping node %d.\n", index);
++
++	list_for_each_entry(this, &node_array[index].member_list, list)
++		printk(KERN_INFO "%d.%d.%d.%d last message %s. %s\n",
++				NIPQUAD(this->addr),
++				str_message(this->message),
++				this->ignore ? "(Ignored)" : "");
++	printk(KERN_INFO "== Done ==\n");
++}
++
++static struct cluster_member *__find_member(int index, __be32 addr)
++{
++	struct cluster_member *this;
++
++	list_for_each_entry(this, &node_array[index].member_list, list) {
++		if (this->addr != addr)
++			continue;
++
++		return this;
++	}
++
++	return NULL;
++}
++
++static void set_ignore(int index, __be32 addr, struct cluster_member *this)
++{
++	if (this->ignore) {
++		PRINTK("Node %d already ignoring %d.%d.%d.%d.\n",
++				index, NIPQUAD(addr));
++		return;
++	}
++
++	PRINTK("Node %d sees node %d.%d.%d.%d now being ignored.\n",
++				index, NIPQUAD(addr));
++	this->ignore = 1;
++	node_array[index].ignored_peer_count++;
++}
++
++static int __add_update_member(int index, __be32 addr, int message)
++{
++	struct cluster_member *this;
++
++	this = __find_member(index, addr);
++	if (this) {
++		if (this->message != message) {
++			this->message = message;
++			if ((message & MSG_NACK) &&
++			    (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
++				set_ignore(index, addr, this);
++			PRINTK("Node %d sees node %d.%d.%d.%d now sending "
++					"%s.\n", index, NIPQUAD(addr),
++					str_message(message));
++			wake_up(&node_array[index].member_events);
++		}
++		return 0;
++	}
++
++	this = (struct cluster_member *) toi_kzalloc(36,
++			sizeof(struct cluster_member), GFP_KERNEL);
++
++	if (!this)
++		return -1;
++
++	this->addr = addr;
++	this->message = message;
++	this->ignore = 0;
++	INIT_LIST_HEAD(&this->list);
++
++	node_array[index].peer_count++;
++
++	PRINTK("Node %d sees node %d.%d.%d.%d sending %s.\n", index,
++			NIPQUAD(addr), str_message(message));
++
++	if ((message & MSG_NACK) &&
++	    (message & (MSG_HIBERNATE | MSG_IMAGE | MSG_IO)))
++		set_ignore(index, addr, this);
++	list_add_tail(&this->list, &node_array[index].member_list);
++	return 1;
++}
++
++static int add_update_member(int index, __be32 addr, int message)
++{
++	int result;
++	unsigned long flags;
++	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
++	result = __add_update_member(index, addr, message);
++	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
++
++	print_member_info(index);
++
++	wake_up(&node_array[index].member_events);
++
++	return result;
++}
++
++static void del_member(int index, __be32 addr)
++{
++	struct cluster_member *this;
++	unsigned long flags;
++
++	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
++	this = __find_member(index, addr);
++
++	if (this) {
++		list_del_init(&this->list);
++		toi_kfree(36, this, sizeof(*this));
++		node_array[index].peer_count--;
++	}
++
++	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
++}
++
++/* 		=== Message transmission ===	*/
++
++static void toi_send_if(int message, unsigned long my_id);
++
++/*
++ *  Process received TOI packet.
++ */
++static int toi_recv(struct sk_buff *skb, struct net_device *dev,
++		struct packet_type *pt, struct net_device *orig_dev)
++{
++	struct toi_pkt *b;
++	struct iphdr *h;
++	int len, result, index;
++	unsigned long addr, message, ack;
++
++	/* Perform verifications before taking the lock.  */
++	if (skb->pkt_type == PACKET_OTHERHOST)
++		goto drop;
++
++	if (dev != net_dev)
++		goto drop;
++
++	skb = skb_share_check(skb, GFP_ATOMIC);
++	if (!skb)
++		return NET_RX_DROP;
++
++	if (!pskb_may_pull(skb,
++			   sizeof(struct iphdr) +
++			   sizeof(struct udphdr)))
++		goto drop;
++
++	b = (struct toi_pkt *)skb_network_header(skb);
++	h = &b->iph;
++
++	if (h->ihl != 5 || h->version != 4 || h->protocol != IPPROTO_UDP)
++		goto drop;
++
++	/* Fragments are not supported */
++	if (h->frag_off & htons(IP_OFFSET | IP_MF)) {
++		if (net_ratelimit())
++			printk(KERN_ERR "TuxOnIce: Ignoring fragmented "
++			       "cluster message.\n");
++		goto drop;
++	}
++
++	if (skb->len < ntohs(h->tot_len))
++		goto drop;
++
++	if (ip_fast_csum((char *) h, h->ihl))
++		goto drop;
++
++	if (b->udph.source != htons(toi_cluster_port_send) ||
++	    b->udph.dest != htons(toi_cluster_port_recv))
++		goto drop;
++
++	if (ntohs(h->tot_len) < ntohs(b->udph.len) + sizeof(struct iphdr))
++		goto drop;
++
++	len = ntohs(b->udph.len) - sizeof(struct udphdr);
++
++	/* Ok the front looks good, make sure we can get at the rest.  */
++	if (!pskb_may_pull(skb, skb->len))
++		goto drop;
++
++	b = (struct toi_pkt *)skb_network_header(skb);
++	h = &b->iph;
++
++	addr = SADDR;
++	PRINTK(">>> Message %s received from " NIPQUAD_FMT ".\n",
++			str_message(b->message), NIPQUAD(addr));
++
++	message = b->message & MSG_STATE_MASK;
++	ack = b->message & MSG_ACK_MASK;
++
++	for (index = 0; index < num_local_nodes; index++) {
++		int new_message = node_array[index].current_message,
++		    old_message = new_message;
++
++		if (index == SADDR || !old_message) {
++			PRINTK("Ignoring node %d (offline or self).\n", index);
++			continue;
++		}
++
++		/* One message at a time, please. */
++		spin_lock(&node_array[index].receive_lock);
++
++		result = add_update_member(index, SADDR, b->message);
++		if (result == -1) {
++			printk(KERN_INFO "Failed to add new cluster member "
++					NIPQUAD_FMT ".\n",
++					NIPQUAD(addr));
++			goto drop_unlock;
++		}
++
++		switch (b->message & MSG_STATE_MASK) {
++		case MSG_PING:
++			break;
++		case MSG_ABORT:
++			break;
++		case MSG_BYE:
++			break;
++		case MSG_HIBERNATE:
++			/* Can I hibernate? */
++			new_message = MSG_HIBERNATE |
++				((index & 1) ? MSG_NACK : MSG_ACK);
++			break;
++		case MSG_IMAGE:
++			/* Can I resume? */
++			new_message = MSG_IMAGE |
++				((index & 1) ? MSG_NACK : MSG_ACK);
++			if (new_message != old_message)
++				printk(KERN_ERR "Setting whether I can resume "
++						"to %d.\n", new_message);
++			break;
++		case MSG_IO:
++			new_message = MSG_IO | MSG_ACK;
++			break;
++		case MSG_RUNNING:
++			break;
++		default:
++			if (net_ratelimit())
++				printk(KERN_ERR "Unrecognised TuxOnIce cluster"
++					" message %d from " NIPQUAD_FMT ".\n",
++					b->message, NIPQUAD(addr));
++		};
++
++		if (old_message != new_message) {
++			node_array[index].current_message = new_message;
++			printk(KERN_INFO ">>> Sending new message for node "
++					"%d.\n", index);
++			toi_send_if(new_message, index);
++		} else if (!ack) {
++			printk(KERN_INFO ">>> Resending message for node %d.\n",
++					index);
++			toi_send_if(new_message, index);
++		}
++drop_unlock:
++		spin_unlock(&node_array[index].receive_lock);
++	};
++
++drop:
++	/* Throw the packet out. */
++	kfree_skb(skb);
++
++	return 0;
++}
++
++/*
++ *  Send cluster message to single interface.
++ */
++static void toi_send_if(int message, unsigned long my_id)
++{
++	struct sk_buff *skb;
++	struct toi_pkt *b;
++	int hh_len = LL_RESERVED_SPACE(net_dev);
++	struct iphdr *h;
++
++	/* Allocate packet */
++	skb = alloc_skb(sizeof(struct toi_pkt) + hh_len + 15, GFP_KERNEL);
++	if (!skb)
++		return;
++	skb_reserve(skb, hh_len);
++	b = (struct toi_pkt *) skb_put(skb, sizeof(struct toi_pkt));
++	memset(b, 0, sizeof(struct toi_pkt));
++
++	/* Construct IP header */
++	skb_reset_network_header(skb);
++	h = ip_hdr(skb);
++	h->version = 4;
++	h->ihl = 5;
++	h->tot_len = htons(sizeof(struct toi_pkt));
++	h->frag_off = htons(IP_DF);
++	h->ttl = 64;
++	h->protocol = IPPROTO_UDP;
++	h->daddr = htonl(INADDR_BROADCAST);
++	h->check = ip_fast_csum((unsigned char *) h, h->ihl);
++
++	/* Construct UDP header */
++	b->udph.source = htons(toi_cluster_port_send);
++	b->udph.dest = htons(toi_cluster_port_recv);
++	b->udph.len = htons(sizeof(struct toi_pkt) - sizeof(struct iphdr));
++	/* UDP checksum not calculated -- explicitly allowed in BOOTP RFC */
++
++	/* Construct message */
++	b->message = message;
++	b->sid = my_id;
++	b->htype = net_dev->type; /* can cause undefined behavior */
++	b->hlen = net_dev->addr_len;
++	memcpy(b->hw_addr, net_dev->dev_addr, net_dev->addr_len);
++	b->secs = htons(3); /* 3 seconds */
++
++	/* Chain packet down the line... */
++	skb->dev = net_dev;
++	skb->protocol = htons(ETH_P_IP);
++	if ((dev_hard_header(skb, net_dev, ntohs(skb->protocol),
++		     net_dev->broadcast, net_dev->dev_addr, skb->len) < 0) ||
++			dev_queue_xmit(skb) < 0)
++		printk(KERN_INFO "E");
++}
++
++/*	=========================================		*/
++
++/*			kTOICluster			*/
++
++static atomic_t num_cluster_threads;
++static DECLARE_WAIT_QUEUE_HEAD(clusterd_events);
++
++static int kTOICluster(void *data)
++{
++	unsigned long my_id;
++
++	my_id = atomic_add_return(1, &num_cluster_threads) - 1;
++	node_array[my_id].current_message = (unsigned long) data;
++
++	PRINTK("kTOICluster daemon %lu starting.\n", my_id);
++
++	current->flags |= PF_NOFREEZE;
++
++	while (node_array[my_id].current_message) {
++		toi_send_if(node_array[my_id].current_message, my_id);
++		sleep_on_timeout(&clusterd_events,
++				cluster_message_timeout);
++		PRINTK("Link state %lu is %d.\n", my_id,
++				node_array[my_id].current_message);
++	}
++
++	toi_send_if(MSG_BYE, my_id);
++	atomic_dec(&num_cluster_threads);
++	wake_up(&clusterd_events);
++
++	PRINTK("kTOICluster daemon %lu exiting.\n", my_id);
++	__set_current_state(TASK_RUNNING);
++	return 0;
++}
++
++static void kill_clusterd(void)
++{
++	int i;
++
++	for (i = 0; i < num_local_nodes; i++) {
++		if (node_array[i].current_message) {
++			PRINTK("Seeking to kill clusterd %d.\n", i);
++			node_array[i].current_message = 0;
++		}
++	}
++	wait_event(clusterd_events,
++			!atomic_read(&num_cluster_threads));
++	PRINTK("All cluster daemons have exited.\n");
++}
++
++static int peers_not_in_message(int index, int message, int precise)
++{
++	struct cluster_member *this;
++	unsigned long flags;
++	int result = 0;
++
++	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
++	list_for_each_entry(this, &node_array[index].member_list, list) {
++		if (this->ignore)
++			continue;
++
++		PRINTK("Peer %d.%d.%d.%d sending %s. "
++			"Seeking %s.\n",
++			NIPQUAD(this->addr),
++			str_message(this->message), str_message(message));
++		if ((precise ? this->message :
++					this->message & MSG_STATE_MASK) !=
++					message)
++			result++;
++	}
++	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
++	PRINTK("%d peers in sought message.\n", result);
++	return result;
++}
++
++static void reset_ignored(int index)
++{
++	struct cluster_member *this;
++	unsigned long flags;
++
++	spin_lock_irqsave(&node_array[index].member_list_lock, flags);
++	list_for_each_entry(this, &node_array[index].member_list, list)
++		this->ignore = 0;
++	node_array[index].ignored_peer_count = 0;
++	spin_unlock_irqrestore(&node_array[index].member_list_lock, flags);
++}
++
++static int peers_in_message(int index, int message, int precise)
++{
++	return node_array[index].peer_count -
++		node_array[index].ignored_peer_count -
++		peers_not_in_message(index, message, precise);
++}
++
++static int time_to_continue(int index, unsigned long start, int message)
++{
++	int first = peers_not_in_message(index, message, 0);
++	int second = peers_in_message(index, message, 1);
++
++	PRINTK("First part returns %d, second returns %d.\n", first, second);
++
++	if (!first && !second) {
++		PRINTK("All peers answered message %d.\n",
++			message);
++		return 1;
++	}
++
++	if (time_after(jiffies, start + continue_delay)) {
++		PRINTK("Timeout reached.\n");
++		return 1;
++	}
++
++	PRINTK("Not time to continue yet (%lu < %lu).\n", jiffies,
++			start + continue_delay);
++	return 0;
++}
++
++void toi_initiate_cluster_hibernate(void)
++{
++	int result;
++	unsigned long start;
++
++	result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
++	if (result)
++		return;
++
++	toi_send_if(MSG_HIBERNATE, 0);
++
++	start = jiffies;
++	wait_event(node_array[0].member_events,
++			time_to_continue(0, start, MSG_HIBERNATE));
++
++	if (test_action_state(TOI_FREEZER_TEST)) {
++		toi_send_if(MSG_ABORT, 0);
++
++		start = jiffies;
++		wait_event(node_array[0].member_events,
++			time_to_continue(0, start, MSG_RUNNING));
++
++		do_toi_step(STEP_QUIET_CLEANUP);
++		return;
++	}
++
++	toi_send_if(MSG_IO, 0);
++
++	result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
++	if (result)
++		return;
++
++	/* This code runs at resume time too! */
++	if (toi_in_hibernate)
++		result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
++}
++EXPORT_SYMBOL_GPL(toi_initiate_cluster_hibernate);
++
++/* toi_cluster_print_debug_stats
++ *
++ * Description:	Print information to be recorded for debugging purposes into a
++ * 		buffer.
++ * Arguments:	buffer: Pointer to a buffer into which the debug info will be
++ * 			printed.
++ * 		size:	Size of the buffer.
++ * Returns:	Number of characters written to the buffer.
++ */
++static int toi_cluster_print_debug_stats(char *buffer, int size)
++{
++	int len;
++
++	if (strlen(toi_cluster_iface))
++		len = scnprintf(buffer, size,
++				"- Cluster interface is '%s'.\n",
++				toi_cluster_iface);
++	else
++		len = scnprintf(buffer, size,
++				"- Cluster support is disabled.\n");
++	return len;
++}
++
++/* cluster_memory_needed
++ *
++ * Description:	Tell the caller how much memory we need to operate during
++ * 		hibernate/resume.
++ * Returns:	Unsigned long. Maximum number of bytes of memory required for
++ * 		operation.
++ */
++static int toi_cluster_memory_needed(void)
++{
++	return 0;
++}
++
++static int toi_cluster_storage_needed(void)
++{
++	return 1 + strlen(toi_cluster_iface);
++}
++
++/* toi_cluster_save_config_info
++ *
++ * Description:	Save informaton needed when reloading the image at resume time.
++ * Arguments:	Buffer:		Pointer to a buffer of size PAGE_SIZE.
++ * Returns:	Number of bytes used for saving our data.
++ */
++static int toi_cluster_save_config_info(char *buffer)
++{
++	strcpy(buffer, toi_cluster_iface);
++	return strlen(toi_cluster_iface + 1);
++}
++
++/* toi_cluster_load_config_info
++ *
++ * Description:	Reload information needed for declustering the image at
++ * 		resume time.
++ * Arguments:	Buffer:		Pointer to the start of the data.
++ *		Size:		Number of bytes that were saved.
++ */
++static void toi_cluster_load_config_info(char *buffer, int size)
++{
++	strncpy(toi_cluster_iface, buffer, size);
++	return;
++}
++
++static void cluster_startup(void)
++{
++	int have_image = do_check_can_resume(), i;
++	unsigned long start = jiffies, initial_message;
++	struct task_struct *p;
++
++	initial_message = MSG_IMAGE;
++
++	have_image = 1;
++
++	for (i = 0; i < num_local_nodes; i++) {
++		PRINTK("Starting ktoiclusterd %d.\n", i);
++		p = kthread_create(kTOICluster, (void *) initial_message,
++				"ktoiclusterd/%d", i);
++		if (IS_ERR(p)) {
++			printk(KERN_ERR "Failed to start ktoiclusterd.\n");
++			return;
++		}
++
++		wake_up_process(p);
++	}
++
++	/* Wait for delay or someone else sending first message */
++	wait_event(node_array[0].member_events, time_to_continue(0, start,
++				MSG_IMAGE));
++
++	others_have_image = peers_in_message(0, MSG_IMAGE | MSG_ACK, 1);
++
++	printk(KERN_INFO "Continuing. I %shave an image. Peers with image:"
++		" %d.\n", have_image ? "" : "don't ", others_have_image);
++
++	if (have_image) {
++		int result;
++
++		/* Start to resume */
++		printk(KERN_INFO "  === Starting to resume ===  \n");
++		node_array[0].current_message = MSG_IO;
++		toi_send_if(MSG_IO, 0);
++
++		/* result = do_toi_step(STEP_RESUME_LOAD_PS1); */
++		result = 0;
++
++		if (!result) {
++			/*
++			 * Atomic restore - we'll come back in the hibernation
++			 * path.
++			 */
++
++			/* result = do_toi_step(STEP_RESUME_DO_RESTORE); */
++			result = 0;
++
++			/* do_toi_step(STEP_QUIET_CLEANUP); */
++		}
++
++		node_array[0].current_message |= MSG_NACK;
++
++		/* For debugging - disable for real life? */
++		wait_event(node_array[0].member_events,
++				time_to_continue(0, start, MSG_IO));
++	}
++
++	if (others_have_image) {
++		/* Wait for them to resume */
++		printk(KERN_INFO "Waiting for other nodes to resume.\n");
++		start = jiffies;
++		wait_event(node_array[0].member_events,
++				time_to_continue(0, start, MSG_RUNNING));
++		if (peers_not_in_message(0, MSG_RUNNING, 0))
++			printk(KERN_INFO "Timed out while waiting for other "
++					"nodes to resume.\n");
++	}
++
++	/* Find out whether an image exists here. Send ACK_IMAGE or NACK_IMAGE
++	 * as appropriate.
++	 *
++	 * If we don't have an image:
++	 * - Wait until someone else says they have one, or conditions are met
++	 *   for continuing to boot (n machines or t seconds).
++	 * - If anyone has an image, wait for them to resume before continuing
++	 *   to boot.
++	 *
++	 * If we have an image:
++	 * - Wait until conditions are met before continuing to resume (n
++	 *   machines or t seconds). Send RESUME_PREP and freeze processes.
++	 *   NACK_PREP if freezing fails (shouldn't) and follow logic for
++	 *   us having no image above. On success, wait for [N]ACK_PREP from
++	 *   other machines. Read image (including atomic restore) until done.
++	 *   Wait for ACK_READ from others (should never fail). Thaw processes
++	 *   and do post-resume. (The section after the atomic restore is done
++	 *   via the code for hibernating).
++	 */
++
++	node_array[0].current_message = MSG_RUNNING;
++}
++
++/* toi_cluster_open_iface
++ *
++ * Description:	Prepare to use an interface.
++ */
++
++static int toi_cluster_open_iface(void)
++{
++	struct net_device *dev;
++
++	rtnl_lock();
++
++	for_each_netdev(&init_net, dev) {
++		if (/* dev == &init_net.loopback_dev || */
++		    strcmp(dev->name, toi_cluster_iface))
++			continue;
++
++		net_dev = dev;
++		break;
++	}
++
++	rtnl_unlock();
++
++	if (!net_dev) {
++		printk(KERN_ERR MYNAME ": Device %s not found.\n",
++				toi_cluster_iface);
++		return -ENODEV;
++	}
++
++	dev_add_pack(&toi_cluster_packet_type);
++	added_pack = 1;
++
++	loopback_mode = (net_dev == init_net.loopback_dev);
++	num_local_nodes = loopback_mode ? 8 : 1;
++
++	PRINTK("Loopback mode is %s. Number of local nodes is %d.\n",
++			loopback_mode ? "on" : "off", num_local_nodes);
++
++	cluster_startup();
++	return 0;
++}
++
++/* toi_cluster_close_iface
++ *
++ * Description: Stop using an interface.
++ */
++
++static int toi_cluster_close_iface(void)
++{
++	kill_clusterd();
++	if (added_pack) {
++		dev_remove_pack(&toi_cluster_packet_type);
++		added_pack = 0;
++	}
++	return 0;
++}
++
++static void write_side_effect(void)
++{
++	if (toi_cluster_ops.enabled) {
++		toi_cluster_open_iface();
++		set_toi_state(TOI_CLUSTER_MODE);
++	} else {
++		toi_cluster_close_iface();
++		clear_toi_state(TOI_CLUSTER_MODE);
++	}
++}
++
++static void node_write_side_effect(void)
++{
++}
++
++/*
++ * data for our sysfs entries.
++ */
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_STRING("interface", SYSFS_RW, toi_cluster_iface, IFNAMSIZ, 0,
++			NULL),
++	SYSFS_INT("enabled", SYSFS_RW, &toi_cluster_ops.enabled, 0, 1, 0,
++			write_side_effect),
++	SYSFS_STRING("cluster_name", SYSFS_RW, toi_cluster_key, 32, 0, NULL),
++	SYSFS_STRING("pre-hibernate-script", SYSFS_RW, pre_hibernate_script,
++			256, 0, NULL),
++	SYSFS_STRING("post-hibernate-script", SYSFS_RW, post_hibernate_script,
++			256, 0, STRING),
++	SYSFS_UL("continue_delay", SYSFS_RW, &continue_delay, HZ / 2, 60 * HZ,
++			0)
++};
++
++/*
++ * Ops structure.
++ */
++
++static struct toi_module_ops toi_cluster_ops = {
++	.type			= FILTER_MODULE,
++	.name			= "Cluster",
++	.directory		= "cluster",
++	.module			= THIS_MODULE,
++	.memory_needed 		= toi_cluster_memory_needed,
++	.print_debug_info	= toi_cluster_print_debug_stats,
++	.save_config_info	= toi_cluster_save_config_info,
++	.load_config_info	= toi_cluster_load_config_info,
++	.storage_needed		= toi_cluster_storage_needed,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* ---- Registration ---- */
++
++#ifdef MODULE
++#define INIT static __init
++#define EXIT static __exit
++#else
++#define INIT
++#define EXIT
++#endif
++
++INIT int toi_cluster_init(void)
++{
++	int temp = toi_register_module(&toi_cluster_ops), i;
++	struct kobject *kobj = toi_cluster_ops.dir_kobj;
++
++	for (i = 0; i < MAX_LOCAL_NODES; i++) {
++		node_array[i].current_message = 0;
++		INIT_LIST_HEAD(&node_array[i].member_list);
++		init_waitqueue_head(&node_array[i].member_events);
++		spin_lock_init(&node_array[i].member_list_lock);
++		spin_lock_init(&node_array[i].receive_lock);
++
++		/* Set up sysfs entry */
++		node_array[i].sysfs_data.attr.name = toi_kzalloc(8,
++				sizeof(node_array[i].sysfs_data.attr.name),
++				GFP_KERNEL);
++		sprintf((char *) node_array[i].sysfs_data.attr.name, "node_%d",
++				i);
++		node_array[i].sysfs_data.attr.mode = SYSFS_RW;
++		node_array[i].sysfs_data.type = TOI_SYSFS_DATA_INTEGER;
++		node_array[i].sysfs_data.flags = 0;
++		node_array[i].sysfs_data.data.integer.variable =
++			(int *) &node_array[i].current_message;
++		node_array[i].sysfs_data.data.integer.minimum = 0;
++		node_array[i].sysfs_data.data.integer.maximum = INT_MAX;
++		node_array[i].sysfs_data.write_side_effect =
++			node_write_side_effect;
++		toi_register_sysfs_file(kobj, &node_array[i].sysfs_data);
++	}
++
++	toi_cluster_ops.enabled = (strlen(toi_cluster_iface) > 0);
++
++	if (toi_cluster_ops.enabled)
++		toi_cluster_open_iface();
++
++	return temp;
++}
++
++EXIT void toi_cluster_exit(void)
++{
++	int i;
++	toi_cluster_close_iface();
++
++	for (i = 0; i < MAX_LOCAL_NODES; i++)
++		toi_unregister_sysfs_file(toi_cluster_ops.dir_kobj,
++				&node_array[i].sysfs_data);
++	toi_unregister_module(&toi_cluster_ops);
++}
++
++static int __init toi_cluster_iface_setup(char *iface)
++{
++	toi_cluster_ops.enabled = (*iface &&
++			strcmp(iface, "off"));
++
++	if (toi_cluster_ops.enabled)
++		strncpy(toi_cluster_iface, iface, strlen(iface));
++}
++
++__setup("toi_cluster=", toi_cluster_iface_setup);
++
++#ifdef MODULE
++MODULE_LICENSE("GPL");
++module_init(toi_cluster_init);
++module_exit(toi_cluster_exit);
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("Cluster Support for TuxOnIce");
++#endif
+diff --git a/kernel/power/tuxonice_cluster.h b/kernel/power/tuxonice_cluster.h
+new file mode 100644
+index 0000000..051feb3
+--- /dev/null
++++ b/kernel/power/tuxonice_cluster.h
+@@ -0,0 +1,18 @@
++/*
++ * kernel/power/tuxonice_cluster.h
++ *
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ */
++
++#ifdef CONFIG_TOI_CLUSTER
++extern int toi_cluster_init(void);
++extern void toi_cluster_exit(void);
++extern void toi_initiate_cluster_hibernate(void);
++#else
++static inline int toi_cluster_init(void) { return 0; }
++static inline void toi_cluster_exit(void) { }
++static inline void toi_initiate_cluster_hibernate(void) { }
++#endif
++
+diff --git a/kernel/power/tuxonice_compress.c b/kernel/power/tuxonice_compress.c
+new file mode 100644
+index 0000000..2d89c4c
+--- /dev/null
++++ b/kernel/power/tuxonice_compress.c
+@@ -0,0 +1,465 @@
++/*
++ * kernel/power/compression.c
++ *
++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * This file contains data compression routines for TuxOnIce,
++ * using cryptoapi.
++ */
++
++#include <linux/suspend.h>
++#include <linux/highmem.h>
++#include <linux/vmalloc.h>
++#include <linux/crypto.h>
++
++#include "tuxonice_builtin.h"
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_io.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_alloc.h"
++
++static int toi_expected_compression;
++
++static struct toi_module_ops toi_compression_ops;
++static struct toi_module_ops *next_driver;
++
++static char toi_compressor_name[32] = "lzo";
++
++static DEFINE_MUTEX(stats_lock);
++
++struct cpu_context {
++	u8 *page_buffer;
++	struct crypto_comp *transform;
++	unsigned int len;
++	u8 *buffer_start;
++	u8 *output_buffer;
++};
++
++#define OUT_BUF_SIZE (2 * PAGE_SIZE)
++
++static DEFINE_PER_CPU(struct cpu_context, contexts);
++
++/*
++ * toi_crypto_prepare
++ *
++ * Prepare to do some work by allocating buffers and transforms.
++ */
++static int toi_compress_crypto_prepare(void)
++{
++	int cpu;
++
++	if (!*toi_compressor_name) {
++		printk(KERN_INFO "TuxOnIce: Compression enabled but no "
++				"compressor name set.\n");
++		return 1;
++	}
++
++	for_each_online_cpu(cpu) {
++		struct cpu_context *this = &per_cpu(contexts, cpu);
++		this->transform = crypto_alloc_comp(toi_compressor_name, 0, 0);
++		if (IS_ERR(this->transform)) {
++			printk(KERN_INFO "TuxOnIce: Failed to initialise the "
++					"%s compression transform.\n",
++					toi_compressor_name);
++			this->transform = NULL;
++			return 1;
++		}
++
++		this->page_buffer =
++			(char *) toi_get_zeroed_page(16, TOI_ATOMIC_GFP);
++
++		if (!this->page_buffer) {
++			printk(KERN_ERR
++			  "Failed to allocate a page buffer for TuxOnIce "
++			  "compression driver.\n");
++			return -ENOMEM;
++		}
++
++		this->output_buffer =
++			(char *) vmalloc_32(OUT_BUF_SIZE);
++
++		if (!this->output_buffer) {
++			printk(KERN_ERR
++			  "Failed to allocate a output buffer for TuxOnIce "
++			  "compression driver.\n");
++			return -ENOMEM;
++		}
++	}
++
++	return 0;
++}
++
++static int toi_compress_rw_cleanup(int writing)
++{
++	int cpu;
++
++	for_each_online_cpu(cpu) {
++		struct cpu_context *this = &per_cpu(contexts, cpu);
++		if (this->transform) {
++			crypto_free_comp(this->transform);
++			this->transform = NULL;
++		}
++
++		if (this->page_buffer)
++			toi_free_page(16, (unsigned long) this->page_buffer);
++
++		this->page_buffer = NULL;
++
++		if (this->output_buffer)
++			vfree(this->output_buffer);
++
++		this->output_buffer = NULL;
++	}
++
++	return 0;
++}
++
++/*
++ * toi_compress_init
++ */
++
++static int toi_compress_init(int toi_or_resume)
++{
++	if (!toi_or_resume)
++		return 0;
++
++	toi_compress_bytes_in = 0;
++	toi_compress_bytes_out = 0;
++
++	next_driver = toi_get_next_filter(&toi_compression_ops);
++
++	return next_driver ? 0 : -ECHILD;
++}
++
++/*
++ * toi_compress_rw_init()
++ */
++
++static int toi_compress_rw_init(int rw, int stream_number)
++{
++	if (toi_compress_crypto_prepare()) {
++		printk(KERN_ERR "Failed to initialise compression "
++				"algorithm.\n");
++		if (rw == READ) {
++			printk(KERN_INFO "Unable to read the image.\n");
++			return -ENODEV;
++		} else {
++			printk(KERN_INFO "Continuing without "
++				"compressing the image.\n");
++			toi_compression_ops.enabled = 0;
++		}
++	}
++
++	return 0;
++}
++
++/*
++ * toi_compress_write_page()
++ *
++ * Compress a page of data, buffering output and passing on filled
++ * pages to the next module in the pipeline.
++ *
++ * Buffer_page:	Pointer to a buffer of size PAGE_SIZE, containing
++ * data to be compressed.
++ *
++ * Returns:	0 on success. Otherwise the error is that returned by later
++ * 		modules, -ECHILD if we have a broken pipeline or -EIO if
++ * 		zlib errs.
++ */
++static int toi_compress_write_page(unsigned long index, int buf_type,
++		void *buffer_page, unsigned int buf_size)
++{
++	int ret = 0, cpu = smp_processor_id();
++	struct cpu_context *ctx = &per_cpu(contexts, cpu);
++	u8* output_buffer = buffer_page;
++	int output_len = buf_size;
++	int out_buf_type = buf_type;
++
++	if (ctx->transform) {
++
++		ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
++		ctx->len = OUT_BUF_SIZE;
++
++		ret = crypto_comp_compress(ctx->transform,
++			ctx->buffer_start, buf_size,
++			ctx->output_buffer, &ctx->len);
++
++		TOI_UNMAP(buf_type, buffer_page);
++
++		toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
++				"CPU %d, index %lu: %d bytes",
++				cpu, index, ctx->len);
++
++		if (!ret && ctx->len < buf_size) { /* some compression */
++			output_buffer = ctx->output_buffer;
++			output_len = ctx->len;
++			out_buf_type = TOI_VIRT;
++		}
++
++	}
++
++	mutex_lock(&stats_lock);
++
++	toi_compress_bytes_in += buf_size;
++	toi_compress_bytes_out += output_len;
++
++	mutex_unlock(&stats_lock);
++
++	if (!ret)
++		ret = next_driver->write_page(index, out_buf_type,
++				output_buffer, output_len);
++
++	return ret;
++}
++
++/*
++ * toi_compress_read_page()
++ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
++ *
++ * Retrieve data from later modules and decompress it until the input buffer
++ * is filled.
++ * Zero if successful. Error condition from me or from downstream on failure.
++ */
++static int toi_compress_read_page(unsigned long *index, int buf_type,
++		void *buffer_page, unsigned int *buf_size)
++{
++	int ret, cpu = smp_processor_id();
++	unsigned int len;
++	unsigned int outlen = PAGE_SIZE;
++	char *buffer_start;
++	struct cpu_context *ctx = &per_cpu(contexts, cpu);
++
++	if (!ctx->transform)
++		return next_driver->read_page(index, TOI_PAGE, buffer_page,
++				buf_size);
++
++	/*
++	 * All our reads must be synchronous - we can't decompress
++	 * data that hasn't been read yet.
++	 */
++
++	ret = next_driver->read_page(index, TOI_VIRT, ctx->page_buffer, &len);
++
++	buffer_start = kmap(buffer_page);
++
++	/* Error or uncompressed data */
++	if (ret || len == PAGE_SIZE) {
++		memcpy(buffer_start, ctx->page_buffer, len);
++		goto out;
++	}
++
++	ret = crypto_comp_decompress(
++			ctx->transform,
++			ctx->page_buffer,
++			len, buffer_start, &outlen);
++
++	toi_message(TOI_COMPRESS, TOI_VERBOSE, 0,
++			"CPU %d, index %lu: %d=>%d (%d).",
++			cpu, *index, len, outlen, ret);
++
++	if (ret)
++		abort_hibernate(TOI_FAILED_IO,
++			"Compress_read returned %d.\n", ret);
++	else if (outlen != PAGE_SIZE) {
++		abort_hibernate(TOI_FAILED_IO,
++			"Decompression yielded %d bytes instead of %ld.\n",
++			outlen, PAGE_SIZE);
++		printk(KERN_ERR "Decompression yielded %d bytes instead of "
++				"%ld.\n", outlen, PAGE_SIZE);
++		ret = -EIO;
++		*buf_size = outlen;
++	}
++out:
++	TOI_UNMAP(buf_type, buffer_page);
++	return ret;
++}
++
++/*
++ * toi_compress_print_debug_stats
++ * @buffer: Pointer to a buffer into which the debug info will be printed.
++ * @size: Size of the buffer.
++ *
++ * Print information to be recorded for debugging purposes into a buffer.
++ * Returns: Number of characters written to the buffer.
++ */
++
++static int toi_compress_print_debug_stats(char *buffer, int size)
++{
++	unsigned long pages_in = toi_compress_bytes_in >> PAGE_SHIFT,
++		      pages_out = toi_compress_bytes_out >> PAGE_SHIFT;
++	int len;
++
++	/* Output the compression ratio achieved. */
++	if (*toi_compressor_name)
++		len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
++				toi_compressor_name);
++	else
++		len = scnprintf(buffer, size, "- Compressor is not set.\n");
++
++	if (pages_in)
++		len += scnprintf(buffer+len, size - len, "  Compressed "
++			"%lu bytes into %lu (%ld percent compression).\n",
++		  toi_compress_bytes_in,
++		  toi_compress_bytes_out,
++		  (pages_in - pages_out) * 100 / pages_in);
++	return len;
++}
++
++/*
++ * toi_compress_compression_memory_needed
++ *
++ * Tell the caller how much memory we need to operate during hibernate/resume.
++ * Returns: Unsigned long. Maximum number of bytes of memory required for
++ * operation.
++ */
++static int toi_compress_memory_needed(void)
++{
++	return 2 * PAGE_SIZE;
++}
++
++static int toi_compress_storage_needed(void)
++{
++	return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
++		strlen(toi_compressor_name) + 1;
++}
++
++/*
++ * toi_compress_save_config_info
++ * @buffer: Pointer to a buffer of size PAGE_SIZE.
++ *
++ * Save informaton needed when reloading the image at resume time.
++ * Returns: Number of bytes used for saving our data.
++ */
++static int toi_compress_save_config_info(char *buffer)
++{
++	int len = strlen(toi_compressor_name) + 1, offset = 0;
++
++	*((unsigned long *) buffer) = toi_compress_bytes_in;
++	offset += sizeof(unsigned long);
++	*((unsigned long *) (buffer + offset)) = toi_compress_bytes_out;
++	offset += sizeof(unsigned long);
++	*((int *) (buffer + offset)) = toi_expected_compression;
++	offset += sizeof(int);
++	*((int *) (buffer + offset)) = len;
++	offset += sizeof(int);
++	strncpy(buffer + offset, toi_compressor_name, len);
++	return offset + len;
++}
++
++/* toi_compress_load_config_info
++ * @buffer: Pointer to the start of the data.
++ * @size: Number of bytes that were saved.
++ *
++ * Description:	Reload information needed for decompressing the image at
++ * resume time.
++ */
++static void toi_compress_load_config_info(char *buffer, int size)
++{
++	int len, offset = 0;
++
++	toi_compress_bytes_in = *((unsigned long *) buffer);
++	offset += sizeof(unsigned long);
++	toi_compress_bytes_out = *((unsigned long *) (buffer + offset));
++	offset += sizeof(unsigned long);
++	toi_expected_compression = *((int *) (buffer + offset));
++	offset += sizeof(int);
++	len = *((int *) (buffer + offset));
++	offset += sizeof(int);
++	strncpy(toi_compressor_name, buffer + offset, len);
++}
++
++static void toi_compress_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
++{
++	bkd->compress_bytes_in = toi_compress_bytes_in;
++	bkd->compress_bytes_out = toi_compress_bytes_out;
++}
++
++static void toi_compress_post_atomic_restore(struct toi_boot_kernel_data *bkd)
++{
++	toi_compress_bytes_in = bkd->compress_bytes_in;
++	toi_compress_bytes_out = bkd->compress_bytes_out;
++}
++
++/*
++ * toi_expected_compression_ratio
++ *
++ * Description:	Returns the expected ratio between data passed into this module
++ * 		and the amount of data output when writing.
++ * Returns:	100 if the module is disabled. Otherwise the value set by the
++ * 		user via our sysfs entry.
++ */
++
++static int toi_compress_expected_ratio(void)
++{
++	if (!toi_compression_ops.enabled)
++		return 100;
++	else
++		return 100 - toi_expected_compression;
++}
++
++/*
++ * data for our sysfs entries.
++ */
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_INT("expected_compression", SYSFS_RW, &toi_expected_compression,
++			0, 99, 0, NULL),
++	SYSFS_INT("enabled", SYSFS_RW, &toi_compression_ops.enabled, 0, 1, 0,
++			NULL),
++	SYSFS_STRING("algorithm", SYSFS_RW, toi_compressor_name, 31, 0, NULL),
++};
++
++/*
++ * Ops structure.
++ */
++static struct toi_module_ops toi_compression_ops = {
++	.type			= FILTER_MODULE,
++	.name			= "compression",
++	.directory		= "compression",
++	.module			= THIS_MODULE,
++	.initialise		= toi_compress_init,
++	.memory_needed 		= toi_compress_memory_needed,
++	.print_debug_info	= toi_compress_print_debug_stats,
++	.save_config_info	= toi_compress_save_config_info,
++	.load_config_info	= toi_compress_load_config_info,
++	.storage_needed		= toi_compress_storage_needed,
++	.expected_compression	= toi_compress_expected_ratio,
++
++	.pre_atomic_restore	= toi_compress_pre_atomic_restore,
++	.post_atomic_restore	= toi_compress_post_atomic_restore,
++
++	.rw_init		= toi_compress_rw_init,
++	.rw_cleanup		= toi_compress_rw_cleanup,
++
++	.write_page		= toi_compress_write_page,
++	.read_page		= toi_compress_read_page,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* ---- Registration ---- */
++
++static __init int toi_compress_load(void)
++{
++	return toi_register_module(&toi_compression_ops);
++}
++
++#ifdef MODULE
++static __exit void toi_compress_unload(void)
++{
++	toi_unregister_module(&toi_compression_ops);
++}
++
++module_init(toi_compress_load);
++module_exit(toi_compress_unload);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("Compression Support for TuxOnIce");
++#else
++late_initcall(toi_compress_load);
++#endif
+diff --git a/kernel/power/tuxonice_extent.c b/kernel/power/tuxonice_extent.c
+new file mode 100644
+index 0000000..e84572c
+--- /dev/null
++++ b/kernel/power/tuxonice_extent.c
+@@ -0,0 +1,123 @@
++/*
++ * kernel/power/tuxonice_extent.c
++ *
++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * These functions encapsulate the manipulation of storage metadata.
++ */
++
++#include <linux/suspend.h>
++#include "tuxonice_modules.h"
++#include "tuxonice_extent.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_ui.h"
++#include "tuxonice.h"
++
++/**
++ * toi_get_extent - return a free extent
++ *
++ * May fail, returning NULL instead.
++ **/
++static struct hibernate_extent *toi_get_extent(void)
++{
++	return (struct hibernate_extent *) toi_kzalloc(2,
++			sizeof(struct hibernate_extent), TOI_ATOMIC_GFP);
++}
++
++/**
++ * toi_put_extent_chain - free a whole chain of extents
++ * @chain:	Chain to free.
++ **/
++void toi_put_extent_chain(struct hibernate_extent_chain *chain)
++{
++	struct hibernate_extent *this;
++
++	this = chain->first;
++
++	while (this) {
++		struct hibernate_extent *next = this->next;
++		toi_kfree(2, this, sizeof(*this));
++		chain->num_extents--;
++		this = next;
++	}
++
++	chain->first = NULL;
++	chain->last_touched = NULL;
++	chain->current_extent = NULL;
++	chain->size = 0;
++}
++EXPORT_SYMBOL_GPL(toi_put_extent_chain);
++
++/**
++ * toi_add_to_extent_chain - add an extent to an existing chain
++ * @chain:	Chain to which the extend should be added
++ * @start:	Start of the extent (first physical block)
++ * @end:	End of the extent (last physical block)
++ *
++ * The chain information is updated if the insertion is successful.
++ **/
++int toi_add_to_extent_chain(struct hibernate_extent_chain *chain,
++		unsigned long start, unsigned long end)
++{
++	struct hibernate_extent *new_ext = NULL, *cur_ext = NULL;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0,
++		"Adding extent %lu-%lu to chain %p.\n", start, end, chain);
++
++	/* Find the right place in the chain */
++	if (chain->last_touched && chain->last_touched->start < start)
++		cur_ext = chain->last_touched;
++	else if (chain->first && chain->first->start < start)
++		cur_ext = chain->first;
++
++	if (cur_ext) {
++		while (cur_ext->next && cur_ext->next->start < start)
++			cur_ext = cur_ext->next;
++
++		if (cur_ext->end == (start - 1)) {
++			struct hibernate_extent *next_ext = cur_ext->next;
++			cur_ext->end = end;
++
++			/* Merge with the following one? */
++			if (next_ext && cur_ext->end + 1 == next_ext->start) {
++				cur_ext->end = next_ext->end;
++				cur_ext->next = next_ext->next;
++				toi_kfree(2, next_ext, sizeof(*next_ext));
++				chain->num_extents--;
++			}
++
++			chain->last_touched = cur_ext;
++			chain->size += (end - start + 1);
++
++			return 0;
++		}
++	}
++
++	new_ext = toi_get_extent();
++	if (!new_ext) {
++		printk(KERN_INFO "Error unable to append a new extent to the "
++				"chain.\n");
++		return -ENOMEM;
++	}
++
++	chain->num_extents++;
++	chain->size += (end - start + 1);
++	new_ext->start = start;
++	new_ext->end = end;
++
++	chain->last_touched = new_ext;
++
++	if (cur_ext) {
++		new_ext->next = cur_ext->next;
++		cur_ext->next = new_ext;
++	} else {
++		if (chain->first)
++			new_ext->next = chain->first;
++		chain->first = new_ext;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(toi_add_to_extent_chain);
+diff --git a/kernel/power/tuxonice_extent.h b/kernel/power/tuxonice_extent.h
+new file mode 100644
+index 0000000..157446cf
+--- /dev/null
++++ b/kernel/power/tuxonice_extent.h
+@@ -0,0 +1,44 @@
++/*
++ * kernel/power/tuxonice_extent.h
++ *
++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * It contains declarations related to extents. Extents are
++ * TuxOnIce's method of storing some of the metadata for the image.
++ * See tuxonice_extent.c for more info.
++ *
++ */
++
++#include "tuxonice_modules.h"
++
++#ifndef EXTENT_H
++#define EXTENT_H
++
++struct hibernate_extent {
++	unsigned long start, end;
++	struct hibernate_extent *next;
++};
++
++struct hibernate_extent_chain {
++	unsigned long size; /* size of the chain ie sum (max-min+1) */
++	int num_extents;
++	struct hibernate_extent *first, *last_touched;
++	struct hibernate_extent *current_extent;
++	unsigned long current_offset;
++};
++
++/* Simplify iterating through all the values in an extent chain */
++#define toi_extent_for_each(extent_chain, extentpointer, value) \
++if ((extent_chain)->first) \
++	for ((extentpointer) = (extent_chain)->first, (value) = \
++			(extentpointer)->start; \
++	     ((extentpointer) && ((extentpointer)->next || (value) <= \
++				 (extentpointer)->end)); \
++	     (((value) == (extentpointer)->end) ? \
++		((extentpointer) = (extentpointer)->next, (value) = \
++		 ((extentpointer) ? (extentpointer)->start : 0)) : \
++			(value)++))
++
++#endif
+diff --git a/kernel/power/tuxonice_file.c b/kernel/power/tuxonice_file.c
+new file mode 100644
+index 0000000..4b817c4
+--- /dev/null
++++ b/kernel/power/tuxonice_file.c
+@@ -0,0 +1,497 @@
++/*
++ * kernel/power/tuxonice_file.c
++ *
++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * This file encapsulates functions for usage of a simple file as a
++ * backing store. It is based upon the swapallocator, and shares the
++ * same basic working. Here, though, we have nothing to do with
++ * swapspace, and only one device to worry about.
++ *
++ * The user can just
++ *
++ * echo TuxOnIce > /path/to/my_file
++ *
++ * dd if=/dev/zero bs=1M count=<file_size_desired> >> /path/to/my_file
++ *
++ * and
++ *
++ * echo /path/to/my_file > /sys/power/tuxonice/file/target
++ *
++ * then put what they find in /sys/power/tuxonice/resume
++ * as their resume= parameter in lilo.conf (and rerun lilo if using it).
++ *
++ * Having done this, they're ready to hibernate and resume.
++ *
++ * TODO:
++ * - File resizing.
++ */
++
++#include <linux/blkdev.h>
++#include <linux/mount.h>
++#include <linux/fs.h>
++#include <linux/fs_uuid.h>
++
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_bio.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_io.h"
++
++#define target_is_normal_file() (S_ISREG(target_inode->i_mode))
++
++static struct toi_module_ops toi_fileops;
++
++static struct file *target_file;
++static struct block_device *toi_file_target_bdev;
++static unsigned long pages_available, pages_allocated;
++static char toi_file_target[256];
++static struct inode *target_inode;
++static int file_target_priority;
++static int used_devt;
++static int target_claim;
++static dev_t toi_file_dev_t;
++static int sig_page_index;
++
++/* For test_toi_file_target */
++static struct toi_bdev_info *file_chain;
++
++static int has_contiguous_blocks(struct toi_bdev_info *dev_info, int page_num)
++{
++	int j;
++	sector_t last = 0;
++
++	for (j = 0; j < dev_info->blocks_per_page; j++) {
++		sector_t this = bmap(target_inode,
++				page_num * dev_info->blocks_per_page + j);
++
++		if (!this || (last && (last + 1) != this))
++			break;
++
++		last = this;
++	}
++
++	return j == dev_info->blocks_per_page;
++}
++
++static unsigned long get_usable_pages(struct toi_bdev_info *dev_info)
++{
++	unsigned long result = 0;
++	struct block_device *bdev = dev_info->bdev;
++	int i;
++
++	switch (target_inode->i_mode & S_IFMT) {
++	case S_IFSOCK:
++	case S_IFCHR:
++	case S_IFIFO: /* Socket, Char, Fifo */
++		return -1;
++	case S_IFREG: /* Regular file: current size - holes + free
++			 space on part */
++		for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT) ; i++) {
++			if (has_contiguous_blocks(dev_info, i))
++				result++;
++		}
++		break;
++	case S_IFBLK: /* Block device */
++		if (!bdev->bd_disk) {
++			toi_message(TOI_IO, TOI_VERBOSE, 0,
++					"bdev->bd_disk null.");
++			return 0;
++		}
++
++		result = (bdev->bd_part ?
++			bdev->bd_part->nr_sects :
++			get_capacity(bdev->bd_disk)) >> (PAGE_SHIFT - 9);
++	}
++
++
++	return result;
++}
++
++static int toi_file_register_storage(void)
++{
++	struct toi_bdev_info *devinfo;
++	int result = 0;
++	struct fs_info *fs_info;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_file_register_storage.");
++	if (!strlen(toi_file_target)) {
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Register file storage: "
++				"No target filename set.");
++		return 0;
++	}
++
++	target_file = filp_open(toi_file_target, O_RDONLY|O_LARGEFILE, 0);
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "filp_open %s returned %p.",
++			toi_file_target, target_file);
++
++	if (IS_ERR(target_file) || !target_file) {
++		target_file = NULL;
++		toi_file_dev_t = name_to_dev_t(toi_file_target);
++		if (!toi_file_dev_t) {
++			struct kstat stat;
++			int error = vfs_stat(toi_file_target, &stat);
++			printk(KERN_INFO "Open file %s returned %p and "
++					"name_to_devt failed.\n",
++					toi_file_target, target_file);
++			if (error) {
++				printk(KERN_INFO "Stating the file also failed."
++					" Nothing more we can do.\n");
++				return 0;
++			} else
++				toi_file_dev_t = stat.rdev;
++		}
++
++		toi_file_target_bdev = toi_open_by_devnum(toi_file_dev_t);
++		if (IS_ERR(toi_file_target_bdev)) {
++			printk(KERN_INFO "Got a dev_num (%lx) but failed to "
++					"open it.\n",
++					(unsigned long) toi_file_dev_t);
++			toi_file_target_bdev = NULL;
++			return 0;
++		}
++		used_devt = 1;
++		target_inode = toi_file_target_bdev->bd_inode;
++	} else
++		target_inode = target_file->f_mapping->host;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Succeeded in opening the target.");
++	if (S_ISLNK(target_inode->i_mode) || S_ISDIR(target_inode->i_mode) ||
++	    S_ISSOCK(target_inode->i_mode) || S_ISFIFO(target_inode->i_mode)) {
++		printk(KERN_INFO "File support works with regular files,"
++				" character files and block devices.\n");
++		/* Cleanup routine will undo the above */
++		return 0;
++	}
++
++	if (!used_devt) {
++		if (S_ISBLK(target_inode->i_mode)) {
++			toi_file_target_bdev = I_BDEV(target_inode);
++			if (!blkdev_get(toi_file_target_bdev, FMODE_WRITE |
++						FMODE_READ, NULL))
++				target_claim = 1;
++		} else
++			toi_file_target_bdev = target_inode->i_sb->s_bdev;
++		if (!toi_file_target_bdev) {
++			printk(KERN_INFO "%s is not a valid file allocator "
++					"target.\n", toi_file_target);
++			return 0;
++		}
++		toi_file_dev_t = toi_file_target_bdev->bd_dev;
++	}
++
++	devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info), GFP_ATOMIC);
++	if (!devinfo) {
++		printk("Failed to allocate a toi_bdev_info struct for the file allocator.\n");
++		return -ENOMEM;
++	}
++
++	devinfo->bdev = toi_file_target_bdev;
++	devinfo->allocator = &toi_fileops;
++	devinfo->allocator_index = 0;
++
++	fs_info = fs_info_from_block_dev(toi_file_target_bdev);
++	if (fs_info && !IS_ERR(fs_info)) {
++		memcpy(devinfo->uuid, &fs_info->uuid, 16);
++		free_fs_info(fs_info);
++	} else
++		result = (int) PTR_ERR(fs_info);
++
++	/* Unlike swap code, only complain if fs_info_from_block_dev returned
++	 * -ENOMEM. The 'file' might be a full partition, so might validly not
++	 * have an identifiable type, UUID etc.
++	 */
++	if (result)
++		printk(KERN_DEBUG "Failed to get fs_info for file device (%d).\n",
++				result);
++	devinfo->dev_t = toi_file_dev_t;
++	devinfo->prio = file_target_priority;
++	devinfo->bmap_shift = target_inode->i_blkbits - 9;
++	devinfo->blocks_per_page =
++		(1 << (PAGE_SHIFT - target_inode->i_blkbits));
++	sprintf(devinfo->name, "file %s", toi_file_target);
++	file_chain = devinfo;
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Dev_t is %lx. Prio is %d. Bmap "
++			"shift is %d. Blocks per page %d.",
++			devinfo->dev_t, devinfo->prio, devinfo->bmap_shift,
++			devinfo->blocks_per_page);
++
++	/* Keep one aside for the signature */
++	pages_available = get_usable_pages(devinfo) - 1;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering file storage, %lu "
++			"pages.", pages_available);
++
++	toi_bio_ops.register_storage(devinfo);
++	return 0;
++}
++
++static unsigned long toi_file_storage_available(void)
++{
++	return pages_available;
++}
++
++static int toi_file_allocate_storage(struct toi_bdev_info *chain,
++		unsigned long request)
++{
++	unsigned long available = pages_available - pages_allocated;
++	unsigned long to_add = min(available, request);
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Pages available is %lu. Allocated "
++		"is %lu. Allocating %lu pages from file.",
++		pages_available, pages_allocated, to_add);
++	pages_allocated += to_add;
++
++	return to_add;
++}
++
++/**
++ * __populate_block_list - add an extent to the chain
++ * @min:	Start of the extent (first physical block = sector)
++ * @max:	End of the extent (last physical block = sector)
++ *
++ * If TOI_TEST_BIO is set, print a debug message, outputting the min and max
++ * fs block numbers.
++ **/
++static int __populate_block_list(struct toi_bdev_info *chain, int min, int max)
++{
++	if (test_action_state(TOI_TEST_BIO))
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %d-%d.",
++			min << chain->bmap_shift,
++			((max + 1) << chain->bmap_shift) - 1);
++
++	return toi_add_to_extent_chain(&chain->blocks, min, max);
++}
++
++static int get_main_pool_phys_params(struct toi_bdev_info *chain)
++{
++	int i, extent_min = -1, extent_max = -1, result = 0, have_sig_page = 0;
++	unsigned long pages_mapped = 0;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Getting file allocator blocks.");
++
++	if (chain->blocks.first)
++		toi_put_extent_chain(&chain->blocks);
++
++	if (!target_is_normal_file()) {
++		result = (pages_available > 0) ?
++			__populate_block_list(chain, chain->blocks_per_page,
++				(pages_allocated + 1) *
++				chain->blocks_per_page - 1) : 0;
++		return result;
++	}
++
++	/*
++	 * FIXME: We are assuming the first page is contiguous. Is that
++	 * assumption always right?
++	 */
++
++	for (i = 0; i < (target_inode->i_size >> PAGE_SHIFT); i++) {
++		sector_t new_sector;
++
++		if (!has_contiguous_blocks(chain, i))
++			continue;
++
++		if (!have_sig_page) {
++			have_sig_page = 1;
++			sig_page_index = i;
++			continue;
++		}
++
++		pages_mapped++;
++
++		/* Ignore first page - it has the header */
++		if (pages_mapped == 1)
++			continue;
++
++		new_sector = bmap(target_inode, (i * chain->blocks_per_page));
++
++		/*
++		 * I'd love to be able to fill in holes and resize
++		 * files, but not yet...
++		 */
++
++		if (new_sector == extent_max + 1)
++			extent_max += chain->blocks_per_page;
++		else {
++			if (extent_min > -1) {
++				result = __populate_block_list(chain,
++						extent_min, extent_max);
++				if (result)
++					return result;
++			}
++
++			extent_min = new_sector;
++			extent_max = extent_min +
++				chain->blocks_per_page - 1;
++		}
++
++		if (pages_mapped == pages_allocated)
++			break;
++	}
++
++	if (extent_min > -1) {
++		result = __populate_block_list(chain, extent_min, extent_max);
++		if (result)
++			return result;
++	}
++
++	return 0;
++}
++
++static void toi_file_free_storage(struct toi_bdev_info *chain)
++{
++	pages_allocated = 0;
++	file_chain = NULL;
++}
++
++/**
++ * toi_file_print_debug_stats - print debug info
++ * @buffer:	Buffer to data to populate
++ * @size:	Size of the buffer
++ **/
++static int toi_file_print_debug_stats(char *buffer, int size)
++{
++	int len = scnprintf(buffer, size, "- File Allocator active.\n");
++
++	len += scnprintf(buffer+len, size-len, "  Storage available for "
++			"image: %lu pages.\n", pages_available);
++
++	return len;
++}
++
++static void toi_file_cleanup(int finishing_cycle)
++{
++	if (toi_file_target_bdev) {
++		if (target_claim) {
++			blkdev_put(toi_file_target_bdev, FMODE_WRITE | FMODE_READ);
++			target_claim = 0;
++		}
++
++		if (used_devt) {
++			blkdev_put(toi_file_target_bdev,
++					FMODE_READ | FMODE_NDELAY);
++			used_devt = 0;
++		}
++		toi_file_target_bdev = NULL;
++		target_inode = NULL;
++	}
++
++	if (target_file) {
++		filp_close(target_file, NULL);
++		target_file = NULL;
++	}
++
++	pages_available = 0;
++}
++
++/**
++ * test_toi_file_target - sysfs callback for /sys/power/tuxonince/file/target
++ *
++ * Test wheter the target file is valid for hibernating.
++ **/
++static void test_toi_file_target(void)
++{
++	int result = toi_file_register_storage();
++	sector_t sector;
++	char buf[50];
++	struct fs_info *fs_info;
++
++	if (result || !file_chain)
++		return;
++
++	/* This doesn't mean we're in business. Is any storage available? */
++	if (!pages_available)
++		goto out;
++
++	toi_file_allocate_storage(file_chain, 1);
++	result = get_main_pool_phys_params(file_chain);
++	if (result)
++		goto out;
++
++
++	sector = bmap(target_inode, sig_page_index *
++			file_chain->blocks_per_page) << file_chain->bmap_shift;
++
++	/* Use the uuid, or the dev_t if that fails */
++	fs_info = fs_info_from_block_dev(toi_file_target_bdev);
++	if (!fs_info || IS_ERR(fs_info)) {
++		bdevname(toi_file_target_bdev, buf);
++		sprintf(resume_file, "/dev/%s:%llu", buf,
++				(unsigned long long) sector);
++	} else {
++		int i;
++		hex_dump_to_buffer(fs_info->uuid, 16, 32, 1, buf, 50, 0);
++
++		/* Remove the spaces */
++		for (i = 1; i < 16; i++) {
++			buf[2 * i] = buf[3 * i];
++			buf[2 * i + 1] = buf[3 * i + 1];
++		}
++		buf[32] = 0;
++		sprintf(resume_file, "UUID=%s:0x%llx", buf,
++				(unsigned long long) sector);
++		free_fs_info(fs_info);
++	}
++
++	toi_attempt_to_parse_resume_device(0);
++out:
++	toi_file_free_storage(file_chain);
++	toi_bio_ops.free_storage();
++}
++
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_STRING("target", SYSFS_RW, toi_file_target, 256,
++		SYSFS_NEEDS_SM_FOR_WRITE, test_toi_file_target),
++	SYSFS_INT("enabled", SYSFS_RW, &toi_fileops.enabled, 0, 1, 0, NULL),
++	SYSFS_INT("priority", SYSFS_RW, &file_target_priority, -4095,
++			4096, 0, NULL),
++};
++
++static struct toi_bio_allocator_ops toi_bio_fileops = {
++	.register_storage			= toi_file_register_storage,
++	.storage_available			= toi_file_storage_available,
++	.allocate_storage			= toi_file_allocate_storage,
++	.bmap					= get_main_pool_phys_params,
++	.free_storage				= toi_file_free_storage,
++};
++
++static struct toi_module_ops toi_fileops = {
++	.type					= BIO_ALLOCATOR_MODULE,
++	.name					= "file storage",
++	.directory				= "file",
++	.module					= THIS_MODULE,
++	.print_debug_info			= toi_file_print_debug_stats,
++	.cleanup				= toi_file_cleanup,
++	.bio_allocator_ops			= &toi_bio_fileops,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* ---- Registration ---- */
++static __init int toi_file_load(void)
++{
++	return toi_register_module(&toi_fileops);
++}
++
++#ifdef MODULE
++static __exit void toi_file_unload(void)
++{
++	toi_unregister_module(&toi_fileops);
++}
++
++module_init(toi_file_load);
++module_exit(toi_file_unload);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("TuxOnIce FileAllocator");
++#else
++late_initcall(toi_file_load);
++#endif
+diff --git a/kernel/power/tuxonice_highlevel.c b/kernel/power/tuxonice_highlevel.c
+new file mode 100644
+index 0000000..4e8f4b6
+--- /dev/null
++++ b/kernel/power/tuxonice_highlevel.c
+@@ -0,0 +1,1343 @@
++/*
++ * kernel/power/tuxonice_highlevel.c
++ */
++/** \mainpage TuxOnIce.
++ *
++ * TuxOnIce provides support for saving and restoring an image of
++ * system memory to an arbitrary storage device, either on the local computer,
++ * or across some network. The support is entirely OS based, so TuxOnIce
++ * works without requiring BIOS, APM or ACPI support. The vast majority of the
++ * code is also architecture independant, so it should be very easy to port
++ * the code to new architectures. TuxOnIce includes support for SMP, 4G HighMem
++ * and preemption. Initramfses and initrds are also supported.
++ *
++ * TuxOnIce uses a modular design, in which the method of storing the image is
++ * completely abstracted from the core code, as are transformations on the data
++ * such as compression and/or encryption (multiple 'modules' can be used to
++ * provide arbitrary combinations of functionality). The user interface is also
++ * modular, so that arbitrarily simple or complex interfaces can be used to
++ * provide anything from debugging information through to eye candy.
++ *
++ * \section Copyright
++ *
++ * TuxOnIce is released under the GPLv2.
++ *
++ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu><BR>
++ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz><BR>
++ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr><BR>
++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net)<BR>
++ *
++ * \section Credits
++ *
++ * Nigel would like to thank the following people for their work:
++ *
++ * Bernard Blackham <bernard@blackham.com.au><BR>
++ * Web page & Wiki administration, some coding. A person without whom
++ * TuxOnIce would not be where it is.
++ *
++ * Michael Frank <mhf@linuxmail.org><BR>
++ * Extensive testing and help with improving stability. I was constantly
++ * amazed by the quality and quantity of Michael's help.
++ *
++ * Pavel Machek <pavel@ucw.cz><BR>
++ * Modifications, defectiveness pointing, being with Gabor at the very
++ * beginning, suspend to swap space, stop all tasks. Port to 2.4.18-ac and
++ * 2.5.17. Even though Pavel and I disagree on the direction suspend to
++ * disk should take, I appreciate the valuable work he did in helping Gabor
++ * get the concept working.
++ *
++ * ..and of course the myriads of TuxOnIce users who have helped diagnose
++ * and fix bugs, made suggestions on how to improve the code, proofread
++ * documentation, and donated time and money.
++ *
++ * Thanks also to corporate sponsors:
++ *
++ * <B>Redhat.</B>Sometime employer from May 2006 (my fault, not Redhat's!).
++ *
++ * <B>Cyclades.com.</B> Nigel's employers from Dec 2004 until May 2006, who
++ * allowed him to work on TuxOnIce and PM related issues on company time.
++ *
++ * <B>LinuxFund.org.</B> Sponsored Nigel's work on TuxOnIce for four months Oct
++ * 2003 to Jan 2004.
++ *
++ * <B>LAC Linux.</B> Donated P4 hardware that enabled development and ongoing
++ * maintenance of SMP and Highmem support.
++ *
++ * <B>OSDL.</B> Provided access to various hardware configurations, make
++ * occasional small donations to the project.
++ */
++
++#include <linux/suspend.h>
++#include <linux/freezer.h>
++#include <generated/utsrelease.h>
++#include <linux/cpu.h>
++#include <linux/console.h>
++#include <linux/writeback.h>
++#include <linux/uaccess.h> /* for get/set_fs & KERNEL_DS on i386 */
++#include <linux/bio.h>
++#include <linux/kgdb.h>
++
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice_io.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_power_off.h"
++#include "tuxonice_storage.h"
++#include "tuxonice_checksum.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_atomic_copy.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_cluster.h"
++
++/*! Pageset metadata. */
++struct pagedir pagedir2 = {2};
++EXPORT_SYMBOL_GPL(pagedir2);
++
++static mm_segment_t oldfs;
++static DEFINE_MUTEX(tuxonice_in_use);
++static int block_dump_save;
++
++/* Binary signature if an image is present */
++char tuxonice_signature[9] = "\xed\xc3\x02\xe9\x98\x56\xe5\x0c";
++EXPORT_SYMBOL_GPL(tuxonice_signature);
++
++unsigned long boot_kernel_data_buffer;
++
++static char *result_strings[] = {
++	"Hibernation was aborted",
++	"The user requested that we cancel the hibernation",
++	"No storage was available",
++	"Insufficient storage was available",
++	"Freezing filesystems and/or tasks failed",
++	"A pre-existing image was used",
++	"We would free memory, but image size limit doesn't allow this",
++	"Unable to free enough memory to hibernate",
++	"Unable to obtain the Power Management Semaphore",
++	"A device suspend/resume returned an error",
++	"A system device suspend/resume returned an error",
++	"The extra pages allowance is too small",
++	"We were unable to successfully prepare an image",
++	"TuxOnIce module initialisation failed",
++	"TuxOnIce module cleanup failed",
++	"I/O errors were encountered",
++	"Ran out of memory",
++	"An error was encountered while reading the image",
++	"Platform preparation failed",
++	"CPU Hotplugging failed",
++	"Architecture specific preparation failed",
++	"Pages needed resaving, but we were told to abort if this happens",
++	"We can't hibernate at the moment (invalid resume= or filewriter "
++		"target?)",
++	"A hibernation preparation notifier chain member cancelled the "
++		"hibernation",
++	"Pre-snapshot preparation failed",
++	"Pre-restore preparation failed",
++	"Failed to disable usermode helpers",
++	"Can't resume from alternate image",
++	"Header reservation too small",
++	"Device Power Management Preparation failed",
++};
++
++/**
++ * toi_finish_anything - cleanup after doing anything
++ * @hibernate_or_resume:	Whether finishing a cycle or attempt at
++ *				resuming.
++ *
++ * This is our basic clean-up routine, matching start_anything below. We
++ * call cleanup routines, drop module references and restore process fs and
++ * cpus allowed masks, together with the global block_dump variable's value.
++ **/
++void toi_finish_anything(int hibernate_or_resume)
++{
++	toi_cleanup_modules(hibernate_or_resume);
++	toi_put_modules();
++	if (hibernate_or_resume) {
++		block_dump = block_dump_save;
++		set_cpus_allowed_ptr(current, cpu_all_mask);
++		toi_alloc_print_debug_stats();
++		atomic_inc(&snapshot_device_available);
++    unlock_system_sleep();
++	}
++
++	set_fs(oldfs);
++	mutex_unlock(&tuxonice_in_use);
++}
++
++/**
++ * toi_start_anything - basic initialisation for TuxOnIce
++ * @toi_or_resume:	Whether starting a cycle or attempt at resuming.
++ *
++ * Our basic initialisation routine. Take references on modules, use the
++ * kernel segment, recheck resume= if no active allocator is set, initialise
++ * modules, save and reset block_dump and ensure we're running on CPU0.
++ **/
++int toi_start_anything(int hibernate_or_resume)
++{
++	mutex_lock(&tuxonice_in_use);
++
++	oldfs = get_fs();
++	set_fs(KERNEL_DS);
++
++	if (hibernate_or_resume) {
++    lock_system_sleep();
++
++		if (!atomic_add_unless(&snapshot_device_available, -1, 0))
++			goto snapshotdevice_unavailable;
++	}
++
++	if (hibernate_or_resume == SYSFS_HIBERNATE)
++		toi_print_modules();
++
++	if (toi_get_modules()) {
++		printk(KERN_INFO "TuxOnIce: Get modules failed!\n");
++		goto prehibernate_err;
++	}
++
++	if (hibernate_or_resume) {
++		block_dump_save = block_dump;
++		block_dump = 0;
++		set_cpus_allowed_ptr(current,
++				cpumask_of(cpumask_first(cpu_online_mask)));
++	}
++
++	if (toi_initialise_modules_early(hibernate_or_resume))
++		goto early_init_err;
++
++	if (!toiActiveAllocator)
++		toi_attempt_to_parse_resume_device(!hibernate_or_resume);
++
++	if (!toi_initialise_modules_late(hibernate_or_resume))
++		return 0;
++
++	toi_cleanup_modules(hibernate_or_resume);
++early_init_err:
++	if (hibernate_or_resume) {
++		block_dump_save = block_dump;
++		set_cpus_allowed_ptr(current, cpu_all_mask);
++	}
++	toi_put_modules();
++prehibernate_err:
++	if (hibernate_or_resume)
++		atomic_inc(&snapshot_device_available);
++snapshotdevice_unavailable:
++	if (hibernate_or_resume)
++		mutex_unlock(&pm_mutex);
++	set_fs(oldfs);
++	mutex_unlock(&tuxonice_in_use);
++	return -EBUSY;
++}
++
++/*
++ * Nosave page tracking.
++ *
++ * Here rather than in prepare_image because we want to do it once only at the
++ * start of a cycle.
++ */
++
++/**
++ * mark_nosave_pages - set up our Nosave bitmap
++ *
++ * Build a bitmap of Nosave pages from the list. The bitmap allows faster
++ * use when preparing the image.
++ **/
++static void mark_nosave_pages(void)
++{
++	struct nosave_region *region;
++
++	list_for_each_entry(region, &nosave_regions, list) {
++		unsigned long pfn;
++
++		for (pfn = region->start_pfn; pfn < region->end_pfn; pfn++)
++			if (pfn_valid(pfn))
++				SetPageNosave(pfn_to_page(pfn));
++	}
++}
++
++static int toi_alloc_bitmap(struct memory_bitmap **bm)
++{
++	int result = 0;
++
++	*bm = kzalloc(sizeof(struct memory_bitmap), GFP_KERNEL);
++	if (!*bm) {
++		printk(KERN_ERR "Failed to kzalloc memory for a bitmap.\n");
++		return -ENOMEM;
++	}
++
++	result = memory_bm_create(*bm, GFP_KERNEL, 0);
++
++	if (result) {
++		printk(KERN_ERR "Failed to create a bitmap.\n");
++		kfree(*bm);
++		*bm = NULL;
++	}
++
++	return result;
++}
++
++/**
++ * allocate_bitmaps - allocate bitmaps used to record page states
++ *
++ * Allocate the bitmaps we use to record the various TuxOnIce related
++ * page states.
++ **/
++static int allocate_bitmaps(void)
++{
++	if (toi_alloc_bitmap(&pageset1_map) ||
++	    toi_alloc_bitmap(&pageset1_copy_map) ||
++	    toi_alloc_bitmap(&pageset2_map) ||
++	    toi_alloc_bitmap(&io_map) ||
++	    toi_alloc_bitmap(&nosave_map) ||
++	    toi_alloc_bitmap(&free_map) ||
++	    toi_alloc_bitmap(&page_resave_map))
++		return 1;
++
++	return 0;
++}
++
++static void toi_free_bitmap(struct memory_bitmap **bm)
++{
++	if (!*bm)
++		return;
++
++	memory_bm_free(*bm, 0);
++	kfree(*bm);
++	*bm = NULL;
++}
++
++/**
++ * free_bitmaps - free the bitmaps used to record page states
++ *
++ * Free the bitmaps allocated above. It is not an error to call
++ * memory_bm_free on a bitmap that isn't currently allocated.
++ **/
++static void free_bitmaps(void)
++{
++	toi_free_bitmap(&pageset1_map);
++	toi_free_bitmap(&pageset1_copy_map);
++	toi_free_bitmap(&pageset2_map);
++	toi_free_bitmap(&io_map);
++	toi_free_bitmap(&nosave_map);
++	toi_free_bitmap(&free_map);
++	toi_free_bitmap(&page_resave_map);
++}
++
++/**
++ * io_MB_per_second - return the number of MB/s read or written
++ * @write:	Whether to return the speed at which we wrote.
++ *
++ * Calculate the number of megabytes per second that were read or written.
++ **/
++static int io_MB_per_second(int write)
++{
++	return (toi_bkd.toi_io_time[write][1]) ?
++		MB((unsigned long) toi_bkd.toi_io_time[write][0]) * HZ /
++		toi_bkd.toi_io_time[write][1] : 0;
++}
++
++#define SNPRINTF(a...) 	do { len += scnprintf(((char *) buffer) + len, \
++		count - len - 1, ## a); } while (0)
++
++/**
++ * get_debug_info - fill a buffer with debugging information
++ * @buffer:	The buffer to be filled.
++ * @count:	The size of the buffer, in bytes.
++ *
++ * Fill a (usually PAGE_SIZEd) buffer with the debugging info that we will
++ * either printk or return via sysfs.
++ **/
++static int get_toi_debug_info(const char *buffer, int count)
++{
++	int len = 0, i, first_result = 1;
++
++	SNPRINTF("TuxOnIce debugging info:\n");
++	SNPRINTF("- TuxOnIce core  : " TOI_CORE_VERSION "\n");
++	SNPRINTF("- Kernel Version : " UTS_RELEASE "\n");
++	SNPRINTF("- Compiler vers. : %d.%d\n", __GNUC__, __GNUC_MINOR__);
++	SNPRINTF("- Attempt number : %d\n", nr_hibernates);
++	SNPRINTF("- Parameters     : %ld %ld %ld %d %ld %ld\n",
++			toi_result,
++			toi_bkd.toi_action,
++			toi_bkd.toi_debug_state,
++			toi_bkd.toi_default_console_level,
++			image_size_limit,
++			toi_poweroff_method);
++	SNPRINTF("- Overall expected compression percentage: %d.\n",
++			100 - toi_expected_compression_ratio());
++	len += toi_print_module_debug_info(((char *) buffer) + len,
++			count - len - 1);
++	if (toi_bkd.toi_io_time[0][1]) {
++		if ((io_MB_per_second(0) < 5) || (io_MB_per_second(1) < 5)) {
++			SNPRINTF("- I/O speed: Write %ld KB/s",
++			  (KB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
++			  toi_bkd.toi_io_time[0][1]));
++			if (toi_bkd.toi_io_time[1][1])
++				SNPRINTF(", Read %ld KB/s",
++				  (KB((unsigned long)
++				      toi_bkd.toi_io_time[1][0]) * HZ /
++				  toi_bkd.toi_io_time[1][1]));
++		} else {
++			SNPRINTF("- I/O speed: Write %ld MB/s",
++			 (MB((unsigned long) toi_bkd.toi_io_time[0][0]) * HZ /
++			  toi_bkd.toi_io_time[0][1]));
++			if (toi_bkd.toi_io_time[1][1])
++				SNPRINTF(", Read %ld MB/s",
++				 (MB((unsigned long)
++				     toi_bkd.toi_io_time[1][0]) * HZ /
++				  toi_bkd.toi_io_time[1][1]));
++		}
++		SNPRINTF(".\n");
++	} else
++		SNPRINTF("- No I/O speed stats available.\n");
++	SNPRINTF("- Extra pages    : %lu used/%lu.\n",
++			extra_pd1_pages_used, extra_pd1_pages_allowance);
++
++	for (i = 0; i < TOI_NUM_RESULT_STATES; i++)
++		if (test_result_state(i)) {
++			SNPRINTF("%s: %s.\n", first_result ?
++					"- Result         " :
++					"                 ",
++					result_strings[i]);
++			first_result = 0;
++		}
++	if (first_result)
++		SNPRINTF("- Result         : %s.\n", nr_hibernates ?
++			"Succeeded" :
++			"No hibernation attempts so far");
++	return len;
++}
++
++/**
++ * do_cleanup - cleanup after attempting to hibernate or resume
++ * @get_debug_info:	Whether to allocate and return debugging info.
++ *
++ * Cleanup after attempting to hibernate or resume, possibly getting
++ * debugging info as we do so.
++ **/
++static void do_cleanup(int get_debug_info, int restarting)
++{
++	int i = 0;
++	char *buffer = NULL;
++
++	trap_non_toi_io = 0;
++
++	if (get_debug_info)
++		toi_prepare_status(DONT_CLEAR_BAR, "Cleaning up...");
++
++	free_checksum_pages();
++
++	if (get_debug_info)
++		buffer = (char *) toi_get_zeroed_page(20, TOI_ATOMIC_GFP);
++
++	if (buffer)
++		i = get_toi_debug_info(buffer, PAGE_SIZE);
++
++	toi_free_extra_pagedir_memory();
++
++	pagedir1.size = 0;
++	pagedir2.size = 0;
++	set_highmem_size(pagedir1, 0);
++	set_highmem_size(pagedir2, 0);
++
++	if (boot_kernel_data_buffer) {
++		if (!test_toi_state(TOI_BOOT_KERNEL))
++			toi_free_page(37, boot_kernel_data_buffer);
++		boot_kernel_data_buffer = 0;
++	}
++
++	clear_toi_state(TOI_BOOT_KERNEL);
++	thaw_processes();
++
++	if (!restarting)
++		toi_stop_other_threads();
++
++	if (test_action_state(TOI_KEEP_IMAGE) &&
++	    !test_result_state(TOI_ABORTED)) {
++		toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
++			"TuxOnIce: Not invalidating the image due "
++			"to Keep Image being enabled.");
++		set_result_state(TOI_KEPT_IMAGE);
++	} else
++		if (toiActiveAllocator)
++			toiActiveAllocator->remove_image();
++
++	free_bitmaps();
++	usermodehelper_enable();
++
++	if (test_toi_state(TOI_NOTIFIERS_PREPARE)) {
++		pm_notifier_call_chain(PM_POST_HIBERNATION);
++		clear_toi_state(TOI_NOTIFIERS_PREPARE);
++	}
++
++	if (buffer && i) {
++		/* Printk can only handle 1023 bytes, including
++		 * its level mangling. */
++		for (i = 0; i < 3; i++)
++			printk(KERN_ERR "%s", buffer + (1023 * i));
++		toi_free_page(20, (unsigned long) buffer);
++	}
++
++	if (!test_action_state(TOI_LATE_CPU_HOTPLUG))
++		enable_nonboot_cpus();
++
++	if (!restarting)
++		toi_cleanup_console();
++
++	free_attention_list();
++
++	if (!restarting)
++		toi_deactivate_storage(0);
++
++	clear_toi_state(TOI_IGNORE_LOGLEVEL);
++	clear_toi_state(TOI_TRYING_TO_RESUME);
++	clear_toi_state(TOI_NOW_RESUMING);
++}
++
++/**
++ * check_still_keeping_image - we kept an image; check whether to reuse it.
++ *
++ * We enter this routine when we have kept an image. If the user has said they
++ * want to still keep it, all we need to do is powerdown. If powering down
++ * means hibernating to ram and the power doesn't run out, we'll return 1.
++ * If we do power off properly or the battery runs out, we'll resume via the
++ * normal paths.
++ *
++ * If the user has said they want to remove the previously kept image, we
++ * remove it, and return 0. We'll then store a new image.
++ **/
++static int check_still_keeping_image(void)
++{
++	if (test_action_state(TOI_KEEP_IMAGE)) {
++		printk(KERN_INFO "Image already stored: powering down "
++				"immediately.");
++		do_toi_step(STEP_HIBERNATE_POWERDOWN);
++		return 1;	/* Just in case we're using S3 */
++	}
++
++	printk(KERN_INFO "Invalidating previous image.\n");
++	toiActiveAllocator->remove_image();
++
++	return 0;
++}
++
++/**
++ * toi_init - prepare to hibernate to disk
++ *
++ * Initialise variables & data structures, in preparation for
++ * hibernating to disk.
++ **/
++static int toi_init(int restarting)
++{
++	int result, i, j;
++
++	toi_result = 0;
++
++	printk(KERN_INFO "Initiating a hibernation cycle.\n");
++
++	nr_hibernates++;
++
++	for (i = 0; i < 2; i++)
++		for (j = 0; j < 2; j++)
++			toi_bkd.toi_io_time[i][j] = 0;
++
++	if (!test_toi_state(TOI_CAN_HIBERNATE) ||
++	    allocate_bitmaps())
++		return 1;
++
++	mark_nosave_pages();
++
++	if (!restarting)
++		toi_prepare_console();
++
++	result = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
++	if (result) {
++		set_result_state(TOI_NOTIFIERS_PREPARE_FAILED);
++		return 1;
++	}
++	set_toi_state(TOI_NOTIFIERS_PREPARE);
++
++	if (!restarting) {
++		printk(KERN_ERR "Starting other threads.");
++		toi_start_other_threads();
++	}
++
++	result = usermodehelper_disable();
++	if (result) {
++		printk(KERN_ERR "TuxOnIce: Failed to disable usermode "
++				"helpers\n");
++		set_result_state(TOI_USERMODE_HELPERS_ERR);
++		return 1;
++	}
++
++	boot_kernel_data_buffer = toi_get_zeroed_page(37, TOI_ATOMIC_GFP);
++	if (!boot_kernel_data_buffer) {
++		printk(KERN_ERR "TuxOnIce: Failed to allocate "
++				"boot_kernel_data_buffer.\n");
++		set_result_state(TOI_OUT_OF_MEMORY);
++		return 1;
++	}
++
++	if (!test_action_state(TOI_LATE_CPU_HOTPLUG) &&
++			disable_nonboot_cpus()) {
++		set_abort_result(TOI_CPU_HOTPLUG_FAILED);
++		return 1;
++	}
++
++	return 0;
++}
++
++/**
++ * can_hibernate - perform basic 'Can we hibernate?' tests
++ *
++ * Perform basic tests that must pass if we're going to be able to hibernate:
++ * Can we get the pm_mutex? Is resume= valid (we need to know where to write
++ * the image header).
++ **/
++static int can_hibernate(void)
++{
++	if (!test_toi_state(TOI_CAN_HIBERNATE))
++		toi_attempt_to_parse_resume_device(0);
++
++	if (!test_toi_state(TOI_CAN_HIBERNATE)) {
++		printk(KERN_INFO "TuxOnIce: Hibernation is disabled.\n"
++			"This may be because you haven't put something along "
++			"the lines of\n\nresume=swap:/dev/hda1\n\n"
++			"in lilo.conf or equivalent. (Where /dev/hda1 is your "
++			"swap partition).\n");
++		set_abort_result(TOI_CANT_SUSPEND);
++		return 0;
++	}
++
++	if (strlen(alt_resume_param)) {
++		attempt_to_parse_alt_resume_param();
++
++		if (!strlen(alt_resume_param)) {
++			printk(KERN_INFO "Alternate resume parameter now "
++					"invalid. Aborting.\n");
++			set_abort_result(TOI_CANT_USE_ALT_RESUME);
++			return 0;
++		}
++	}
++
++	return 1;
++}
++
++/**
++ * do_post_image_write - having written an image, figure out what to do next
++ *
++ * After writing an image, we might load an alternate image or power down.
++ * Powering down might involve hibernating to ram, in which case we also
++ * need to handle reloading pageset2.
++ **/
++static int do_post_image_write(void)
++{
++	/* If switching images fails, do normal powerdown */
++	if (alt_resume_param[0])
++		do_toi_step(STEP_RESUME_ALT_IMAGE);
++
++	toi_power_down();
++
++	barrier();
++	mb();
++	return 0;
++}
++
++/**
++ * __save_image - do the hard work of saving the image
++ *
++ * High level routine for getting the image saved. The key assumptions made
++ * are that processes have been frozen and sufficient memory is available.
++ *
++ * We also exit through here at resume time, coming back from toi_hibernate
++ * after the atomic restore. This is the reason for the toi_in_hibernate
++ * test.
++ **/
++static int __save_image(void)
++{
++	int temp_result, did_copy = 0;
++
++	toi_prepare_status(DONT_CLEAR_BAR, "Starting to save the image..");
++
++	toi_message(TOI_ANY_SECTION, TOI_LOW, 1,
++		" - Final values: %d and %d.",
++		pagedir1.size, pagedir2.size);
++
++	toi_cond_pause(1, "About to write pagedir2.");
++
++	temp_result = write_pageset(&pagedir2);
++
++	if (temp_result == -1 || test_result_state(TOI_ABORTED))
++		return 1;
++
++	toi_cond_pause(1, "About to copy pageset 1.");
++
++	if (test_result_state(TOI_ABORTED))
++		return 1;
++
++	toi_deactivate_storage(1);
++
++	toi_prepare_status(DONT_CLEAR_BAR, "Doing atomic copy/restore.");
++
++	toi_in_hibernate = 1;
++
++	if (toi_go_atomic(PMSG_FREEZE, 1))
++		goto Failed;
++
++	temp_result = toi_hibernate();
++
++#ifdef CONFIG_KGDB
++	if (test_action_state(TOI_POST_RESUME_BREAKPOINT))
++		kgdb_breakpoint();
++#endif
++
++	if (!temp_result)
++		did_copy = 1;
++
++	/* We return here at resume time too! */
++	toi_end_atomic(ATOMIC_ALL_STEPS, toi_in_hibernate, temp_result);
++
++Failed:
++	if (toi_activate_storage(1))
++		panic("Failed to reactivate our storage.");
++
++	/* Resume time? */
++	if (!toi_in_hibernate) {
++		copyback_post();
++		return 0;
++	}
++
++	/* Nope. Hibernating. So, see if we can save the image... */
++
++	if (temp_result || test_result_state(TOI_ABORTED)) {
++		if (did_copy)
++			goto abort_reloading_pagedir_two;
++		else
++			return 1;
++	}
++
++	toi_update_status(pagedir2.size, pagedir1.size + pagedir2.size,
++			NULL);
++
++	if (test_result_state(TOI_ABORTED))
++		goto abort_reloading_pagedir_two;
++
++	toi_cond_pause(1, "About to write pageset1.");
++
++	toi_message(TOI_ANY_SECTION, TOI_LOW, 1, "-- Writing pageset1");
++
++	temp_result = write_pageset(&pagedir1);
++
++	/* We didn't overwrite any memory, so no reread needs to be done. */
++	if (test_action_state(TOI_TEST_FILTER_SPEED) ||
++	    test_action_state(TOI_TEST_BIO))
++		return 1;
++
++	if (temp_result == 1 || test_result_state(TOI_ABORTED))
++		goto abort_reloading_pagedir_two;
++
++	toi_cond_pause(1, "About to write header.");
++
++	if (test_result_state(TOI_ABORTED))
++		goto abort_reloading_pagedir_two;
++
++	temp_result = write_image_header();
++
++	if (!temp_result && !test_result_state(TOI_ABORTED))
++		return 0;
++
++abort_reloading_pagedir_two:
++	temp_result = read_pageset2(1);
++
++	/* If that failed, we're sunk. Panic! */
++	if (temp_result)
++		panic("Attempt to reload pagedir 2 while aborting "
++				"a hibernate failed.");
++
++	return 1;
++}
++
++static void map_ps2_pages(int enable)
++{
++	unsigned long pfn = 0;
++
++	pfn = memory_bm_next_pfn(pageset2_map);
++
++	while (pfn != BM_END_OF_MAP) {
++		struct page *page = pfn_to_page(pfn);
++		kernel_map_pages(page, 1, enable);
++		pfn = memory_bm_next_pfn(pageset2_map);
++	}
++}
++
++/**
++ * do_save_image - save the image and handle the result
++ *
++ * Save the prepared image. If we fail or we're in the path returning
++ * from the atomic restore, cleanup.
++ **/
++static int do_save_image(void)
++{
++	int result;
++	map_ps2_pages(0);
++	result = __save_image();
++	map_ps2_pages(1);
++	return result;
++}
++
++/**
++ * do_prepare_image - try to prepare an image
++ *
++ * Seek to initialise and prepare an image to be saved. On failure,
++ * cleanup.
++ **/
++static int do_prepare_image(void)
++{
++	int restarting = test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL);
++
++	if (!restarting && toi_activate_storage(0))
++		return 1;
++
++	/*
++	 * If kept image and still keeping image and hibernating to RAM, we will
++	 * return 1 after hibernating and resuming (provided the power doesn't
++	 * run out. In that case, we skip directly to cleaning up and exiting.
++	 */
++
++	if (!can_hibernate() ||
++	    (test_result_state(TOI_KEPT_IMAGE) &&
++	     check_still_keeping_image()))
++		return 1;
++
++	if (toi_init(restarting) || toi_prepare_image() ||
++			test_result_state(TOI_ABORTED))
++		return 1;
++
++	trap_non_toi_io = 1;
++
++	return 0;
++}
++
++/**
++ * do_check_can_resume - find out whether an image has been stored
++ *
++ * Read whether an image exists. We use the same routine as the
++ * image_exists sysfs entry, and just look to see whether the
++ * first character in the resulting buffer is a '1'.
++ **/
++int do_check_can_resume(void)
++{
++	int result = -1;
++
++	if (toi_activate_storage(0))
++		return -1;
++
++	if (!test_toi_state(TOI_RESUME_DEVICE_OK))
++		toi_attempt_to_parse_resume_device(1);
++
++	if (toiActiveAllocator)
++		result = toiActiveAllocator->image_exists(1);
++
++	toi_deactivate_storage(0);
++	return result;
++}
++EXPORT_SYMBOL_GPL(do_check_can_resume);
++
++/**
++ * do_load_atomic_copy - load the first part of an image, if it exists
++ *
++ * Check whether we have an image. If one exists, do sanity checking
++ * (possibly invalidating the image or even rebooting if the user
++ * requests that) before loading it into memory in preparation for the
++ * atomic restore.
++ *
++ * If and only if we have an image loaded and ready to restore, we return 1.
++ **/
++static int do_load_atomic_copy(void)
++{
++	int read_image_result = 0;
++
++	if (sizeof(swp_entry_t) != sizeof(long)) {
++		printk(KERN_WARNING "TuxOnIce: The size of swp_entry_t != size"
++			" of long. Please report this!\n");
++		return 1;
++	}
++
++	if (!resume_file[0])
++		printk(KERN_WARNING "TuxOnIce: "
++			"You need to use a resume= command line parameter to "
++			"tell TuxOnIce where to look for an image.\n");
++
++	toi_activate_storage(0);
++
++	if (!(test_toi_state(TOI_RESUME_DEVICE_OK)) &&
++		!toi_attempt_to_parse_resume_device(0)) {
++		/*
++		 * Without a usable storage device we can do nothing -
++		 * even if noresume is given
++		 */
++
++		if (!toiNumAllocators)
++			printk(KERN_ALERT "TuxOnIce: "
++			  "No storage allocators have been registered.\n");
++		else
++			printk(KERN_ALERT "TuxOnIce: "
++				"Missing or invalid storage location "
++				"(resume= parameter). Please correct and "
++				"rerun lilo (or equivalent) before "
++				"hibernating.\n");
++		toi_deactivate_storage(0);
++		return 1;
++	}
++
++	if (allocate_bitmaps())
++		return 1;
++
++	read_image_result = read_pageset1(); /* non fatal error ignored */
++
++	if (test_toi_state(TOI_NORESUME_SPECIFIED))
++		clear_toi_state(TOI_NORESUME_SPECIFIED);
++
++	toi_deactivate_storage(0);
++
++	if (read_image_result)
++		return 1;
++
++	return 0;
++}
++
++/**
++ * prepare_restore_load_alt_image - save & restore alt image variables
++ *
++ * Save and restore the pageset1 maps, when loading an alternate image.
++ **/
++static void prepare_restore_load_alt_image(int prepare)
++{
++	static struct memory_bitmap *pageset1_map_save, *pageset1_copy_map_save;
++
++	if (prepare) {
++		pageset1_map_save = pageset1_map;
++		pageset1_map = NULL;
++		pageset1_copy_map_save = pageset1_copy_map;
++		pageset1_copy_map = NULL;
++		set_toi_state(TOI_LOADING_ALT_IMAGE);
++		toi_reset_alt_image_pageset2_pfn();
++	} else {
++		memory_bm_free(pageset1_map, 0);
++		pageset1_map = pageset1_map_save;
++		memory_bm_free(pageset1_copy_map, 0);
++		pageset1_copy_map = pageset1_copy_map_save;
++		clear_toi_state(TOI_NOW_RESUMING);
++		clear_toi_state(TOI_LOADING_ALT_IMAGE);
++	}
++}
++
++/**
++ * do_toi_step - perform a step in hibernating or resuming
++ *
++ * Perform a step in hibernating or resuming an image. This abstraction
++ * is in preparation for implementing cluster support, and perhaps replacing
++ * uswsusp too (haven't looked whether that's possible yet).
++ **/
++int do_toi_step(int step)
++{
++	switch (step) {
++	case STEP_HIBERNATE_PREPARE_IMAGE:
++		return do_prepare_image();
++	case STEP_HIBERNATE_SAVE_IMAGE:
++		return do_save_image();
++	case STEP_HIBERNATE_POWERDOWN:
++		return do_post_image_write();
++	case STEP_RESUME_CAN_RESUME:
++		return do_check_can_resume();
++	case STEP_RESUME_LOAD_PS1:
++		return do_load_atomic_copy();
++	case STEP_RESUME_DO_RESTORE:
++		/*
++		 * If we succeed, this doesn't return.
++		 * Instead, we return from do_save_image() in the
++		 * hibernated kernel.
++		 */
++		return toi_atomic_restore();
++	case STEP_RESUME_ALT_IMAGE:
++		printk(KERN_INFO "Trying to resume alternate image.\n");
++		toi_in_hibernate = 0;
++		save_restore_alt_param(SAVE, NOQUIET);
++		prepare_restore_load_alt_image(1);
++		if (!do_check_can_resume()) {
++			printk(KERN_INFO "Nothing to resume from.\n");
++			goto out;
++		}
++		if (!do_load_atomic_copy())
++			toi_atomic_restore();
++
++		printk(KERN_INFO "Failed to load image.\n");
++out:
++		prepare_restore_load_alt_image(0);
++		save_restore_alt_param(RESTORE, NOQUIET);
++		break;
++	case STEP_CLEANUP:
++		do_cleanup(1, 0);
++		break;
++	case STEP_QUIET_CLEANUP:
++		do_cleanup(0, 0);
++		break;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(do_toi_step);
++
++/* -- Functions for kickstarting a hibernate or resume --- */
++
++/**
++ * toi_try_resume - try to do the steps in resuming
++ *
++ * Check if we have an image and if so try to resume. Clear the status
++ * flags too.
++ **/
++void toi_try_resume(void)
++{
++	set_toi_state(TOI_TRYING_TO_RESUME);
++	resume_attempted = 1;
++
++	current->flags |= PF_MEMALLOC;
++	toi_start_other_threads();
++
++	if (do_toi_step(STEP_RESUME_CAN_RESUME) &&
++			!do_toi_step(STEP_RESUME_LOAD_PS1))
++		do_toi_step(STEP_RESUME_DO_RESTORE);
++
++	toi_stop_other_threads();
++	do_cleanup(0, 0);
++
++	current->flags &= ~PF_MEMALLOC;
++
++	clear_toi_state(TOI_IGNORE_LOGLEVEL);
++	clear_toi_state(TOI_TRYING_TO_RESUME);
++	clear_toi_state(TOI_NOW_RESUMING);
++}
++
++/**
++ * toi_sys_power_disk_try_resume - wrapper calling toi_try_resume
++ *
++ * Wrapper for when __toi_try_resume is called from swsusp resume path,
++ * rather than from echo > /sys/power/tuxonice/do_resume.
++ **/
++static void toi_sys_power_disk_try_resume(void)
++{
++	resume_attempted = 1;
++
++	/*
++	 * There's a comment in kernel/power/disk.c that indicates
++	 * we should be able to use mutex_lock_nested below. That
++	 * doesn't seem to cut it, though, so let's just turn lockdep
++	 * off for now.
++	 */
++	lockdep_off();
++
++	if (toi_start_anything(SYSFS_RESUMING))
++		goto out;
++
++	toi_try_resume();
++
++	/*
++	 * For initramfs, we have to clear the boot time
++	 * flag after trying to resume
++	 */
++	clear_toi_state(TOI_BOOT_TIME);
++
++	toi_finish_anything(SYSFS_RESUMING);
++out:
++	lockdep_on();
++}
++
++/**
++ * toi_try_hibernate - try to start a hibernation cycle
++ *
++ * Start a hibernation cycle, coming in from either
++ * echo > /sys/power/tuxonice/do_suspend
++ *
++ * or
++ *
++ * echo disk > /sys/power/state
++ *
++ * In the later case, we come in without pm_sem taken; in the
++ * former, it has been taken.
++ **/
++int toi_try_hibernate(void)
++{
++	int result = 0, sys_power_disk = 0, retries = 0;
++
++	if (!mutex_is_locked(&tuxonice_in_use)) {
++		/* Came in via /sys/power/disk */
++		if (toi_start_anything(SYSFS_HIBERNATING))
++			return -EBUSY;
++		sys_power_disk = 1;
++	}
++
++	current->flags |= PF_MEMALLOC;
++
++	if (test_toi_state(TOI_CLUSTER_MODE)) {
++		toi_initiate_cluster_hibernate();
++		goto out;
++	}
++
++prepare:
++	result = do_toi_step(STEP_HIBERNATE_PREPARE_IMAGE);
++
++	if (result)
++		goto out;
++
++	if (test_action_state(TOI_FREEZER_TEST))
++		goto out_restore_gfp_mask;
++
++	result = do_toi_step(STEP_HIBERNATE_SAVE_IMAGE);
++
++	if (test_result_state(TOI_EXTRA_PAGES_ALLOW_TOO_SMALL)) {
++		if (retries < 2) {
++			do_cleanup(0, 1);
++			retries++;
++			clear_result_state(TOI_ABORTED);
++			extra_pd1_pages_allowance = extra_pd1_pages_used + 500;
++			printk(KERN_INFO "Automatically adjusting the extra"
++				" pages allowance to %ld and restarting.\n",
++				extra_pd1_pages_allowance);
++			pm_restore_gfp_mask();
++			goto prepare;
++		}
++
++		printk(KERN_INFO "Adjusted extra pages allowance twice and "
++			"still couldn't hibernate successfully. Giving up.");
++	}
++
++	/* This code runs at resume time too! */
++	if (!result && toi_in_hibernate)
++		result = do_toi_step(STEP_HIBERNATE_POWERDOWN);
++
++out_restore_gfp_mask:
++	pm_restore_gfp_mask();
++out:
++	do_cleanup(1, 0);
++	current->flags &= ~PF_MEMALLOC;
++
++	if (sys_power_disk)
++		toi_finish_anything(SYSFS_HIBERNATING);
++
++	return result;
++}
++
++/*
++ * channel_no: If !0, -c <channel_no> is added to args (userui).
++ */
++int toi_launch_userspace_program(char *command, int channel_no,
++		int wait, int debug)
++{
++	int retval;
++	static char *envp[] = {
++			"HOME=/",
++			"TERM=linux",
++			"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
++			NULL };
++	static char *argv[] = { NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL
++		};
++	char *channel = NULL;
++	int arg = 0, size;
++	char test_read[255];
++	char *orig_posn = command;
++
++	if (!strlen(orig_posn))
++		return 1;
++
++	if (channel_no) {
++		channel = toi_kzalloc(4, 6, GFP_KERNEL);
++		if (!channel) {
++			printk(KERN_INFO "Failed to allocate memory in "
++				"preparing to launch userspace program.\n");
++			return 1;
++		}
++	}
++
++	/* Up to 6 args supported */
++	while (arg < 6) {
++		sscanf(orig_posn, "%s", test_read);
++		size = strlen(test_read);
++		if (!(size))
++			break;
++		argv[arg] = toi_kzalloc(5, size + 1, TOI_ATOMIC_GFP);
++		strcpy(argv[arg], test_read);
++		orig_posn += size + 1;
++		*test_read = 0;
++		arg++;
++	}
++
++	if (channel_no) {
++		sprintf(channel, "-c%d", channel_no);
++		argv[arg] = channel;
++	} else
++		arg--;
++
++	if (debug) {
++		argv[++arg] = toi_kzalloc(5, 8, TOI_ATOMIC_GFP);
++		strcpy(argv[arg], "--debug");
++	}
++
++	retval = call_usermodehelper(argv[0], argv, envp, wait);
++
++	/*
++	 * If the program reports an error, retval = 256. Don't complain
++	 * about that here.
++	 */
++	if (retval && retval != 256)
++		printk(KERN_ERR "Failed to launch userspace program '%s': "
++				"Error %d\n", command, retval);
++
++	{
++		int i;
++		for (i = 0; i < arg; i++)
++			if (argv[i] && argv[i] != channel)
++				toi_kfree(5, argv[i], sizeof(*argv[i]));
++	}
++
++	toi_kfree(4, channel, sizeof(*channel));
++
++	return retval;
++}
++
++/*
++ * This array contains entries that are automatically registered at
++ * boot. Modules and the console code register their own entries separately.
++ */
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_LONG("extra_pages_allowance", SYSFS_RW,
++			&extra_pd1_pages_allowance, 0, LONG_MAX, 0),
++	SYSFS_CUSTOM("image_exists", SYSFS_RW, image_exists_read,
++			image_exists_write, SYSFS_NEEDS_SM_FOR_BOTH, NULL),
++	SYSFS_STRING("resume", SYSFS_RW, resume_file, 255,
++			SYSFS_NEEDS_SM_FOR_WRITE,
++			attempt_to_parse_resume_device2),
++	SYSFS_STRING("alt_resume_param", SYSFS_RW, alt_resume_param, 255,
++			SYSFS_NEEDS_SM_FOR_WRITE,
++			attempt_to_parse_alt_resume_param),
++	SYSFS_CUSTOM("debug_info", SYSFS_READONLY, get_toi_debug_info, NULL, 0,
++			NULL),
++	SYSFS_BIT("ignore_rootfs", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_IGNORE_ROOTFS, 0),
++	SYSFS_LONG("image_size_limit", SYSFS_RW, &image_size_limit, -2,
++			INT_MAX, 0),
++	SYSFS_UL("last_result", SYSFS_RW, &toi_result, 0, 0, 0),
++	SYSFS_BIT("no_multithreaded_io", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_NO_MULTITHREADED_IO, 0),
++	SYSFS_BIT("no_flusher_thread", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_NO_FLUSHER_THREAD, 0),
++	SYSFS_BIT("full_pageset2", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_PAGESET2_FULL, 0),
++	SYSFS_BIT("reboot", SYSFS_RW, &toi_bkd.toi_action, TOI_REBOOT, 0),
++	SYSFS_BIT("replace_swsusp", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_REPLACE_SWSUSP, 0),
++	SYSFS_STRING("resume_commandline", SYSFS_RW,
++			toi_bkd.toi_nosave_commandline, COMMAND_LINE_SIZE, 0,
++			NULL),
++	SYSFS_STRING("version", SYSFS_READONLY, TOI_CORE_VERSION, 0, 0, NULL),
++	SYSFS_BIT("freezer_test", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_FREEZER_TEST, 0),
++	SYSFS_BIT("test_bio", SYSFS_RW, &toi_bkd.toi_action, TOI_TEST_BIO, 0),
++	SYSFS_BIT("test_filter_speed", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_TEST_FILTER_SPEED, 0),
++	SYSFS_BIT("no_pageset2", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_NO_PAGESET2, 0),
++	SYSFS_BIT("no_pageset2_if_unneeded", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_NO_PS2_IF_UNNEEDED, 0),
++	SYSFS_BIT("late_cpu_hotplug", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_LATE_CPU_HOTPLUG, 0),
++	SYSFS_STRING("binary_signature", SYSFS_READONLY,
++			tuxonice_signature, 9, 0, NULL),
++	SYSFS_INT("max_workers", SYSFS_RW, &toi_max_workers, 0, NR_CPUS, 0,
++			NULL),
++#ifdef CONFIG_KGDB
++	SYSFS_BIT("post_resume_breakpoint", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_POST_RESUME_BREAKPOINT, 0),
++#endif
++	SYSFS_BIT("no_readahead", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_NO_READAHEAD, 0),
++#ifdef CONFIG_TOI_KEEP_IMAGE
++	SYSFS_BIT("keep_image", SYSFS_RW , &toi_bkd.toi_action, TOI_KEEP_IMAGE,
++			0),
++#endif
++};
++
++static struct toi_core_fns my_fns = {
++	.get_nonconflicting_page = __toi_get_nonconflicting_page,
++	.post_context_save = __toi_post_context_save,
++	.try_hibernate = toi_try_hibernate,
++	.try_resume = toi_sys_power_disk_try_resume,
++};
++
++/**
++ * core_load - initialisation of TuxOnIce core
++ *
++ * Initialise the core, beginning with sysfs. Checksum and so on are part of
++ * the core, but have their own initialisation routines because they either
++ * aren't compiled in all the time or have their own subdirectories.
++ **/
++static __init int core_load(void)
++{
++	int i,
++	    numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
++
++	printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION
++			" (http://tuxonice.net)\n");
++
++	if (toi_sysfs_init())
++		return 1;
++
++	for (i = 0; i < numfiles; i++)
++		toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
++
++	toi_core_fns = &my_fns;
++
++	if (toi_alloc_init())
++		return 1;
++	if (toi_checksum_init())
++		return 1;
++	if (toi_usm_init())
++		return 1;
++	if (toi_ui_init())
++		return 1;
++	if (toi_poweroff_init())
++		return 1;
++	if (toi_cluster_init())
++		return 1;
++
++	return 0;
++}
++
++#ifdef MODULE
++/**
++ * core_unload: Prepare to unload the core code.
++ **/
++static __exit void core_unload(void)
++{
++	int i,
++	    numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
++
++	toi_alloc_exit();
++	toi_checksum_exit();
++	toi_poweroff_exit();
++	toi_ui_exit();
++	toi_usm_exit();
++	toi_cluster_exit();
++
++	for (i = 0; i < numfiles; i++)
++		toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
++
++	toi_core_fns = NULL;
++
++	toi_sysfs_exit();
++}
++MODULE_LICENSE("GPL");
++module_init(core_load);
++module_exit(core_unload);
++#else
++late_initcall(core_load);
++#endif
+diff --git a/kernel/power/tuxonice_incremental.c b/kernel/power/tuxonice_incremental.c
+new file mode 100644
+index 0000000..16d58fb
+--- /dev/null
++++ b/kernel/power/tuxonice_incremental.c
+@@ -0,0 +1,383 @@
++/*
++ * kernel/power/incremental.c
++ *
++ * Copyright (C) 2012 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * This file contains routines related to storing incremental images - that
++ * is, retaining an image after an initial cycle and then storing incremental
++ * changes on subsequent hibernations.
++ */
++
++#include <linux/suspend.h>
++#include <linux/highmem.h>
++#include <linux/vmalloc.h>
++#include <linux/crypto.h>
++#include <linux/scatterlist.h>
++
++#include "tuxonice_builtin.h"
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_io.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_alloc.h"
++
++static struct toi_module_ops toi_incremental_ops;
++static struct toi_module_ops *next_driver;
++static unsigned long toi_incremental_bytes_in, toi_incremental_bytes_out;
++
++static char toi_incremental_slow_cmp_name[32] = "sha1";
++static int toi_incremental_digestsize;
++
++static DEFINE_MUTEX(stats_lock);
++
++struct cpu_context {
++	u8 *buffer_start;
++	struct hash_desc desc;
++	struct scatterlist sg[1];
++	unsigned char *digest;
++};
++
++#define OUT_BUF_SIZE (2 * PAGE_SIZE)
++
++static DEFINE_PER_CPU(struct cpu_context, contexts);
++
++/*
++ * toi_crypto_prepare
++ *
++ * Prepare to do some work by allocating buffers and transforms.
++ */
++static int toi_incremental_crypto_prepare(void)
++{
++	int cpu, digestsize = toi_incremental_digestsize;
++
++	if (!*toi_incremental_slow_cmp_name) {
++		printk(KERN_INFO "TuxOnIce: Incremental image support enabled but no "
++				"hash algorithm set.\n");
++		return 1;
++	}
++
++	for_each_online_cpu(cpu) {
++		struct cpu_context *this = &per_cpu(contexts, cpu);
++		this->desc.tfm = crypto_alloc_hash(toi_incremental_slow_cmp_name, 0, 0);
++		if (IS_ERR(this->desc.tfm)) {
++			printk(KERN_INFO "TuxOnIce: Failed to initialise the "
++					"%s hashing transform.\n",
++					toi_incremental_slow_cmp_name);
++			this->desc.tfm = NULL;
++			return 1;
++		}
++
++		if (!digestsize) {
++			digestsize = crypto_hash_digestsize(this->desc.tfm);
++			toi_incremental_digestsize = digestsize;
++		}
++
++		this->digest = toi_kzalloc(16, digestsize, GFP_KERNEL);
++		if (!this->digest)
++			return -ENOMEM;
++
++		this->desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
++	}
++
++	return 0;
++}
++
++static int toi_incremental_rw_cleanup(int writing)
++{
++	int cpu;
++
++	for_each_online_cpu(cpu) {
++		struct cpu_context *this = &per_cpu(contexts, cpu);
++		if (this->desc.tfm) {
++			crypto_free_hash(this->desc.tfm);
++			this->desc.tfm = NULL;
++		}
++
++		if (this->digest) {
++			toi_kfree(16, this->digest, toi_incremental_digestsize);
++			this->digest = NULL;
++		}
++	}
++
++	return 0;
++}
++
++/*
++ * toi_incremental_init
++ */
++
++static int toi_incremental_init(int hibernate_or_resume)
++{
++	if (!hibernate_or_resume)
++		return 0;
++
++	next_driver = toi_get_next_filter(&toi_incremental_ops);
++
++	return next_driver ? 0 : -ECHILD;
++}
++
++/*
++ * toi_incremental_rw_init()
++ */
++
++static int toi_incremental_rw_init(int rw, int stream_number)
++{
++	if (rw == WRITE && toi_incremental_crypto_prepare()) {
++		printk(KERN_ERR "Failed to initialise hashing "
++				"algorithm.\n");
++		if (rw == READ) {
++			printk(KERN_INFO "Unable to read the image.\n");
++			return -ENODEV;
++		} else {
++			printk(KERN_INFO "Continuing without "
++				" calculating an incremental image.\n");
++			toi_incremental_ops.enabled = 0;
++		}
++	}
++
++	return 0;
++}
++
++/*
++ * toi_incremental_write_page()
++ *
++ * Decide whether to write a page to the image. Calculate the SHA1 (or something
++ * else if the user changes the hashing algo) of the page and compare it to the
++ * previous value (if any). If there was no previous value or the values are
++ * different, write the page. Otherwise, skip the write.
++ *
++ * @TODO: Clear hashes for pages that are no longer in the image!
++ *
++ * Buffer_page:	Pointer to a buffer of size PAGE_SIZE, containing
++ * data to be written.
++ *
++ * Returns:	0 on success. Otherwise the error is that returned by later
++ * 		modules, -ECHILD if we have a broken pipeline or -EIO if
++ * 		zlib errs.
++ */
++static int toi_incremental_write_page(unsigned long index, int buf_type,
++		void *buffer_page, unsigned int buf_size)
++{
++	int ret = 0, cpu = smp_processor_id();
++	struct cpu_context *ctx = &per_cpu(contexts, cpu);
++	int to_write = true;
++
++	if (ctx->desc.tfm) {
++		// char *old_hash;
++
++		ctx->buffer_start = TOI_MAP(buf_type, buffer_page);
++
++		sg_init_one(&ctx->sg[0], ctx->buffer_start, buf_size);
++
++		ret = crypto_hash_digest(&ctx->desc, &ctx->sg[0], ctx->sg[0].length, ctx->digest);
++		// old_hash = get_old_hash(index);
++
++		TOI_UNMAP(buf_type, buffer_page);
++
++#if 0
++		if (!ret && new_hash == old_hash) {
++			to_write = false;	
++		} else
++			store_hash(ctx, index, new_hash);
++#endif
++	}
++
++	mutex_lock(&stats_lock);
++
++	toi_incremental_bytes_in += buf_size;
++	if (ret || to_write)
++		toi_incremental_bytes_out += buf_size;
++
++	mutex_unlock(&stats_lock);
++
++	if (ret || to_write) {
++		int ret2 = next_driver->write_page(index, buf_type,
++				buffer_page, buf_size);
++		if (!ret)
++			ret = ret2;
++	}
++
++	return ret;
++}
++
++/*
++ * toi_incremental_read_page()
++ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
++ *
++ * Nothing extra to do here.
++ */
++static int toi_incremental_read_page(unsigned long *index, int buf_type,
++		void *buffer_page, unsigned int *buf_size)
++{
++	return next_driver->read_page(index, TOI_PAGE, buffer_page, buf_size);
++}
++
++/*
++ * toi_incremental_print_debug_stats
++ * @buffer: Pointer to a buffer into which the debug info will be printed.
++ * @size: Size of the buffer.
++ *
++ * Print information to be recorded for debugging purposes into a buffer.
++ * Returns: Number of characters written to the buffer.
++ */
++
++static int toi_incremental_print_debug_stats(char *buffer, int size)
++{
++	unsigned long pages_in = toi_incremental_bytes_in >> PAGE_SHIFT,
++		      pages_out = toi_incremental_bytes_out >> PAGE_SHIFT;
++	int len;
++
++	/* Output the size of the incremental image. */
++	if (*toi_incremental_slow_cmp_name)
++		len = scnprintf(buffer, size, "- Hash algorithm is '%s'.\n",
++				toi_incremental_slow_cmp_name);
++	else
++		len = scnprintf(buffer, size, "- Hash algorithm is not set.\n");
++
++	if (pages_in)
++		len += scnprintf(buffer+len, size - len, "  Incremental image "
++			"%lu of %lu bytes (%ld percent).\n",
++		  toi_incremental_bytes_out,
++		  toi_incremental_bytes_in,
++		  pages_out * 100 / pages_in);
++	return len;
++}
++
++/*
++ * toi_incremental_memory_needed
++ *
++ * Tell the caller how much memory we need to operate during hibernate/resume.
++ * Returns: Unsigned long. Maximum number of bytes of memory required for
++ * operation.
++ */
++static int toi_incremental_memory_needed(void)
++{
++	return 2 * PAGE_SIZE;
++}
++
++static int toi_incremental_storage_needed(void)
++{
++	return 2 * sizeof(unsigned long) + sizeof(int) +
++		strlen(toi_incremental_slow_cmp_name) + 1;
++}
++
++/*
++ * toi_incremental_save_config_info
++ * @buffer: Pointer to a buffer of size PAGE_SIZE.
++ *
++ * Save informaton needed when reloading the image at resume time.
++ * Returns: Number of bytes used for saving our data.
++ */
++static int toi_incremental_save_config_info(char *buffer)
++{
++	int len = strlen(toi_incremental_slow_cmp_name) + 1, offset = 0;
++
++	*((unsigned long *) buffer) = toi_incremental_bytes_in;
++	offset += sizeof(unsigned long);
++	*((unsigned long *) (buffer + offset)) = toi_incremental_bytes_out;
++	offset += sizeof(unsigned long);
++	*((int *) (buffer + offset)) = len;
++	offset += sizeof(int);
++	strncpy(buffer + offset, toi_incremental_slow_cmp_name, len);
++	return offset + len;
++}
++
++/* toi_incremental_load_config_info
++ * @buffer: Pointer to the start of the data.
++ * @size: Number of bytes that were saved.
++ *
++ * Description:	Reload information to be retained for debugging info.
++ */
++static void toi_incremental_load_config_info(char *buffer, int size)
++{
++	int len, offset = 0;
++
++	toi_incremental_bytes_in = *((unsigned long *) buffer);
++	offset += sizeof(unsigned long);
++	toi_incremental_bytes_out = *((unsigned long *) (buffer + offset));
++	offset += sizeof(unsigned long);
++	len = *((int *) (buffer + offset));
++	offset += sizeof(int);
++	strncpy(toi_incremental_slow_cmp_name, buffer + offset, len);
++}
++
++static void toi_incremental_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
++{
++	bkd->incremental_bytes_in = toi_incremental_bytes_in;
++	bkd->incremental_bytes_out = toi_incremental_bytes_out;
++}
++
++static void toi_incremental_post_atomic_restore(struct toi_boot_kernel_data *bkd)
++{
++	toi_incremental_bytes_in = bkd->incremental_bytes_in;
++	toi_incremental_bytes_out = bkd->incremental_bytes_out;
++}
++
++static void toi_incremental_algo_change(void)
++{
++	/* Reset so it's gotten from crypto_hash_digestsize afresh */
++	toi_incremental_digestsize = 0;
++}
++
++/*
++ * data for our sysfs entries.
++ */
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_INT("enabled", SYSFS_RW, &toi_incremental_ops.enabled, 0, 1, 0,
++			NULL),
++	SYSFS_STRING("algorithm", SYSFS_RW, toi_incremental_slow_cmp_name, 31, 0, toi_incremental_algo_change),
++};
++
++/*
++ * Ops structure.
++ */
++static struct toi_module_ops toi_incremental_ops = {
++	.type			= FILTER_MODULE,
++	.name			= "incremental",
++	.directory		= "incremental",
++	.module			= THIS_MODULE,
++	.initialise		= toi_incremental_init,
++	.memory_needed 		= toi_incremental_memory_needed,
++	.print_debug_info	= toi_incremental_print_debug_stats,
++	.save_config_info	= toi_incremental_save_config_info,
++	.load_config_info	= toi_incremental_load_config_info,
++	.storage_needed		= toi_incremental_storage_needed,
++
++	.pre_atomic_restore	= toi_incremental_pre_atomic_restore,
++	.post_atomic_restore	= toi_incremental_post_atomic_restore,
++
++	.rw_init		= toi_incremental_rw_init,
++	.rw_cleanup		= toi_incremental_rw_cleanup,
++
++	.write_page		= toi_incremental_write_page,
++	.read_page		= toi_incremental_read_page,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* ---- Registration ---- */
++
++static __init int toi_incremental_load(void)
++{
++	return toi_register_module(&toi_incremental_ops);
++}
++
++#ifdef MODULE
++static __exit void toi_incremental_unload(void)
++{
++	toi_unregister_module(&toi_incremental_ops);
++}
++
++module_init(toi_incremental_load);
++module_exit(toi_incremental_unload);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("Incremental Image Support for TuxOnIce");
++#else
++late_initcall(toi_incremental_load);
++#endif
+diff --git a/kernel/power/tuxonice_io.c b/kernel/power/tuxonice_io.c
+new file mode 100644
+index 0000000..901f1c9
+--- /dev/null
++++ b/kernel/power/tuxonice_io.c
+@@ -0,0 +1,1936 @@
++/*
++ * kernel/power/tuxonice_io.c
++ *
++ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
++ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
++ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * It contains high level IO routines for hibernating.
++ *
++ */
++
++#include <linux/suspend.h>
++#include <linux/version.h>
++#include <linux/utsname.h>
++#include <linux/mount.h>
++#include <linux/highmem.h>
++#include <linux/kthread.h>
++#include <linux/cpu.h>
++#include <linux/fs_struct.h>
++#include <linux/bio.h>
++#include <linux/fs_uuid.h>
++#include <asm/tlbflush.h>
++
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_pageflags.h"
++#include "tuxonice_io.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_storage.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice_extent.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_checksum.h"
++#include "tuxonice_alloc.h"
++char alt_resume_param[256];
++
++/* Version read from image header at resume */
++static int toi_image_header_version;
++
++#define read_if_version(VERS, VAR, DESC, ERR_ACT) do {					\
++	if (likely(toi_image_header_version >= VERS))				\
++		if (toiActiveAllocator->rw_header_chunk(READ, NULL,		\
++					(char *) &VAR, sizeof(VAR))) {		\
++			abort_hibernate(TOI_FAILED_IO, "Failed to read DESC.");	\
++			ERR_ACT;					\
++		}								\
++} while(0)									\
++
++/* Variables shared between threads and updated under the mutex */
++static int io_write, io_finish_at, io_base, io_barmax, io_pageset, io_result;
++static int io_index, io_nextupdate, io_pc, io_pc_step;
++static DEFINE_MUTEX(io_mutex);
++static DEFINE_PER_CPU(struct page *, last_sought);
++static DEFINE_PER_CPU(struct page *, last_high_page);
++static DEFINE_PER_CPU(char *, checksum_locn);
++static DEFINE_PER_CPU(struct pbe *, last_low_page);
++static atomic_t io_count;
++atomic_t toi_io_workers;
++EXPORT_SYMBOL_GPL(toi_io_workers);
++
++static int using_flusher;
++
++DECLARE_WAIT_QUEUE_HEAD(toi_io_queue_flusher);
++EXPORT_SYMBOL_GPL(toi_io_queue_flusher);
++
++int toi_bio_queue_flusher_should_finish;
++EXPORT_SYMBOL_GPL(toi_bio_queue_flusher_should_finish);
++
++int toi_max_workers;
++
++static char *image_version_error = "The image header version is newer than " \
++	"this kernel supports.";
++
++struct toi_module_ops *first_filter;
++
++static atomic_t toi_num_other_threads;
++static DECLARE_WAIT_QUEUE_HEAD(toi_worker_wait_queue);
++enum toi_worker_commands {
++	TOI_IO_WORKER_STOP,
++	TOI_IO_WORKER_RUN,
++	TOI_IO_WORKER_EXIT
++};
++static enum toi_worker_commands toi_worker_command;
++
++/**
++ * toi_attempt_to_parse_resume_device - determine if we can hibernate
++ *
++ * Can we hibernate, using the current resume= parameter?
++ **/
++int toi_attempt_to_parse_resume_device(int quiet)
++{
++	struct list_head *Allocator;
++	struct toi_module_ops *thisAllocator;
++	int result, returning = 0;
++
++	if (toi_activate_storage(0))
++		return 0;
++
++	toiActiveAllocator = NULL;
++	clear_toi_state(TOI_RESUME_DEVICE_OK);
++	clear_toi_state(TOI_CAN_RESUME);
++	clear_result_state(TOI_ABORTED);
++
++	if (!toiNumAllocators) {
++		if (!quiet)
++			printk(KERN_INFO "TuxOnIce: No storage allocators have "
++				"been registered. Hibernating will be "
++				"disabled.\n");
++		goto cleanup;
++	}
++
++	list_for_each(Allocator, &toiAllocators) {
++		thisAllocator = list_entry(Allocator, struct toi_module_ops,
++								type_list);
++
++		/*
++		 * Not sure why you'd want to disable an allocator, but
++		 * we should honour the flag if we're providing it
++		 */
++		if (!thisAllocator->enabled)
++			continue;
++
++		result = thisAllocator->parse_sig_location(
++				resume_file, (toiNumAllocators == 1),
++				quiet);
++
++		switch (result) {
++		case -EINVAL:
++			/* For this allocator, but not a valid
++			 * configuration. Error already printed. */
++			goto cleanup;
++
++		case 0:
++			/* For this allocator and valid. */
++			toiActiveAllocator = thisAllocator;
++
++			set_toi_state(TOI_RESUME_DEVICE_OK);
++			set_toi_state(TOI_CAN_RESUME);
++			returning = 1;
++			goto cleanup;
++		}
++	}
++	if (!quiet)
++		printk(KERN_INFO "TuxOnIce: No matching enabled allocator "
++				"found. Resuming disabled.\n");
++cleanup:
++	toi_deactivate_storage(0);
++	return returning;
++}
++EXPORT_SYMBOL_GPL(toi_attempt_to_parse_resume_device);
++
++void attempt_to_parse_resume_device2(void)
++{
++	toi_prepare_usm();
++	toi_attempt_to_parse_resume_device(0);
++	toi_cleanup_usm();
++}
++EXPORT_SYMBOL_GPL(attempt_to_parse_resume_device2);
++
++void save_restore_alt_param(int replace, int quiet)
++{
++	static char resume_param_save[255];
++	static unsigned long toi_state_save;
++
++	if (replace) {
++		toi_state_save = toi_state;
++		strcpy(resume_param_save, resume_file);
++		strcpy(resume_file, alt_resume_param);
++	} else {
++		strcpy(resume_file, resume_param_save);
++		toi_state = toi_state_save;
++	}
++	toi_attempt_to_parse_resume_device(quiet);
++}
++
++void attempt_to_parse_alt_resume_param(void)
++{
++	int ok = 0;
++
++	/* Temporarily set resume_param to the poweroff value */
++	if (!strlen(alt_resume_param))
++		return;
++
++	printk(KERN_INFO "=== Trying Poweroff Resume2 ===\n");
++	save_restore_alt_param(SAVE, NOQUIET);
++	if (test_toi_state(TOI_CAN_RESUME))
++		ok = 1;
++
++	printk(KERN_INFO "=== Done ===\n");
++	save_restore_alt_param(RESTORE, QUIET);
++
++	/* If not ok, clear the string */
++	if (ok)
++		return;
++
++	printk(KERN_INFO "Can't resume from that location; clearing "
++			"alt_resume_param.\n");
++	alt_resume_param[0] = '\0';
++}
++
++/**
++ * noresume_reset_modules - reset data structures in case of non resuming
++ *
++ * When we read the start of an image, modules (and especially the
++ * active allocator) might need to reset data structures if we
++ * decide to remove the image rather than resuming from it.
++ **/
++static void noresume_reset_modules(void)
++{
++	struct toi_module_ops *this_filter;
++
++	list_for_each_entry(this_filter, &toi_filters, type_list)
++		if (this_filter->noresume_reset)
++			this_filter->noresume_reset();
++
++	if (toiActiveAllocator && toiActiveAllocator->noresume_reset)
++		toiActiveAllocator->noresume_reset();
++}
++
++/**
++ * fill_toi_header - fill the hibernate header structure
++ * @struct toi_header: Header data structure to be filled.
++ **/
++static int fill_toi_header(struct toi_header *sh)
++{
++	int i, error;
++
++	error = init_header((struct swsusp_info *) sh);
++	if (error)
++		return error;
++
++	sh->pagedir = pagedir1;
++	sh->pageset_2_size = pagedir2.size;
++	sh->param0 = toi_result;
++	sh->param1 = toi_bkd.toi_action;
++	sh->param2 = toi_bkd.toi_debug_state;
++	sh->param3 = toi_bkd.toi_default_console_level;
++	sh->root_fs = current->fs->root.mnt->mnt_sb->s_dev;
++	for (i = 0; i < 4; i++)
++		sh->io_time[i/2][i%2] = toi_bkd.toi_io_time[i/2][i%2];
++	sh->bkd = boot_kernel_data_buffer;
++	return 0;
++}
++
++/**
++ * rw_init_modules - initialize modules
++ * @rw:		Whether we are reading of writing an image.
++ * @which:	Section of the image being processed.
++ *
++ * Iterate over modules, preparing the ones that will be used to read or write
++ * data.
++ **/
++static int rw_init_modules(int rw, int which)
++{
++	struct toi_module_ops *this_module;
++	/* Initialise page transformers */
++	list_for_each_entry(this_module, &toi_filters, type_list) {
++		if (!this_module->enabled)
++			continue;
++		if (this_module->rw_init && this_module->rw_init(rw, which)) {
++			abort_hibernate(TOI_FAILED_MODULE_INIT,
++				"Failed to initialize the %s filter.",
++				this_module->name);
++			return 1;
++		}
++	}
++
++	/* Initialise allocator */
++	if (toiActiveAllocator->rw_init(rw, which)) {
++		abort_hibernate(TOI_FAILED_MODULE_INIT,
++				"Failed to initialise the allocator.");
++		return 1;
++	}
++
++	/* Initialise other modules */
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled ||
++		    this_module->type == FILTER_MODULE ||
++		    this_module->type == WRITER_MODULE)
++			continue;
++		if (this_module->rw_init && this_module->rw_init(rw, which)) {
++			set_abort_result(TOI_FAILED_MODULE_INIT);
++			printk(KERN_INFO "Setting aborted flag due to module "
++					"init failure.\n");
++			return 1;
++		}
++	}
++
++	return 0;
++}
++
++/**
++ * rw_cleanup_modules - cleanup modules
++ * @rw:	Whether we are reading of writing an image.
++ *
++ * Cleanup components after reading or writing a set of pages.
++ * Only the allocator may fail.
++ **/
++static int rw_cleanup_modules(int rw)
++{
++	struct toi_module_ops *this_module;
++	int result = 0;
++
++	/* Cleanup other modules */
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled ||
++		    this_module->type == FILTER_MODULE ||
++		    this_module->type == WRITER_MODULE)
++			continue;
++		if (this_module->rw_cleanup)
++			result |= this_module->rw_cleanup(rw);
++	}
++
++	/* Flush data and cleanup */
++	list_for_each_entry(this_module, &toi_filters, type_list) {
++		if (!this_module->enabled)
++			continue;
++		if (this_module->rw_cleanup)
++			result |= this_module->rw_cleanup(rw);
++	}
++
++	result |= toiActiveAllocator->rw_cleanup(rw);
++
++	return result;
++}
++
++static struct page *copy_page_from_orig_page(struct page *orig_page, int is_high)
++{
++	int index, min, max;
++	struct page *high_page = NULL,
++		    **my_last_high_page = &__get_cpu_var(last_high_page),
++		    **my_last_sought = &__get_cpu_var(last_sought);
++	struct pbe *this, **my_last_low_page = &__get_cpu_var(last_low_page);
++	void *compare;
++
++	if (is_high) {
++		if (*my_last_sought && *my_last_high_page &&
++				*my_last_sought < orig_page)
++			high_page = *my_last_high_page;
++		else
++			high_page = (struct page *) restore_highmem_pblist;
++		this = (struct pbe *) kmap(high_page);
++		compare = orig_page;
++	} else {
++		if (*my_last_sought && *my_last_low_page &&
++				*my_last_sought < orig_page)
++			this = *my_last_low_page;
++		else
++			this = restore_pblist;
++		compare = page_address(orig_page);
++	}
++
++	*my_last_sought = orig_page;
++
++	/* Locate page containing pbe */
++	while (this[PBES_PER_PAGE - 1].next &&
++			this[PBES_PER_PAGE - 1].orig_address < compare) {
++		if (is_high) {
++			struct page *next_high_page = (struct page *)
++				this[PBES_PER_PAGE - 1].next;
++			kunmap(high_page);
++			this = kmap(next_high_page);
++			high_page = next_high_page;
++		} else
++			this = this[PBES_PER_PAGE - 1].next;
++	}
++
++	/* Do a binary search within the page */
++	min = 0;
++	max = PBES_PER_PAGE;
++	index = PBES_PER_PAGE / 2;
++	while (max - min) {
++		if (!this[index].orig_address ||
++		    this[index].orig_address > compare)
++			max = index;
++		else if (this[index].orig_address == compare) {
++			if (is_high) {
++				struct page *page = this[index].address;
++				*my_last_high_page = high_page;
++				kunmap(high_page);
++				return page;
++			}
++			*my_last_low_page = this;
++			return virt_to_page(this[index].address);
++		} else
++			min = index;
++		index = ((max + min) / 2);
++	};
++
++	if (is_high)
++		kunmap(high_page);
++
++	abort_hibernate(TOI_FAILED_IO, "Failed to get destination page for"
++		" orig page %p. This[min].orig_address=%p.\n", orig_page,
++		this[index].orig_address);
++	return NULL;
++}
++
++/**
++ * write_next_page - write the next page in a pageset
++ * @data_pfn: The pfn where the next data to write is located.
++ * @my_io_index: The index of the page in the pageset.
++ * @write_pfn: The pfn number to write in the image (where the data belongs).
++ *
++ * Get the pfn of the next page to write, map the page if necessary and do the
++ * write.
++ **/
++static int write_next_page(unsigned long *data_pfn, int *my_io_index,
++		unsigned long *write_pfn)
++{
++	struct page *page;
++	char **my_checksum_locn = &__get_cpu_var(checksum_locn);
++	int result = 0, was_present;
++
++	*data_pfn = memory_bm_next_pfn(io_map);
++
++	/* Another thread could have beaten us to it. */
++	if (*data_pfn == BM_END_OF_MAP) {
++		if (atomic_read(&io_count)) {
++			printk(KERN_INFO "Ran out of pfns but io_count is "
++					"still %d.\n", atomic_read(&io_count));
++			BUG();
++		}
++		mutex_unlock(&io_mutex);
++		return -ENODATA;
++	}
++
++	*my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
++
++	memory_bm_clear_bit(io_map, *data_pfn);
++	page = pfn_to_page(*data_pfn);
++
++	was_present = kernel_page_present(page);
++	if (!was_present)
++		kernel_map_pages(page, 1, 1);
++
++	if (io_pageset == 1)
++		*write_pfn = memory_bm_next_pfn(pageset1_map);
++	else {
++		*write_pfn = *data_pfn;
++		*my_checksum_locn = tuxonice_get_next_checksum();
++	}
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Write %d:%ld.", *my_io_index, *write_pfn);
++
++	mutex_unlock(&io_mutex);
++
++	if (io_pageset == 2 && tuxonice_calc_checksum(page, *my_checksum_locn))
++		return 1;
++
++	result = first_filter->write_page(*write_pfn, TOI_PAGE, page,
++			PAGE_SIZE);
++
++	if (!was_present)
++		kernel_map_pages(page, 1, 0);
++
++	return result;
++}
++
++/**
++ * read_next_page - read the next page in a pageset
++ * @my_io_index: The index of the page in the pageset.
++ * @write_pfn: The pfn in which the data belongs.
++ *
++ * Read a page of the image into our buffer. It can happen (here and in the
++ * write routine) that threads don't get run until after other CPUs have done
++ * all the work. This was the cause of the long standing issue with
++ * occasionally getting -ENODATA errors at the end of reading the image. We
++ * therefore need to check there's actually a page to read before trying to
++ * retrieve one.
++ **/
++
++static int read_next_page(int *my_io_index, unsigned long *write_pfn,
++		struct page *buffer)
++{
++	unsigned int buf_size = PAGE_SIZE;
++	unsigned long left = atomic_read(&io_count);
++
++	if (!left)
++		return -ENODATA;
++
++	/* Start off assuming the page we read isn't resaved */
++	*my_io_index = io_finish_at - atomic_sub_return(1, &io_count);
++
++	mutex_unlock(&io_mutex);
++
++	/*
++	 * Are we aborting? If so, don't submit any more I/O as
++	 * resetting the resume_attempted flag (from ui.c) will
++	 * clear the bdev flags, making this thread oops.
++	 */
++	if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
++		atomic_dec(&toi_io_workers);
++		if (!atomic_read(&toi_io_workers)) {
++			/*
++			 * So we can be sure we'll have memory for
++			 * marking that we haven't resumed.
++			 */
++			rw_cleanup_modules(READ);
++			set_toi_state(TOI_IO_STOPPED);
++		}
++		while (1)
++			schedule();
++	}
++
++	/*
++	 * See toi_bio_read_page in tuxonice_bio.c:
++	 * read the next page in the image.
++	 */
++	return first_filter->read_page(write_pfn, TOI_PAGE, buffer, &buf_size);
++}
++
++static void use_read_page(unsigned long write_pfn, struct page *buffer)
++{
++	struct page *final_page = pfn_to_page(write_pfn),
++		    *copy_page = final_page;
++	char *virt, *buffer_virt;
++	int was_present, cpu = smp_processor_id();
++	unsigned long idx = 0;
++
++	if (io_pageset == 1 && (!pageset1_copy_map ||
++			!memory_bm_test_bit_index(pageset1_copy_map, write_pfn, cpu))) {
++		int is_high = PageHighMem(final_page);
++		copy_page = copy_page_from_orig_page(is_high ? (void *) write_pfn : final_page, is_high);
++	}
++
++	if (!memory_bm_test_bit_index(io_map, write_pfn, cpu)) {
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Discard %ld.", write_pfn);
++		mutex_lock(&io_mutex);
++		idx = atomic_add_return(1, &io_count);
++		mutex_unlock(&io_mutex);
++		return;
++	}
++
++	virt = kmap(copy_page);
++	buffer_virt = kmap(buffer);
++	was_present = kernel_page_present(copy_page);
++	if (!was_present)
++		kernel_map_pages(copy_page, 1, 1);
++	memcpy(virt, buffer_virt, PAGE_SIZE);
++	if (!was_present)
++		kernel_map_pages(copy_page, 1, 0);
++	kunmap(copy_page);
++	kunmap(buffer);
++	memory_bm_clear_bit_index(io_map, write_pfn, cpu);
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Read %d:%ld", idx, write_pfn);
++}
++
++static unsigned long status_update(int writing, unsigned long done,
++		unsigned long ticks)
++{
++	int cs_index = writing ? 0 : 1;
++	unsigned long ticks_so_far = toi_bkd.toi_io_time[cs_index][1] + ticks;
++	unsigned long msec = jiffies_to_msecs(abs(ticks_so_far));
++	unsigned long pgs_per_s, estimate = 0, pages_left;
++
++	if (msec) {
++		pages_left = io_barmax - done;
++		pgs_per_s = 1000 * done / msec;
++		if (pgs_per_s)
++			estimate = DIV_ROUND_UP(pages_left, pgs_per_s);
++	}
++
++	if (estimate && ticks > HZ / 2)
++		return toi_update_status(done, io_barmax,
++			" %d/%d MB (%lu sec left)",
++			MB(done+1), MB(io_barmax), estimate);
++
++	return toi_update_status(done, io_barmax, " %d/%d MB",
++		MB(done+1), MB(io_barmax));
++}
++
++/**
++ * worker_rw_loop - main loop to read/write pages
++ *
++ * The main I/O loop for reading or writing pages. The io_map bitmap is used to
++ * track the pages to read/write.
++ * If we are reading, the pages are loaded to their final (mapped) pfn.
++ * Data is non zero iff this is a thread started via start_other_threads.
++ * In that case, we stay in here until told to quit.
++ **/
++static int worker_rw_loop(void *data)
++{
++	unsigned long data_pfn, write_pfn, next_jiffies = jiffies + HZ / 4,
++		      jif_index = 1, start_time = jiffies, thread_num;
++	int result = 0, my_io_index = 0, last_worker;
++	struct page *buffer = toi_alloc_page(28, TOI_ATOMIC_GFP);
++	cpumask_var_t orig_mask;
++
++        if (!alloc_cpumask_var(&orig_mask, GFP_KERNEL)) {
++		printk(KERN_EMERG "Failed to allocate cpumask for TuxOnIce I/O thread %ld.\n", (unsigned long) data);
++                return -ENOMEM;
++        }
++
++	cpumask_copy(orig_mask, tsk_cpus_allowed(current));
++
++	current->flags |= PF_NOFREEZE;
++
++top:
++	mutex_lock(&io_mutex);
++	thread_num = atomic_read(&toi_io_workers);
++
++	cpumask_copy(tsk_cpus_allowed(current), orig_mask);
++	schedule();
++
++	atomic_inc(&toi_io_workers);
++
++	while (atomic_read(&io_count) >= atomic_read(&toi_io_workers) &&
++		!(io_write && test_result_state(TOI_ABORTED)) &&
++		toi_worker_command == TOI_IO_WORKER_RUN) {
++		if (!thread_num && jiffies > next_jiffies) {
++			next_jiffies += HZ / 4;
++			if (toiActiveAllocator->update_throughput_throttle)
++				toiActiveAllocator->update_throughput_throttle(
++						jif_index);
++			jif_index++;
++		}
++
++		/*
++		 * What page to use? If reading, don't know yet which page's
++		 * data will be read, so always use the buffer. If writing,
++		 * use the copy (Pageset1) or original page (Pageset2), but
++		 * always write the pfn of the original page.
++		 */
++		if (io_write)
++			result = write_next_page(&data_pfn, &my_io_index,
++					&write_pfn);
++		else /* Reading */
++			result = read_next_page(&my_io_index, &write_pfn,
++					buffer);
++
++		if (result) {
++			mutex_lock(&io_mutex);
++			/* Nothing to do? */
++			if (result == -ENODATA) {
++				toi_message(TOI_IO, TOI_VERBOSE, 0,
++					"Thread %d has no more work.",
++					smp_processor_id());
++				break;
++			}
++
++			io_result = result;
++
++			if (io_write) {
++				printk(KERN_INFO "Write chunk returned %d.\n",
++						result);
++				abort_hibernate(TOI_FAILED_IO,
++					"Failed to write a chunk of the "
++					"image.");
++				break;
++			}
++
++			if (io_pageset == 1) {
++				printk(KERN_ERR "\nBreaking out of I/O loop "
++					"because of result code %d.\n", result);
++				break;
++			}
++			panic("Read chunk returned (%d)", result);
++		}
++
++		/*
++		 * Discard reads of resaved pages while reading ps2
++		 * and unwanted pages while rereading ps2 when aborting.
++		 */
++		if (!io_write) {
++			if (!PageResave(pfn_to_page(write_pfn)))
++				use_read_page(write_pfn, buffer);
++			else {
++				mutex_lock(&io_mutex);
++				toi_message(TOI_IO, TOI_VERBOSE, 0,
++						"Resaved %ld.", write_pfn);
++				atomic_inc(&io_count);
++				mutex_unlock(&io_mutex);
++			}
++		}
++
++		if (!thread_num) {
++			if(my_io_index + io_base > io_nextupdate)
++				io_nextupdate = status_update(io_write,
++						my_io_index + io_base,
++						jiffies - start_time);
++
++			if (my_io_index > io_pc) {
++				printk(KERN_CONT "...%d%%", 20 * io_pc_step);
++				io_pc_step++;
++				io_pc = io_finish_at * io_pc_step / 5;
++			}
++		}
++
++		toi_cond_pause(0, NULL);
++
++		/*
++		 * Subtle: If there's less I/O still to be done than threads
++		 * running, quit. This stops us doing I/O beyond the end of
++		 * the image when reading.
++		 *
++		 * Possible race condition. Two threads could do the test at
++		 * the same time; one should exit and one should continue.
++		 * Therefore we take the mutex before comparing and exiting.
++		 */
++
++		mutex_lock(&io_mutex);
++	}
++
++	last_worker = atomic_dec_and_test(&toi_io_workers);
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "%d workers left.", atomic_read(&toi_io_workers));
++	mutex_unlock(&io_mutex);
++
++	if ((unsigned long) data && toi_worker_command != TOI_IO_WORKER_EXIT) {
++		/* Were we the last thread and we're using a flusher thread? */
++		if (last_worker && using_flusher) {
++			toiActiveAllocator->finish_all_io();
++		}
++		/* First, if we're doing I/O, wait for it to finish */
++		wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_RUN);
++		/* Then wait to be told what to do next */
++		wait_event(toi_worker_wait_queue, toi_worker_command != TOI_IO_WORKER_STOP);
++		if (toi_worker_command == TOI_IO_WORKER_RUN)
++			goto top;
++	}
++
++	if (thread_num)
++		atomic_dec(&toi_num_other_threads);
++
++	toi_message(TOI_IO, TOI_LOW, 0, "Thread %d exiting.", thread_num);
++	toi__free_page(28, buffer);
++	free_cpumask_var(orig_mask);
++
++	return result;
++}
++
++int toi_start_other_threads(void)
++{
++	int cpu;
++	struct task_struct *p;
++	int to_start = (toi_max_workers ? toi_max_workers : num_online_cpus()) - 1;
++  unsigned long num_started = 0;
++
++	if (test_action_state(TOI_NO_MULTITHREADED_IO))
++		return 0;
++
++	toi_worker_command = TOI_IO_WORKER_STOP;
++
++	for_each_online_cpu(cpu) {
++		if (num_started == to_start)
++			break;
++
++		if (cpu == smp_processor_id())
++			continue;
++
++		p = kthread_create_on_node(worker_rw_loop, (void *) num_started + 1,
++				cpu_to_node(cpu), "ktoi_io/%d", cpu);
++		if (IS_ERR(p)) {
++			printk(KERN_ERR "ktoi_io for %i failed\n", cpu);
++			continue;
++		}
++		kthread_bind(p, cpu);
++		p->flags |= PF_MEMALLOC;
++		wake_up_process(p);
++		num_started++;
++		atomic_inc(&toi_num_other_threads);
++	}
++
++	toi_message(TOI_IO, TOI_LOW, 0, "Started %d threads.", num_started);
++	return num_started;
++}
++
++void toi_stop_other_threads(void)
++{
++	toi_message(TOI_IO, TOI_LOW, 0, "Stopping other threads.");
++	toi_worker_command = TOI_IO_WORKER_EXIT;
++	wake_up(&toi_worker_wait_queue);
++}
++
++/**
++ * do_rw_loop - main highlevel function for reading or writing pages
++ *
++ * Create the io_map bitmap and call worker_rw_loop to perform I/O operations.
++ **/
++static int do_rw_loop(int write, int finish_at, struct memory_bitmap *pageflags,
++		int base, int barmax, int pageset)
++{
++	int index = 0, cpu, result = 0, workers_started;
++	unsigned long pfn;
++
++	first_filter = toi_get_next_filter(NULL);
++
++	if (!finish_at)
++		return 0;
++
++	io_write = write;
++	io_finish_at = finish_at;
++	io_base = base;
++	io_barmax = barmax;
++	io_pageset = pageset;
++	io_index = 0;
++	io_pc = io_finish_at / 5;
++	io_pc_step = 1;
++	io_result = 0;
++	io_nextupdate = base + 1;
++	toi_bio_queue_flusher_should_finish = 0;
++
++	for_each_online_cpu(cpu) {
++		per_cpu(last_sought, cpu) = NULL;
++		per_cpu(last_low_page, cpu) = NULL;
++		per_cpu(last_high_page, cpu) = NULL;
++	}
++
++	/* Ensure all bits clear */
++	memory_bm_clear(io_map);
++
++	/* Set the bits for the pages to write */
++	memory_bm_position_reset(pageflags);
++
++	pfn = memory_bm_next_pfn(pageflags);
++
++	while (pfn != BM_END_OF_MAP && index < finish_at) {
++		memory_bm_set_bit(io_map, pfn);
++		pfn = memory_bm_next_pfn(pageflags);
++		index++;
++	}
++
++	BUG_ON(index < finish_at);
++
++	atomic_set(&io_count, finish_at);
++
++	memory_bm_position_reset(pageset1_map);
++
++	mutex_lock(&io_mutex);
++
++	clear_toi_state(TOI_IO_STOPPED);
++
++	using_flusher = (atomic_read(&toi_num_other_threads) &&
++			 toiActiveAllocator->io_flusher &&
++			 !test_action_state(TOI_NO_FLUSHER_THREAD));
++
++	workers_started = atomic_read(&toi_num_other_threads);
++
++	memory_bm_set_iterators(io_map, atomic_read(&toi_num_other_threads) + 1);
++	memory_bm_position_reset(io_map);
++
++	memory_bm_set_iterators(pageset1_copy_map, atomic_read(&toi_num_other_threads) + 1);
++	memory_bm_position_reset(pageset1_copy_map);
++
++	toi_worker_command = TOI_IO_WORKER_RUN;
++	wake_up(&toi_worker_wait_queue);
++
++	mutex_unlock(&io_mutex);
++
++	if (using_flusher)
++		result = toiActiveAllocator->io_flusher(write);
++	else
++		worker_rw_loop(NULL);
++
++	while (atomic_read(&toi_io_workers))
++		schedule();
++
++	printk(KERN_CONT "\n");
++
++	toi_worker_command = TOI_IO_WORKER_STOP;
++	wake_up(&toi_worker_wait_queue);
++
++	if (unlikely(test_toi_state(TOI_STOP_RESUME))) {
++		if (!atomic_read(&toi_io_workers)) {
++			rw_cleanup_modules(READ);
++			set_toi_state(TOI_IO_STOPPED);
++		}
++		while (1)
++			schedule();
++	}
++	set_toi_state(TOI_IO_STOPPED);
++
++	if (!io_result && !result && !test_result_state(TOI_ABORTED)) {
++		unsigned long next;
++
++		toi_update_status(io_base + io_finish_at, io_barmax,
++				" %d/%d MB ",
++				MB(io_base + io_finish_at), MB(io_barmax));
++
++		memory_bm_position_reset(io_map);
++		next = memory_bm_next_pfn(io_map);
++		if  (next != BM_END_OF_MAP) {
++			printk(KERN_INFO "Finished I/O loop but still work to "
++					"do?\nFinish at = %d. io_count = %d.\n",
++					finish_at, atomic_read(&io_count));
++			printk(KERN_INFO "I/O bitmap still records work to do."
++					"%ld.\n", next);
++			BUG();
++			do {
++				cpu_relax();
++			} while (0);
++		}
++	}
++
++	return io_result ? io_result : result;
++}
++
++/**
++ * write_pageset - write a pageset to disk.
++ * @pagedir:	Which pagedir to write.
++ *
++ * Returns:
++ *	Zero on success or -1 on failure.
++ **/
++int write_pageset(struct pagedir *pagedir)
++{
++	int finish_at, base = 0;
++	int barmax = pagedir1.size + pagedir2.size;
++	long error = 0;
++	struct memory_bitmap *pageflags;
++	unsigned long start_time, end_time;
++
++	/*
++	 * Even if there is nothing to read or write, the allocator
++	 * may need the init/cleanup for it's housekeeping.  (eg:
++	 * Pageset1 may start where pageset2 ends when writing).
++	 */
++	finish_at = pagedir->size;
++
++	if (pagedir->id == 1) {
++		toi_prepare_status(DONT_CLEAR_BAR,
++				"Writing kernel & process data...");
++		base = pagedir2.size;
++		if (test_action_state(TOI_TEST_FILTER_SPEED) ||
++		    test_action_state(TOI_TEST_BIO))
++			pageflags = pageset1_map;
++		else
++			pageflags = pageset1_copy_map;
++	} else {
++		toi_prepare_status(DONT_CLEAR_BAR, "Writing caches...");
++		pageflags = pageset2_map;
++	}
++
++	start_time = jiffies;
++
++	if (rw_init_modules(1, pagedir->id)) {
++		abort_hibernate(TOI_FAILED_MODULE_INIT,
++				"Failed to initialise modules for writing.");
++		error = 1;
++	}
++
++	if (!error)
++		error = do_rw_loop(1, finish_at, pageflags, base, barmax,
++				pagedir->id);
++
++	if (rw_cleanup_modules(WRITE) && !error) {
++		abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
++				"Failed to cleanup after writing.");
++		error = 1;
++	}
++
++	end_time = jiffies;
++
++	if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
++		toi_bkd.toi_io_time[0][0] += finish_at,
++		toi_bkd.toi_io_time[0][1] += (end_time - start_time);
++	}
++
++	return error;
++}
++
++/**
++ * read_pageset - highlevel function to read a pageset from disk
++ * @pagedir:			pageset to read
++ * @overwrittenpagesonly:	Whether to read the whole pageset or
++ *				only part of it.
++ *
++ * Returns:
++ *	Zero on success or -1 on failure.
++ **/
++static int read_pageset(struct pagedir *pagedir, int overwrittenpagesonly)
++{
++	int result = 0, base = 0;
++	int finish_at = pagedir->size;
++	int barmax = pagedir1.size + pagedir2.size;
++	struct memory_bitmap *pageflags;
++	unsigned long start_time, end_time;
++
++	if (pagedir->id == 1) {
++		toi_prepare_status(DONT_CLEAR_BAR,
++				"Reading kernel & process data...");
++		pageflags = pageset1_map;
++	} else {
++		toi_prepare_status(DONT_CLEAR_BAR, "Reading caches...");
++		if (overwrittenpagesonly) {
++			barmax = min(pagedir1.size, pagedir2.size);
++			finish_at = min(pagedir1.size, pagedir2.size);
++		} else
++			base = pagedir1.size;
++		pageflags = pageset2_map;
++	}
++
++	start_time = jiffies;
++
++	if (rw_init_modules(0, pagedir->id)) {
++		toiActiveAllocator->remove_image();
++		result = 1;
++	} else
++		result = do_rw_loop(0, finish_at, pageflags, base, barmax,
++				pagedir->id);
++
++	if (rw_cleanup_modules(READ) && !result) {
++		abort_hibernate(TOI_FAILED_MODULE_CLEANUP,
++				"Failed to cleanup after reading.");
++		result = 1;
++	}
++
++	/* Statistics */
++	end_time = jiffies;
++
++	if ((end_time - start_time) && (!test_result_state(TOI_ABORTED))) {
++		toi_bkd.toi_io_time[1][0] += finish_at,
++		toi_bkd.toi_io_time[1][1] += (end_time - start_time);
++	}
++
++	return result;
++}
++
++/**
++ * write_module_configs - store the modules configuration
++ *
++ * The configuration for each module is stored in the image header.
++ * Returns: Int
++ *	Zero on success, Error value otherwise.
++ **/
++static int write_module_configs(void)
++{
++	struct toi_module_ops *this_module;
++	char *buffer = (char *) toi_get_zeroed_page(22, TOI_ATOMIC_GFP);
++	int len, index = 1;
++	struct toi_module_header toi_module_header;
++
++	if (!buffer) {
++		printk(KERN_INFO "Failed to allocate a buffer for saving "
++				"module configuration info.\n");
++		return -ENOMEM;
++	}
++
++	/*
++	 * We have to know which data goes with which module, so we at
++	 * least write a length of zero for a module. Note that we are
++	 * also assuming every module's config data takes <= PAGE_SIZE.
++	 */
++
++	/* For each module (in registration order) */
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled || !this_module->storage_needed ||
++		    (this_module->type == WRITER_MODULE &&
++		     toiActiveAllocator != this_module))
++			continue;
++
++		/* Get the data from the module */
++		len = 0;
++		if (this_module->save_config_info)
++			len = this_module->save_config_info(buffer);
++
++		/* Save the details of the module */
++		toi_module_header.enabled = this_module->enabled;
++		toi_module_header.type = this_module->type;
++		toi_module_header.index = index++;
++		strncpy(toi_module_header.name, this_module->name,
++					sizeof(toi_module_header.name));
++		toiActiveAllocator->rw_header_chunk(WRITE,
++				this_module,
++				(char *) &toi_module_header,
++				sizeof(toi_module_header));
++
++		/* Save the size of the data and any data returned */
++		toiActiveAllocator->rw_header_chunk(WRITE,
++				this_module,
++				(char *) &len, sizeof(int));
++		if (len)
++			toiActiveAllocator->rw_header_chunk(
++				WRITE, this_module, buffer, len);
++	}
++
++	/* Write a blank header to terminate the list */
++	toi_module_header.name[0] = '\0';
++	toiActiveAllocator->rw_header_chunk(WRITE, NULL,
++			(char *) &toi_module_header, sizeof(toi_module_header));
++
++	toi_free_page(22, (unsigned long) buffer);
++	return 0;
++}
++
++/**
++ * read_one_module_config - read and configure one module
++ *
++ * Read the configuration for one module, and configure the module
++ * to match if it is loaded.
++ *
++ * Returns: Int
++ *	Zero on success, Error value otherwise.
++ **/
++static int read_one_module_config(struct toi_module_header *header)
++{
++	struct toi_module_ops *this_module;
++	int result, len;
++	char *buffer;
++
++	/* Find the module */
++	this_module = toi_find_module_given_name(header->name);
++
++	if (!this_module) {
++		if (header->enabled) {
++			toi_early_boot_message(1, TOI_CONTINUE_REQ,
++				"It looks like we need module %s for reading "
++				"the image but it hasn't been registered.\n",
++				header->name);
++			if (!(test_toi_state(TOI_CONTINUE_REQ)))
++				return -EINVAL;
++		} else
++			printk(KERN_INFO "Module %s configuration data found, "
++				"but the module hasn't registered. Looks like "
++				"it was disabled, so we're ignoring its data.",
++				header->name);
++	}
++
++	/* Get the length of the data (if any) */
++	result = toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &len,
++			sizeof(int));
++	if (result) {
++		printk(KERN_ERR "Failed to read the length of the module %s's"
++				" configuration data.\n",
++				header->name);
++		return -EINVAL;
++	}
++
++	/* Read any data and pass to the module (if we found one) */
++	if (!len)
++		return 0;
++
++	buffer = (char *) toi_get_zeroed_page(23, TOI_ATOMIC_GFP);
++
++	if (!buffer) {
++		printk(KERN_ERR "Failed to allocate a buffer for reloading "
++				"module configuration info.\n");
++		return -ENOMEM;
++	}
++
++	toiActiveAllocator->rw_header_chunk(READ, NULL, buffer, len);
++
++	if (!this_module)
++		goto out;
++
++	if (!this_module->save_config_info)
++		printk(KERN_ERR "Huh? Module %s appears to have a "
++				"save_config_info, but not a load_config_info "
++				"function!\n", this_module->name);
++	else
++		this_module->load_config_info(buffer, len);
++
++	/*
++	 * Now move this module to the tail of its lists. This will put it in
++	 * order. Any new modules will end up at the top of the lists. They
++	 * should have been set to disabled when loaded (people will
++	 * normally not edit an initrd to load a new module and then hibernate
++	 * without using it!).
++	 */
++
++	toi_move_module_tail(this_module);
++
++	this_module->enabled = header->enabled;
++
++out:
++	toi_free_page(23, (unsigned long) buffer);
++	return 0;
++}
++
++/**
++ * read_module_configs - reload module configurations from the image header.
++ *
++ * Returns: Int
++ *	Zero on success or an error code.
++ **/
++static int read_module_configs(void)
++{
++	int result = 0;
++	struct toi_module_header toi_module_header;
++	struct toi_module_ops *this_module;
++
++	/* All modules are initially disabled. That way, if we have a module
++	 * loaded now that wasn't loaded when we hibernated, it won't be used
++	 * in trying to read the data.
++	 */
++	list_for_each_entry(this_module, &toi_modules, module_list)
++		this_module->enabled = 0;
++
++	/* Get the first module header */
++	result = toiActiveAllocator->rw_header_chunk(READ, NULL,
++			(char *) &toi_module_header,
++			sizeof(toi_module_header));
++	if (result) {
++		printk(KERN_ERR "Failed to read the next module header.\n");
++		return -EINVAL;
++	}
++
++	/* For each module (in registration order) */
++	while (toi_module_header.name[0]) {
++		result = read_one_module_config(&toi_module_header);
++
++		if (result)
++			return -EINVAL;
++
++		/* Get the next module header */
++		result = toiActiveAllocator->rw_header_chunk(READ, NULL,
++				(char *) &toi_module_header,
++				sizeof(toi_module_header));
++
++		if (result) {
++			printk(KERN_ERR "Failed to read the next module "
++					"header.\n");
++			return -EINVAL;
++		}
++	}
++
++	return 0;
++}
++
++static inline int save_fs_info(struct fs_info *fs, struct block_device *bdev)
++{
++	return (!fs || IS_ERR(fs) || !fs->last_mount_size) ? 0 : 1;
++}
++
++int fs_info_space_needed(void)
++{
++	const struct super_block *sb;
++	int result = sizeof(int);
++
++	list_for_each_entry(sb, &super_blocks, s_list) {
++		struct fs_info *fs;
++
++		if (!sb->s_bdev)
++			continue;
++
++		fs = fs_info_from_block_dev(sb->s_bdev);
++		if (save_fs_info(fs, sb->s_bdev))
++			result += 16 + sizeof(dev_t) + sizeof(int) +
++				fs->last_mount_size;
++		free_fs_info(fs);
++	}
++	return result;
++}
++
++static int fs_info_num_to_save(void)
++{
++	const struct super_block *sb;
++	int to_save = 0;
++
++	list_for_each_entry(sb, &super_blocks, s_list) {
++		struct fs_info *fs;
++
++		if (!sb->s_bdev)
++			continue;
++
++		fs = fs_info_from_block_dev(sb->s_bdev);
++		if (save_fs_info(fs, sb->s_bdev))
++			to_save++;
++		free_fs_info(fs);
++	}
++
++	return to_save;
++}
++
++static int fs_info_save(void)
++{
++	const struct super_block *sb;
++	int to_save = fs_info_num_to_save();
++
++	if (toiActiveAllocator->rw_header_chunk(WRITE, NULL, (char *) &to_save,
++				sizeof(int))) {
++		abort_hibernate(TOI_FAILED_IO, "Failed to write num fs_info"
++				" to save.");
++		return -EIO;
++	}
++
++	list_for_each_entry(sb, &super_blocks, s_list) {
++		struct fs_info *fs;
++
++		if (!sb->s_bdev)
++			continue;
++
++		fs = fs_info_from_block_dev(sb->s_bdev);
++		if (save_fs_info(fs, sb->s_bdev)) {
++			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
++					&fs->uuid[0], 16)) {
++				abort_hibernate(TOI_FAILED_IO, "Failed to "
++						"write uuid.");
++				return -EIO;
++			}
++			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
++					(char *) &fs->dev_t, sizeof(dev_t))) {
++				abort_hibernate(TOI_FAILED_IO, "Failed to "
++						"write dev_t.");
++				return -EIO;
++			}
++			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
++					(char *) &fs->last_mount_size, sizeof(int))) {
++				abort_hibernate(TOI_FAILED_IO, "Failed to "
++						"write last mount length.");
++				return -EIO;
++			}
++			if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
++					fs->last_mount, fs->last_mount_size)) {
++				abort_hibernate(TOI_FAILED_IO, "Failed to "
++						"write uuid.");
++				return -EIO;
++			}
++		}
++		free_fs_info(fs);
++	}
++	return 0;
++}
++
++static int fs_info_load_and_check_one(void)
++{
++	char uuid[16], *last_mount;
++	int result = 0, ln;
++	dev_t dev_t;
++	struct block_device *dev;
++	struct fs_info *fs_info, seek;
++
++	if (toiActiveAllocator->rw_header_chunk(READ, NULL, uuid, 16)) {
++		abort_hibernate(TOI_FAILED_IO, "Failed to read uuid.");
++		return -EIO;
++	}
++
++	read_if_version(3, dev_t, "uuid dev_t field", return -EIO);
++
++	if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &ln,
++				sizeof(int))) {
++		abort_hibernate(TOI_FAILED_IO,
++				"Failed to read last mount size.");
++		return -EIO;
++	}
++
++	last_mount = kzalloc(ln, GFP_KERNEL);
++
++	if (!last_mount)
++		return -ENOMEM;
++
++	if (toiActiveAllocator->rw_header_chunk(READ, NULL, last_mount,	ln)) {
++		abort_hibernate(TOI_FAILED_IO,
++				"Failed to read last mount timestamp.");
++		result = -EIO;
++		goto out_lmt;
++	}
++
++	strncpy((char *) &seek.uuid, uuid, 16);
++	seek.dev_t = dev_t;
++	seek.last_mount_size = ln;
++	seek.last_mount = last_mount;
++	dev_t = blk_lookup_fs_info(&seek);
++	if (!dev_t)
++		goto out_lmt;
++
++	dev = toi_open_by_devnum(dev_t);
++
++	fs_info = fs_info_from_block_dev(dev);
++	if (fs_info && !IS_ERR(fs_info)) {
++		if (ln != fs_info->last_mount_size) {
++			printk(KERN_EMERG "Found matching uuid but last mount "
++					"time lengths differ?! "
++					"(%d vs %d).\n", ln,
++					fs_info->last_mount_size);
++			result = -EINVAL;
++		} else {
++			char buf[BDEVNAME_SIZE];
++			result = !!memcmp(fs_info->last_mount, last_mount, ln);
++			if (result)
++				printk(KERN_EMERG "Last mount time for %s has "
++					"changed!\n", bdevname(dev, buf));
++		}
++	}
++	toi_close_bdev(dev);
++	free_fs_info(fs_info);
++out_lmt:
++	kfree(last_mount);
++	return result;
++}
++
++static int fs_info_load_and_check(void)
++{
++	int to_do, result = 0;
++
++	if (toiActiveAllocator->rw_header_chunk(READ, NULL, (char *) &to_do,
++				sizeof(int))) {
++		abort_hibernate(TOI_FAILED_IO, "Failed to read num fs_info "
++				"to load.");
++		return -EIO;
++	}
++
++	while(to_do--)
++		result |= fs_info_load_and_check_one();
++
++	return result;
++}
++
++/**
++ * write_image_header - write the image header after write the image proper
++ *
++ * Returns: Int
++ *	Zero on success, error value otherwise.
++ **/
++int write_image_header(void)
++{
++	int ret;
++	int total = pagedir1.size + pagedir2.size+2;
++	char *header_buffer = NULL;
++
++	/* Now prepare to write the header */
++	ret = toiActiveAllocator->write_header_init();
++	if (ret) {
++		abort_hibernate(TOI_FAILED_MODULE_INIT,
++				"Active allocator's write_header_init"
++				" function failed.");
++		goto write_image_header_abort;
++	}
++
++	/* Get a buffer */
++	header_buffer = (char *) toi_get_zeroed_page(24, TOI_ATOMIC_GFP);
++	if (!header_buffer) {
++		abort_hibernate(TOI_OUT_OF_MEMORY,
++			"Out of memory when trying to get page for header!");
++		goto write_image_header_abort;
++	}
++
++	/* Write hibernate header */
++	if (fill_toi_header((struct toi_header *) header_buffer)) {
++		abort_hibernate(TOI_OUT_OF_MEMORY,
++			"Failure to fill header information!");
++		goto write_image_header_abort;
++	}
++
++	if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
++			header_buffer, sizeof(struct toi_header))) {
++		abort_hibernate(TOI_OUT_OF_MEMORY,
++			"Failure to write header info.");
++		goto write_image_header_abort;
++	}
++
++	if (toiActiveAllocator->rw_header_chunk(WRITE, NULL,
++			(char *) &toi_max_workers, sizeof(toi_max_workers))) {
++		abort_hibernate(TOI_OUT_OF_MEMORY,
++			"Failure to number of workers to use.");
++		goto write_image_header_abort;
++	}
++
++	/* Write filesystem info */
++	if (fs_info_save())
++		goto write_image_header_abort;
++
++	/* Write module configurations */
++	ret = write_module_configs();
++	if (ret) {
++		abort_hibernate(TOI_FAILED_IO,
++				"Failed to write module configs.");
++		goto write_image_header_abort;
++	}
++
++	if (memory_bm_write(pageset1_map,
++				toiActiveAllocator->rw_header_chunk)) {
++		abort_hibernate(TOI_FAILED_IO,
++				"Failed to write bitmaps.");
++		goto write_image_header_abort;
++	}
++
++	/* Flush data and let allocator cleanup */
++	if (toiActiveAllocator->write_header_cleanup()) {
++		abort_hibernate(TOI_FAILED_IO,
++				"Failed to cleanup writing header.");
++		goto write_image_header_abort_no_cleanup;
++	}
++
++	if (test_result_state(TOI_ABORTED))
++		goto write_image_header_abort_no_cleanup;
++
++	toi_update_status(total, total, NULL);
++
++out:
++	if (header_buffer)
++		toi_free_page(24, (unsigned long) header_buffer);
++	return ret;
++
++write_image_header_abort:
++	toiActiveAllocator->write_header_cleanup();
++write_image_header_abort_no_cleanup:
++	ret = -1;
++	goto out;
++}
++
++/**
++ * sanity_check - check the header
++ * @sh:	the header which was saved at hibernate time.
++ *
++ * Perform a few checks, seeking to ensure that the kernel being
++ * booted matches the one hibernated. They need to match so we can
++ * be _sure_ things will work. It is not absolutely impossible for
++ * resuming from a different kernel to work, just not assured.
++ **/
++static char *sanity_check(struct toi_header *sh)
++{
++	char *reason = check_image_kernel((struct swsusp_info *) sh);
++
++	if (reason)
++		return reason;
++
++	if (!test_action_state(TOI_IGNORE_ROOTFS)) {
++		const struct super_block *sb;
++		list_for_each_entry(sb, &super_blocks, s_list) {
++			if ((!(sb->s_flags & MS_RDONLY)) &&
++			    (sb->s_type->fs_flags & FS_REQUIRES_DEV))
++				return "Device backed fs has been mounted "
++					"rw prior to resume or initrd/ramfs "
++					"is mounted rw.";
++		}
++	}
++
++	return NULL;
++}
++
++static DECLARE_WAIT_QUEUE_HEAD(freeze_wait);
++
++#define FREEZE_IN_PROGRESS (~0)
++
++static int freeze_result;
++
++static void do_freeze(struct work_struct *dummy)
++{
++	freeze_result = freeze_processes();
++	wake_up(&freeze_wait);
++	trap_non_toi_io = 1;
++}
++
++static DECLARE_WORK(freeze_work, do_freeze);
++
++/**
++ * __read_pageset1 - test for the existence of an image and attempt to load it
++ *
++ * Returns:	Int
++ *	Zero if image found and pageset1 successfully loaded.
++ *	Error if no image found or loaded.
++ **/
++static int __read_pageset1(void)
++{
++	int i, result = 0;
++	char *header_buffer = (char *) toi_get_zeroed_page(25, TOI_ATOMIC_GFP),
++	     *sanity_error = NULL;
++	struct toi_header *toi_header;
++
++	if (!header_buffer) {
++		printk(KERN_INFO "Unable to allocate a page for reading the "
++				"signature.\n");
++		return -ENOMEM;
++	}
++
++	/* Check for an image */
++	result = toiActiveAllocator->image_exists(1);
++	if (result == 3) {
++		result = -ENODATA;
++		toi_early_boot_message(1, 0, "The signature from an older "
++				"version of TuxOnIce has been detected.");
++		goto out_remove_image;
++	}
++
++	if (result != 1) {
++		result = -ENODATA;
++		noresume_reset_modules();
++		printk(KERN_INFO "TuxOnIce: No image found.\n");
++		goto out;
++	}
++
++	/*
++	 * Prepare the active allocator for reading the image header. The
++	 * activate allocator might read its own configuration.
++	 *
++	 * NB: This call may never return because there might be a signature
++	 * for a different image such that we warn the user and they choose
++	 * to reboot. (If the device ids look erroneous (2.4 vs 2.6) or the
++	 * location of the image might be unavailable if it was stored on a
++	 * network connection).
++	 */
++
++	result = toiActiveAllocator->read_header_init();
++	if (result) {
++		printk(KERN_INFO "TuxOnIce: Failed to initialise, reading the "
++				"image header.\n");
++		goto out_remove_image;
++	}
++
++	/* Check for noresume command line option */
++	if (test_toi_state(TOI_NORESUME_SPECIFIED)) {
++		printk(KERN_INFO "TuxOnIce: Noresume on command line. Removed "
++				"image.\n");
++		goto out_remove_image;
++	}
++
++	/* Check whether we've resumed before */
++	if (test_toi_state(TOI_RESUMED_BEFORE)) {
++		toi_early_boot_message(1, 0, NULL);
++		if (!(test_toi_state(TOI_CONTINUE_REQ))) {
++			printk(KERN_INFO "TuxOnIce: Tried to resume before: "
++					"Invalidated image.\n");
++			goto out_remove_image;
++		}
++	}
++
++	clear_toi_state(TOI_CONTINUE_REQ);
++
++	toi_image_header_version = toiActiveAllocator->get_header_version();
++
++	if (unlikely(toi_image_header_version > TOI_HEADER_VERSION)) {
++		toi_early_boot_message(1, 0, image_version_error);
++		if (!(test_toi_state(TOI_CONTINUE_REQ))) {
++			printk(KERN_INFO "TuxOnIce: Header version too new: "
++					"Invalidated image.\n");
++			goto out_remove_image;
++		}
++	}
++
++	/* Read hibernate header */
++	result = toiActiveAllocator->rw_header_chunk(READ, NULL,
++			header_buffer, sizeof(struct toi_header));
++	if (result < 0) {
++		printk(KERN_ERR "TuxOnIce: Failed to read the image "
++				"signature.\n");
++		goto out_remove_image;
++	}
++
++	toi_header = (struct toi_header *) header_buffer;
++
++	/*
++	 * NB: This call may also result in a reboot rather than returning.
++	 */
++
++	sanity_error = sanity_check(toi_header);
++	if (sanity_error) {
++		toi_early_boot_message(1, TOI_CONTINUE_REQ,
++				sanity_error);
++		printk(KERN_INFO "TuxOnIce: Sanity check failed.\n");
++		goto out_remove_image;
++	}
++
++	/*
++	 * We have an image and it looks like it will load okay.
++	 *
++	 * Get metadata from header. Don't override commandline parameters.
++	 *
++	 * We don't need to save the image size limit because it's not used
++	 * during resume and will be restored with the image anyway.
++	 */
++
++	memcpy((char *) &pagedir1,
++		(char *) &toi_header->pagedir, sizeof(pagedir1));
++	toi_result = toi_header->param0;
++	if (!toi_bkd.toi_debug_state) {
++		toi_bkd.toi_action =
++			(toi_header->param1 & ~toi_bootflags_mask) |
++			(toi_bkd.toi_action & toi_bootflags_mask);
++		toi_bkd.toi_debug_state = toi_header->param2;
++		toi_bkd.toi_default_console_level = toi_header->param3;
++	}
++	clear_toi_state(TOI_IGNORE_LOGLEVEL);
++	pagedir2.size = toi_header->pageset_2_size;
++	for (i = 0; i < 4; i++)
++		toi_bkd.toi_io_time[i/2][i%2] =
++			toi_header->io_time[i/2][i%2];
++
++	set_toi_state(TOI_BOOT_KERNEL);
++	boot_kernel_data_buffer = toi_header->bkd;
++
++	read_if_version(1, toi_max_workers, "TuxOnIce max workers",
++			goto out_remove_image);
++
++	/* Read filesystem info */
++	if (fs_info_load_and_check()) {
++		printk(KERN_EMERG "TuxOnIce: File system mount time checks "
++			"failed. Refusing to corrupt your filesystems!\n");
++		goto out_remove_image;
++	}
++
++	/* Read module configurations */
++	result = read_module_configs();
++	if (result) {
++		pagedir1.size = 0;
++		pagedir2.size = 0;
++		printk(KERN_INFO "TuxOnIce: Failed to read TuxOnIce module "
++				"configurations.\n");
++		clear_action_state(TOI_KEEP_IMAGE);
++		goto out_remove_image;
++	}
++
++	toi_prepare_console();
++
++	set_toi_state(TOI_NOW_RESUMING);
++
++	if (!test_action_state(TOI_LATE_CPU_HOTPLUG)) {
++		toi_prepare_status(DONT_CLEAR_BAR, "Disable nonboot cpus.");
++		if (disable_nonboot_cpus()) {
++			set_abort_result(TOI_CPU_HOTPLUG_FAILED);
++			goto out_reset_console;
++		}
++	}
++
++	result = pm_notifier_call_chain(PM_RESTORE_PREPARE);
++	if (result)
++		goto out_notifier_call_chain;;
++
++	if (usermodehelper_disable())
++		goto out_enable_nonboot_cpus;
++
++	current->flags |= PF_NOFREEZE;
++	freeze_result = FREEZE_IN_PROGRESS;
++
++	schedule_work_on(cpumask_first(cpu_online_mask), &freeze_work);
++
++	toi_cond_pause(1, "About to read original pageset1 locations.");
++
++	/*
++	 * See _toi_rw_header_chunk in tuxonice_bio.c:
++	 * Initialize pageset1_map by reading the map from the image.
++	 */
++	if (memory_bm_read(pageset1_map, toiActiveAllocator->rw_header_chunk))
++		goto out_thaw;
++
++	/*
++	 * See toi_rw_cleanup in tuxonice_bio.c:
++	 * Clean up after reading the header.
++	 */
++	result = toiActiveAllocator->read_header_cleanup();
++	if (result) {
++		printk(KERN_ERR "TuxOnIce: Failed to cleanup after reading the "
++				"image header.\n");
++		goto out_thaw;
++	}
++
++	toi_cond_pause(1, "About to read pagedir.");
++
++	/*
++	 * Get the addresses of pages into which we will load the kernel to
++	 * be copied back and check if they conflict with the ones we are using.
++	 */
++	if (toi_get_pageset1_load_addresses()) {
++		printk(KERN_INFO "TuxOnIce: Failed to get load addresses for "
++				"pageset1.\n");
++		goto out_thaw;
++	}
++
++	/* Read the original kernel back */
++	toi_cond_pause(1, "About to read pageset 1.");
++
++	/* Given the pagemap, read back the data from disk */
++	if (read_pageset(&pagedir1, 0)) {
++		toi_prepare_status(DONT_CLEAR_BAR, "Failed to read pageset 1.");
++		result = -EIO;
++		goto out_thaw;
++	}
++
++	toi_cond_pause(1, "About to restore original kernel.");
++	result = 0;
++
++	if (!test_action_state(TOI_KEEP_IMAGE) &&
++	    toiActiveAllocator->mark_resume_attempted)
++		toiActiveAllocator->mark_resume_attempted(1);
++
++	wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
++out:
++	current->flags &= ~PF_NOFREEZE;
++	toi_free_page(25, (unsigned long) header_buffer);
++	return result;
++
++out_thaw:
++	wait_event(freeze_wait, freeze_result != FREEZE_IN_PROGRESS);
++	trap_non_toi_io = 0;
++	thaw_processes();
++	usermodehelper_enable();
++out_enable_nonboot_cpus:
++	enable_nonboot_cpus();
++out_notifier_call_chain:
++  pm_notifier_call_chain(PM_POST_RESTORE);
++out_reset_console:
++	toi_cleanup_console();
++out_remove_image:
++	result = -EINVAL;
++	if (!test_action_state(TOI_KEEP_IMAGE))
++		toiActiveAllocator->remove_image();
++	toiActiveAllocator->read_header_cleanup();
++	noresume_reset_modules();
++	goto out;
++}
++
++/**
++ * read_pageset1 - highlevel function to read the saved pages
++ *
++ * Attempt to read the header and pageset1 of a hibernate image.
++ * Handle the outcome, complaining where appropriate.
++ **/
++int read_pageset1(void)
++{
++	int error;
++
++	error = __read_pageset1();
++
++	if (error && error != -ENODATA && error != -EINVAL &&
++					!test_result_state(TOI_ABORTED))
++		abort_hibernate(TOI_IMAGE_ERROR,
++			"TuxOnIce: Error %d resuming\n", error);
++
++	return error;
++}
++
++/**
++ * get_have_image_data - check the image header
++ **/
++static char *get_have_image_data(void)
++{
++	char *output_buffer = (char *) toi_get_zeroed_page(26, TOI_ATOMIC_GFP);
++	struct toi_header *toi_header;
++
++	if (!output_buffer) {
++		printk(KERN_INFO "Output buffer null.\n");
++		return NULL;
++	}
++
++	/* Check for an image */
++	if (!toiActiveAllocator->image_exists(1) ||
++	    toiActiveAllocator->read_header_init() ||
++	    toiActiveAllocator->rw_header_chunk(READ, NULL,
++			output_buffer, sizeof(struct toi_header))) {
++		sprintf(output_buffer, "0\n");
++		/*
++		 * From an initrd/ramfs, catting have_image and
++		 * getting a result of 0 is sufficient.
++		 */
++		clear_toi_state(TOI_BOOT_TIME);
++		goto out;
++	}
++
++	toi_header = (struct toi_header *) output_buffer;
++
++	sprintf(output_buffer, "1\n%s\n%s\n",
++			toi_header->uts.machine,
++			toi_header->uts.version);
++
++	/* Check whether we've resumed before */
++	if (test_toi_state(TOI_RESUMED_BEFORE))
++		strcat(output_buffer, "Resumed before.\n");
++
++out:
++	noresume_reset_modules();
++	return output_buffer;
++}
++
++/**
++ * read_pageset2 - read second part of the image
++ * @overwrittenpagesonly:	Read only pages which would have been
++ *				verwritten by pageset1?
++ *
++ * Read in part or all of pageset2 of an image, depending upon
++ * whether we are hibernating and have only overwritten a portion
++ * with pageset1 pages, or are resuming and need to read them
++ * all.
++ *
++ * Returns: Int
++ *	Zero if no error, otherwise the error value.
++ **/
++int read_pageset2(int overwrittenpagesonly)
++{
++	int result = 0;
++
++	if (!pagedir2.size)
++		return 0;
++
++	result = read_pageset(&pagedir2, overwrittenpagesonly);
++
++	toi_cond_pause(1, "Pagedir 2 read.");
++
++	return result;
++}
++
++/**
++ * image_exists_read - has an image been found?
++ * @page:	Output buffer
++ *
++ * Store 0 or 1 in page, depending on whether an image is found.
++ * Incoming buffer is PAGE_SIZE and result is guaranteed
++ * to be far less than that, so we don't worry about
++ * overflow.
++ **/
++int image_exists_read(const char *page, int count)
++{
++	int len = 0;
++	char *result;
++
++	if (toi_activate_storage(0))
++		return count;
++
++	if (!test_toi_state(TOI_RESUME_DEVICE_OK))
++		toi_attempt_to_parse_resume_device(0);
++
++	if (!toiActiveAllocator) {
++		len = sprintf((char *) page, "-1\n");
++	} else {
++		result = get_have_image_data();
++		if (result) {
++			len = sprintf((char *) page, "%s",  result);
++			toi_free_page(26, (unsigned long) result);
++		}
++	}
++
++	toi_deactivate_storage(0);
++
++	return len;
++}
++
++/**
++ * image_exists_write - invalidate an image if one exists
++ **/
++int image_exists_write(const char *buffer, int count)
++{
++	if (toi_activate_storage(0))
++		return count;
++
++	if (toiActiveAllocator && toiActiveAllocator->image_exists(1))
++		toiActiveAllocator->remove_image();
++
++	toi_deactivate_storage(0);
++
++	clear_result_state(TOI_KEPT_IMAGE);
++
++	return count;
++}
+diff --git a/kernel/power/tuxonice_io.h b/kernel/power/tuxonice_io.h
+new file mode 100644
+index 0000000..fe37713
+--- /dev/null
++++ b/kernel/power/tuxonice_io.h
+@@ -0,0 +1,74 @@
++/*
++ * kernel/power/tuxonice_io.h
++ *
++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * It contains high level IO routines for hibernating.
++ *
++ */
++
++#include <linux/utsname.h>
++#include "tuxonice_pagedir.h"
++
++/* Non-module data saved in our image header */
++struct toi_header {
++	/*
++	 * Mirror struct swsusp_info, but without
++	 * the page aligned attribute
++	 */
++	struct new_utsname uts;
++	u32 version_code;
++	unsigned long num_physpages;
++	int cpus;
++	unsigned long image_pages;
++	unsigned long pages;
++	unsigned long size;
++
++	/* Our own data */
++	unsigned long orig_mem_free;
++	int page_size;
++	int pageset_2_size;
++	int param0;
++	int param1;
++	int param2;
++	int param3;
++	int progress0;
++	int progress1;
++	int progress2;
++	int progress3;
++	int io_time[2][2];
++	struct pagedir pagedir;
++	dev_t root_fs;
++	unsigned long bkd; /* Boot kernel data locn */
++};
++
++extern int write_pageset(struct pagedir *pagedir);
++extern int write_image_header(void);
++extern int read_pageset1(void);
++extern int read_pageset2(int overwrittenpagesonly);
++
++extern int toi_attempt_to_parse_resume_device(int quiet);
++extern void attempt_to_parse_resume_device2(void);
++extern void attempt_to_parse_alt_resume_param(void);
++int image_exists_read(const char *page, int count);
++int image_exists_write(const char *buffer, int count);
++extern void save_restore_alt_param(int replace, int quiet);
++extern atomic_t toi_io_workers;
++
++/* Args to save_restore_alt_param */
++#define RESTORE 0
++#define SAVE 1
++
++#define NOQUIET 0
++#define QUIET 1
++
++extern dev_t name_to_dev_t(char *line);
++
++extern wait_queue_head_t toi_io_queue_flusher;
++extern int toi_bio_queue_flusher_should_finish;
++
++int fs_info_space_needed(void);
++
++extern int toi_max_workers;
+diff --git a/kernel/power/tuxonice_modules.c b/kernel/power/tuxonice_modules.c
+new file mode 100644
+index 0000000..4cc24a9
+--- /dev/null
++++ b/kernel/power/tuxonice_modules.c
+@@ -0,0 +1,522 @@
++/*
++ * kernel/power/tuxonice_modules.c
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ */
++
++#include <linux/suspend.h>
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_ui.h"
++
++LIST_HEAD(toi_filters);
++LIST_HEAD(toiAllocators);
++
++LIST_HEAD(toi_modules);
++EXPORT_SYMBOL_GPL(toi_modules);
++
++struct toi_module_ops *toiActiveAllocator;
++EXPORT_SYMBOL_GPL(toiActiveAllocator);
++
++static int toi_num_filters;
++int toiNumAllocators, toi_num_modules;
++
++/*
++ * toi_header_storage_for_modules
++ *
++ * Returns the amount of space needed to store configuration
++ * data needed by the modules prior to copying back the original
++ * kernel. We can exclude data for pageset2 because it will be
++ * available anyway once the kernel is copied back.
++ */
++long toi_header_storage_for_modules(void)
++{
++	struct toi_module_ops *this_module;
++	int bytes = 0;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled ||
++		    (this_module->type == WRITER_MODULE &&
++		     toiActiveAllocator != this_module))
++			continue;
++		if (this_module->storage_needed) {
++			int this = this_module->storage_needed() +
++				sizeof(struct toi_module_header) +
++				sizeof(int);
++			this_module->header_requested = this;
++			bytes += this;
++		}
++	}
++
++	/* One more for the empty terminator */
++	return bytes + sizeof(struct toi_module_header);
++}
++
++void print_toi_header_storage_for_modules(void)
++{
++	struct toi_module_ops *this_module;
++	int bytes = 0;
++
++	printk(KERN_DEBUG "Header storage:\n");
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled ||
++		    (this_module->type == WRITER_MODULE &&
++		     toiActiveAllocator != this_module))
++			continue;
++		if (this_module->storage_needed) {
++			int this = this_module->storage_needed() +
++				sizeof(struct toi_module_header) +
++				sizeof(int);
++			this_module->header_requested = this;
++			bytes += this;
++			printk(KERN_DEBUG "+ %16s : %-4d/%d.\n",
++					this_module->name,
++					this_module->header_used, this);
++		}
++	}
++
++	printk(KERN_DEBUG "+ empty terminator : %zu.\n",
++			sizeof(struct toi_module_header));
++	printk(KERN_DEBUG "                     ====\n");
++	printk(KERN_DEBUG "                     %zu\n",
++			bytes + sizeof(struct toi_module_header));
++}
++EXPORT_SYMBOL_GPL(print_toi_header_storage_for_modules);
++
++/*
++ * toi_memory_for_modules
++ *
++ * Returns the amount of memory requested by modules for
++ * doing their work during the cycle.
++ */
++
++long toi_memory_for_modules(int print_parts)
++{
++	long bytes = 0, result;
++	struct toi_module_ops *this_module;
++
++	if (print_parts)
++		printk(KERN_INFO "Memory for modules:\n===================\n");
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		int this;
++		if (!this_module->enabled)
++			continue;
++		if (this_module->memory_needed) {
++			this = this_module->memory_needed();
++			if (print_parts)
++				printk(KERN_INFO "%10d bytes (%5ld pages) for "
++						"module '%s'.\n", this,
++						DIV_ROUND_UP(this, PAGE_SIZE),
++						this_module->name);
++			bytes += this;
++		}
++	}
++
++	result = DIV_ROUND_UP(bytes, PAGE_SIZE);
++	if (print_parts)
++		printk(KERN_INFO " => %ld bytes, %ld pages.\n", bytes, result);
++
++	return result;
++}
++
++/*
++ * toi_expected_compression_ratio
++ *
++ * Returns the compression ratio expected when saving the image.
++ */
++
++int toi_expected_compression_ratio(void)
++{
++	int ratio = 100;
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled)
++			continue;
++		if (this_module->expected_compression)
++			ratio = ratio * this_module->expected_compression()
++				/ 100;
++	}
++
++	return ratio;
++}
++
++/* toi_find_module_given_dir
++ * Functionality :	Return a module (if found), given a pointer
++ * 			to its directory name
++ */
++
++static struct toi_module_ops *toi_find_module_given_dir(char *name)
++{
++	struct toi_module_ops *this_module, *found_module = NULL;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!strcmp(name, this_module->directory)) {
++			found_module = this_module;
++			break;
++		}
++	}
++
++	return found_module;
++}
++
++/* toi_find_module_given_name
++ * Functionality :	Return a module (if found), given a pointer
++ * 			to its name
++ */
++
++struct toi_module_ops *toi_find_module_given_name(char *name)
++{
++	struct toi_module_ops *this_module, *found_module = NULL;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!strcmp(name, this_module->name)) {
++			found_module = this_module;
++			break;
++		}
++	}
++
++	return found_module;
++}
++
++/*
++ * toi_print_module_debug_info
++ * Functionality   : Get debugging info from modules into a buffer.
++ */
++int toi_print_module_debug_info(char *buffer, int buffer_size)
++{
++	struct toi_module_ops *this_module;
++	int len = 0;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled)
++			continue;
++		if (this_module->print_debug_info) {
++			int result;
++			result = this_module->print_debug_info(buffer + len,
++					buffer_size - len);
++			len += result;
++		}
++	}
++
++	/* Ensure null terminated */
++	buffer[buffer_size] = 0;
++
++	return len;
++}
++
++/*
++ * toi_register_module
++ *
++ * Register a module.
++ */
++int toi_register_module(struct toi_module_ops *module)
++{
++	int i;
++	struct kobject *kobj;
++
++	module->enabled = 1;
++
++	if (toi_find_module_given_name(module->name)) {
++		printk(KERN_INFO "TuxOnIce: Trying to load module %s,"
++				" which is already registered.\n",
++				module->name);
++		return -EBUSY;
++	}
++
++	switch (module->type) {
++	case FILTER_MODULE:
++		list_add_tail(&module->type_list, &toi_filters);
++		toi_num_filters++;
++		break;
++	case WRITER_MODULE:
++		list_add_tail(&module->type_list, &toiAllocators);
++		toiNumAllocators++;
++		break;
++	case MISC_MODULE:
++	case MISC_HIDDEN_MODULE:
++	case BIO_ALLOCATOR_MODULE:
++		break;
++	default:
++		printk(KERN_ERR "Hmmm. Module '%s' has an invalid type."
++			" It has been ignored.\n", module->name);
++		return -EINVAL;
++	}
++	list_add_tail(&module->module_list, &toi_modules);
++	toi_num_modules++;
++
++	if ((!module->directory && !module->shared_directory) ||
++			!module->sysfs_data || !module->num_sysfs_entries)
++		return 0;
++
++	/*
++	 * Modules may share a directory, but those with shared_dir
++	 * set must be loaded (via symbol dependencies) after parents
++	 * and unloaded beforehand.
++	 */
++	if (module->shared_directory) {
++		struct toi_module_ops *shared =
++			toi_find_module_given_dir(module->shared_directory);
++		if (!shared) {
++			printk(KERN_ERR "TuxOnIce: Module %s wants to share "
++					"%s's directory but %s isn't loaded.\n",
++					module->name, module->shared_directory,
++					module->shared_directory);
++			toi_unregister_module(module);
++			return -ENODEV;
++		}
++		kobj = shared->dir_kobj;
++	} else {
++		if (!strncmp(module->directory, "[ROOT]", 6))
++			kobj = tuxonice_kobj;
++		else
++			kobj = make_toi_sysdir(module->directory);
++	}
++	module->dir_kobj = kobj;
++	for (i = 0; i < module->num_sysfs_entries; i++) {
++		int result = toi_register_sysfs_file(kobj,
++				&module->sysfs_data[i]);
++		if (result)
++			return result;
++	}
++	return 0;
++}
++EXPORT_SYMBOL_GPL(toi_register_module);
++
++/*
++ * toi_unregister_module
++ *
++ * Remove a module.
++ */
++void toi_unregister_module(struct toi_module_ops *module)
++{
++	int i;
++
++	if (module->dir_kobj)
++		for (i = 0; i < module->num_sysfs_entries; i++)
++			toi_unregister_sysfs_file(module->dir_kobj,
++					&module->sysfs_data[i]);
++
++	if (!module->shared_directory && module->directory &&
++			strncmp(module->directory, "[ROOT]", 6))
++		remove_toi_sysdir(module->dir_kobj);
++
++	switch (module->type) {
++	case FILTER_MODULE:
++		list_del(&module->type_list);
++		toi_num_filters--;
++		break;
++	case WRITER_MODULE:
++		list_del(&module->type_list);
++		toiNumAllocators--;
++		if (toiActiveAllocator == module) {
++			toiActiveAllocator = NULL;
++			clear_toi_state(TOI_CAN_RESUME);
++			clear_toi_state(TOI_CAN_HIBERNATE);
++		}
++		break;
++	case MISC_MODULE:
++	case MISC_HIDDEN_MODULE:
++	case BIO_ALLOCATOR_MODULE:
++		break;
++	default:
++		printk(KERN_ERR "Module '%s' has an invalid type."
++			" It has been ignored.\n", module->name);
++		return;
++	}
++	list_del(&module->module_list);
++	toi_num_modules--;
++}
++EXPORT_SYMBOL_GPL(toi_unregister_module);
++
++/*
++ * toi_move_module_tail
++ *
++ * Rearrange modules when reloading the config.
++ */
++void toi_move_module_tail(struct toi_module_ops *module)
++{
++	switch (module->type) {
++	case FILTER_MODULE:
++		if (toi_num_filters > 1)
++			list_move_tail(&module->type_list, &toi_filters);
++		break;
++	case WRITER_MODULE:
++		if (toiNumAllocators > 1)
++			list_move_tail(&module->type_list, &toiAllocators);
++		break;
++	case MISC_MODULE:
++	case MISC_HIDDEN_MODULE:
++	case BIO_ALLOCATOR_MODULE:
++		break;
++	default:
++		printk(KERN_ERR "Module '%s' has an invalid type."
++			" It has been ignored.\n", module->name);
++		return;
++	}
++	if ((toi_num_filters + toiNumAllocators) > 1)
++		list_move_tail(&module->module_list, &toi_modules);
++}
++
++/*
++ * toi_initialise_modules
++ *
++ * Get ready to do some work!
++ */
++int toi_initialise_modules(int starting_cycle, int early)
++{
++	struct toi_module_ops *this_module;
++	int result;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		this_module->header_requested = 0;
++		this_module->header_used = 0;
++		if (!this_module->enabled)
++			continue;
++		if (this_module->early != early)
++			continue;
++		if (this_module->initialise) {
++			result = this_module->initialise(starting_cycle);
++			if (result) {
++				toi_cleanup_modules(starting_cycle);
++				return result;
++			}
++			this_module->initialised = 1;
++		}
++	}
++
++	return 0;
++}
++
++/*
++ * toi_cleanup_modules
++ *
++ * Tell modules the work is done.
++ */
++void toi_cleanup_modules(int finishing_cycle)
++{
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (!this_module->enabled || !this_module->initialised)
++			continue;
++		if (this_module->cleanup)
++			this_module->cleanup(finishing_cycle);
++		this_module->initialised = 0;
++	}
++}
++
++/*
++ * toi_pre_atomic_restore_modules
++ *
++ * Get ready to do some work!
++ */
++void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
++{
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (this_module->enabled && this_module->pre_atomic_restore)
++			this_module->pre_atomic_restore(bkd);
++	}
++}
++
++/*
++ * toi_post_atomic_restore_modules
++ *
++ * Get ready to do some work!
++ */
++void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd)
++{
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (this_module->enabled && this_module->post_atomic_restore)
++			this_module->post_atomic_restore(bkd);
++	}
++}
++
++/*
++ * toi_get_next_filter
++ *
++ * Get the next filter in the pipeline.
++ */
++struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *filter_sought)
++{
++	struct toi_module_ops *last_filter = NULL, *this_filter = NULL;
++
++	list_for_each_entry(this_filter, &toi_filters, type_list) {
++		if (!this_filter->enabled)
++			continue;
++		if ((last_filter == filter_sought) || (!filter_sought))
++			return this_filter;
++		last_filter = this_filter;
++	}
++
++	return toiActiveAllocator;
++}
++EXPORT_SYMBOL_GPL(toi_get_next_filter);
++
++/**
++ * toi_show_modules: Printk what support is loaded.
++ */
++void toi_print_modules(void)
++{
++	struct toi_module_ops *this_module;
++	int prev = 0;
++
++	printk(KERN_INFO "TuxOnIce " TOI_CORE_VERSION ", with support for");
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		if (this_module->type == MISC_HIDDEN_MODULE)
++			continue;
++		printk("%s %s%s%s", prev ? "," : "",
++				this_module->enabled ? "" : "[",
++				this_module->name,
++				this_module->enabled ? "" : "]");
++		prev = 1;
++	}
++
++	printk(".\n");
++}
++
++/* toi_get_modules
++ *
++ * Take a reference to modules so they can't go away under us.
++ */
++
++int toi_get_modules(void)
++{
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list) {
++		struct toi_module_ops *this_module2;
++
++		if (try_module_get(this_module->module))
++			continue;
++
++		/* Failed! Reverse gets and return error */
++		list_for_each_entry(this_module2, &toi_modules,
++				module_list) {
++			if (this_module == this_module2)
++				return -EINVAL;
++			module_put(this_module2->module);
++		}
++	}
++	return 0;
++}
++
++/* toi_put_modules
++ *
++ * Release our references to modules we used.
++ */
++
++void toi_put_modules(void)
++{
++	struct toi_module_ops *this_module;
++
++	list_for_each_entry(this_module, &toi_modules, module_list)
++		module_put(this_module->module);
++}
+diff --git a/kernel/power/tuxonice_modules.h b/kernel/power/tuxonice_modules.h
+new file mode 100644
+index 0000000..bf5d749
+--- /dev/null
++++ b/kernel/power/tuxonice_modules.h
+@@ -0,0 +1,211 @@
++/*
++ * kernel/power/tuxonice_modules.h
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * It contains declarations for modules. Modules are additions to
++ * TuxOnIce that provide facilities such as image compression or
++ * encryption, backends for storage of the image and user interfaces.
++ *
++ */
++
++#ifndef TOI_MODULES_H
++#define TOI_MODULES_H
++
++/* This is the maximum size we store in the image header for a module name */
++#define TOI_MAX_MODULE_NAME_LENGTH 30
++
++struct toi_boot_kernel_data;
++
++/* Per-module metadata */
++struct toi_module_header {
++	char name[TOI_MAX_MODULE_NAME_LENGTH];
++	int enabled;
++	int type;
++	int index;
++	int data_length;
++	unsigned long signature;
++};
++
++enum {
++	FILTER_MODULE,
++	WRITER_MODULE,
++	BIO_ALLOCATOR_MODULE,
++	MISC_MODULE,
++	MISC_HIDDEN_MODULE,
++};
++
++enum {
++	TOI_ASYNC,
++	TOI_SYNC
++};
++
++enum {
++	TOI_VIRT,
++	TOI_PAGE,
++};
++
++#define TOI_MAP(type, addr) \
++ (type == TOI_PAGE ? kmap(addr) : addr)
++
++#define TOI_UNMAP(type, addr) \
++ do { \
++   if (type == TOI_PAGE) \
++     kunmap(addr); \
++ } while(0)
++
++struct toi_module_ops {
++	/* Functions common to all modules */
++	int type;
++	char *name;
++	char *directory;
++	char *shared_directory;
++	struct kobject *dir_kobj;
++	struct module *module;
++	int enabled, early, initialised;
++	struct list_head module_list;
++
++	/* List of filters or allocators */
++	struct list_head list, type_list;
++
++	/*
++	 * Requirements for memory and storage in
++	 * the image header..
++	 */
++	int (*memory_needed) (void);
++	int (*storage_needed) (void);
++
++	int header_requested, header_used;
++
++	int (*expected_compression) (void);
++
++	/*
++	 * Debug info
++	 */
++	int (*print_debug_info) (char *buffer, int size);
++	int (*save_config_info) (char *buffer);
++	void (*load_config_info) (char *buffer, int len);
++
++	/*
++	 * Initialise & cleanup - general routines called
++	 * at the start and end of a cycle.
++	 */
++	int (*initialise) (int starting_cycle);
++	void (*cleanup) (int finishing_cycle);
++
++	void (*pre_atomic_restore) (struct toi_boot_kernel_data *bkd);
++	void (*post_atomic_restore) (struct toi_boot_kernel_data *bkd);
++
++	/*
++	 * Calls for allocating storage (allocators only).
++	 *
++	 * Header space is requested separately and cannot fail, but the
++	 * reservation is only applied when main storage is allocated.
++	 * The header space reservation is thus always set prior to
++	 * requesting the allocation of storage - and prior to querying
++	 * how much storage is available.
++	 */
++
++	unsigned long (*storage_available) (void);
++	void (*reserve_header_space) (unsigned long space_requested);
++	int (*register_storage) (void);
++	int (*allocate_storage) (unsigned long space_requested);
++	unsigned long (*storage_allocated) (void);
++
++	/*
++	 * Routines used in image I/O.
++	 */
++	int (*rw_init) (int rw, int stream_number);
++	int (*rw_cleanup) (int rw);
++	int (*write_page) (unsigned long index, int buf_type, void *buf,
++			unsigned int buf_size);
++	int (*read_page) (unsigned long *index, int buf_type, void *buf,
++			unsigned int *buf_size);
++	int (*io_flusher) (int rw);
++
++	/* Reset module if image exists but reading aborted */
++	void (*noresume_reset) (void);
++
++	/* Read and write the metadata */
++	int (*write_header_init) (void);
++	int (*write_header_cleanup) (void);
++
++	int (*read_header_init) (void);
++	int (*read_header_cleanup) (void);
++
++	/* To be called after read_header_init */
++	int (*get_header_version) (void);
++
++	int (*rw_header_chunk) (int rw, struct toi_module_ops *owner,
++			char *buffer_start, int buffer_size);
++
++	int (*rw_header_chunk_noreadahead) (int rw,
++			struct toi_module_ops *owner, char *buffer_start,
++			int buffer_size);
++
++	/* Attempt to parse an image location */
++	int (*parse_sig_location) (char *buffer, int only_writer, int quiet);
++
++	/* Throttle I/O according to throughput */
++	void (*update_throughput_throttle) (int jif_index);
++
++	/* Flush outstanding I/O */
++	int (*finish_all_io) (void);
++
++	/* Determine whether image exists that we can restore */
++	int (*image_exists) (int quiet);
++
++	/* Mark the image as having tried to resume */
++	int (*mark_resume_attempted) (int);
++
++	/* Destroy image if one exists */
++	int (*remove_image) (void);
++
++	/* Sysfs Data */
++	struct toi_sysfs_data *sysfs_data;
++	int num_sysfs_entries;
++
++	/* Block I/O allocator */
++	struct toi_bio_allocator_ops *bio_allocator_ops;
++};
++
++extern int toi_num_modules, toiNumAllocators;
++
++extern struct toi_module_ops *toiActiveAllocator;
++extern struct list_head toi_filters, toiAllocators, toi_modules;
++
++extern void toi_prepare_console_modules(void);
++extern void toi_cleanup_console_modules(void);
++
++extern struct toi_module_ops *toi_find_module_given_name(char *name);
++extern struct toi_module_ops *toi_get_next_filter(struct toi_module_ops *);
++
++extern int toi_register_module(struct toi_module_ops *module);
++extern void toi_move_module_tail(struct toi_module_ops *module);
++
++extern long toi_header_storage_for_modules(void);
++extern long toi_memory_for_modules(int print_parts);
++extern void print_toi_header_storage_for_modules(void);
++extern int toi_expected_compression_ratio(void);
++
++extern int toi_print_module_debug_info(char *buffer, int buffer_size);
++extern int toi_register_module(struct toi_module_ops *module);
++extern void toi_unregister_module(struct toi_module_ops *module);
++
++extern int toi_initialise_modules(int starting_cycle, int early);
++#define toi_initialise_modules_early(starting) \
++	toi_initialise_modules(starting, 1)
++#define toi_initialise_modules_late(starting) \
++	toi_initialise_modules(starting, 0)
++extern void toi_cleanup_modules(int finishing_cycle);
++
++extern void toi_post_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
++extern void toi_pre_atomic_restore_modules(struct toi_boot_kernel_data *bkd);
++
++extern void toi_print_modules(void);
++
++int toi_get_modules(void);
++void toi_put_modules(void);
++#endif
+diff --git a/kernel/power/tuxonice_netlink.c b/kernel/power/tuxonice_netlink.c
+new file mode 100644
+index 0000000..75b4aa9
+--- /dev/null
++++ b/kernel/power/tuxonice_netlink.c
+@@ -0,0 +1,329 @@
++/*
++ * kernel/power/tuxonice_netlink.c
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Functions for communicating with a userspace helper via netlink.
++ */
++
++
++#include <linux/suspend.h>
++#include <linux/sched.h>
++#include "tuxonice_netlink.h"
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_builtin.h"
++
++static struct user_helper_data *uhd_list;
++
++/*
++ * Refill our pool of SKBs for use in emergencies (eg, when eating memory and
++ * none can be allocated).
++ */
++static void toi_fill_skb_pool(struct user_helper_data *uhd)
++{
++	while (uhd->pool_level < uhd->pool_limit) {
++		struct sk_buff *new_skb =
++			alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
++
++		if (!new_skb)
++			break;
++
++		new_skb->next = uhd->emerg_skbs;
++		uhd->emerg_skbs = new_skb;
++		uhd->pool_level++;
++	}
++}
++
++/*
++ * Try to allocate a single skb. If we can't get one, try to use one from
++ * our pool.
++ */
++static struct sk_buff *toi_get_skb(struct user_helper_data *uhd)
++{
++	struct sk_buff *skb =
++		alloc_skb(NLMSG_SPACE(uhd->skb_size), TOI_ATOMIC_GFP);
++
++	if (skb)
++		return skb;
++
++	skb = uhd->emerg_skbs;
++	if (skb) {
++		uhd->pool_level--;
++		uhd->emerg_skbs = skb->next;
++		skb->next = NULL;
++	}
++
++	return skb;
++}
++
++void toi_send_netlink_message(struct user_helper_data *uhd,
++		int type, void *params, size_t len)
++{
++	struct sk_buff *skb;
++	struct nlmsghdr *nlh;
++	void *dest;
++	struct task_struct *t;
++
++	if (uhd->pid == -1)
++		return;
++
++	if (uhd->debug)
++		printk(KERN_ERR "toi_send_netlink_message: Send "
++				"message type %d.\n", type);
++
++	skb = toi_get_skb(uhd);
++	if (!skb) {
++		printk(KERN_INFO "toi_netlink: Can't allocate skb!\n");
++		return;
++	}
++
++	nlh = nlmsg_put(skb, 0, uhd->sock_seq, type, len, 0);
++	uhd->sock_seq++;
++
++	dest = NLMSG_DATA(nlh);
++	if (params && len > 0)
++		memcpy(dest, params, len);
++
++	netlink_unicast(uhd->nl, skb, uhd->pid, 0);
++
++	toi_read_lock_tasklist();
++	t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
++	if (!t) {
++		toi_read_unlock_tasklist();
++		if (uhd->pid > -1)
++			printk(KERN_INFO "Hmm. Can't find the userspace task"
++				" %d.\n", uhd->pid);
++		return;
++	}
++	wake_up_process(t);
++	toi_read_unlock_tasklist();
++
++	yield();
++}
++EXPORT_SYMBOL_GPL(toi_send_netlink_message);
++
++static void send_whether_debugging(struct user_helper_data *uhd)
++{
++	static u8 is_debugging = 1;
++
++	toi_send_netlink_message(uhd, NETLINK_MSG_IS_DEBUGGING,
++			&is_debugging, sizeof(u8));
++}
++
++/*
++ * Set the PF_NOFREEZE flag on the given process to ensure it can run whilst we
++ * are hibernating.
++ */
++static int nl_set_nofreeze(struct user_helper_data *uhd, __u32 pid)
++{
++	struct task_struct *t;
++
++	if (uhd->debug)
++		printk(KERN_ERR "nl_set_nofreeze for pid %d.\n", pid);
++
++	toi_read_lock_tasklist();
++	t = find_task_by_pid_ns(pid, &init_pid_ns);
++	if (!t) {
++		toi_read_unlock_tasklist();
++		printk(KERN_INFO "Strange. Can't find the userspace task %d.\n",
++				pid);
++		return -EINVAL;
++	}
++
++	t->flags |= PF_NOFREEZE;
++
++	toi_read_unlock_tasklist();
++	uhd->pid = pid;
++
++	toi_send_netlink_message(uhd, NETLINK_MSG_NOFREEZE_ACK, NULL, 0);
++
++	return 0;
++}
++
++/*
++ * Called when the userspace process has informed us that it's ready to roll.
++ */
++static int nl_ready(struct user_helper_data *uhd, u32 version)
++{
++	if (version != uhd->interface_version) {
++		printk(KERN_INFO "%s userspace process using invalid interface"
++				" version (%d - kernel wants %d). Trying to "
++				"continue without it.\n",
++				uhd->name, version, uhd->interface_version);
++		if (uhd->not_ready)
++			uhd->not_ready();
++		return -EINVAL;
++	}
++
++	complete(&uhd->wait_for_process);
++
++	return 0;
++}
++
++void toi_netlink_close_complete(struct user_helper_data *uhd)
++{
++	if (uhd->nl) {
++		netlink_kernel_release(uhd->nl);
++		uhd->nl = NULL;
++	}
++
++	while (uhd->emerg_skbs) {
++		struct sk_buff *next = uhd->emerg_skbs->next;
++		kfree_skb(uhd->emerg_skbs);
++		uhd->emerg_skbs = next;
++	}
++
++	uhd->pid = -1;
++}
++EXPORT_SYMBOL_GPL(toi_netlink_close_complete);
++
++static int toi_nl_gen_rcv_msg(struct user_helper_data *uhd,
++		struct sk_buff *skb, struct nlmsghdr *nlh)
++{
++	int type = nlh->nlmsg_type;
++	int *data;
++	int err;
++
++	if (uhd->debug)
++		printk(KERN_ERR "toi_user_rcv_skb: Received message %d.\n",
++				type);
++
++	/* Let the more specific handler go first. It returns
++	 * 1 for valid messages that it doesn't know. */
++	err = uhd->rcv_msg(skb, nlh);
++	if (err != 1)
++		return err;
++
++	/* Only allow one task to receive NOFREEZE privileges */
++	if (type == NETLINK_MSG_NOFREEZE_ME && uhd->pid != -1) {
++		printk(KERN_INFO "Received extra nofreeze me requests.\n");
++		return -EBUSY;
++	}
++
++	data = NLMSG_DATA(nlh);
++
++	switch (type) {
++	case NETLINK_MSG_NOFREEZE_ME:
++		return nl_set_nofreeze(uhd, nlh->nlmsg_pid);
++	case NETLINK_MSG_GET_DEBUGGING:
++		send_whether_debugging(uhd);
++		return 0;
++	case NETLINK_MSG_READY:
++		if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(u32))) {
++			printk(KERN_INFO "Invalid ready mesage.\n");
++			if (uhd->not_ready)
++				uhd->not_ready();
++			return -EINVAL;
++		}
++		return nl_ready(uhd, (u32) *data);
++	case NETLINK_MSG_CLEANUP:
++		toi_netlink_close_complete(uhd);
++		return 0;
++	}
++
++	return -EINVAL;
++}
++
++static void toi_user_rcv_skb(struct sk_buff *skb)
++{
++	int err;
++	struct nlmsghdr *nlh;
++	struct user_helper_data *uhd = uhd_list;
++
++	while (uhd && uhd->netlink_id != skb->sk->sk_protocol)
++		uhd = uhd->next;
++
++	if (!uhd)
++		return;
++
++	while (skb->len >= NLMSG_SPACE(0)) {
++		u32 rlen;
++
++		nlh = (struct nlmsghdr *) skb->data;
++		if (nlh->nlmsg_len < sizeof(*nlh) || skb->len < nlh->nlmsg_len)
++			return;
++
++		rlen = NLMSG_ALIGN(nlh->nlmsg_len);
++		if (rlen > skb->len)
++			rlen = skb->len;
++
++		err = toi_nl_gen_rcv_msg(uhd, skb, nlh);
++		if (err)
++			netlink_ack(skb, nlh, err);
++		else if (nlh->nlmsg_flags & NLM_F_ACK)
++			netlink_ack(skb, nlh, 0);
++		skb_pull(skb, rlen);
++	}
++}
++
++static int netlink_prepare(struct user_helper_data *uhd)
++{
++	struct netlink_kernel_cfg cfg = {
++		.groups = 0,
++		.input = toi_user_rcv_skb,
++	};
++
++	uhd->next = uhd_list;
++	uhd_list = uhd;
++
++	uhd->sock_seq = 0x42c0ffee;
++	uhd->nl = netlink_kernel_create(&init_net, uhd->netlink_id, &cfg);
++	if (!uhd->nl) {
++		printk(KERN_INFO "Failed to allocate netlink socket for %s.\n",
++				uhd->name);
++		return -ENOMEM;
++	}
++
++	toi_fill_skb_pool(uhd);
++
++	return 0;
++}
++
++void toi_netlink_close(struct user_helper_data *uhd)
++{
++	struct task_struct *t;
++
++	toi_read_lock_tasklist();
++	t = find_task_by_pid_ns(uhd->pid, &init_pid_ns);
++	if (t)
++		t->flags &= ~PF_NOFREEZE;
++	toi_read_unlock_tasklist();
++
++	toi_send_netlink_message(uhd, NETLINK_MSG_CLEANUP, NULL, 0);
++}
++EXPORT_SYMBOL_GPL(toi_netlink_close);
++
++int toi_netlink_setup(struct user_helper_data *uhd)
++{
++	/* In case userui didn't cleanup properly on us */
++	toi_netlink_close_complete(uhd);
++
++	if (netlink_prepare(uhd) < 0) {
++		printk(KERN_INFO "Netlink prepare failed.\n");
++		return 1;
++	}
++
++	if (toi_launch_userspace_program(uhd->program, uhd->netlink_id,
++				UMH_WAIT_EXEC, uhd->debug) < 0) {
++		printk(KERN_INFO "Launch userspace program failed.\n");
++		toi_netlink_close_complete(uhd);
++		return 1;
++	}
++
++	/* Wait 2 seconds for the userspace process to make contact */
++	wait_for_completion_timeout(&uhd->wait_for_process, 2*HZ);
++
++	if (uhd->pid == -1) {
++		printk(KERN_INFO "%s: Failed to contact userspace process.\n",
++				uhd->name);
++		toi_netlink_close_complete(uhd);
++		return 1;
++	}
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(toi_netlink_setup);
+diff --git a/kernel/power/tuxonice_netlink.h b/kernel/power/tuxonice_netlink.h
+new file mode 100644
+index 0000000..b8ef06e
+--- /dev/null
++++ b/kernel/power/tuxonice_netlink.h
+@@ -0,0 +1,62 @@
++/*
++ * kernel/power/tuxonice_netlink.h
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Declarations for functions for communicating with a userspace helper
++ * via netlink.
++ */
++
++#include <linux/netlink.h>
++#include <net/sock.h>
++
++#define NETLINK_MSG_BASE 0x10
++
++#define NETLINK_MSG_READY 0x10
++#define	NETLINK_MSG_NOFREEZE_ME 0x16
++#define NETLINK_MSG_GET_DEBUGGING 0x19
++#define NETLINK_MSG_CLEANUP 0x24
++#define NETLINK_MSG_NOFREEZE_ACK 0x27
++#define NETLINK_MSG_IS_DEBUGGING 0x28
++
++struct user_helper_data {
++	int (*rcv_msg) (struct sk_buff *skb, struct nlmsghdr *nlh);
++	void (*not_ready) (void);
++	struct sock *nl;
++	u32 sock_seq;
++	pid_t pid;
++	char *comm;
++	char program[256];
++	int pool_level;
++	int pool_limit;
++	struct sk_buff *emerg_skbs;
++	int skb_size;
++	int netlink_id;
++	char *name;
++	struct user_helper_data *next;
++	struct completion wait_for_process;
++	u32 interface_version;
++	int must_init;
++	int debug;
++};
++
++#ifdef CONFIG_NET
++int toi_netlink_setup(struct user_helper_data *uhd);
++void toi_netlink_close(struct user_helper_data *uhd);
++void toi_send_netlink_message(struct user_helper_data *uhd,
++		int type, void *params, size_t len);
++void toi_netlink_close_complete(struct user_helper_data *uhd);
++#else
++static inline int toi_netlink_setup(struct user_helper_data *uhd)
++{
++	return 0;
++}
++
++static inline void toi_netlink_close(struct user_helper_data *uhd) { };
++static inline void toi_send_netlink_message(struct user_helper_data *uhd,
++		int type, void *params, size_t len) { };
++static inline void toi_netlink_close_complete(struct user_helper_data *uhd)
++	{ };
++#endif
+diff --git a/kernel/power/tuxonice_pagedir.c b/kernel/power/tuxonice_pagedir.c
+new file mode 100644
+index 0000000..ce0d38c
+--- /dev/null
++++ b/kernel/power/tuxonice_pagedir.c
+@@ -0,0 +1,346 @@
++/*
++ * kernel/power/tuxonice_pagedir.c
++ *
++ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
++ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
++ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Routines for handling pagesets.
++ * Note that pbes aren't actually stored as such. They're stored as
++ * bitmaps and extents.
++ */
++
++#include <linux/suspend.h>
++#include <linux/highmem.h>
++#include <linux/bootmem.h>
++#include <linux/hardirq.h>
++#include <linux/sched.h>
++#include <linux/cpu.h>
++#include <asm/tlbflush.h>
++
++#include "tuxonice_pageflags.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_pagedir.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice.h"
++#include "tuxonice_builtin.h"
++#include "tuxonice_alloc.h"
++
++static int ptoi_pfn;
++static struct pbe *this_low_pbe;
++static struct pbe **last_low_pbe_ptr;
++
++void toi_reset_alt_image_pageset2_pfn(void)
++{
++	memory_bm_position_reset(pageset2_map);
++}
++
++static struct page *first_conflicting_page;
++
++/*
++ * free_conflicting_pages
++ */
++
++static void free_conflicting_pages(void)
++{
++	while (first_conflicting_page) {
++		struct page *next =
++			*((struct page **) kmap(first_conflicting_page));
++		kunmap(first_conflicting_page);
++		toi__free_page(29, first_conflicting_page);
++		first_conflicting_page = next;
++	}
++}
++
++/* __toi_get_nonconflicting_page
++ *
++ * Description: Gets order zero pages that won't be overwritten
++ *		while copying the original pages.
++ */
++
++struct page *___toi_get_nonconflicting_page(int can_be_highmem)
++{
++	struct page *page;
++	gfp_t flags = TOI_ATOMIC_GFP;
++	if (can_be_highmem)
++		flags |= __GFP_HIGHMEM;
++
++
++	if (test_toi_state(TOI_LOADING_ALT_IMAGE) &&
++			pageset2_map &&
++			(ptoi_pfn != BM_END_OF_MAP)) {
++		do {
++			ptoi_pfn = memory_bm_next_pfn(pageset2_map);
++			if (ptoi_pfn != BM_END_OF_MAP) {
++				page = pfn_to_page(ptoi_pfn);
++				if (!PagePageset1(page) &&
++				    (can_be_highmem || !PageHighMem(page)))
++					return page;
++			}
++		} while (ptoi_pfn != BM_END_OF_MAP);
++	}
++
++	do {
++		page = toi_alloc_page(29, flags);
++		if (!page) {
++			printk(KERN_INFO "Failed to get nonconflicting "
++					"page.\n");
++			return NULL;
++		}
++		if (PagePageset1(page)) {
++			struct page **next = (struct page **) kmap(page);
++			*next = first_conflicting_page;
++			first_conflicting_page = page;
++			kunmap(page);
++		}
++	} while (PagePageset1(page));
++
++	return page;
++}
++
++unsigned long __toi_get_nonconflicting_page(void)
++{
++	struct page *page = ___toi_get_nonconflicting_page(0);
++	return page ? (unsigned long) page_address(page) : 0;
++}
++
++static struct pbe *get_next_pbe(struct page **page_ptr, struct pbe *this_pbe,
++		int highmem)
++{
++	if (((((unsigned long) this_pbe) & (PAGE_SIZE - 1))
++		     + 2 * sizeof(struct pbe)) > PAGE_SIZE) {
++		struct page *new_page =
++			___toi_get_nonconflicting_page(highmem);
++		if (!new_page)
++			return ERR_PTR(-ENOMEM);
++		this_pbe = (struct pbe *) kmap(new_page);
++		memset(this_pbe, 0, PAGE_SIZE);
++		*page_ptr = new_page;
++	} else
++		this_pbe++;
++
++	return this_pbe;
++}
++
++/**
++ * get_pageset1_load_addresses - generate pbes for conflicting pages
++ *
++ * We check here that pagedir & pages it points to won't collide
++ * with pages where we're going to restore from the loaded pages
++ * later.
++ *
++ * Returns:
++ *	Zero on success, one if couldn't find enough pages (shouldn't
++ *	happen).
++ **/
++int toi_get_pageset1_load_addresses(void)
++{
++	int pfn, highallocd = 0, lowallocd = 0;
++	int low_needed = pagedir1.size - get_highmem_size(pagedir1);
++	int high_needed = get_highmem_size(pagedir1);
++	int low_pages_for_highmem = 0;
++	gfp_t flags = GFP_ATOMIC | __GFP_NOWARN | __GFP_HIGHMEM;
++	struct page *page, *high_pbe_page = NULL, *last_high_pbe_page = NULL,
++		    *low_pbe_page, *last_low_pbe_page = NULL;
++	struct pbe **last_high_pbe_ptr = &restore_highmem_pblist,
++		   *this_high_pbe = NULL;
++	unsigned long orig_low_pfn, orig_high_pfn;
++	int high_pbes_done = 0, low_pbes_done = 0;
++	int low_direct = 0, high_direct = 0, result = 0, i;
++	int high_page = 1, high_offset = 0, low_page = 1, low_offset = 0;
++
++	memory_bm_set_iterators(pageset1_map, 3);
++	memory_bm_position_reset(pageset1_map);
++
++	memory_bm_set_iterators(pageset1_copy_map, 2);
++	memory_bm_position_reset(pageset1_copy_map);
++
++	last_low_pbe_ptr = &restore_pblist;
++
++	/* First, allocate pages for the start of our pbe lists. */
++	if (high_needed) {
++		high_pbe_page = ___toi_get_nonconflicting_page(1);
++		if (!high_pbe_page) {
++			result = -ENOMEM;
++			goto out;
++		}
++		this_high_pbe = (struct pbe *) kmap(high_pbe_page);
++		memset(this_high_pbe, 0, PAGE_SIZE);
++	}
++
++	low_pbe_page = ___toi_get_nonconflicting_page(0);
++	if (!low_pbe_page) {
++		result = -ENOMEM;
++		goto out;
++	}
++	this_low_pbe = (struct pbe *) page_address(low_pbe_page);
++
++	/*
++	 * Next, allocate the number of pages we need.
++	 */
++
++	i = low_needed + high_needed;
++
++	do {
++		int is_high;
++
++		if (i == low_needed)
++			flags &= ~__GFP_HIGHMEM;
++
++		page = toi_alloc_page(30, flags);
++		BUG_ON(!page);
++
++		SetPagePageset1Copy(page);
++		is_high = PageHighMem(page);
++
++		if (PagePageset1(page)) {
++			if (is_high)
++				high_direct++;
++			else
++				low_direct++;
++		} else {
++			if (is_high)
++				highallocd++;
++			else
++				lowallocd++;
++		}
++	} while (--i);
++
++	high_needed -= high_direct;
++	low_needed -= low_direct;
++
++	/*
++	 * Do we need to use some lowmem pages for the copies of highmem
++	 * pages?
++	 */
++	if (high_needed > highallocd) {
++		low_pages_for_highmem = high_needed - highallocd;
++		high_needed -= low_pages_for_highmem;
++		low_needed += low_pages_for_highmem;
++	}
++
++	/*
++	 * Now generate our pbes (which will be used for the atomic restore),
++	 * and free unneeded pages.
++	 */
++	memory_bm_position_reset(pageset1_copy_map);
++	for (pfn = memory_bm_next_pfn_index(pageset1_copy_map, 1); pfn != BM_END_OF_MAP;
++			pfn = memory_bm_next_pfn_index(pageset1_copy_map, 1)) {
++		int is_high;
++		page = pfn_to_page(pfn);
++		is_high = PageHighMem(page);
++
++		if (PagePageset1(page))
++			continue;
++
++		/* Nope. We're going to use this page. Add a pbe. */
++		if (is_high || low_pages_for_highmem) {
++			struct page *orig_page;
++			high_pbes_done++;
++			if (!is_high)
++				low_pages_for_highmem--;
++			do {
++				orig_high_pfn = memory_bm_next_pfn_index(pageset1_map, 1);
++				BUG_ON(orig_high_pfn == BM_END_OF_MAP);
++				orig_page = pfn_to_page(orig_high_pfn);
++			} while (!PageHighMem(orig_page) ||
++					PagePageset1Copy(orig_page));
++
++			this_high_pbe->orig_address = (void *) orig_high_pfn;
++			this_high_pbe->address = page;
++			this_high_pbe->next = NULL;
++			toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "High pbe %d/%d: %p(%d)=>%p",
++					high_page, high_offset, page, orig_high_pfn, orig_page);
++			if (last_high_pbe_page != high_pbe_page) {
++				*last_high_pbe_ptr =
++					(struct pbe *) high_pbe_page;
++				if (last_high_pbe_page) {
++					kunmap(last_high_pbe_page);
++					high_page++;
++					high_offset = 0;
++				} else
++					high_offset++;
++				last_high_pbe_page = high_pbe_page;
++			} else {
++				*last_high_pbe_ptr = this_high_pbe;
++				high_offset++;
++			}
++			last_high_pbe_ptr = &this_high_pbe->next;
++			this_high_pbe = get_next_pbe(&high_pbe_page,
++					this_high_pbe, 1);
++			if (IS_ERR(this_high_pbe)) {
++				printk(KERN_INFO
++						"This high pbe is an error.\n");
++				return -ENOMEM;
++			}
++		} else {
++			struct page *orig_page;
++			low_pbes_done++;
++			do {
++				orig_low_pfn = memory_bm_next_pfn_index(pageset1_map, 2);
++				BUG_ON(orig_low_pfn == BM_END_OF_MAP);
++				orig_page = pfn_to_page(orig_low_pfn);
++			} while (PageHighMem(orig_page) ||
++					PagePageset1Copy(orig_page));
++
++			this_low_pbe->orig_address = page_address(orig_page);
++			this_low_pbe->address = page_address(page);
++			this_low_pbe->next = NULL;
++			toi_message(TOI_PAGEDIR, TOI_VERBOSE, 0, "Low pbe %d/%d: %p(%d)=>%p",
++					low_page, low_offset, this_low_pbe->orig_address,
++					orig_low_pfn, this_low_pbe->address);
++			*last_low_pbe_ptr = this_low_pbe;
++			last_low_pbe_ptr = &this_low_pbe->next;
++			this_low_pbe = get_next_pbe(&low_pbe_page,
++					this_low_pbe, 0);
++			if (low_pbe_page != last_low_pbe_page) {
++				if (last_low_pbe_page) {
++					low_page++;
++					low_offset = 0;
++				}
++				last_low_pbe_page = low_pbe_page;
++			} else
++				low_offset++;
++			if (IS_ERR(this_low_pbe)) {
++				printk(KERN_INFO "this_low_pbe is an error.\n");
++				return -ENOMEM;
++			}
++		}
++	}
++
++	if (high_pbe_page)
++		kunmap(high_pbe_page);
++
++	if (last_high_pbe_page != high_pbe_page) {
++		if (last_high_pbe_page)
++			kunmap(last_high_pbe_page);
++		toi__free_page(29, high_pbe_page);
++	}
++
++	free_conflicting_pages();
++
++out:
++	memory_bm_set_iterators(pageset1_map, 1);
++	memory_bm_set_iterators(pageset1_copy_map, 1);
++	return result;
++}
++
++int add_boot_kernel_data_pbe(void)
++{
++	this_low_pbe->address = (char *) __toi_get_nonconflicting_page();
++	if (!this_low_pbe->address) {
++		printk(KERN_INFO "Failed to get bkd atomic restore buffer.");
++		return -ENOMEM;
++	}
++
++	toi_bkd.size = sizeof(toi_bkd);
++	memcpy(this_low_pbe->address, &toi_bkd, sizeof(toi_bkd));
++
++	*last_low_pbe_ptr = this_low_pbe;
++	this_low_pbe->orig_address = (char *) boot_kernel_data_buffer;
++	this_low_pbe->next = NULL;
++	return 0;
++}
+diff --git a/kernel/power/tuxonice_pagedir.h b/kernel/power/tuxonice_pagedir.h
+new file mode 100644
+index 0000000..d08e4b1
+--- /dev/null
++++ b/kernel/power/tuxonice_pagedir.h
+@@ -0,0 +1,50 @@
++/*
++ * kernel/power/tuxonice_pagedir.h
++ *
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Declarations for routines for handling pagesets.
++ */
++
++#ifndef KERNEL_POWER_PAGEDIR_H
++#define KERNEL_POWER_PAGEDIR_H
++
++/* Pagedir
++ *
++ * Contains the metadata for a set of pages saved in the image.
++ */
++
++struct pagedir {
++	int id;
++	unsigned long size;
++#ifdef CONFIG_HIGHMEM
++	unsigned long size_high;
++#endif
++};
++
++#ifdef CONFIG_HIGHMEM
++#define get_highmem_size(pagedir) (pagedir.size_high)
++#define set_highmem_size(pagedir, sz) do { pagedir.size_high = sz; } while (0)
++#define inc_highmem_size(pagedir) do { pagedir.size_high++; } while (0)
++#define get_lowmem_size(pagedir) (pagedir.size - pagedir.size_high)
++#else
++#define get_highmem_size(pagedir) (0)
++#define set_highmem_size(pagedir, sz) do { } while (0)
++#define inc_highmem_size(pagedir) do { } while (0)
++#define get_lowmem_size(pagedir) (pagedir.size)
++#endif
++
++extern struct pagedir pagedir1, pagedir2;
++
++extern void toi_copy_pageset1(void);
++
++extern int toi_get_pageset1_load_addresses(void);
++
++extern unsigned long __toi_get_nonconflicting_page(void);
++struct page *___toi_get_nonconflicting_page(int can_be_highmem);
++
++extern void toi_reset_alt_image_pageset2_pfn(void);
++extern int add_boot_kernel_data_pbe(void);
++#endif
+diff --git a/kernel/power/tuxonice_pageflags.c b/kernel/power/tuxonice_pageflags.c
+new file mode 100644
+index 0000000..77fab4f
+--- /dev/null
++++ b/kernel/power/tuxonice_pageflags.c
+@@ -0,0 +1,29 @@
++/*
++ * kernel/power/tuxonice_pageflags.c
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Routines for serialising and relocating pageflags in which we
++ * store our image metadata.
++ */
++
++#include <linux/list.h>
++#include <linux/module.h>
++#include "tuxonice_pageflags.h"
++#include "power.h"
++
++int toi_pageflags_space_needed(void)
++{
++	int total = 0;
++	struct bm_block *bb;
++
++	total = sizeof(unsigned int);
++
++	list_for_each_entry(bb, &pageset1_map->blocks, hook)
++		total += 2 * sizeof(unsigned long) + PAGE_SIZE;
++
++	return total;
++}
++EXPORT_SYMBOL_GPL(toi_pageflags_space_needed);
+diff --git a/kernel/power/tuxonice_pageflags.h b/kernel/power/tuxonice_pageflags.h
+new file mode 100644
+index 0000000..d5aa7b1
+--- /dev/null
++++ b/kernel/power/tuxonice_pageflags.h
+@@ -0,0 +1,72 @@
++/*
++ * kernel/power/tuxonice_pageflags.h
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ */
++
++#ifndef KERNEL_POWER_TUXONICE_PAGEFLAGS_H
++#define KERNEL_POWER_TUXONICE_PAGEFLAGS_H
++
++extern struct memory_bitmap *pageset1_map;
++extern struct memory_bitmap *pageset1_copy_map;
++extern struct memory_bitmap *pageset2_map;
++extern struct memory_bitmap *page_resave_map;
++extern struct memory_bitmap *io_map;
++extern struct memory_bitmap *nosave_map;
++extern struct memory_bitmap *free_map;
++
++#define PagePageset1(page) \
++	(memory_bm_test_bit(pageset1_map, page_to_pfn(page)))
++#define SetPagePageset1(page) \
++	(memory_bm_set_bit(pageset1_map, page_to_pfn(page)))
++#define ClearPagePageset1(page) \
++	(memory_bm_clear_bit(pageset1_map, page_to_pfn(page)))
++
++#define PagePageset1Copy(page) \
++	(memory_bm_test_bit(pageset1_copy_map, page_to_pfn(page)))
++#define SetPagePageset1Copy(page) \
++	(memory_bm_set_bit(pageset1_copy_map, page_to_pfn(page)))
++#define ClearPagePageset1Copy(page) \
++	(memory_bm_clear_bit(pageset1_copy_map, page_to_pfn(page)))
++
++#define PagePageset2(page) \
++	(memory_bm_test_bit(pageset2_map, page_to_pfn(page)))
++#define SetPagePageset2(page) \
++	(memory_bm_set_bit(pageset2_map, page_to_pfn(page)))
++#define ClearPagePageset2(page) \
++	(memory_bm_clear_bit(pageset2_map, page_to_pfn(page)))
++
++#define PageWasRW(page) \
++	(memory_bm_test_bit(pageset2_map, page_to_pfn(page)))
++#define SetPageWasRW(page) \
++	(memory_bm_set_bit(pageset2_map, page_to_pfn(page)))
++#define ClearPageWasRW(page) \
++	(memory_bm_clear_bit(pageset2_map, page_to_pfn(page)))
++
++#define PageResave(page) (page_resave_map ? \
++	memory_bm_test_bit(page_resave_map, page_to_pfn(page)) : 0)
++#define SetPageResave(page) \
++	(memory_bm_set_bit(page_resave_map, page_to_pfn(page)))
++#define ClearPageResave(page) \
++	(memory_bm_clear_bit(page_resave_map, page_to_pfn(page)))
++
++#define PageNosave(page) (nosave_map ? \
++		memory_bm_test_bit(nosave_map, page_to_pfn(page)) : 0)
++#define SetPageNosave(page) \
++	(memory_bm_set_bit(nosave_map, page_to_pfn(page)))
++#define ClearPageNosave(page) \
++	(memory_bm_clear_bit(nosave_map, page_to_pfn(page)))
++
++#define PageNosaveFree(page) (free_map ? \
++		memory_bm_test_bit(free_map, page_to_pfn(page)) : 0)
++#define SetPageNosaveFree(page) \
++	(memory_bm_set_bit(free_map, page_to_pfn(page)))
++#define ClearPageNosaveFree(page) \
++	(memory_bm_clear_bit(free_map, page_to_pfn(page)))
++
++extern void save_pageflags(struct memory_bitmap *pagemap);
++extern int load_pageflags(struct memory_bitmap *pagemap);
++extern int toi_pageflags_space_needed(void);
++#endif
+diff --git a/kernel/power/tuxonice_power_off.c b/kernel/power/tuxonice_power_off.c
+new file mode 100644
+index 0000000..1604a95
+--- /dev/null
++++ b/kernel/power/tuxonice_power_off.c
+@@ -0,0 +1,287 @@
++/*
++ * kernel/power/tuxonice_power_off.c
++ *
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Support for powering down.
++ */
++
++#include <linux/device.h>
++#include <linux/suspend.h>
++#include <linux/mm.h>
++#include <linux/pm.h>
++#include <linux/reboot.h>
++#include <linux/cpu.h>
++#include <linux/console.h>
++#include <linux/fs.h>
++#include "tuxonice.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_power_off.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_io.h"
++
++unsigned long toi_poweroff_method; /* 0 - Kernel power off */
++EXPORT_SYMBOL_GPL(toi_poweroff_method);
++
++static int wake_delay;
++static char lid_state_file[256], wake_alarm_dir[256];
++static struct file *lid_file, *alarm_file, *epoch_file;
++static int post_wake_state = -1;
++
++static int did_suspend_to_both;
++
++/*
++ * __toi_power_down
++ * Functionality   : Powers down or reboots the computer once the image
++ *                   has been written to disk.
++ * Key Assumptions : Able to reboot/power down via code called or that
++ *                   the warning emitted if the calls fail will be visible
++ *                   to the user (ie printk resumes devices).
++ */
++
++static void __toi_power_down(int method)
++{
++	int error;
++
++	toi_cond_pause(1, test_action_state(TOI_REBOOT) ? "Ready to reboot." :
++			"Powering down.");
++
++	if (test_result_state(TOI_ABORTED))
++		goto out;
++
++	if (test_action_state(TOI_REBOOT))
++		kernel_restart(NULL);
++
++	switch (method) {
++	case 0:
++		break;
++	case 3:
++		/*
++		 * Re-read the overwritten part of pageset2 to make post-resume
++		 * faster.
++		 */
++		if (read_pageset2(1))
++			panic("Attempt to reload pagedir 2 failed. "
++					"Try rebooting.");
++
++		pm_prepare_console();
++
++		error = pm_notifier_call_chain(PM_SUSPEND_PREPARE);
++		if (!error) {
++			pm_restore_gfp_mask();
++			error = suspend_devices_and_enter(PM_SUSPEND_MEM);
++			pm_restrict_gfp_mask();
++			if (!error)
++				did_suspend_to_both = 1;
++		}
++		pm_notifier_call_chain(PM_POST_SUSPEND);
++		pm_restore_console();
++
++		/* Success - we're now post-resume-from-ram */
++		if (did_suspend_to_both)
++			return;
++
++		/* Failed to suspend to ram - do normal power off */
++		break;
++	case 4:
++		/*
++		 * If succeeds, doesn't return. If fails, do a simple
++		 * powerdown.
++		 */
++		hibernation_platform_enter();
++		break;
++	case 5:
++		/* Historic entry only now */
++		break;
++	}
++
++	if (method && method != 5)
++		toi_cond_pause(1,
++			"Falling back to alternate power off method.");
++
++	if (test_result_state(TOI_ABORTED))
++		goto out;
++
++	kernel_power_off();
++	kernel_halt();
++	toi_cond_pause(1, "Powerdown failed.");
++	while (1)
++		cpu_relax();
++
++out:
++	if (read_pageset2(1))
++		panic("Attempt to reload pagedir 2 failed. Try rebooting.");
++	return;
++}
++
++#define CLOSE_FILE(file) \
++	if (file) { \
++		filp_close(file, NULL); file = NULL; \
++	}
++
++static void powerdown_cleanup(int toi_or_resume)
++{
++	if (!toi_or_resume)
++		return;
++
++	CLOSE_FILE(lid_file);
++	CLOSE_FILE(alarm_file);
++	CLOSE_FILE(epoch_file);
++}
++
++static void open_file(char *format, char *arg, struct file **var, int mode,
++		char *desc)
++{
++	char buf[256];
++
++	if (strlen(arg)) {
++		sprintf(buf, format, arg);
++		*var = filp_open(buf, mode, 0);
++		if (IS_ERR(*var) || !*var) {
++			printk(KERN_INFO "Failed to open %s file '%s' (%p).\n",
++				desc, buf, *var);
++			*var = NULL;
++		}
++	}
++}
++
++static int powerdown_init(int toi_or_resume)
++{
++	if (!toi_or_resume)
++		return 0;
++
++	did_suspend_to_both = 0;
++
++	open_file("/proc/acpi/button/%s/state", lid_state_file, &lid_file,
++			O_RDONLY, "lid");
++
++	if (strlen(wake_alarm_dir)) {
++		open_file("/sys/class/rtc/%s/wakealarm", wake_alarm_dir,
++				&alarm_file, O_WRONLY, "alarm");
++
++		open_file("/sys/class/rtc/%s/since_epoch", wake_alarm_dir,
++				&epoch_file, O_RDONLY, "epoch");
++	}
++
++	return 0;
++}
++
++static int lid_closed(void)
++{
++	char array[25];
++	ssize_t size;
++	loff_t pos = 0;
++
++	if (!lid_file)
++		return 0;
++
++	size = vfs_read(lid_file, (char __user *) array, 25, &pos);
++	if ((int) size < 1) {
++		printk(KERN_INFO "Failed to read lid state file (%d).\n",
++			(int) size);
++		return 0;
++	}
++
++	if (!strcmp(array, "state:      closed\n"))
++		return 1;
++
++	return 0;
++}
++
++static void write_alarm_file(int value)
++{
++	ssize_t size;
++	char buf[40];
++	loff_t pos = 0;
++
++	if (!alarm_file)
++		return;
++
++	sprintf(buf, "%d\n", value);
++
++	size = vfs_write(alarm_file, (char __user *)buf, strlen(buf), &pos);
++
++	if (size < 0)
++		printk(KERN_INFO "Error %d writing alarm value %s.\n",
++				(int) size, buf);
++}
++
++/**
++ * toi_check_resleep: See whether to powerdown again after waking.
++ *
++ * After waking, check whether we should powerdown again in a (usually
++ * different) way. We only do this if the lid switch is still closed.
++ */
++void toi_check_resleep(void)
++{
++	/* We only return if we suspended to ram and woke. */
++	if (lid_closed() && post_wake_state >= 0)
++		__toi_power_down(post_wake_state);
++}
++
++void toi_power_down(void)
++{
++	if (alarm_file && wake_delay) {
++		char array[25];
++		loff_t pos = 0;
++		size_t size = vfs_read(epoch_file, (char __user *) array, 25,
++				&pos);
++
++		if (((int) size) < 1)
++			printk(KERN_INFO "Failed to read epoch file (%d).\n",
++					(int) size);
++		else {
++			unsigned long since_epoch;
++			if (!strict_strtoul(array, 0, &since_epoch)) {
++				/* Clear any wakeup time. */
++				write_alarm_file(0);
++
++				/* Set new wakeup time. */
++				write_alarm_file(since_epoch + wake_delay);
++			}
++		}
++	}
++
++	__toi_power_down(toi_poweroff_method);
++
++	toi_check_resleep();
++}
++EXPORT_SYMBOL_GPL(toi_power_down);
++
++static struct toi_sysfs_data sysfs_params[] = {
++#if defined(CONFIG_ACPI)
++	SYSFS_STRING("lid_file", SYSFS_RW, lid_state_file, 256, 0, NULL),
++	SYSFS_INT("wake_delay", SYSFS_RW, &wake_delay, 0, INT_MAX, 0, NULL),
++	SYSFS_STRING("wake_alarm_dir", SYSFS_RW, wake_alarm_dir, 256, 0, NULL),
++	SYSFS_INT("post_wake_state", SYSFS_RW, &post_wake_state, -1, 5, 0,
++			NULL),
++	SYSFS_UL("powerdown_method", SYSFS_RW, &toi_poweroff_method, 0, 5, 0),
++	SYSFS_INT("did_suspend_to_both", SYSFS_READONLY, &did_suspend_to_both,
++		0, 0, 0, NULL)
++#endif
++};
++
++static struct toi_module_ops powerdown_ops = {
++	.type				= MISC_HIDDEN_MODULE,
++	.name				= "poweroff",
++	.initialise			= powerdown_init,
++	.cleanup			= powerdown_cleanup,
++	.directory			= "[ROOT]",
++	.module				= THIS_MODULE,
++	.sysfs_data			= sysfs_params,
++	.num_sysfs_entries		= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++int toi_poweroff_init(void)
++{
++	return toi_register_module(&powerdown_ops);
++}
++
++void toi_poweroff_exit(void)
++{
++	toi_unregister_module(&powerdown_ops);
++}
+diff --git a/kernel/power/tuxonice_power_off.h b/kernel/power/tuxonice_power_off.h
+new file mode 100644
+index 0000000..9aa0ea8
+--- /dev/null
++++ b/kernel/power/tuxonice_power_off.h
+@@ -0,0 +1,24 @@
++/*
++ * kernel/power/tuxonice_power_off.h
++ *
++ * Copyright (C) 2006-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Support for the powering down.
++ */
++
++int toi_pm_state_finish(void);
++void toi_power_down(void);
++extern unsigned long toi_poweroff_method;
++int toi_poweroff_init(void);
++void toi_poweroff_exit(void);
++void toi_check_resleep(void);
++
++extern int platform_begin(int platform_mode);
++extern int platform_pre_snapshot(int platform_mode);
++extern void platform_leave(int platform_mode);
++extern void platform_end(int platform_mode);
++extern void platform_finish(int platform_mode);
++extern int platform_pre_restore(int platform_mode);
++extern void platform_restore_cleanup(int platform_mode);
+diff --git a/kernel/power/tuxonice_prepare_image.c b/kernel/power/tuxonice_prepare_image.c
+new file mode 100644
+index 0000000..a2d4259
+--- /dev/null
++++ b/kernel/power/tuxonice_prepare_image.c
+@@ -0,0 +1,1115 @@
++/*
++ * kernel/power/tuxonice_prepare_image.c
++ *
++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * We need to eat memory until we can:
++ * 1. Perform the save without changing anything (RAM_NEEDED < #pages)
++ * 2. Fit it all in available space (toiActiveAllocator->available_space() >=
++ *    main_storage_needed())
++ * 3. Reload the pagedir and pageset1 to places that don't collide with their
++ *    final destinations, not knowing to what extent the resumed kernel will
++ *    overlap with the one loaded at boot time. I think the resumed kernel
++ *    should overlap completely, but I don't want to rely on this as it is
++ *    an unproven assumption. We therefore assume there will be no overlap at
++ *    all (worse case).
++ * 4. Meet the user's requested limit (if any) on the size of the image.
++ *    The limit is in MB, so pages/256 (assuming 4K pages).
++ *
++ */
++
++#include <linux/highmem.h>
++#include <linux/freezer.h>
++#include <linux/hardirq.h>
++#include <linux/mmzone.h>
++#include <linux/console.h>
++
++#include "tuxonice_pageflags.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_io.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_prepare_image.h"
++#include "tuxonice.h"
++#include "tuxonice_extent.h"
++#include "tuxonice_checksum.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_atomic_copy.h"
++#include "tuxonice_builtin.h"
++
++static unsigned long num_nosave, main_storage_allocated, storage_limit,
++	    header_storage_needed;
++unsigned long extra_pd1_pages_allowance =
++	CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE;
++long image_size_limit = CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT;
++static int no_ps2_needed;
++
++struct attention_list {
++	struct task_struct *task;
++	struct attention_list *next;
++};
++
++static struct attention_list *attention_list;
++
++#define PAGESET1 0
++#define PAGESET2 1
++
++void free_attention_list(void)
++{
++	struct attention_list *last = NULL;
++
++	while (attention_list) {
++		last = attention_list;
++		attention_list = attention_list->next;
++		toi_kfree(6, last, sizeof(*last));
++	}
++}
++
++static int build_attention_list(void)
++{
++	int i, task_count = 0;
++	struct task_struct *p;
++	struct attention_list *next;
++
++	/*
++	 * Count all userspace process (with task->mm) marked PF_NOFREEZE.
++	 */
++	toi_read_lock_tasklist();
++	for_each_process(p)
++		if ((p->flags & PF_NOFREEZE) || p == current)
++			task_count++;
++	toi_read_unlock_tasklist();
++
++	/*
++	 * Allocate attention list structs.
++	 */
++	for (i = 0; i < task_count; i++) {
++		struct attention_list *this =
++			toi_kzalloc(6, sizeof(struct attention_list),
++					TOI_WAIT_GFP);
++		if (!this) {
++			printk(KERN_INFO "Failed to allocate slab for "
++					"attention list.\n");
++			free_attention_list();
++			return 1;
++		}
++		this->next = NULL;
++		if (attention_list)
++			this->next = attention_list;
++		attention_list = this;
++	}
++
++	next = attention_list;
++	toi_read_lock_tasklist();
++	for_each_process(p)
++		if ((p->flags & PF_NOFREEZE) || p == current) {
++			next->task = p;
++			next = next->next;
++		}
++	toi_read_unlock_tasklist();
++	return 0;
++}
++
++static void pageset2_full(void)
++{
++	struct zone *zone;
++	struct page *page;
++	unsigned long flags;
++	int i;
++
++	for_each_populated_zone(zone) {
++		spin_lock_irqsave(&zone->lru_lock, flags);
++		for_each_lru(i) {
++			if (!zone_page_state(zone, NR_LRU_BASE + i))
++				continue;
++
++			list_for_each_entry(page, &zone->lruvec.lists[i], lru) {
++				struct address_space *mapping;
++
++				mapping = page_mapping(page);
++				if (!mapping || !mapping->host ||
++				    !(mapping->host->i_flags & S_ATOMIC_COPY))
++					SetPagePageset2(page);
++			}
++		}
++		spin_unlock_irqrestore(&zone->lru_lock, flags);
++	}
++}
++
++/*
++ * toi_mark_task_as_pageset
++ * Functionality   : Marks all the saveable pages belonging to a given process
++ * 		     as belonging to a particular pageset.
++ */
++
++static void toi_mark_task_as_pageset(struct task_struct *t, int pageset2)
++{
++	struct vm_area_struct *vma;
++	struct mm_struct *mm;
++
++	mm = t->active_mm;
++
++	if (!mm || !mm->mmap)
++		return;
++
++	if (!irqs_disabled())
++		down_read(&mm->mmap_sem);
++
++	for (vma = mm->mmap; vma; vma = vma->vm_next) {
++		unsigned long posn;
++
++		if (!vma->vm_start ||
++		    vma->vm_flags & (VM_IO | VM_DONTDUMP | VM_PFNMAP))
++			continue;
++
++		for (posn = vma->vm_start; posn < vma->vm_end;
++				posn += PAGE_SIZE) {
++			struct page *page = follow_page(vma, posn, 0);
++			struct address_space *mapping;
++
++			if (!page || !pfn_valid(page_to_pfn(page)))
++				continue;
++
++			mapping = page_mapping(page);
++			if (mapping && mapping->host &&
++			    mapping->host->i_flags & S_ATOMIC_COPY)
++				continue;
++
++			if (pageset2)
++				SetPagePageset2(page);
++			else {
++				ClearPagePageset2(page);
++				SetPagePageset1(page);
++			}
++		}
++	}
++
++	if (!irqs_disabled())
++		up_read(&mm->mmap_sem);
++}
++
++static void mark_tasks(int pageset)
++{
++	struct task_struct *p;
++
++	toi_read_lock_tasklist();
++	for_each_process(p) {
++		if (!p->mm)
++			continue;
++
++		if (p->flags & PF_KTHREAD)
++			continue;
++
++		toi_mark_task_as_pageset(p, pageset);
++	}
++	toi_read_unlock_tasklist();
++
++}
++
++/* mark_pages_for_pageset2
++ *
++ * Description:	Mark unshared pages in processes not needed for hibernate as
++ * 		being able to be written out in a separate pagedir.
++ * 		HighMem pages are simply marked as pageset2. They won't be
++ * 		needed during hibernate.
++ */
++
++static void toi_mark_pages_for_pageset2(void)
++{
++	struct attention_list *this = attention_list;
++
++	memory_bm_clear(pageset2_map);
++
++	if (test_action_state(TOI_NO_PAGESET2) || no_ps2_needed)
++		return;
++
++	if (test_action_state(TOI_PAGESET2_FULL))
++		pageset2_full();
++	else
++		mark_tasks(PAGESET2);
++
++	/*
++	 * Because the tasks in attention_list are ones related to hibernating,
++	 * we know that they won't go away under us.
++	 */
++
++	while (this) {
++		if (!test_result_state(TOI_ABORTED))
++			toi_mark_task_as_pageset(this->task, PAGESET1);
++		this = this->next;
++	}
++}
++
++/*
++ * The atomic copy of pageset1 is stored in pageset2 pages.
++ * But if pageset1 is larger (normally only just after boot),
++ * we need to allocate extra pages to store the atomic copy.
++ * The following data struct and functions are used to handle
++ * the allocation and freeing of that memory.
++ */
++
++static unsigned long extra_pages_allocated;
++
++struct extras {
++	struct page *page;
++	int order;
++	struct extras *next;
++};
++
++static struct extras *extras_list;
++
++/* toi_free_extra_pagedir_memory
++ *
++ * Description:	Free previously allocated extra pagedir memory.
++ */
++void toi_free_extra_pagedir_memory(void)
++{
++	/* Free allocated pages */
++	while (extras_list) {
++		struct extras *this = extras_list;
++		int i;
++
++		extras_list = this->next;
++
++		for (i = 0; i < (1 << this->order); i++)
++			ClearPageNosave(this->page + i);
++
++		toi_free_pages(9, this->page, this->order);
++		toi_kfree(7, this, sizeof(*this));
++	}
++
++	extra_pages_allocated = 0;
++}
++
++/* toi_allocate_extra_pagedir_memory
++ *
++ * Description:	Allocate memory for making the atomic copy of pagedir1 in the
++ * 		case where it is bigger than pagedir2.
++ * Arguments:	int	num_to_alloc: Number of extra pages needed.
++ * Result:	int. 	Number of extra pages we now have allocated.
++ */
++static int toi_allocate_extra_pagedir_memory(int extra_pages_needed)
++{
++	int j, order, num_to_alloc = extra_pages_needed - extra_pages_allocated;
++	gfp_t flags = TOI_ATOMIC_GFP;
++
++	if (num_to_alloc < 1)
++		return 0;
++
++	order = fls(num_to_alloc);
++	if (order >= MAX_ORDER)
++		order = MAX_ORDER - 1;
++
++	while (num_to_alloc) {
++		struct page *newpage;
++		unsigned long virt;
++		struct extras *extras_entry;
++
++		while ((1 << order) > num_to_alloc)
++			order--;
++
++		extras_entry = (struct extras *) toi_kzalloc(7,
++			sizeof(struct extras), TOI_ATOMIC_GFP);
++
++		if (!extras_entry)
++			return extra_pages_allocated;
++
++		virt = toi_get_free_pages(9, flags, order);
++		while (!virt && order) {
++			order--;
++			virt = toi_get_free_pages(9, flags, order);
++		}
++
++		if (!virt) {
++			toi_kfree(7, extras_entry, sizeof(*extras_entry));
++			return extra_pages_allocated;
++		}
++
++		newpage = virt_to_page(virt);
++
++		extras_entry->page = newpage;
++		extras_entry->order = order;
++		extras_entry->next = extras_list;
++
++		extras_list = extras_entry;
++
++		for (j = 0; j < (1 << order); j++) {
++			SetPageNosave(newpage + j);
++			SetPagePageset1Copy(newpage + j);
++		}
++
++		extra_pages_allocated += (1 << order);
++		num_to_alloc -= (1 << order);
++	}
++
++	return extra_pages_allocated;
++}
++
++/*
++ * real_nr_free_pages: Count pcp pages for a zone type or all zones
++ * (-1 for all, otherwise zone_idx() result desired).
++ */
++unsigned long real_nr_free_pages(unsigned long zone_idx_mask)
++{
++	struct zone *zone;
++	int result = 0, cpu;
++
++	/* PCP lists */
++	for_each_populated_zone(zone) {
++		if (!(zone_idx_mask & (1 << zone_idx(zone))))
++			continue;
++
++		for_each_online_cpu(cpu) {
++			struct per_cpu_pageset *pset =
++				per_cpu_ptr(zone->pageset, cpu);
++			struct per_cpu_pages *pcp = &pset->pcp;
++			result += pcp->count;
++		}
++
++		result += zone_page_state(zone, NR_FREE_PAGES);
++	}
++	return result;
++}
++EXPORT_SYMBOL_GPL(real_nr_free_pages);
++
++/*
++ * Discover how much extra memory will be required by the drivers
++ * when they're asked to hibernate. We can then ensure that amount
++ * of memory is available when we really want it.
++ */
++static void get_extra_pd1_allowance(void)
++{
++	unsigned long orig_num_free = real_nr_free_pages(all_zones_mask), final;
++
++	toi_prepare_status(CLEAR_BAR, "Finding allowance for drivers.");
++
++	if (toi_go_atomic(PMSG_FREEZE, 1))
++		return;
++
++	final = real_nr_free_pages(all_zones_mask);
++	toi_end_atomic(ATOMIC_ALL_STEPS, 1, 0);
++
++	extra_pd1_pages_allowance = (orig_num_free > final) ?
++		orig_num_free - final + MIN_EXTRA_PAGES_ALLOWANCE :
++		MIN_EXTRA_PAGES_ALLOWANCE;
++}
++
++/*
++ * Amount of storage needed, possibly taking into account the
++ * expected compression ratio and possibly also ignoring our
++ * allowance for extra pages.
++ */
++static unsigned long main_storage_needed(int use_ecr,
++		int ignore_extra_pd1_allow)
++{
++	return (pagedir1.size + pagedir2.size +
++	  (ignore_extra_pd1_allow ? 0 : extra_pd1_pages_allowance)) *
++	 (use_ecr ? toi_expected_compression_ratio() : 100) / 100;
++}
++
++/*
++ * Storage needed for the image header, in bytes until the return.
++ */
++unsigned long get_header_storage_needed(void)
++{
++	unsigned long bytes = sizeof(struct toi_header) +
++			toi_header_storage_for_modules() +
++			toi_pageflags_space_needed() +
++			fs_info_space_needed();
++
++	return DIV_ROUND_UP(bytes, PAGE_SIZE);
++}
++EXPORT_SYMBOL_GPL(get_header_storage_needed);
++
++/*
++ * When freeing memory, pages from either pageset might be freed.
++ *
++ * When seeking to free memory to be able to hibernate, for every ps1 page
++ * freed, we need 2 less pages for the atomic copy because there is one less
++ * page to copy and one more page into which data can be copied.
++ *
++ * Freeing ps2 pages saves us nothing directly. No more memory is available
++ * for the atomic copy. Indirectly, a ps1 page might be freed (slab?), but
++ * that's too much work to figure out.
++ *
++ * => ps1_to_free functions
++ *
++ * Of course if we just want to reduce the image size, because of storage
++ * limitations or an image size limit either ps will do.
++ *
++ * => any_to_free function
++ */
++
++static unsigned long lowpages_usable_for_highmem_copy(void)
++{
++	unsigned long needed = get_lowmem_size(pagedir1) +
++			extra_pd1_pages_allowance + MIN_FREE_RAM +
++			toi_memory_for_modules(0),
++		available = get_lowmem_size(pagedir2) +
++			 real_nr_free_low_pages() + extra_pages_allocated;
++
++	return available > needed ? available - needed : 0;
++}
++
++static unsigned long highpages_ps1_to_free(void)
++{
++	unsigned long need = get_highmem_size(pagedir1),
++		      available = get_highmem_size(pagedir2) +
++			      real_nr_free_high_pages() +
++			      lowpages_usable_for_highmem_copy();
++
++	return need > available ? DIV_ROUND_UP(need - available, 2) : 0;
++}
++
++static unsigned long lowpages_ps1_to_free(void)
++{
++	unsigned long needed = get_lowmem_size(pagedir1) +
++			extra_pd1_pages_allowance + MIN_FREE_RAM +
++			toi_memory_for_modules(0),
++		available = get_lowmem_size(pagedir2) +
++			 real_nr_free_low_pages() + extra_pages_allocated;
++
++	return needed > available ? DIV_ROUND_UP(needed - available, 2) : 0;
++}
++
++static unsigned long current_image_size(void)
++{
++	return pagedir1.size + pagedir2.size + header_storage_needed;
++}
++
++static unsigned long storage_still_required(void)
++{
++	unsigned long needed = main_storage_needed(1, 1);
++	return needed > storage_limit ? needed - storage_limit : 0;
++}
++
++static unsigned long ram_still_required(void)
++{
++	unsigned long needed = MIN_FREE_RAM + toi_memory_for_modules(0) +
++		2 * extra_pd1_pages_allowance,
++		  available = real_nr_free_low_pages() + extra_pages_allocated;
++	return needed > available ? needed - available : 0;
++}
++
++unsigned long any_to_free(int use_image_size_limit)
++{
++	int use_soft_limit = use_image_size_limit && image_size_limit > 0;
++	unsigned long current_size = current_image_size(),
++		      soft_limit = use_soft_limit ? (image_size_limit << 8) : 0,
++		      to_free = use_soft_limit ? (current_size > soft_limit ?
++				      current_size - soft_limit : 0) : 0,
++		      storage_limit = storage_still_required(),
++		      ram_limit = ram_still_required(),
++		      first_max = max(to_free, storage_limit);
++
++	return max(first_max, ram_limit);
++}
++
++static int need_pageset2(void)
++{
++	return (real_nr_free_low_pages() + extra_pages_allocated -
++		2 * extra_pd1_pages_allowance - MIN_FREE_RAM -
++		 toi_memory_for_modules(0) - pagedir1.size) < pagedir2.size;
++}
++
++/* amount_needed
++ *
++ * Calculates the amount by which the image size needs to be reduced to meet
++ * our constraints.
++ */
++static unsigned long amount_needed(int use_image_size_limit)
++{
++	return max(highpages_ps1_to_free() + lowpages_ps1_to_free(),
++			any_to_free(use_image_size_limit));
++}
++
++static int image_not_ready(int use_image_size_limit)
++{
++	toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
++		"Amount still needed (%lu) > 0:%u,"
++		" Storage allocd: %lu < %lu: %u.\n",
++			amount_needed(use_image_size_limit),
++			(amount_needed(use_image_size_limit) > 0),
++			main_storage_allocated,
++			main_storage_needed(1, 1),
++			main_storage_allocated < main_storage_needed(1, 1));
++
++	toi_cond_pause(0, NULL);
++
++	return (amount_needed(use_image_size_limit) > 0) ||
++		 main_storage_allocated < main_storage_needed(1, 1);
++}
++
++static void display_failure_reason(int tries_exceeded)
++{
++	unsigned long storage_required = storage_still_required(),
++	    ram_required = ram_still_required(),
++	    high_ps1 = highpages_ps1_to_free(),
++	    low_ps1 = lowpages_ps1_to_free();
++
++	printk(KERN_INFO "Failed to prepare the image because...\n");
++
++	if (!storage_limit) {
++		printk(KERN_INFO "- You need some storage available to be "
++				"able to hibernate.\n");
++		return;
++	}
++
++	if (tries_exceeded)
++		printk(KERN_INFO "- The maximum number of iterations was "
++				"reached without successfully preparing the "
++				"image.\n");
++
++	if (storage_required) {
++		printk(KERN_INFO " - We need at least %lu pages of storage "
++				"(ignoring the header), but only have %lu.\n",
++				main_storage_needed(1, 1),
++				main_storage_allocated);
++		set_abort_result(TOI_INSUFFICIENT_STORAGE);
++	}
++
++	if (ram_required) {
++		printk(KERN_INFO " - We need %lu more free pages of low "
++				"memory.\n", ram_required);
++		printk(KERN_INFO "     Minimum free     : %8d\n", MIN_FREE_RAM);
++		printk(KERN_INFO "   + Reqd. by modules : %8lu\n",
++				toi_memory_for_modules(0));
++		printk(KERN_INFO "   + 2 * extra allow  : %8lu\n",
++				2 * extra_pd1_pages_allowance);
++		printk(KERN_INFO "   - Currently free   : %8lu\n",
++				real_nr_free_low_pages());
++		printk(KERN_INFO "   - Pages allocd     : %8lu\n",
++				extra_pages_allocated);
++		printk(KERN_INFO "                      : ========\n");
++		printk(KERN_INFO "     Still needed     : %8lu\n",
++				ram_required);
++
++		/* Print breakdown of memory needed for modules */
++		toi_memory_for_modules(1);
++		set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
++	}
++
++	if (high_ps1) {
++		printk(KERN_INFO "- We need to free %lu highmem pageset 1 "
++				"pages.\n", high_ps1);
++		set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
++	}
++
++	if (low_ps1) {
++		printk(KERN_INFO " - We need to free %ld lowmem pageset 1 "
++				"pages.\n", low_ps1);
++		set_abort_result(TOI_UNABLE_TO_FREE_ENOUGH_MEMORY);
++	}
++}
++
++static void display_stats(int always, int sub_extra_pd1_allow)
++{
++	char buffer[255];
++	snprintf(buffer, 254,
++		"Free:%lu(%lu). Sets:%lu(%lu),%lu(%lu). "
++		"Nosave:%lu-%lu=%lu. Storage:%lu/%lu(%lu=>%lu). "
++		"Needed:%lu,%lu,%lu(%u,%lu,%lu,%ld) (PS2:%s)\n",
++
++		/* Free */
++		real_nr_free_pages(all_zones_mask),
++		real_nr_free_low_pages(),
++
++		/* Sets */
++		pagedir1.size, pagedir1.size - get_highmem_size(pagedir1),
++		pagedir2.size, pagedir2.size - get_highmem_size(pagedir2),
++
++		/* Nosave */
++		num_nosave, extra_pages_allocated,
++		num_nosave - extra_pages_allocated,
++
++		/* Storage */
++		main_storage_allocated,
++		storage_limit,
++		main_storage_needed(1, sub_extra_pd1_allow),
++		main_storage_needed(1, 1),
++
++		/* Needed */
++		lowpages_ps1_to_free(), highpages_ps1_to_free(),
++		any_to_free(1),
++		MIN_FREE_RAM, toi_memory_for_modules(0),
++		extra_pd1_pages_allowance,
++		image_size_limit,
++
++		need_pageset2() ? "yes" : "no");
++
++	if (always)
++		printk("%s", buffer);
++	else
++		toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 1, buffer);
++}
++
++/* generate_free_page_map
++ *
++ * Description:	This routine generates a bitmap of free pages from the
++ * 		lists used by the memory manager. We then use the bitmap
++ * 		to quickly calculate which pages to save and in which
++ * 		pagesets.
++ */
++static void generate_free_page_map(void)
++{
++	int order, cpu, t;
++	unsigned long flags, i;
++	struct zone *zone;
++	struct list_head *curr;
++	unsigned long pfn;
++	struct page *page;
++
++	for_each_populated_zone(zone) {
++
++		if (!zone->spanned_pages)
++			continue;
++
++		spin_lock_irqsave(&zone->lock, flags);
++
++		for (i = 0; i < zone->spanned_pages; i++) {
++			pfn = zone->zone_start_pfn + i;
++
++			if (!pfn_valid(pfn))
++				continue;
++
++			page = pfn_to_page(pfn);
++
++			ClearPageNosaveFree(page);
++		}
++
++		for_each_migratetype_order(order, t) {
++			list_for_each(curr,
++					&zone->free_area[order].free_list[t]) {
++				unsigned long j;
++
++				pfn = page_to_pfn(list_entry(curr, struct page,
++							lru));
++				for (j = 0; j < (1UL << order); j++)
++					SetPageNosaveFree(pfn_to_page(pfn + j));
++			}
++		}
++
++		for_each_online_cpu(cpu) {
++			struct per_cpu_pageset *pset =
++				per_cpu_ptr(zone->pageset, cpu);
++			struct per_cpu_pages *pcp = &pset->pcp;
++			struct page *page;
++			int t;
++
++			for (t = 0; t < MIGRATE_PCPTYPES; t++)
++				list_for_each_entry(page, &pcp->lists[t], lru)
++					SetPageNosaveFree(page);
++		}
++
++		spin_unlock_irqrestore(&zone->lock, flags);
++	}
++}
++
++/* size_of_free_region
++ *
++ * Description:	Return the number of pages that are free, beginning with and
++ * 		including this one.
++ */
++static int size_of_free_region(struct zone *zone, unsigned long start_pfn)
++{
++	unsigned long this_pfn = start_pfn,
++		      end_pfn = zone->zone_start_pfn + zone->spanned_pages - 1;
++
++	while (pfn_valid(this_pfn) && this_pfn <= end_pfn && PageNosaveFree(pfn_to_page(this_pfn)))
++		this_pfn++;
++
++	return this_pfn - start_pfn;
++}
++
++/* flag_image_pages
++ *
++ * This routine generates our lists of pages to be stored in each
++ * pageset. Since we store the data using extents, and adding new
++ * extents might allocate a new extent page, this routine may well
++ * be called more than once.
++ */
++static void flag_image_pages(int atomic_copy)
++{
++	int num_free = 0;
++	unsigned long loop;
++	struct zone *zone;
++
++	pagedir1.size = 0;
++	pagedir2.size = 0;
++
++	set_highmem_size(pagedir1, 0);
++	set_highmem_size(pagedir2, 0);
++
++	num_nosave = 0;
++
++	memory_bm_clear(pageset1_map);
++
++	generate_free_page_map();
++
++	/*
++	 * Pages not to be saved are marked Nosave irrespective of being
++	 * reserved.
++	 */
++	for_each_populated_zone(zone) {
++		int highmem = is_highmem(zone);
++
++		for (loop = 0; loop < zone->spanned_pages; loop++) {
++			unsigned long pfn = zone->zone_start_pfn + loop;
++			struct page *page;
++			int chunk_size;
++
++			if (!pfn_valid(pfn))
++				continue;
++
++			chunk_size = size_of_free_region(zone, pfn);
++			if (chunk_size) {
++				num_free += chunk_size;
++				loop += chunk_size - 1;
++				continue;
++			}
++
++			page = pfn_to_page(pfn);
++
++			if (PageNosave(page)) {
++				num_nosave++;
++				continue;
++			}
++
++			page = highmem ? saveable_highmem_page(zone, pfn) :
++				saveable_page(zone, pfn);
++
++			if (!page) {
++				num_nosave++;
++				continue;
++			}
++
++			if (PagePageset2(page)) {
++				pagedir2.size++;
++				if (PageHighMem(page))
++					inc_highmem_size(pagedir2);
++				else
++					SetPagePageset1Copy(page);
++				if (PageResave(page)) {
++					SetPagePageset1(page);
++					ClearPagePageset1Copy(page);
++					pagedir1.size++;
++					if (PageHighMem(page))
++						inc_highmem_size(pagedir1);
++				}
++			} else {
++				pagedir1.size++;
++				SetPagePageset1(page);
++				if (PageHighMem(page))
++					inc_highmem_size(pagedir1);
++			}
++		}
++	}
++
++	if (!atomic_copy)
++		toi_message(TOI_EAT_MEMORY, TOI_MEDIUM, 0,
++			"Count data pages: Set1 (%d) + Set2 (%d) + Nosave (%ld)"
++						" + NumFree (%d) = %d.\n",
++			pagedir1.size, pagedir2.size, num_nosave, num_free,
++			pagedir1.size + pagedir2.size + num_nosave + num_free);
++}
++
++void toi_recalculate_image_contents(int atomic_copy)
++{
++	memory_bm_clear(pageset1_map);
++	if (!atomic_copy) {
++		unsigned long pfn;
++		memory_bm_position_reset(pageset2_map);
++		for (pfn = memory_bm_next_pfn(pageset2_map);
++				pfn != BM_END_OF_MAP;
++				pfn = memory_bm_next_pfn(pageset2_map))
++			ClearPagePageset1Copy(pfn_to_page(pfn));
++		/* Need to call this before getting pageset1_size! */
++		toi_mark_pages_for_pageset2();
++	}
++	flag_image_pages(atomic_copy);
++
++	if (!atomic_copy) {
++		storage_limit = toiActiveAllocator->storage_available();
++		display_stats(0, 0);
++	}
++}
++
++int try_allocate_extra_memory(void)
++{
++	unsigned long wanted = pagedir1.size +  extra_pd1_pages_allowance -
++		get_lowmem_size(pagedir2);
++	if (wanted > extra_pages_allocated) {
++		unsigned long got = toi_allocate_extra_pagedir_memory(wanted);
++		if (wanted < got) {
++			toi_message(TOI_EAT_MEMORY, TOI_LOW, 1,
++				"Want %d extra pages for pageset1, got %d.\n",
++				wanted, got);
++			return 1;
++		}
++	}
++	return 0;
++}
++
++
++/* update_image
++ *
++ * Allocate [more] memory and storage for the image.
++ */
++static void update_image(int ps2_recalc)
++{
++	int old_header_req;
++	unsigned long seek;
++
++	if (try_allocate_extra_memory())
++		return;
++
++	if (ps2_recalc)
++		goto recalc;
++
++	thaw_kernel_threads();
++
++	/*
++	 * Allocate remaining storage space, if possible, up to the
++	 * maximum we know we'll need. It's okay to allocate the
++	 * maximum if the writer is the swapwriter, but
++	 * we don't want to grab all available space on an NFS share.
++	 * We therefore ignore the expected compression ratio here,
++	 * thereby trying to allocate the maximum image size we could
++	 * need (assuming compression doesn't expand the image), but
++	 * don't complain if we can't get the full amount we're after.
++	 */
++
++	do {
++		int result;
++
++		old_header_req = header_storage_needed;
++		toiActiveAllocator->reserve_header_space(header_storage_needed);
++
++		/* How much storage is free with the reservation applied? */
++		storage_limit = toiActiveAllocator->storage_available();
++		seek = min(storage_limit, main_storage_needed(0, 0));
++
++		result = toiActiveAllocator->allocate_storage(seek);
++		if (result)
++			printk("Failed to allocate storage (%d).\n", result);
++
++		main_storage_allocated =
++			toiActiveAllocator->storage_allocated();
++
++		/* Need more header because more storage allocated? */
++		header_storage_needed = get_header_storage_needed();
++
++	} while (header_storage_needed > old_header_req);
++
++	if (freeze_kernel_threads())
++		set_abort_result(TOI_FREEZING_FAILED);
++
++recalc:
++	toi_recalculate_image_contents(0);
++}
++
++/* attempt_to_freeze
++ *
++ * Try to freeze processes.
++ */
++
++static int attempt_to_freeze(void)
++{
++	int result;
++
++	/* Stop processes before checking again */
++	toi_prepare_status(CLEAR_BAR, "Freezing processes & syncing "
++			"filesystems.");
++	result = freeze_processes();
++
++	if (result)
++		set_abort_result(TOI_FREEZING_FAILED);
++
++	result = freeze_kernel_threads();
++
++	if (result)
++		set_abort_result(TOI_FREEZING_FAILED);
++
++	return result;
++}
++
++/* eat_memory
++ *
++ * Try to free some memory, either to meet hard or soft constraints on the image
++ * characteristics.
++ *
++ * Hard constraints:
++ * - Pageset1 must be < half of memory;
++ * - We must have enough memory free at resume time to have pageset1
++ *   be able to be loaded in pages that don't conflict with where it has to
++ *   be restored.
++ * Soft constraints
++ * - User specificied image size limit.
++ */
++static void eat_memory(void)
++{
++	unsigned long amount_wanted = 0;
++	int did_eat_memory = 0;
++
++	/*
++	 * Note that if we have enough storage space and enough free memory, we
++	 * may exit without eating anything. We give up when the last 10
++	 * iterations ate no extra pages because we're not going to get much
++	 * more anyway, but the few pages we get will take a lot of time.
++	 *
++	 * We freeze processes before beginning, and then unfreeze them if we
++	 * need to eat memory until we think we have enough. If our attempts
++	 * to freeze fail, we give up and abort.
++	 */
++
++	amount_wanted = amount_needed(1);
++
++	switch (image_size_limit) {
++	case -1: /* Don't eat any memory */
++		if (amount_wanted > 0) {
++			set_abort_result(TOI_WOULD_EAT_MEMORY);
++			return;
++		}
++		break;
++	case -2:  /* Free caches only */
++		drop_pagecache();
++		toi_recalculate_image_contents(0);
++		amount_wanted = amount_needed(1);
++		break;
++	default:
++		break;
++	}
++
++	if (amount_wanted > 0 && !test_result_state(TOI_ABORTED) &&
++			image_size_limit != -1) {
++		unsigned long request = amount_wanted;
++		unsigned long high_req = max(highpages_ps1_to_free(),
++				any_to_free(1));
++		unsigned long low_req = lowpages_ps1_to_free();
++		unsigned long got = 0;
++
++		toi_prepare_status(CLEAR_BAR,
++				"Seeking to free %ldMB of memory.",
++				MB(amount_wanted));
++
++		thaw_kernel_threads();
++
++		/*
++		 * Ask for too many because shrink_memory_mask doesn't
++		 * currently return enough most of the time.
++		 */
++		
++		if (low_req)
++			got = shrink_memory_mask(low_req, GFP_KERNEL);
++		if (high_req)
++			shrink_memory_mask(high_req - got, GFP_HIGHUSER);
++
++		did_eat_memory = 1;
++
++		toi_recalculate_image_contents(0);
++
++		amount_wanted = amount_needed(1);
++
++		printk(KERN_DEBUG "Asked shrink_memory_mask for %ld low pages &"
++				" %ld pages from anywhere, got %ld.\n",
++				high_req, low_req,
++				request - amount_wanted);
++
++		toi_cond_pause(0, NULL);
++
++		if (freeze_kernel_threads())
++			set_abort_result(TOI_FREEZING_FAILED);
++	}
++
++	if (did_eat_memory)
++		toi_recalculate_image_contents(0);
++}
++
++/* toi_prepare_image
++ *
++ * Entry point to the whole image preparation section.
++ *
++ * We do four things:
++ * - Freeze processes;
++ * - Ensure image size constraints are met;
++ * - Complete all the preparation for saving the image,
++ *   including allocation of storage. The only memory
++ *   that should be needed when we're finished is that
++ *   for actually storing the image (and we know how
++ *   much is needed for that because the modules tell
++ *   us).
++ * - Make sure that all dirty buffers are written out.
++ */
++#define MAX_TRIES 2
++int toi_prepare_image(void)
++{
++	int result = 1, tries = 1;
++
++	main_storage_allocated = 0;
++	no_ps2_needed = 0;
++
++	if (attempt_to_freeze())
++		return 1;
++
++	if (!extra_pd1_pages_allowance)
++		get_extra_pd1_allowance();
++
++	storage_limit = toiActiveAllocator->storage_available();
++
++	if (!storage_limit) {
++		printk(KERN_INFO "No storage available. Didn't try to prepare "
++				"an image.\n");
++		display_failure_reason(0);
++		set_abort_result(TOI_NOSTORAGE_AVAILABLE);
++		return 1;
++	}
++
++	if (build_attention_list()) {
++		abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
++				"Unable to successfully prepare the image.\n");
++		return 1;
++	}
++
++	toi_recalculate_image_contents(0);
++
++	do {
++		toi_prepare_status(CLEAR_BAR,
++				"Preparing Image. Try %d.", tries);
++
++		eat_memory();
++
++		if (test_result_state(TOI_ABORTED))
++			break;
++
++		update_image(0);
++
++		tries++;
++
++	} while (image_not_ready(1) && tries <= MAX_TRIES &&
++			!test_result_state(TOI_ABORTED));
++
++	result = image_not_ready(0);
++
++	if (!test_result_state(TOI_ABORTED)) {
++		if (result) {
++			display_stats(1, 0);
++			display_failure_reason(tries > MAX_TRIES);
++			abort_hibernate(TOI_UNABLE_TO_PREPARE_IMAGE,
++				"Unable to successfully prepare the image.\n");
++		} else {
++			/* Pageset 2 needed? */
++			if (!need_pageset2() &&
++				  test_action_state(TOI_NO_PS2_IF_UNNEEDED)) {
++				no_ps2_needed = 1;
++				toi_recalculate_image_contents(0);
++				update_image(1);
++			}
++
++			toi_cond_pause(1, "Image preparation complete.");
++		}
++	}
++
++	return result ? result : allocate_checksum_pages();
++}
+diff --git a/kernel/power/tuxonice_prepare_image.h b/kernel/power/tuxonice_prepare_image.h
+new file mode 100644
+index 0000000..2a2ca0b
+--- /dev/null
++++ b/kernel/power/tuxonice_prepare_image.h
+@@ -0,0 +1,38 @@
++/*
++ * kernel/power/tuxonice_prepare_image.h
++ *
++ * Copyright (C) 2003-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ */
++
++#include <asm/sections.h>
++
++extern int toi_prepare_image(void);
++extern void toi_recalculate_image_contents(int storage_available);
++extern unsigned long real_nr_free_pages(unsigned long zone_idx_mask);
++extern long image_size_limit;
++extern void toi_free_extra_pagedir_memory(void);
++extern unsigned long extra_pd1_pages_allowance;
++extern void free_attention_list(void);
++
++#define MIN_FREE_RAM 100
++#define MIN_EXTRA_PAGES_ALLOWANCE 500
++
++#define all_zones_mask ((unsigned long) ((1 << MAX_NR_ZONES) - 1))
++#ifdef CONFIG_HIGHMEM
++#define real_nr_free_high_pages() (real_nr_free_pages(1 << ZONE_HIGHMEM))
++#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask - \
++						(1 << ZONE_HIGHMEM)))
++#else
++#define real_nr_free_high_pages() (0)
++#define real_nr_free_low_pages() (real_nr_free_pages(all_zones_mask))
++
++/* For eat_memory function */
++#define ZONE_HIGHMEM (MAX_NR_ZONES + 1)
++#endif
++
++unsigned long get_header_storage_needed(void);
++unsigned long any_to_free(int use_image_size_limit);
++int try_allocate_extra_memory(void);
+diff --git a/kernel/power/tuxonice_prune.c b/kernel/power/tuxonice_prune.c
+new file mode 100644
+index 0000000..9a9444d
+--- /dev/null
++++ b/kernel/power/tuxonice_prune.c
+@@ -0,0 +1,419 @@
++/*
++ * kernel/power/tuxonice_prune.c
++ *
++ * Copyright (C) 2012 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * This file implements a TuxOnIce module that seeks to prune the
++ * amount of data written to disk. It builds a table of hashes
++ * of the uncompressed data, and writes the pfn of the previous page
++ * with the same contents instead of repeating the data when a match
++ * is found.
++ */
++
++#include <linux/suspend.h>
++#include <linux/highmem.h>
++#include <linux/vmalloc.h>
++#include <linux/crypto.h>
++#include <linux/scatterlist.h>
++#include <crypto/hash.h>
++
++#include "tuxonice_builtin.h"
++#include "tuxonice.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_io.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_alloc.h"
++
++/*
++ * We never write a page bigger than PAGE_SIZE, so use a large number
++ * to indicate that data is a PFN.
++ */
++#define PRUNE_DATA_IS_PFN (PAGE_SIZE + 100)
++
++static unsigned long toi_pruned_pages;
++
++static struct toi_module_ops toi_prune_ops;
++static struct toi_module_ops *next_driver;
++
++static char toi_prune_hash_algo_name[32] = "sha1";
++
++static DEFINE_MUTEX(stats_lock);
++
++struct cpu_context {
++	struct shash_desc desc;
++	char *digest;
++};
++
++#define OUT_BUF_SIZE (2 * PAGE_SIZE)
++
++static DEFINE_PER_CPU(struct cpu_context, contexts);
++
++/*
++ * toi_crypto_prepare
++ *
++ * Prepare to do some work by allocating buffers and transforms.
++ */
++static int toi_prune_crypto_prepare(void)
++{
++	int cpu, ret, digestsize;
++
++	if (!*toi_prune_hash_algo_name) {
++		printk(KERN_INFO "TuxOnIce: Pruning enabled but no "
++				"hash algorithm set.\n");
++		return 1;
++	}
++
++	for_each_online_cpu(cpu) {
++		struct cpu_context *this = &per_cpu(contexts, cpu);
++		this->desc.tfm = crypto_alloc_shash(toi_prune_hash_algo_name, 0, 0);
++		if (IS_ERR(this->desc.tfm)) {
++			printk(KERN_INFO "TuxOnIce: Failed to allocate the "
++					"%s prune hash algorithm.\n",
++					toi_prune_hash_algo_name);
++			this->desc.tfm = NULL;
++			return 1;
++		}
++
++		if (!digestsize)
++			digestsize = crypto_shash_digestsize(this->desc.tfm);
++
++		this->digest = kmalloc(digestsize, GFP_KERNEL);
++		if (!this->digest) {
++			printk(KERN_INFO "TuxOnIce: Failed to allocate space "
++					"for digest output.\n");
++			crypto_free_shash(this->desc.tfm);
++			this->desc.tfm = NULL;
++		}
++
++		this->desc.flags = 0;
++
++		ret = crypto_shash_init(&this->desc);
++		if (ret < 0) {
++			printk(KERN_INFO "TuxOnIce: Failed to initialise the "
++					"%s prune hash algorithm.\n",
++					toi_prune_hash_algo_name);
++			kfree(this->digest);
++			this->digest = NULL;
++			crypto_free_shash(this->desc.tfm);
++			this->desc.tfm = NULL;
++			return 1;
++		}
++	}
++
++	return 0;
++}
++
++static int toi_prune_rw_cleanup(int writing)
++{
++	int cpu;
++
++	for_each_online_cpu(cpu) {
++		struct cpu_context *this = &per_cpu(contexts, cpu);
++		if (this->desc.tfm) {
++			crypto_free_shash(this->desc.tfm);
++			this->desc.tfm = NULL;
++		}
++
++		if (this->digest) {
++			kfree(this->digest);
++			this->digest = NULL;
++		}
++	}
++
++	return 0;
++}
++
++/*
++ * toi_prune_init
++ */
++
++static int toi_prune_init(int toi_or_resume)
++{
++	if (!toi_or_resume)
++		return 0;
++
++	toi_pruned_pages = 0;
++
++	next_driver = toi_get_next_filter(&toi_prune_ops);
++
++	return next_driver ? 0 : -ECHILD;
++}
++
++/*
++ * toi_prune_rw_init()
++ */
++
++static int toi_prune_rw_init(int rw, int stream_number)
++{
++	if (toi_prune_crypto_prepare()) {
++		printk(KERN_ERR "Failed to initialise prune "
++				"algorithm.\n");
++		if (rw == READ) {
++			printk(KERN_INFO "Unable to read the image.\n");
++			return -ENODEV;
++		} else {
++			printk(KERN_INFO "Continuing without "
++				"pruning the image.\n");
++			toi_prune_ops.enabled = 0;
++		}
++	}
++
++	return 0;
++}
++
++/*
++ * toi_prune_write_page()
++ *
++ * Compress a page of data, buffering output and passing on filled
++ * pages to the next module in the pipeline.
++ *
++ * Buffer_page:	Pointer to a buffer of size PAGE_SIZE, containing
++ * data to be checked.
++ *
++ * Returns:	0 on success. Otherwise the error is that returned by later
++ * 		modules, -ECHILD if we have a broken pipeline or -EIO if
++ * 		zlib errs.
++ */
++static int toi_prune_write_page(unsigned long index, int buf_type,
++		void *buffer_page, unsigned int buf_size)
++{
++	int ret = 0, cpu = smp_processor_id(), write_data = 1;
++	struct cpu_context *ctx = &per_cpu(contexts, cpu);
++	u8* output_buffer = buffer_page;
++	int output_len = buf_size;
++	int out_buf_type = buf_type;
++	void *buffer_start;
++	u32 buf[4];
++
++	if (ctx->desc.tfm) {
++
++		buffer_start = TOI_MAP(buf_type, buffer_page);
++		ctx->len = OUT_BUF_SIZE;
++
++		ret = crypto_shash_digest(&ctx->desc, buffer_start, buf_size, &ctx->digest);
++		if (ret) {
++			printk(KERN_INFO "TuxOnIce: Failed to calculate digest (%d).\n", ret);
++		} else {
++			mutex_lock(&stats_lock);
++
++			toi_pruned_pages++;
++
++			mutex_unlock(&stats_lock);
++
++		}
++
++		TOI_UNMAP(buf_type, buffer_page);
++	}
++
++	if (write_data)
++		ret = next_driver->write_page(index, out_buf_type,
++				output_buffer, output_len);
++	else
++		ret = next_driver->write_page(index, out_buf_type,
++				output_buffer, output_len);
++
++	return ret;
++}
++
++/*
++ * toi_prune_read_page()
++ * @buffer_page: struct page *. Pointer to a buffer of size PAGE_SIZE.
++ *
++ * Retrieve data from later modules or from a previously loaded page and
++ * fill the input buffer.
++ * Zero if successful. Error condition from me or from downstream on failure.
++ */
++static int toi_prune_read_page(unsigned long *index, int buf_type,
++		void *buffer_page, unsigned int *buf_size)
++{
++	int ret, cpu = smp_processor_id();
++	unsigned int len;
++	char *buffer_start;
++	struct cpu_context *ctx = &per_cpu(contexts, cpu);
++
++	if (!ctx->desc.tfm)
++		return next_driver->read_page(index, TOI_PAGE, buffer_page,
++				buf_size);
++
++	/*
++	 * All our reads must be synchronous - we can't handle
++	 * data that hasn't been read yet.
++	 */
++
++	ret = next_driver->read_page(index, buf_type, buffer_page, &len);
++
++	if (len == PRUNE_DATA_IS_PFN) {
++		buffer_start = kmap(buffer_page);
++	}
++
++	return ret;
++}
++
++/*
++ * toi_prune_print_debug_stats
++ * @buffer: Pointer to a buffer into which the debug info will be printed.
++ * @size: Size of the buffer.
++ *
++ * Print information to be recorded for debugging purposes into a buffer.
++ * Returns: Number of characters written to the buffer.
++ */
++
++static int toi_prune_print_debug_stats(char *buffer, int size)
++{
++	int len;
++
++	/* Output the number of pages pruned. */
++	if (*toi_prune_hash_algo_name)
++		len = scnprintf(buffer, size, "- Compressor is '%s'.\n",
++				toi_prune_hash_algo_name);
++	else
++		len = scnprintf(buffer, size, "- Compressor is not set.\n");
++
++	if (toi_pruned_pages)
++		len += scnprintf(buffer+len, size - len, "  Pruned "
++			"%lu pages).\n",
++		  toi_pruned_pages);
++	return len;
++}
++
++/*
++ * toi_prune_memory_needed
++ *
++ * Tell the caller how much memory we need to operate during hibernate/resume.
++ * Returns: Unsigned long. Maximum number of bytes of memory required for
++ * operation.
++ */
++static int toi_prune_memory_needed(void)
++{
++	return 2 * PAGE_SIZE;
++}
++
++static int toi_prune_storage_needed(void)
++{
++	return 2 * sizeof(unsigned long) + 2 * sizeof(int) +
++		strlen(toi_prune_hash_algo_name) + 1;
++}
++
++/*
++ * toi_prune_save_config_info
++ * @buffer: Pointer to a buffer of size PAGE_SIZE.
++ *
++ * Save informaton needed when reloading the image at resume time.
++ * Returns: Number of bytes used for saving our data.
++ */
++static int toi_prune_save_config_info(char *buffer)
++{
++	int len = strlen(toi_prune_hash_algo_name) + 1, offset = 0;
++
++	*((unsigned long *) buffer) = toi_pruned_pages;
++	offset += sizeof(unsigned long);
++	*((int *) (buffer + offset)) = len;
++	offset += sizeof(int);
++	strncpy(buffer + offset, toi_prune_hash_algo_name, len);
++	return offset + len;
++}
++
++/* toi_prune_load_config_info
++ * @buffer: Pointer to the start of the data.
++ * @size: Number of bytes that were saved.
++ *
++ * Description:	Reload information needed for passing back to the
++ * resumed kernel.
++ */
++static void toi_prune_load_config_info(char *buffer, int size)
++{
++	int len, offset = 0;
++
++	toi_pruned_pages = *((unsigned long *) buffer);
++	offset += sizeof(unsigned long);
++	len = *((int *) (buffer + offset));
++	offset += sizeof(int);
++	strncpy(toi_prune_hash_algo_name, buffer + offset, len);
++}
++
++static void toi_prune_pre_atomic_restore(struct toi_boot_kernel_data *bkd)
++{
++	bkd->pruned_pages = toi_pruned_pages;
++}
++
++static void toi_prune_post_atomic_restore(struct toi_boot_kernel_data *bkd)
++{
++	toi_pruned_pages = bkd->pruned_pages;
++}
++
++/*
++ * toi_expected_ratio
++ *
++ * Description:	Returns the expected ratio between data passed into this module
++ * 		and the amount of data output when writing.
++ * Returns:	100 - we have no idea how many pages will be pruned.
++ */
++
++static int toi_prune_expected_ratio(void)
++{
++	return 100;
++}
++
++/*
++ * data for our sysfs entries.
++ */
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_INT("enabled", SYSFS_RW, &toi_prune_ops.enabled, 0, 1, 0,
++			NULL),
++	SYSFS_STRING("algorithm", SYSFS_RW, toi_prune_hash_algo_name, 31, 0, NULL),
++};
++
++/*
++ * Ops structure.
++ */
++static struct toi_module_ops toi_prune_ops = {
++	.type			= FILTER_MODULE,
++	.name			= "prune",
++	.directory		= "prune",
++	.module			= THIS_MODULE,
++	.initialise		= toi_prune_init,
++	.memory_needed 		= toi_prune_memory_needed,
++	.print_debug_info	= toi_prune_print_debug_stats,
++	.save_config_info	= toi_prune_save_config_info,
++	.load_config_info	= toi_prune_load_config_info,
++	.storage_needed		= toi_prune_storage_needed,
++	.expected_compression	= toi_prune_expected_ratio,
++
++	.pre_atomic_restore	= toi_prune_pre_atomic_restore,
++	.post_atomic_restore	= toi_prune_post_atomic_restore,
++
++	.rw_init		= toi_prune_rw_init,
++	.rw_cleanup		= toi_prune_rw_cleanup,
++
++	.write_page		= toi_prune_write_page,
++	.read_page		= toi_prune_read_page,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* ---- Registration ---- */
++
++static __init int toi_prune_load(void)
++{
++	return toi_register_module(&toi_prune_ops);
++}
++
++#ifdef MODULE
++static __exit void toi_prune_unload(void)
++{
++	toi_unregister_module(&toi_prune_ops);
++}
++
++module_init(toi_prune_load);
++module_exit(toi_prune_unload);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("Image Pruning Support for TuxOnIce");
++#else
++late_initcall(toi_prune_load);
++#endif
+diff --git a/kernel/power/tuxonice_storage.c b/kernel/power/tuxonice_storage.c
+new file mode 100644
+index 0000000..dcf83f4
+--- /dev/null
++++ b/kernel/power/tuxonice_storage.c
+@@ -0,0 +1,283 @@
++/*
++ * kernel/power/tuxonice_storage.c
++ *
++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Routines for talking to a userspace program that manages storage.
++ *
++ * The kernel side:
++ * - starts the userspace program;
++ * - sends messages telling it when to open and close the connection;
++ * - tells it when to quit;
++ *
++ * The user space side:
++ * - passes messages regarding status;
++ *
++ */
++
++#include <linux/module.h>
++#include <linux/suspend.h>
++#include <linux/freezer.h>
++
++#include "tuxonice_sysfs.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_netlink.h"
++#include "tuxonice_storage.h"
++#include "tuxonice_ui.h"
++
++static struct user_helper_data usm_helper_data;
++static struct toi_module_ops usm_ops;
++static int message_received, usm_prepare_count;
++static int storage_manager_last_action, storage_manager_action;
++
++static int usm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
++{
++	int type;
++	int *data;
++
++	type = nlh->nlmsg_type;
++
++	/* A control message: ignore them */
++	if (type < NETLINK_MSG_BASE)
++		return 0;
++
++	/* Unknown message: reply with EINVAL */
++	if (type >= USM_MSG_MAX)
++		return -EINVAL;
++
++	/* All operations require privileges, even GET */
++	if (!capable(CAP_NET_ADMIN))
++		return -EPERM;
++
++	/* Only allow one task to receive NOFREEZE privileges */
++	if (type == NETLINK_MSG_NOFREEZE_ME && usm_helper_data.pid != -1)
++		return -EBUSY;
++
++	data = (int *) NLMSG_DATA(nlh);
++
++	switch (type) {
++	case USM_MSG_SUCCESS:
++	case USM_MSG_FAILED:
++		message_received = type;
++		complete(&usm_helper_data.wait_for_process);
++		break;
++	default:
++		printk(KERN_INFO "Storage manager doesn't recognise "
++				"message %d.\n", type);
++	}
++
++	return 1;
++}
++
++#ifdef CONFIG_NET
++static int activations;
++
++int toi_activate_storage(int force)
++{
++	int tries = 1;
++
++	if (usm_helper_data.pid == -1 || !usm_ops.enabled)
++		return 0;
++
++	message_received = 0;
++	activations++;
++
++	if (activations > 1 && !force)
++		return 0;
++
++	while ((!message_received || message_received == USM_MSG_FAILED) &&
++			tries < 2) {
++		toi_prepare_status(DONT_CLEAR_BAR, "Activate storage attempt "
++				"%d.\n", tries);
++
++		init_completion(&usm_helper_data.wait_for_process);
++
++		toi_send_netlink_message(&usm_helper_data,
++			USM_MSG_CONNECT,
++			NULL, 0);
++
++		/* Wait 2 seconds for the userspace process to make contact */
++		wait_for_completion_timeout(&usm_helper_data.wait_for_process,
++				2*HZ);
++
++		tries++;
++	}
++
++	return 0;
++}
++
++int toi_deactivate_storage(int force)
++{
++	if (usm_helper_data.pid == -1 || !usm_ops.enabled)
++		return 0;
++
++	message_received = 0;
++	activations--;
++
++	if (activations && !force)
++		return 0;
++
++	init_completion(&usm_helper_data.wait_for_process);
++
++	toi_send_netlink_message(&usm_helper_data,
++			USM_MSG_DISCONNECT,
++			NULL, 0);
++
++	wait_for_completion_timeout(&usm_helper_data.wait_for_process, 2*HZ);
++
++	if (!message_received || message_received == USM_MSG_FAILED) {
++		printk(KERN_INFO "Returning failure disconnecting storage.\n");
++		return 1;
++	}
++
++	return 0;
++}
++#endif
++
++static void storage_manager_simulate(void)
++{
++	printk(KERN_INFO "--- Storage manager simulate ---\n");
++	toi_prepare_usm();
++	schedule();
++	printk(KERN_INFO "--- Activate storage 1 ---\n");
++	toi_activate_storage(1);
++	schedule();
++	printk(KERN_INFO "--- Deactivate storage 1 ---\n");
++	toi_deactivate_storage(1);
++	schedule();
++	printk(KERN_INFO "--- Cleanup usm ---\n");
++	toi_cleanup_usm();
++	schedule();
++	printk(KERN_INFO "--- Storage manager simulate ends ---\n");
++}
++
++static int usm_storage_needed(void)
++{
++	return sizeof(int) + strlen(usm_helper_data.program) + 1;
++}
++
++static int usm_save_config_info(char *buf)
++{
++	int len = strlen(usm_helper_data.program);
++	memcpy(buf, usm_helper_data.program, len + 1);
++	return sizeof(int) + len + 1;
++}
++
++static void usm_load_config_info(char *buf, int size)
++{
++	/* Don't load the saved path if one has already been set */
++	if (usm_helper_data.program[0])
++		return;
++
++	memcpy(usm_helper_data.program, buf + sizeof(int), *((int *) buf));
++}
++
++static int usm_memory_needed(void)
++{
++	/* ball park figure of 32 pages */
++	return 32 * PAGE_SIZE;
++}
++
++/* toi_prepare_usm
++ */
++int toi_prepare_usm(void)
++{
++	usm_prepare_count++;
++
++	if (usm_prepare_count > 1 || !usm_ops.enabled)
++		return 0;
++
++	usm_helper_data.pid = -1;
++
++	if (!*usm_helper_data.program)
++		return 0;
++
++	toi_netlink_setup(&usm_helper_data);
++
++	if (usm_helper_data.pid == -1)
++		printk(KERN_INFO "TuxOnIce Storage Manager wanted, but couldn't"
++				" start it.\n");
++
++	toi_activate_storage(0);
++
++	return usm_helper_data.pid != -1;
++}
++
++void toi_cleanup_usm(void)
++{
++	usm_prepare_count--;
++
++	if (usm_helper_data.pid > -1 && !usm_prepare_count) {
++		toi_deactivate_storage(0);
++		toi_netlink_close(&usm_helper_data);
++	}
++}
++
++static void storage_manager_activate(void)
++{
++	if (storage_manager_action == storage_manager_last_action)
++		return;
++
++	if (storage_manager_action)
++		toi_prepare_usm();
++	else
++		toi_cleanup_usm();
++
++	storage_manager_last_action = storage_manager_action;
++}
++
++/*
++ * User interface specific /sys/power/tuxonice entries.
++ */
++
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_NONE("simulate_atomic_copy", storage_manager_simulate),
++	SYSFS_INT("enabled", SYSFS_RW, &usm_ops.enabled, 0, 1, 0, NULL),
++	SYSFS_STRING("program", SYSFS_RW, usm_helper_data.program, 254, 0,
++		NULL),
++	SYSFS_INT("activate_storage", SYSFS_RW , &storage_manager_action, 0, 1,
++			0, storage_manager_activate)
++};
++
++static struct toi_module_ops usm_ops = {
++	.type				= MISC_MODULE,
++	.name				= "usm",
++	.directory			= "storage_manager",
++	.module				= THIS_MODULE,
++	.storage_needed			= usm_storage_needed,
++	.save_config_info		= usm_save_config_info,
++	.load_config_info		= usm_load_config_info,
++	.memory_needed			= usm_memory_needed,
++
++	.sysfs_data			= sysfs_params,
++	.num_sysfs_entries		= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* toi_usm_sysfs_init
++ * Description: Boot time initialisation for user interface.
++ */
++int toi_usm_init(void)
++{
++	usm_helper_data.nl = NULL;
++	usm_helper_data.program[0] = '\0';
++	usm_helper_data.pid = -1;
++	usm_helper_data.skb_size = 0;
++	usm_helper_data.pool_limit = 6;
++	usm_helper_data.netlink_id = NETLINK_TOI_USM;
++	usm_helper_data.name = "userspace storage manager";
++	usm_helper_data.rcv_msg = usm_user_rcv_msg;
++	usm_helper_data.interface_version = 2;
++	usm_helper_data.must_init = 0;
++	init_completion(&usm_helper_data.wait_for_process);
++
++	return toi_register_module(&usm_ops);
++}
++
++void toi_usm_exit(void)
++{
++	toi_netlink_close_complete(&usm_helper_data);
++	toi_unregister_module(&usm_ops);
++}
+diff --git a/kernel/power/tuxonice_storage.h b/kernel/power/tuxonice_storage.h
+new file mode 100644
+index 0000000..8c6b5a7
+--- /dev/null
++++ b/kernel/power/tuxonice_storage.h
+@@ -0,0 +1,45 @@
++/*
++ * kernel/power/tuxonice_storage.h
++ *
++ * Copyright (C) 2005-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ */
++
++#ifdef CONFIG_NET
++int toi_prepare_usm(void);
++void toi_cleanup_usm(void);
++
++int toi_activate_storage(int force);
++int toi_deactivate_storage(int force);
++extern int toi_usm_init(void);
++extern void toi_usm_exit(void);
++#else
++static inline int toi_usm_init(void) { return 0; }
++static inline void toi_usm_exit(void) { }
++
++static inline int toi_activate_storage(int force)
++{
++	return 0;
++}
++
++static inline int toi_deactivate_storage(int force)
++{
++	return 0;
++}
++
++static inline int toi_prepare_usm(void) { return 0; }
++static inline void toi_cleanup_usm(void) { }
++#endif
++
++enum {
++	USM_MSG_BASE = 0x10,
++
++	/* Kernel -> Userspace */
++	USM_MSG_CONNECT = 0x30,
++	USM_MSG_DISCONNECT = 0x31,
++	USM_MSG_SUCCESS = 0x40,
++	USM_MSG_FAILED = 0x41,
++
++	USM_MSG_MAX,
++};
+diff --git a/kernel/power/tuxonice_swap.c b/kernel/power/tuxonice_swap.c
+new file mode 100644
+index 0000000..a6c0d76
+--- /dev/null
++++ b/kernel/power/tuxonice_swap.c
+@@ -0,0 +1,463 @@
++/*
++ * kernel/power/tuxonice_swap.c
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * Distributed under GPLv2.
++ *
++ * This file encapsulates functions for usage of swap space as a
++ * backing store.
++ */
++
++#include <linux/suspend.h>
++#include <linux/blkdev.h>
++#include <linux/swapops.h>
++#include <linux/swap.h>
++#include <linux/syscalls.h>
++#include <linux/fs_uuid.h>
++
++#include "tuxonice.h"
++#include "tuxonice_sysfs.h"
++#include "tuxonice_modules.h"
++#include "tuxonice_io.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_extent.h"
++#include "tuxonice_bio.h"
++#include "tuxonice_alloc.h"
++#include "tuxonice_builtin.h"
++
++static struct toi_module_ops toi_swapops;
++
++/* For swapfile automatically swapon/off'd. */
++static char swapfilename[255] = "";
++static int toi_swapon_status;
++
++/* Swap Pages */
++static unsigned long swap_allocated;
++
++static struct sysinfo swapinfo;
++
++static int is_ram_backed(struct swap_info_struct *si)
++{
++	if (!strncmp(si->bdev->bd_disk->disk_name, "ram", 3) ||
++	    !strncmp(si->bdev->bd_disk->disk_name, "zram", 4))
++		return 1;
++
++	return 0;
++}
++
++/**
++ * enable_swapfile: Swapon the user specified swapfile prior to hibernating.
++ *
++ * Activate the given swapfile if it wasn't already enabled. Remember whether
++ * we really did swapon it for swapoffing later.
++ */
++static void enable_swapfile(void)
++{
++	int activateswapresult = -EINVAL;
++
++	if (swapfilename[0]) {
++		/* Attempt to swap on with maximum priority */
++		activateswapresult = sys_swapon(swapfilename, 0xFFFF);
++		if (activateswapresult && activateswapresult != -EBUSY)
++			printk(KERN_ERR "TuxOnIce: The swapfile/partition "
++				"specified by /sys/power/tuxonice/swap/swapfile"
++				" (%s) could not be turned on (error %d). "
++				"Attempting to continue.\n",
++				swapfilename, activateswapresult);
++		if (!activateswapresult)
++			toi_swapon_status = 1;
++	}
++}
++
++/**
++ * disable_swapfile: Swapoff any file swaponed at the start of the cycle.
++ *
++ * If we did successfully swapon a file at the start of the cycle, swapoff
++ * it now (finishing up).
++ */
++static void disable_swapfile(void)
++{
++	if (!toi_swapon_status)
++		return;
++
++	sys_swapoff(swapfilename);
++	toi_swapon_status = 0;
++}
++
++static int add_blocks_to_extent_chain(struct toi_bdev_info *chain,
++		unsigned long start, unsigned long end)
++{
++	if (test_action_state(TOI_TEST_BIO))
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Adding extent %lu-%lu to "
++				"chain %p.", start << chain->bmap_shift,
++				end << chain->bmap_shift, chain);
++
++	return toi_add_to_extent_chain(&chain->blocks, start, end);
++}
++
++
++static int get_main_pool_phys_params(struct toi_bdev_info *chain)
++{
++	struct hibernate_extent *extentpointer = NULL;
++	unsigned long address, extent_min = 0, extent_max = 0;
++	int empty = 1;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "get main pool phys params for "
++			"chain %d.", chain->allocator_index);
++
++	if (!chain->allocations.first)
++		return 0;
++
++	if (chain->blocks.first)
++		toi_put_extent_chain(&chain->blocks);
++
++	toi_extent_for_each(&chain->allocations, extentpointer, address) {
++		swp_entry_t swap_address = (swp_entry_t) { address };
++		struct block_device *bdev;
++		sector_t new_sector = map_swap_entry(swap_address, &bdev);
++
++		if (empty) {
++			empty = 0;
++			extent_min = extent_max = new_sector;
++			continue;
++		}
++
++		if (new_sector == extent_max + 1) {
++			extent_max++;
++			continue;
++		}
++
++		if (add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
++			printk(KERN_ERR "Out of memory while making block "
++					"chains.\n");
++			return -ENOMEM;
++		}
++
++		extent_min = new_sector;
++		extent_max = new_sector;
++	}
++
++	if (!empty &&
++	    add_blocks_to_extent_chain(chain, extent_min, extent_max)) {
++		printk(KERN_ERR "Out of memory while making block chains.\n");
++		return -ENOMEM;
++	}
++
++	return 0;
++}
++
++/*
++ * Like si_swapinfo, except that we don't include ram backed swap (compcache!)
++ * and don't need to use the spinlocks (userspace is stopped when this
++ * function is called).
++ */
++void si_swapinfo_no_compcache(void)
++{
++	unsigned int i;
++
++	si_swapinfo(&swapinfo);
++	swapinfo.freeswap = 0;
++	swapinfo.totalswap = 0;
++
++	for (i = 0; i < MAX_SWAPFILES; i++) {
++		struct swap_info_struct *si = get_swap_info_struct(i);
++		if (si && (si->flags & SWP_WRITEOK) && !is_ram_backed(si)) {
++			swapinfo.totalswap += si->inuse_pages;
++			swapinfo.freeswap += si->pages - si->inuse_pages;
++		}
++	}
++}
++/*
++ * We can't just remember the value from allocation time, because other
++ * processes might have allocated swap in the mean time.
++ */
++static unsigned long toi_swap_storage_available(void)
++{
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "In toi_swap_storage_available.");
++	si_swapinfo_no_compcache();
++	return swapinfo.freeswap + swap_allocated;
++}
++
++static int toi_swap_initialise(int starting_cycle)
++{
++	if (!starting_cycle)
++		return 0;
++
++	enable_swapfile();
++	return 0;
++}
++
++static void toi_swap_cleanup(int ending_cycle)
++{
++	if (!ending_cycle)
++		return;
++
++	disable_swapfile();
++}
++
++static void toi_swap_free_storage(struct toi_bdev_info *chain)
++{
++	/* Free swap entries */
++	struct hibernate_extent *extentpointer;
++	unsigned long extentvalue;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "Freeing storage for chain %p.",
++			chain);
++
++	swap_allocated -= chain->allocations.size;
++	toi_extent_for_each(&chain->allocations, extentpointer, extentvalue)
++		swap_free((swp_entry_t) { extentvalue });
++
++	toi_put_extent_chain(&chain->allocations);
++}
++
++static void free_swap_range(unsigned long min, unsigned long max)
++{
++	int j;
++
++	for (j = min; j <= max; j++)
++		swap_free((swp_entry_t) { j });
++	swap_allocated -= (max - min + 1);
++}
++
++/*
++ * Allocation of a single swap type. Swap priorities are handled at the higher
++ * level.
++ */
++static int toi_swap_allocate_storage(struct toi_bdev_info *chain,
++		unsigned long request)
++{
++	unsigned long gotten = 0;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "  Swap allocate storage: Asked to"
++			" allocate %lu pages from device %d.", request,
++			chain->allocator_index);
++
++	while (gotten < request) {
++		swp_entry_t start, end;
++		get_swap_range_of_type(chain->allocator_index, &start, &end,
++				request - gotten + 1);
++		if (start.val) {
++			int added = end.val - start.val + 1;
++			if (toi_add_to_extent_chain(&chain->allocations,
++						start.val, end.val)) {
++				printk(KERN_INFO "Failed to allocate extent for "
++					"%lu-%lu.\n", start.val, end.val);
++				free_swap_range(start.val, end.val);
++				break;
++			}
++			gotten += added;
++			swap_allocated += added;
++		} else
++			break;
++	}
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "  Allocated %lu pages.", gotten);
++	return gotten;
++}
++
++static int toi_swap_register_storage(void)
++{
++	int i, result = 0;
++
++	toi_message(TOI_IO, TOI_VERBOSE, 0, "toi_swap_register_storage.");
++	for (i = 0; i < MAX_SWAPFILES; i++) {
++		struct swap_info_struct *si = get_swap_info_struct(i);
++		struct toi_bdev_info *devinfo;
++		unsigned char *p;
++		unsigned char buf[256];
++		struct fs_info *fs_info;
++
++		if (!si || !(si->flags & SWP_WRITEOK) || is_ram_backed(si))
++			continue;
++
++		devinfo = toi_kzalloc(39, sizeof(struct toi_bdev_info),
++				GFP_ATOMIC);
++		if (!devinfo) {
++			printk("Failed to allocate devinfo struct for swap "
++					"device %d.\n", i);
++			return -ENOMEM;
++		}
++
++		devinfo->bdev = si->bdev;
++		devinfo->allocator = &toi_swapops;
++		devinfo->allocator_index = i;
++
++		fs_info = fs_info_from_block_dev(si->bdev);
++		if (fs_info && !IS_ERR(fs_info)) {
++			memcpy(devinfo->uuid, &fs_info->uuid, 16);
++			free_fs_info(fs_info);
++		} else
++			result = (int) PTR_ERR(fs_info);
++
++		if (!fs_info)
++			printk("fs_info from block dev returned %d.\n", result);
++		devinfo->dev_t = si->bdev->bd_dev;
++		devinfo->prio = si->prio;
++		devinfo->bmap_shift = 3;
++		devinfo->blocks_per_page = 1;
++
++		p = d_path(&si->swap_file->f_path, buf, sizeof(buf));
++		sprintf(devinfo->name, "swap on %s", p);
++
++		toi_message(TOI_IO, TOI_VERBOSE, 0, "Registering swap storage:"
++				" Device %d (%lx), prio %d.", i,
++				(unsigned long) devinfo->dev_t, devinfo->prio);
++		toi_bio_ops.register_storage(devinfo);
++	}
++
++	return 0;
++}
++
++/*
++ * workspace_size
++ *
++ * Description:
++ * Returns the number of bytes of RAM needed for this
++ * code to do its work. (Used when calculating whether
++ * we have enough memory to be able to hibernate & resume).
++ *
++ */
++static int toi_swap_memory_needed(void)
++{
++	return 1;
++}
++
++/*
++ * Print debug info
++ *
++ * Description:
++ */
++static int toi_swap_print_debug_stats(char *buffer, int size)
++{
++	int len = 0;
++
++	len = scnprintf(buffer, size, "- Swap Allocator enabled.\n");
++	if (swapfilename[0])
++		len += scnprintf(buffer+len, size-len,
++			"  Attempting to automatically swapon: %s.\n",
++			swapfilename);
++
++	si_swapinfo_no_compcache();
++
++	len += scnprintf(buffer+len, size-len,
++			"  Swap available for image: %lu pages.\n",
++			swapinfo.freeswap + swap_allocated);
++
++	return len;
++}
++
++static int header_locations_read_sysfs(const char *page, int count)
++{
++	int i, printedpartitionsmessage = 0, len = 0, haveswap = 0;
++	struct inode *swapf = NULL;
++	int zone;
++	char *path_page = (char *) toi_get_free_page(10, GFP_KERNEL);
++	char *path, *output = (char *) page;
++	int path_len;
++
++	if (!page)
++		return 0;
++
++	for (i = 0; i < MAX_SWAPFILES; i++) {
++		struct swap_info_struct *si =  get_swap_info_struct(i);
++
++		if (!si || !(si->flags & SWP_WRITEOK))
++			continue;
++
++		if (S_ISBLK(si->swap_file->f_mapping->host->i_mode)) {
++			haveswap = 1;
++			if (!printedpartitionsmessage) {
++				len += sprintf(output + len,
++					"For swap partitions, simply use the "
++					"format: resume=swap:/dev/hda1.\n");
++				printedpartitionsmessage = 1;
++			}
++		} else {
++			path_len = 0;
++
++			path = d_path(&si->swap_file->f_path, path_page,
++					PAGE_SIZE);
++			path_len = snprintf(path_page, PAGE_SIZE, "%s", path);
++
++			haveswap = 1;
++			swapf = si->swap_file->f_mapping->host;
++			zone = bmap(swapf, 0);
++			if (!zone) {
++				len += sprintf(output + len,
++					"Swapfile %s has been corrupted. Reuse"
++					" mkswap on it and try again.\n",
++					path_page);
++			} else {
++				char name_buffer[BDEVNAME_SIZE];
++				len += sprintf(output + len,
++					"For swapfile `%s`,"
++					" use resume=swap:/dev/%s:0x%x.\n",
++					path_page,
++					bdevname(si->bdev, name_buffer),
++					zone << (swapf->i_blkbits - 9));
++			}
++		}
++	}
++
++	if (!haveswap)
++		len = sprintf(output, "You need to turn on swap partitions "
++				"before examining this file.\n");
++
++	toi_free_page(10, (unsigned long) path_page);
++	return len;
++}
++
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_STRING("swapfilename", SYSFS_RW, swapfilename, 255, 0, NULL),
++	SYSFS_CUSTOM("headerlocations", SYSFS_READONLY,
++			header_locations_read_sysfs, NULL, 0, NULL),
++	SYSFS_INT("enabled", SYSFS_RW, &toi_swapops.enabled, 0, 1, 0,
++			attempt_to_parse_resume_device2),
++};
++
++static struct toi_bio_allocator_ops toi_bio_swapops = {
++	.register_storage			= toi_swap_register_storage,
++	.storage_available			= toi_swap_storage_available,
++	.allocate_storage			= toi_swap_allocate_storage,
++	.bmap					= get_main_pool_phys_params,
++	.free_storage				= toi_swap_free_storage,
++};
++
++static struct toi_module_ops toi_swapops = {
++	.type					= BIO_ALLOCATOR_MODULE,
++	.name					= "swap storage",
++	.directory				= "swap",
++	.module					= THIS_MODULE,
++	.memory_needed				= toi_swap_memory_needed,
++	.print_debug_info			= toi_swap_print_debug_stats,
++	.initialise				= toi_swap_initialise,
++	.cleanup				= toi_swap_cleanup,
++	.bio_allocator_ops			= &toi_bio_swapops,
++
++	.sysfs_data		= sysfs_params,
++	.num_sysfs_entries	= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++/* ---- Registration ---- */
++static __init int toi_swap_load(void)
++{
++	return toi_register_module(&toi_swapops);
++}
++
++#ifdef MODULE
++static __exit void toi_swap_unload(void)
++{
++	toi_unregister_module(&toi_swapops);
++}
++
++module_init(toi_swap_load);
++module_exit(toi_swap_unload);
++MODULE_LICENSE("GPL");
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("TuxOnIce SwapAllocator");
++#else
++late_initcall(toi_swap_load);
++#endif
+diff --git a/kernel/power/tuxonice_sysfs.c b/kernel/power/tuxonice_sysfs.c
+new file mode 100644
+index 0000000..0088409
+--- /dev/null
++++ b/kernel/power/tuxonice_sysfs.c
+@@ -0,0 +1,335 @@
++/*
++ * kernel/power/tuxonice_sysfs.c
++ *
++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * This file contains support for sysfs entries for tuning TuxOnIce.
++ *
++ * We have a generic handler that deals with the most common cases, and
++ * hooks for special handlers to use.
++ */
++
++#include <linux/suspend.h>
++
++#include "tuxonice_sysfs.h"
++#include "tuxonice.h"
++#include "tuxonice_storage.h"
++#include "tuxonice_alloc.h"
++
++static int toi_sysfs_initialised;
++
++static void toi_initialise_sysfs(void);
++
++static struct toi_sysfs_data sysfs_params[];
++
++#define to_sysfs_data(_attr) container_of(_attr, struct toi_sysfs_data, attr)
++
++static void toi_main_wrapper(void)
++{
++	toi_try_hibernate();
++}
++
++static ssize_t toi_attr_show(struct kobject *kobj, struct attribute *attr,
++			      char *page)
++{
++	struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
++	int len = 0;
++	int full_prep = sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ;
++
++	if (full_prep && toi_start_anything(0))
++		return -EBUSY;
++
++	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
++		toi_prepare_usm();
++
++	switch (sysfs_data->type) {
++	case TOI_SYSFS_DATA_CUSTOM:
++		len = (sysfs_data->data.special.read_sysfs) ?
++			(sysfs_data->data.special.read_sysfs)(page, PAGE_SIZE)
++			: 0;
++		break;
++	case TOI_SYSFS_DATA_BIT:
++		len = sprintf(page, "%d\n",
++			-test_bit(sysfs_data->data.bit.bit,
++				sysfs_data->data.bit.bit_vector));
++		break;
++	case TOI_SYSFS_DATA_INTEGER:
++		len = sprintf(page, "%d\n",
++			*(sysfs_data->data.integer.variable));
++		break;
++	case TOI_SYSFS_DATA_LONG:
++		len = sprintf(page, "%ld\n",
++			*(sysfs_data->data.a_long.variable));
++		break;
++	case TOI_SYSFS_DATA_UL:
++		len = sprintf(page, "%lu\n",
++			*(sysfs_data->data.ul.variable));
++		break;
++	case TOI_SYSFS_DATA_STRING:
++		len = sprintf(page, "%s\n",
++			sysfs_data->data.string.variable);
++		break;
++	}
++
++	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_READ)
++		toi_cleanup_usm();
++
++	if (full_prep)
++		toi_finish_anything(0);
++
++	return len;
++}
++
++#define BOUND(_variable, _type) do { \
++	if (*_variable < sysfs_data->data._type.minimum) \
++		*_variable = sysfs_data->data._type.minimum; \
++	else if (*_variable > sysfs_data->data._type.maximum) \
++		*_variable = sysfs_data->data._type.maximum; \
++} while (0)
++
++static ssize_t toi_attr_store(struct kobject *kobj, struct attribute *attr,
++		const char *my_buf, size_t count)
++{
++	int assigned_temp_buffer = 0, result = count;
++	struct toi_sysfs_data *sysfs_data = to_sysfs_data(attr);
++
++	if (toi_start_anything((sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME)))
++		return -EBUSY;
++
++	((char *) my_buf)[count] = 0;
++
++	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
++		toi_prepare_usm();
++
++	switch (sysfs_data->type) {
++	case TOI_SYSFS_DATA_CUSTOM:
++		if (sysfs_data->data.special.write_sysfs)
++			result = (sysfs_data->data.special.write_sysfs)(my_buf,
++					count);
++		break;
++	case TOI_SYSFS_DATA_BIT:
++		{
++		unsigned long value;
++		result = strict_strtoul(my_buf, 0, &value);
++		if (result)
++			break;
++		if (value)
++			set_bit(sysfs_data->data.bit.bit,
++				(sysfs_data->data.bit.bit_vector));
++		else
++			clear_bit(sysfs_data->data.bit.bit,
++				(sysfs_data->data.bit.bit_vector));
++		}
++		break;
++	case TOI_SYSFS_DATA_INTEGER:
++		{
++			long temp;
++			result = strict_strtol(my_buf, 0, &temp);
++			if (result)
++				break;
++			*(sysfs_data->data.integer.variable) = (int) temp;
++			BOUND(sysfs_data->data.integer.variable, integer);
++			break;
++		}
++	case TOI_SYSFS_DATA_LONG:
++		{
++			long *variable =
++				sysfs_data->data.a_long.variable;
++			result = strict_strtol(my_buf, 0, variable);
++			if (result)
++				break;
++			BOUND(variable, a_long);
++			break;
++		}
++	case TOI_SYSFS_DATA_UL:
++		{
++			unsigned long *variable =
++				sysfs_data->data.ul.variable;
++			result = strict_strtoul(my_buf, 0, variable);
++			if (result)
++				break;
++			BOUND(variable, ul);
++			break;
++		}
++		break;
++	case TOI_SYSFS_DATA_STRING:
++		{
++			int copy_len = count;
++			char *variable =
++				sysfs_data->data.string.variable;
++
++			if (sysfs_data->data.string.max_length &&
++			    (copy_len > sysfs_data->data.string.max_length))
++				copy_len = sysfs_data->data.string.max_length;
++
++			if (!variable) {
++				variable = (char *) toi_get_zeroed_page(31,
++						TOI_ATOMIC_GFP);
++				sysfs_data->data.string.variable = variable;
++				assigned_temp_buffer = 1;
++			}
++			strncpy(variable, my_buf, copy_len);
++			if (copy_len && my_buf[copy_len - 1] == '\n')
++				variable[count - 1] = 0;
++			variable[count] = 0;
++		}
++		break;
++	}
++
++	if (!result)
++		result = count;
++
++	/* Side effect routine? */
++	if (result == count && sysfs_data->write_side_effect)
++		sysfs_data->write_side_effect();
++
++	/* Free temporary buffers */
++	if (assigned_temp_buffer) {
++		toi_free_page(31,
++			(unsigned long) sysfs_data->data.string.variable);
++		sysfs_data->data.string.variable = NULL;
++	}
++
++	if (sysfs_data->flags & SYSFS_NEEDS_SM_FOR_WRITE)
++		toi_cleanup_usm();
++
++	toi_finish_anything(sysfs_data->flags & SYSFS_HIBERNATE_OR_RESUME);
++
++	return result;
++}
++
++static struct sysfs_ops toi_sysfs_ops = {
++	.show	= &toi_attr_show,
++	.store	= &toi_attr_store,
++};
++
++static struct kobj_type toi_ktype = {
++	.sysfs_ops	= &toi_sysfs_ops,
++};
++
++struct kobject *tuxonice_kobj;
++
++/* Non-module sysfs entries.
++ *
++ * This array contains entries that are automatically registered at
++ * boot. Modules and the console code register their own entries separately.
++ */
++
++static struct toi_sysfs_data sysfs_params[] = {
++	SYSFS_CUSTOM("do_hibernate", SYSFS_WRITEONLY, NULL, NULL,
++		SYSFS_HIBERNATING, toi_main_wrapper),
++	SYSFS_CUSTOM("do_resume", SYSFS_WRITEONLY, NULL, NULL,
++		SYSFS_RESUMING, toi_try_resume)
++};
++
++void remove_toi_sysdir(struct kobject *kobj)
++{
++	if (!kobj)
++		return;
++
++	kobject_put(kobj);
++}
++
++struct kobject *make_toi_sysdir(char *name)
++{
++	struct kobject *kobj = kobject_create_and_add(name, tuxonice_kobj);
++
++	if (!kobj) {
++		printk(KERN_INFO "TuxOnIce: Can't allocate kobject for sysfs "
++				"dir!\n");
++		return NULL;
++	}
++
++	kobj->ktype = &toi_ktype;
++
++	return kobj;
++}
++
++/* toi_register_sysfs_file
++ *
++ * Helper for registering a new /sysfs/tuxonice entry.
++ */
++
++int toi_register_sysfs_file(
++		struct kobject *kobj,
++		struct toi_sysfs_data *toi_sysfs_data)
++{
++	int result;
++
++	if (!toi_sysfs_initialised)
++		toi_initialise_sysfs();
++
++	result = sysfs_create_file(kobj, &toi_sysfs_data->attr);
++	if (result)
++		printk(KERN_INFO "TuxOnIce: sysfs_create_file for %s "
++			"returned %d.\n",
++			toi_sysfs_data->attr.name, result);
++	kobj->ktype = &toi_ktype;
++
++	return result;
++}
++EXPORT_SYMBOL_GPL(toi_register_sysfs_file);
++
++/* toi_unregister_sysfs_file
++ *
++ * Helper for removing unwanted /sys/power/tuxonice entries.
++ *
++ */
++void toi_unregister_sysfs_file(struct kobject *kobj,
++		struct toi_sysfs_data *toi_sysfs_data)
++{
++	sysfs_remove_file(kobj, &toi_sysfs_data->attr);
++}
++EXPORT_SYMBOL_GPL(toi_unregister_sysfs_file);
++
++void toi_cleanup_sysfs(void)
++{
++	int i,
++	    numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
++
++	if (!toi_sysfs_initialised)
++		return;
++
++	for (i = 0; i < numfiles; i++)
++		toi_unregister_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
++
++	kobject_put(tuxonice_kobj);
++	toi_sysfs_initialised = 0;
++}
++
++/* toi_initialise_sysfs
++ *
++ * Initialise the /sysfs/tuxonice directory.
++ */
++
++static void toi_initialise_sysfs(void)
++{
++	int i;
++	int numfiles = sizeof(sysfs_params) / sizeof(struct toi_sysfs_data);
++
++	if (toi_sysfs_initialised)
++		return;
++
++	/* Make our TuxOnIce directory a child of /sys/power */
++	tuxonice_kobj = kobject_create_and_add("tuxonice", power_kobj);
++	if (!tuxonice_kobj)
++		return;
++
++	toi_sysfs_initialised = 1;
++
++	for (i = 0; i < numfiles; i++)
++		toi_register_sysfs_file(tuxonice_kobj, &sysfs_params[i]);
++}
++
++int toi_sysfs_init(void)
++{
++	toi_initialise_sysfs();
++	return 0;
++}
++
++void toi_sysfs_exit(void)
++{
++	toi_cleanup_sysfs();
++}
+diff --git a/kernel/power/tuxonice_sysfs.h b/kernel/power/tuxonice_sysfs.h
+new file mode 100644
+index 0000000..4185c6d
+--- /dev/null
++++ b/kernel/power/tuxonice_sysfs.h
+@@ -0,0 +1,137 @@
++/*
++ * kernel/power/tuxonice_sysfs.h
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ */
++
++#include <linux/sysfs.h>
++
++struct toi_sysfs_data {
++	struct attribute attr;
++	int type;
++	int flags;
++	union {
++		struct {
++			unsigned long *bit_vector;
++			int bit;
++		} bit;
++		struct {
++			int *variable;
++			int minimum;
++			int maximum;
++		} integer;
++		struct {
++			long *variable;
++			long minimum;
++			long maximum;
++		} a_long;
++		struct {
++			unsigned long *variable;
++			unsigned long minimum;
++			unsigned long maximum;
++		} ul;
++		struct {
++			char *variable;
++			int max_length;
++		} string;
++		struct {
++			int (*read_sysfs) (const char *buffer, int count);
++			int (*write_sysfs) (const char *buffer, int count);
++			void *data;
++		} special;
++	} data;
++
++	/* Side effects routine. Used, eg, for reparsing the
++	 * resume= entry when it changes */
++	void (*write_side_effect) (void);
++	struct list_head sysfs_data_list;
++};
++
++enum {
++	TOI_SYSFS_DATA_NONE = 1,
++	TOI_SYSFS_DATA_CUSTOM,
++	TOI_SYSFS_DATA_BIT,
++	TOI_SYSFS_DATA_INTEGER,
++	TOI_SYSFS_DATA_UL,
++	TOI_SYSFS_DATA_LONG,
++	TOI_SYSFS_DATA_STRING
++};
++
++#define SYSFS_WRITEONLY 0200
++#define SYSFS_READONLY 0444
++#define SYSFS_RW 0644
++
++#define SYSFS_BIT(_name, _mode, _ul, _bit, _flags) { \
++	.attr = {.name  = _name , .mode   = _mode }, \
++	.type = TOI_SYSFS_DATA_BIT, \
++	.flags = _flags, \
++	.data = { .bit = { .bit_vector = _ul, .bit = _bit } } }
++
++#define SYSFS_INT(_name, _mode, _int, _min, _max, _flags, _wse) { \
++	.attr = {.name  = _name , .mode   = _mode }, \
++	.type = TOI_SYSFS_DATA_INTEGER, \
++	.flags = _flags, \
++	.data = { .integer = { .variable = _int, .minimum = _min, \
++			.maximum = _max } }, \
++	.write_side_effect = _wse }
++
++#define SYSFS_UL(_name, _mode, _ul, _min, _max, _flags) { \
++	.attr = {.name  = _name , .mode   = _mode }, \
++	.type = TOI_SYSFS_DATA_UL, \
++	.flags = _flags, \
++	.data = { .ul = { .variable = _ul, .minimum = _min, \
++			.maximum = _max } } }
++
++#define SYSFS_LONG(_name, _mode, _long, _min, _max, _flags) { \
++	.attr = {.name  = _name , .mode   = _mode }, \
++	.type = TOI_SYSFS_DATA_LONG, \
++	.flags = _flags, \
++	.data = { .a_long = { .variable = _long, .minimum = _min, \
++			.maximum = _max } } }
++
++#define SYSFS_STRING(_name, _mode, _string, _max_len, _flags, _wse) { \
++	.attr = {.name  = _name , .mode   = _mode }, \
++	.type = TOI_SYSFS_DATA_STRING, \
++	.flags = _flags, \
++	.data = { .string = { .variable = _string, .max_length = _max_len } }, \
++	.write_side_effect = _wse }
++
++#define SYSFS_CUSTOM(_name, _mode, _read, _write, _flags, _wse) { \
++	.attr = {.name  = _name , .mode   = _mode }, \
++	.type = TOI_SYSFS_DATA_CUSTOM, \
++	.flags = _flags, \
++	.data = { .special = { .read_sysfs = _read, .write_sysfs = _write } }, \
++	.write_side_effect = _wse }
++
++#define SYSFS_NONE(_name, _wse) { \
++	.attr = {.name  = _name , .mode   = SYSFS_WRITEONLY }, \
++	.type = TOI_SYSFS_DATA_NONE, \
++	.write_side_effect = _wse, \
++}
++
++/* Flags */
++#define SYSFS_NEEDS_SM_FOR_READ 1
++#define SYSFS_NEEDS_SM_FOR_WRITE 2
++#define SYSFS_HIBERNATE 4
++#define SYSFS_RESUME 8
++#define SYSFS_HIBERNATE_OR_RESUME (SYSFS_HIBERNATE | SYSFS_RESUME)
++#define SYSFS_HIBERNATING (SYSFS_HIBERNATE | SYSFS_NEEDS_SM_FOR_WRITE)
++#define SYSFS_RESUMING (SYSFS_RESUME | SYSFS_NEEDS_SM_FOR_WRITE)
++#define SYSFS_NEEDS_SM_FOR_BOTH \
++ (SYSFS_NEEDS_SM_FOR_READ | SYSFS_NEEDS_SM_FOR_WRITE)
++
++int toi_register_sysfs_file(struct kobject *kobj,
++		struct toi_sysfs_data *toi_sysfs_data);
++void toi_unregister_sysfs_file(struct kobject *kobj,
++		struct toi_sysfs_data *toi_sysfs_data);
++
++extern struct kobject *tuxonice_kobj;
++
++struct kobject *make_toi_sysdir(char *name);
++void remove_toi_sysdir(struct kobject *obj);
++extern void toi_cleanup_sysfs(void);
++
++extern int toi_sysfs_init(void);
++extern void toi_sysfs_exit(void);
+diff --git a/kernel/power/tuxonice_ui.c b/kernel/power/tuxonice_ui.c
+new file mode 100644
+index 0000000..452b3db
+--- /dev/null
++++ b/kernel/power/tuxonice_ui.c
+@@ -0,0 +1,250 @@
++/*
++ * kernel/power/tuxonice_ui.c
++ *
++ * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu>
++ * Copyright (C) 1998,2001,2002 Pavel Machek <pavel@suse.cz>
++ * Copyright (C) 2002-2003 Florent Chabaud <fchabaud@free.fr>
++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Routines for TuxOnIce's user interface.
++ *
++ * The user interface code talks to a userspace program via a
++ * netlink socket.
++ *
++ * The kernel side:
++ * - starts the userui program;
++ * - sends text messages and progress bar status;
++ *
++ * The user space side:
++ * - passes messages regarding user requests (abort, toggle reboot etc)
++ *
++ */
++
++#define __KERNEL_SYSCALLS__
++
++#include <linux/reboot.h>
++
++#include "tuxonice_sysfs.h"
++#include "tuxonice_modules.h"
++#include "tuxonice.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_netlink.h"
++#include "tuxonice_power_off.h"
++#include "tuxonice_builtin.h"
++
++static char local_printf_buf[1024];	/* Same as printk - should be safe */
++struct ui_ops *toi_current_ui;
++EXPORT_SYMBOL_GPL(toi_current_ui);
++
++/**
++ * toi_wait_for_keypress - Wait for keypress via userui or /dev/console.
++ *
++ * @timeout: Maximum time to wait.
++ *
++ * Wait for a keypress, either from userui or /dev/console if userui isn't
++ * available. The non-userui path is particularly for at boot-time, prior
++ * to userui being started, when we have an important warning to give to
++ * the user.
++ */
++static char toi_wait_for_keypress(int timeout)
++{
++	if (toi_current_ui && toi_current_ui->wait_for_key(timeout))
++		return ' ';
++
++	return toi_wait_for_keypress_dev_console(timeout);
++}
++
++/* toi_early_boot_message()
++ * Description:	Handle errors early in the process of booting.
++ * 		The user may press C to continue booting, perhaps
++ * 		invalidating the image,  or space to reboot.
++ * 		This works from either the serial console or normally
++ * 		attached keyboard.
++ *
++ * 		Note that we come in here from init, while the kernel is
++ * 		locked. If we want to get events from the serial console,
++ * 		we need to temporarily unlock the kernel.
++ *
++ * 		toi_early_boot_message may also be called post-boot.
++ * 		In this case, it simply printks the message and returns.
++ *
++ * Arguments:	int	Whether we are able to erase the image.
++ * 		int	default_answer. What to do when we timeout. This
++ * 			will normally be continue, but the user might
++ * 			provide command line options (__setup) to override
++ * 			particular cases.
++ * 		Char *. Pointer to a string explaining why we're moaning.
++ */
++
++#define say(message, a...) printk(KERN_EMERG message, ##a)
++
++void toi_early_boot_message(int message_detail, int default_answer,
++	char *warning_reason, ...)
++{
++#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
++	unsigned long orig_state = get_toi_state(), continue_req = 0;
++	unsigned long orig_loglevel = console_loglevel;
++	int can_ask = 1;
++#else
++	int can_ask = 0;
++#endif
++
++	va_list args;
++	int printed_len;
++
++	if (!toi_wait) {
++		set_toi_state(TOI_CONTINUE_REQ);
++		can_ask = 0;
++	}
++
++	if (warning_reason) {
++		va_start(args, warning_reason);
++		printed_len = vsnprintf(local_printf_buf,
++				sizeof(local_printf_buf),
++				warning_reason,
++				args);
++		va_end(args);
++	}
++
++	if (!test_toi_state(TOI_BOOT_TIME)) {
++		printk("TuxOnIce: %s\n", local_printf_buf);
++		return;
++	}
++
++	if (!can_ask) {
++		continue_req = !!default_answer;
++		goto post_ask;
++	}
++
++#if defined(CONFIG_VT) || defined(CONFIG_SERIAL_CONSOLE)
++	console_loglevel = 7;
++
++	say("=== TuxOnIce ===\n\n");
++	if (warning_reason) {
++		say("BIG FAT WARNING!! %s\n\n", local_printf_buf);
++		switch (message_detail) {
++		case 0:
++			say("If you continue booting, note that any image WILL"
++				"NOT BE REMOVED.\nTuxOnIce is unable to do so "
++				"because the appropriate modules aren't\n"
++				"loaded. You should manually remove the image "
++				"to avoid any\npossibility of corrupting your "
++				"filesystem(s) later.\n");
++			break;
++		case 1:
++			say("If you want to use the current TuxOnIce image, "
++				"reboot and try\nagain with the same kernel "
++				"that you hibernated from. If you want\n"
++				"to forget that image, continue and the image "
++				"will be erased.\n");
++			break;
++		}
++		say("Press SPACE to reboot or C to continue booting with "
++			"this kernel\n\n");
++		if (toi_wait > 0)
++			say("Default action if you don't select one in %d "
++				"seconds is: %s.\n",
++				toi_wait,
++				default_answer == TOI_CONTINUE_REQ ?
++				"continue booting" : "reboot");
++	} else {
++		say("BIG FAT WARNING!!\n\n"
++			"You have tried to resume from this image before.\n"
++			"If it failed once, it may well fail again.\n"
++			"Would you like to remove the image and boot "
++			"normally?\nThis will be equivalent to entering "
++			"noresume on the\nkernel command line.\n\n"
++			"Press SPACE to remove the image or C to continue "
++			"resuming.\n\n");
++		if (toi_wait > 0)
++			say("Default action if you don't select one in %d "
++				"seconds is: %s.\n", toi_wait,
++				!!default_answer ?
++				"continue resuming" : "remove the image");
++	}
++	console_loglevel = orig_loglevel;
++
++	set_toi_state(TOI_SANITY_CHECK_PROMPT);
++	clear_toi_state(TOI_CONTINUE_REQ);
++
++	if (toi_wait_for_keypress(toi_wait) == 0) /* We timed out */
++		continue_req = !!default_answer;
++	else
++		continue_req = test_toi_state(TOI_CONTINUE_REQ);
++
++#endif /* CONFIG_VT or CONFIG_SERIAL_CONSOLE */
++
++post_ask:
++	if ((warning_reason) && (!continue_req))
++		kernel_restart(NULL);
++
++	restore_toi_state(orig_state);
++	if (continue_req)
++		set_toi_state(TOI_CONTINUE_REQ);
++}
++EXPORT_SYMBOL_GPL(toi_early_boot_message);
++#undef say
++
++/*
++ * User interface specific /sys/power/tuxonice entries.
++ */
++
++static struct toi_sysfs_data sysfs_params[] = {
++#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
++	SYSFS_INT("default_console_level", SYSFS_RW,
++			&toi_bkd.toi_default_console_level, 0, 7, 0, NULL),
++	SYSFS_UL("debug_sections", SYSFS_RW, &toi_bkd.toi_debug_state, 0,
++			1 << 30, 0),
++	SYSFS_BIT("log_everything", SYSFS_RW, &toi_bkd.toi_action, TOI_LOGALL,
++			0)
++#endif
++};
++
++static struct toi_module_ops userui_ops = {
++	.type				= MISC_HIDDEN_MODULE,
++	.name				= "printk ui",
++	.directory			= "user_interface",
++	.module				= THIS_MODULE,
++	.sysfs_data			= sysfs_params,
++	.num_sysfs_entries		= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++int toi_register_ui_ops(struct ui_ops *this_ui)
++{
++	if (toi_current_ui) {
++		printk(KERN_INFO "Only one TuxOnIce user interface module can "
++				"be loaded at a time.");
++		return -EBUSY;
++	}
++
++	toi_current_ui = this_ui;
++
++	return 0;
++}
++EXPORT_SYMBOL_GPL(toi_register_ui_ops);
++
++void toi_remove_ui_ops(struct ui_ops *this_ui)
++{
++	if (toi_current_ui != this_ui)
++		return;
++
++	toi_current_ui = NULL;
++}
++EXPORT_SYMBOL_GPL(toi_remove_ui_ops);
++
++/* toi_console_sysfs_init
++ * Description: Boot time initialisation for user interface.
++ */
++
++int toi_ui_init(void)
++{
++	return toi_register_module(&userui_ops);
++}
++
++void toi_ui_exit(void)
++{
++	toi_unregister_module(&userui_ops);
++}
+diff --git a/kernel/power/tuxonice_ui.h b/kernel/power/tuxonice_ui.h
+new file mode 100644
+index 0000000..4ced165
+--- /dev/null
++++ b/kernel/power/tuxonice_ui.h
+@@ -0,0 +1,97 @@
++/*
++ * kernel/power/tuxonice_ui.h
++ *
++ * Copyright (C) 2004-2010 Nigel Cunningham (nigel at tuxonice net)
++ */
++
++enum {
++	DONT_CLEAR_BAR,
++	CLEAR_BAR
++};
++
++enum {
++	/* Userspace -> Kernel */
++	USERUI_MSG_ABORT = 0x11,
++	USERUI_MSG_SET_STATE = 0x12,
++	USERUI_MSG_GET_STATE = 0x13,
++	USERUI_MSG_GET_DEBUG_STATE = 0x14,
++	USERUI_MSG_SET_DEBUG_STATE = 0x15,
++	USERUI_MSG_SPACE = 0x18,
++	USERUI_MSG_GET_POWERDOWN_METHOD = 0x1A,
++	USERUI_MSG_SET_POWERDOWN_METHOD = 0x1B,
++	USERUI_MSG_GET_LOGLEVEL = 0x1C,
++	USERUI_MSG_SET_LOGLEVEL = 0x1D,
++	USERUI_MSG_PRINTK = 0x1E,
++
++	/* Kernel -> Userspace */
++	USERUI_MSG_MESSAGE = 0x21,
++	USERUI_MSG_PROGRESS = 0x22,
++	USERUI_MSG_POST_ATOMIC_RESTORE = 0x25,
++
++	USERUI_MSG_MAX,
++};
++
++struct userui_msg_params {
++	u32 a, b, c, d;
++	char text[255];
++};
++
++struct ui_ops {
++	char (*wait_for_key) (int timeout);
++	u32 (*update_status) (u32 value, u32 maximum, const char *fmt, ...);
++	void (*prepare_status) (int clearbar, const char *fmt, ...);
++	void (*cond_pause) (int pause, char *message);
++	void (*abort)(int result_code, const char *fmt, ...);
++	void (*prepare)(void);
++	void (*cleanup)(void);
++	void (*message)(u32 section, u32 level, u32 normally_logged,
++			const char *fmt, ...);
++};
++
++extern struct ui_ops *toi_current_ui;
++
++#define toi_update_status(val, max, fmt, args...) \
++ (toi_current_ui ? (toi_current_ui->update_status) (val, max, fmt, ##args) : \
++	max)
++
++#define toi_prepare_console(void) \
++	do { if (toi_current_ui) \
++		(toi_current_ui->prepare)(); \
++	} while (0)
++
++#define toi_cleanup_console(void) \
++	do { if (toi_current_ui) \
++		(toi_current_ui->cleanup)(); \
++	} while (0)
++
++#define abort_hibernate(result, fmt, args...) \
++	do { if (toi_current_ui) \
++		(toi_current_ui->abort)(result, fmt, ##args); \
++	     else { \
++		set_abort_result(result); \
++	     } \
++	} while (0)
++
++#define toi_cond_pause(pause, message) \
++	do { if (toi_current_ui) \
++		(toi_current_ui->cond_pause)(pause, message); \
++	} while (0)
++
++#define toi_prepare_status(clear, fmt, args...) \
++	do { if (toi_current_ui) \
++		(toi_current_ui->prepare_status)(clear, fmt, ##args); \
++	     else \
++		printk(KERN_INFO fmt "%s", ##args, "\n"); \
++	} while (0)
++
++#define toi_message(sn, lev, log, fmt, a...) \
++do { \
++	if (toi_current_ui && (!sn || test_debug_state(sn))) \
++		toi_current_ui->message(sn, lev, log, fmt, ##a); \
++} while (0)
++
++__exit void toi_ui_cleanup(void);
++extern int toi_ui_init(void);
++extern void toi_ui_exit(void);
++extern int toi_register_ui_ops(struct ui_ops *this_ui);
++extern void toi_remove_ui_ops(struct ui_ops *this_ui);
+diff --git a/kernel/power/tuxonice_userui.c b/kernel/power/tuxonice_userui.c
+new file mode 100644
+index 0000000..bc74672
+--- /dev/null
++++ b/kernel/power/tuxonice_userui.c
+@@ -0,0 +1,667 @@
++/*
++ * kernel/power/user_ui.c
++ *
++ * Copyright (C) 2005-2007 Bernard Blackham
++ * Copyright (C) 2002-2010 Nigel Cunningham (nigel at tuxonice net)
++ *
++ * This file is released under the GPLv2.
++ *
++ * Routines for TuxOnIce's user interface.
++ *
++ * The user interface code talks to a userspace program via a
++ * netlink socket.
++ *
++ * The kernel side:
++ * - starts the userui program;
++ * - sends text messages and progress bar status;
++ *
++ * The user space side:
++ * - passes messages regarding user requests (abort, toggle reboot etc)
++ *
++ */
++
++#define __KERNEL_SYSCALLS__
++
++#include <linux/suspend.h>
++#include <linux/freezer.h>
++#include <linux/console.h>
++#include <linux/ctype.h>
++#include <linux/tty.h>
++#include <linux/vt_kern.h>
++#include <linux/reboot.h>
++#include <linux/security.h>
++#include <linux/syscalls.h>
++#include <linux/vt.h>
++
++#include "tuxonice_sysfs.h"
++#include "tuxonice_modules.h"
++#include "tuxonice.h"
++#include "tuxonice_ui.h"
++#include "tuxonice_netlink.h"
++#include "tuxonice_power_off.h"
++
++static char local_printf_buf[1024];	/* Same as printk - should be safe */
++
++static struct user_helper_data ui_helper_data;
++static struct toi_module_ops userui_ops;
++static int orig_kmsg;
++
++static char lastheader[512];
++static int lastheader_message_len;
++static int ui_helper_changed; /* Used at resume-time so don't overwrite value
++				set from initrd/ramfs. */
++
++/* Number of distinct progress amounts that userspace can display */
++static int progress_granularity = 30;
++
++static DECLARE_WAIT_QUEUE_HEAD(userui_wait_for_key);
++
++/**
++ * ui_nl_set_state - Update toi_action based on a message from userui.
++ *
++ * @n: The bit (1 << bit) to set.
++ */
++static void ui_nl_set_state(int n)
++{
++	/* Only let them change certain settings */
++	static const u32 toi_action_mask =
++		(1 << TOI_REBOOT) | (1 << TOI_PAUSE) |
++		(1 << TOI_LOGALL) |
++		(1 << TOI_SINGLESTEP) |
++		(1 << TOI_PAUSE_NEAR_PAGESET_END);
++	static unsigned long new_action;
++
++	new_action = (toi_bkd.toi_action & (~toi_action_mask)) |
++		(n & toi_action_mask);
++
++	printk(KERN_DEBUG "n is %x. Action flags being changed from %lx "
++			"to %lx.", n, toi_bkd.toi_action, new_action);
++	toi_bkd.toi_action = new_action;
++
++	if (!test_action_state(TOI_PAUSE) &&
++			!test_action_state(TOI_SINGLESTEP))
++		wake_up_interruptible(&userui_wait_for_key);
++}
++
++/**
++ * userui_post_atomic_restore - Tell userui that atomic restore just happened.
++ *
++ * Tell userui that atomic restore just occured, so that it can do things like
++ * redrawing the screen, re-getting settings and so on.
++ */
++static void userui_post_atomic_restore(struct toi_boot_kernel_data *bkd)
++{
++	toi_send_netlink_message(&ui_helper_data,
++			USERUI_MSG_POST_ATOMIC_RESTORE, NULL, 0);
++}
++
++/**
++ * userui_storage_needed - Report how much memory in image header is needed.
++ */
++static int userui_storage_needed(void)
++{
++	return sizeof(ui_helper_data.program) + 1 + sizeof(int);
++}
++
++/**
++ * userui_save_config_info - Fill buffer with config info for image header.
++ *
++ * @buf: Buffer into which to put the config info we want to save.
++ */
++static int userui_save_config_info(char *buf)
++{
++	*((int *) buf) = progress_granularity;
++	memcpy(buf + sizeof(int), ui_helper_data.program,
++			sizeof(ui_helper_data.program));
++	return sizeof(ui_helper_data.program) + sizeof(int) + 1;
++}
++
++/**
++ * userui_load_config_info - Restore config info from buffer.
++ *
++ * @buf: Buffer containing header info loaded.
++ * @size: Size of data loaded for this module.
++ */
++static void userui_load_config_info(char *buf, int size)
++{
++	progress_granularity = *((int *) buf);
++	size -= sizeof(int);
++
++	/* Don't load the saved path if one has already been set */
++	if (ui_helper_changed)
++		return;
++
++	if (size > sizeof(ui_helper_data.program))
++		size = sizeof(ui_helper_data.program);
++
++	memcpy(ui_helper_data.program, buf + sizeof(int), size);
++	ui_helper_data.program[sizeof(ui_helper_data.program)-1] = '\0';
++}
++
++/**
++ * set_ui_program_set: Record that userui program was changed.
++ *
++ * Side effect routine for when the userui program is set. In an initrd or
++ * ramfs, the user may set a location for the userui program. If this happens,
++ * we don't want to reload the value that was saved in the image header. This
++ * routine allows us to flag that we shouldn't restore the program name from
++ * the image header.
++ */
++static void set_ui_program_set(void)
++{
++	ui_helper_changed = 1;
++}
++
++/**
++ * userui_memory_needed - Tell core how much memory to reserve for us.
++ */
++static int userui_memory_needed(void)
++{
++	/* ball park figure of 128 pages */
++	return 128 * PAGE_SIZE;
++}
++
++/**
++ * userui_update_status - Update the progress bar and (if on) in-bar message.
++ *
++ * @value: Current progress percentage numerator.
++ * @maximum: Current progress percentage denominator.
++ * @fmt: Message to be displayed in the middle of the progress bar.
++ *
++ * Note that a NULL message does not mean that any previous message is erased!
++ * For that, you need toi_prepare_status with clearbar on.
++ *
++ * Returns an unsigned long, being the next numerator (as determined by the
++ * maximum and progress granularity) where status needs to be updated.
++ * This is to reduce unnecessary calls to update_status.
++ */
++static u32 userui_update_status(u32 value, u32 maximum, const char *fmt, ...)
++{
++	static u32 last_step = 9999;
++	struct userui_msg_params msg;
++	u32 this_step, next_update;
++	int bitshift;
++
++	if (ui_helper_data.pid == -1)
++		return 0;
++
++	if ((!maximum) || (!progress_granularity))
++		return maximum;
++
++	if (value < 0)
++		value = 0;
++
++	if (value > maximum)
++		value = maximum;
++
++	/* Try to avoid math problems - we can't do 64 bit math here
++	 * (and shouldn't need it - anyone got screen resolution
++	 * of 65536 pixels or more?) */
++	bitshift = fls(maximum) - 16;
++	if (bitshift > 0) {
++		u32 temp_maximum = maximum >> bitshift;
++		u32 temp_value = value >> bitshift;
++		this_step = (u32)
++			(temp_value * progress_granularity / temp_maximum);
++		next_update = (((this_step + 1) * temp_maximum /
++					progress_granularity) + 1) << bitshift;
++	} else {
++		this_step = (u32) (value * progress_granularity / maximum);
++		next_update = ((this_step + 1) * maximum /
++				progress_granularity) + 1;
++	}
++
++	if (this_step == last_step)
++		return next_update;
++
++	memset(&msg, 0, sizeof(msg));
++
++	msg.a = this_step;
++	msg.b = progress_granularity;
++
++	if (fmt) {
++		va_list args;
++		va_start(args, fmt);
++		vsnprintf(msg.text, sizeof(msg.text), fmt, args);
++		va_end(args);
++		msg.text[sizeof(msg.text)-1] = '\0';
++	}
++
++	toi_send_netlink_message(&ui_helper_data, USERUI_MSG_PROGRESS,
++			&msg, sizeof(msg));
++	last_step = this_step;
++
++	return next_update;
++}
++
++/**
++ * userui_message - Display a message without necessarily logging it.
++ *
++ * @section: Type of message. Messages can be filtered by type.
++ * @level: Degree of importance of the message. Lower values = higher priority.
++ * @normally_logged: Whether logged even if log_everything is off.
++ * @fmt: Message (and parameters).
++ *
++ * This function is intended to do the same job as printk, but without normally
++ * logging what is printed. The point is to be able to get debugging info on
++ * screen without filling the logs with "1/534. ^M 2/534^M. 3/534^M"
++ *
++ * It may be called from an interrupt context - can't sleep!
++ */
++static void userui_message(u32 section, u32 level, u32 normally_logged,
++		const char *fmt, ...)
++{
++	struct userui_msg_params msg;
++
++	if ((level) && (level > console_loglevel))
++		return;
++
++	memset(&msg, 0, sizeof(msg));
++
++	msg.a = section;
++	msg.b = level;
++	msg.c = normally_logged;
++
++	if (fmt) {
++		va_list args;
++		va_start(args, fmt);
++		vsnprintf(msg.text, sizeof(msg.text), fmt, args);
++		va_end(args);
++		msg.text[sizeof(msg.text)-1] = '\0';
++	}
++
++	if (test_action_state(TOI_LOGALL))
++		printk(KERN_INFO "%s\n", msg.text);
++
++	toi_send_netlink_message(&ui_helper_data, USERUI_MSG_MESSAGE,
++			&msg, sizeof(msg));
++}
++
++/**
++ * wait_for_key_via_userui - Wait for userui to receive a keypress.
++ */
++static void wait_for_key_via_userui(void)
++{
++	DECLARE_WAITQUEUE(wait, current);
++
++	add_wait_queue(&userui_wait_for_key, &wait);
++	set_current_state(TASK_INTERRUPTIBLE);
++
++	interruptible_sleep_on(&userui_wait_for_key);
++
++	set_current_state(TASK_RUNNING);
++	remove_wait_queue(&userui_wait_for_key, &wait);
++}
++
++/**
++ * userui_prepare_status - Display high level messages.
++ *
++ * @clearbar: Whether to clear the progress bar.
++ * @fmt...: New message for the title.
++ *
++ * Prepare the 'nice display', drawing the header and version, along with the
++ * current action and perhaps also resetting the progress bar.
++ */
++static void userui_prepare_status(int clearbar, const char *fmt, ...)
++{
++	va_list args;
++
++	if (fmt) {
++		va_start(args, fmt);
++		lastheader_message_len = vsnprintf(lastheader, 512, fmt, args);
++		va_end(args);
++	}
++
++	if (clearbar)
++		toi_update_status(0, 1, NULL);
++
++	if (ui_helper_data.pid == -1)
++		printk(KERN_EMERG "%s\n", lastheader);
++	else
++		toi_message(0, TOI_STATUS, 1, lastheader, NULL);
++}
++
++/**
++ * toi_wait_for_keypress - Wait for keypress via userui.
++ *
++ * @timeout: Maximum time to wait.
++ *
++ * Wait for a keypress from userui.
++ *
++ * FIXME: Implement timeout?
++ */
++static char userui_wait_for_keypress(int timeout)
++{
++	char key = '\0';
++
++	if (ui_helper_data.pid != -1) {
++		wait_for_key_via_userui();
++		key = ' ';
++	}
++
++	return key;
++}
++
++/**
++ * userui_abort_hibernate - Abort a cycle & tell user if they didn't request it.
++ *
++ * @result_code: Reason why we're aborting (1 << bit).
++ * @fmt: Message to display if telling the user what's going on.
++ *
++ * Abort a cycle. If this wasn't at the user's request (and we're displaying
++ * output), tell the user why and wait for them to acknowledge the message.
++ */
++static void userui_abort_hibernate(int result_code, const char *fmt, ...)
++{
++	va_list args;
++	int printed_len = 0;
++
++	set_result_state(result_code);
++
++	if (test_result_state(TOI_ABORTED))
++		return;
++
++	set_result_state(TOI_ABORTED);
++
++	if (test_result_state(TOI_ABORT_REQUESTED))
++		return;
++
++	va_start(args, fmt);
++	printed_len = vsnprintf(local_printf_buf,  sizeof(local_printf_buf),
++			fmt, args);
++	va_end(args);
++	if (ui_helper_data.pid != -1)
++		printed_len = sprintf(local_printf_buf + printed_len,
++					" (Press SPACE to continue)");
++
++	toi_prepare_status(CLEAR_BAR, "%s", local_printf_buf);
++
++	if (ui_helper_data.pid != -1)
++		userui_wait_for_keypress(0);
++}
++
++/**
++ * request_abort_hibernate - Abort hibernating or resuming at user request.
++ *
++ * Handle the user requesting the cancellation of a hibernation or resume by
++ * pressing escape.
++ */
++static void request_abort_hibernate(void)
++{
++	if (test_result_state(TOI_ABORT_REQUESTED) ||
++	   !test_action_state(TOI_CAN_CANCEL))
++		return;
++
++	if (test_toi_state(TOI_NOW_RESUMING)) {
++		toi_prepare_status(CLEAR_BAR, "Escape pressed. "
++					"Powering down again.");
++		set_toi_state(TOI_STOP_RESUME);
++		while (!test_toi_state(TOI_IO_STOPPED))
++			schedule();
++		if (toiActiveAllocator->mark_resume_attempted)
++			toiActiveAllocator->mark_resume_attempted(0);
++		toi_power_down();
++	}
++
++	toi_prepare_status(CLEAR_BAR, "--- ESCAPE PRESSED :"
++					" ABORTING HIBERNATION ---");
++	set_abort_result(TOI_ABORT_REQUESTED);
++	wake_up_interruptible(&userui_wait_for_key);
++}
++
++/**
++ * userui_user_rcv_msg - Receive a netlink message from userui.
++ *
++ * @skb: skb received.
++ * @nlh: Netlink header received.
++ */
++static int userui_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
++{
++	int type;
++	int *data;
++
++	type = nlh->nlmsg_type;
++
++	/* A control message: ignore them */
++	if (type < NETLINK_MSG_BASE)
++		return 0;
++
++	/* Unknown message: reply with EINVAL */
++	if (type >= USERUI_MSG_MAX)
++		return -EINVAL;
++
++	/* All operations require privileges, even GET */
++	if (!capable(CAP_NET_ADMIN))
++		return -EPERM;
++
++	/* Only allow one task to receive NOFREEZE privileges */
++	if (type == NETLINK_MSG_NOFREEZE_ME && ui_helper_data.pid != -1) {
++		printk(KERN_INFO "Got NOFREEZE_ME request when "
++			"ui_helper_data.pid is %d.\n", ui_helper_data.pid);
++		return -EBUSY;
++	}
++
++	data = (int *) NLMSG_DATA(nlh);
++
++	switch (type) {
++	case USERUI_MSG_ABORT:
++		request_abort_hibernate();
++		return 0;
++	case USERUI_MSG_GET_STATE:
++		toi_send_netlink_message(&ui_helper_data,
++				USERUI_MSG_GET_STATE, &toi_bkd.toi_action,
++				sizeof(toi_bkd.toi_action));
++		return 0;
++	case USERUI_MSG_GET_DEBUG_STATE:
++		toi_send_netlink_message(&ui_helper_data,
++				USERUI_MSG_GET_DEBUG_STATE,
++				&toi_bkd.toi_debug_state,
++				sizeof(toi_bkd.toi_debug_state));
++		return 0;
++	case USERUI_MSG_SET_STATE:
++		if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
++			return -EINVAL;
++		ui_nl_set_state(*data);
++		return 0;
++	case USERUI_MSG_SET_DEBUG_STATE:
++		if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
++			return -EINVAL;
++		toi_bkd.toi_debug_state = (*data);
++		return 0;
++	case USERUI_MSG_SPACE:
++		wake_up_interruptible(&userui_wait_for_key);
++		return 0;
++	case USERUI_MSG_GET_POWERDOWN_METHOD:
++		toi_send_netlink_message(&ui_helper_data,
++				USERUI_MSG_GET_POWERDOWN_METHOD,
++				&toi_poweroff_method,
++				sizeof(toi_poweroff_method));
++		return 0;
++	case USERUI_MSG_SET_POWERDOWN_METHOD:
++		if (nlh->nlmsg_len != NLMSG_LENGTH(sizeof(char)))
++			return -EINVAL;
++		toi_poweroff_method = (unsigned long)(*data);
++		return 0;
++	case USERUI_MSG_GET_LOGLEVEL:
++		toi_send_netlink_message(&ui_helper_data,
++				USERUI_MSG_GET_LOGLEVEL,
++				&toi_bkd.toi_default_console_level,
++				sizeof(toi_bkd.toi_default_console_level));
++		return 0;
++	case USERUI_MSG_SET_LOGLEVEL:
++		if (nlh->nlmsg_len < NLMSG_LENGTH(sizeof(int)))
++			return -EINVAL;
++		toi_bkd.toi_default_console_level = (*data);
++		return 0;
++	case USERUI_MSG_PRINTK:
++		printk(KERN_INFO "%s", (char *) data);
++		return 0;
++	}
++
++	/* Unhandled here */
++	return 1;
++}
++
++/**
++ * userui_cond_pause - Possibly pause at user request.
++ *
++ * @pause: Whether to pause or just display the message.
++ * @message: Message to display at the start of pausing.
++ *
++ * Potentially pause and wait for the user to tell us to continue. We normally
++ * only pause when @pause is set. While paused, the user can do things like
++ * changing the loglevel, toggling the display of debugging sections and such
++ * like.
++ */
++static void userui_cond_pause(int pause, char *message)
++{
++	int displayed_message = 0, last_key = 0;
++
++	while (last_key != 32 &&
++		ui_helper_data.pid != -1 &&
++		((test_action_state(TOI_PAUSE) && pause) ||
++		 (test_action_state(TOI_SINGLESTEP)))) {
++		if (!displayed_message) {
++			toi_prepare_status(DONT_CLEAR_BAR,
++			   "%s Press SPACE to continue.%s",
++			   message ? message : "",
++			   (test_action_state(TOI_SINGLESTEP)) ?
++			   " Single step on." : "");
++			displayed_message = 1;
++		}
++		last_key = userui_wait_for_keypress(0);
++	}
++	schedule();
++}
++
++/**
++ * userui_prepare_console - Prepare the console for use.
++ *
++ * Prepare a console for use, saving current kmsg settings and attempting to
++ * start userui. Console loglevel changes are handled by userui.
++ */
++static void userui_prepare_console(void)
++{
++	orig_kmsg = vt_kmsg_redirect(fg_console + 1);
++
++	ui_helper_data.pid = -1;
++
++	if (!userui_ops.enabled) {
++		printk(KERN_INFO "TuxOnIce: Userui disabled.\n");
++		return;
++	}
++
++	if (*ui_helper_data.program)
++		toi_netlink_setup(&ui_helper_data);
++	else
++		printk(KERN_INFO "TuxOnIce: Userui program not configured.\n");
++}
++
++/**
++ * userui_cleanup_console - Cleanup after a cycle.
++ *
++ * Tell userui to cleanup, and restore kmsg_redirect to its original value.
++ */
++
++static void userui_cleanup_console(void)
++{
++	if (ui_helper_data.pid > -1)
++		toi_netlink_close(&ui_helper_data);
++
++	vt_kmsg_redirect(orig_kmsg);
++}
++
++/*
++ * User interface specific /sys/power/tuxonice entries.
++ */
++
++static struct toi_sysfs_data sysfs_params[] = {
++#if defined(CONFIG_NET) && defined(CONFIG_SYSFS)
++	SYSFS_BIT("enable_escape", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_CAN_CANCEL, 0),
++	SYSFS_BIT("pause_between_steps", SYSFS_RW, &toi_bkd.toi_action,
++			TOI_PAUSE, 0),
++	SYSFS_INT("enabled", SYSFS_RW, &userui_ops.enabled, 0, 1, 0, NULL),
++	SYSFS_INT("progress_granularity", SYSFS_RW, &progress_granularity, 1,
++			2048, 0, NULL),
++	SYSFS_STRING("program", SYSFS_RW, ui_helper_data.program, 255, 0,
++			set_ui_program_set),
++	SYSFS_INT("debug", SYSFS_RW, &ui_helper_data.debug, 0, 1, 0, NULL)
++#endif
++};
++
++static struct toi_module_ops userui_ops = {
++	.type				= MISC_MODULE,
++	.name				= "userui",
++	.shared_directory		= "user_interface",
++	.module				= THIS_MODULE,
++	.storage_needed			= userui_storage_needed,
++	.save_config_info		= userui_save_config_info,
++	.load_config_info		= userui_load_config_info,
++	.memory_needed			= userui_memory_needed,
++	.post_atomic_restore		= userui_post_atomic_restore,
++	.sysfs_data			= sysfs_params,
++	.num_sysfs_entries		= sizeof(sysfs_params) /
++		sizeof(struct toi_sysfs_data),
++};
++
++static struct ui_ops my_ui_ops = {
++	.update_status			= userui_update_status,
++	.message			= userui_message,
++	.prepare_status			= userui_prepare_status,
++	.abort				= userui_abort_hibernate,
++	.cond_pause			= userui_cond_pause,
++	.prepare			= userui_prepare_console,
++	.cleanup			= userui_cleanup_console,
++	.wait_for_key			= userui_wait_for_keypress,
++};
++
++/**
++ * toi_user_ui_init - Boot time initialisation for user interface.
++ *
++ * Invoked from the core init routine.
++ */
++static __init int toi_user_ui_init(void)
++{
++	int result;
++
++	ui_helper_data.nl = NULL;
++	strncpy(ui_helper_data.program, CONFIG_TOI_USERUI_DEFAULT_PATH, 255);
++	ui_helper_data.pid = -1;
++	ui_helper_data.skb_size = sizeof(struct userui_msg_params);
++	ui_helper_data.pool_limit = 6;
++	ui_helper_data.netlink_id = NETLINK_TOI_USERUI;
++	ui_helper_data.name = "userspace ui";
++	ui_helper_data.rcv_msg = userui_user_rcv_msg;
++	ui_helper_data.interface_version = 8;
++	ui_helper_data.must_init = 0;
++	ui_helper_data.not_ready = userui_cleanup_console;
++	init_completion(&ui_helper_data.wait_for_process);
++	result = toi_register_module(&userui_ops);
++	if (!result)
++		result = toi_register_ui_ops(&my_ui_ops);
++	if (result)
++		toi_unregister_module(&userui_ops);
++
++	return result;
++}
++
++#ifdef MODULE
++/**
++ * toi_user_ui_ext - Cleanup code for if the core is unloaded.
++ */
++static __exit void toi_user_ui_exit(void)
++{
++	toi_netlink_close_complete(&ui_helper_data);
++	toi_remove_ui_ops(&my_ui_ops);
++	toi_unregister_module(&userui_ops);
++}
++
++module_init(toi_user_ui_init);
++module_exit(toi_user_ui_exit);
++MODULE_AUTHOR("Nigel Cunningham");
++MODULE_DESCRIPTION("TuxOnIce Userui Support");
++MODULE_LICENSE("GPL");
++#else
++late_initcall(toi_user_ui_init);
++#endif
+diff --git a/kernel/power/user.c b/kernel/power/user.c
+index 4ed81e7..793144d 100644
+--- a/kernel/power/user.c
++++ b/kernel/power/user.c
+@@ -12,6 +12,7 @@
+ #include <linux/suspend.h>
+ #include <linux/syscalls.h>
+ #include <linux/reboot.h>
++#include <linux/export.h>
+ #include <linux/string.h>
+ #include <linux/device.h>
+ #include <linux/miscdevice.h>
+@@ -42,6 +43,7 @@ static struct snapshot_data {
+ } snapshot_state;
+ 
+ atomic_t snapshot_device_available = ATOMIC_INIT(1);
++EXPORT_SYMBOL_GPL(snapshot_device_available);
+ 
+ static int snapshot_open(struct inode *inode, struct file *filp)
+ {
+diff --git a/kernel/printk.c b/kernel/printk.c
+index f24633a..86fca67 100644
+--- a/kernel/printk.c
++++ b/kernel/printk.c
+@@ -33,6 +33,7 @@
+ #include <linux/bootmem.h>
+ #include <linux/memblock.h>
+ #include <linux/syscalls.h>
++#include <linux/suspend.h>
+ #include <linux/kexec.h>
+ #include <linux/kdb.h>
+ #include <linux/ratelimit.h>
+@@ -71,6 +72,7 @@ int console_printk[4] = {
+ 	MINIMUM_CONSOLE_LOGLEVEL,	/* minimum_console_loglevel */
+ 	DEFAULT_CONSOLE_LOGLEVEL,	/* default_console_loglevel */
+ };
++EXPORT_SYMBOL_GPL(console_printk);
+ 
+ /*
+  * Low level drivers may need that to know if they can schedule in
+@@ -1867,6 +1869,7 @@ void suspend_console(void)
+ 	console_suspended = 1;
+ 	up(&console_sem);
+ }
++EXPORT_SYMBOL_GPL(suspend_console);
+ 
+ void resume_console(void)
+ {
+@@ -1876,6 +1879,7 @@ void resume_console(void)
+ 	console_suspended = 0;
+ 	console_unlock();
+ }
++EXPORT_SYMBOL_GPL(resume_console);
+ 
+ /**
+  * console_cpu_notify - print deferred console messages after CPU hotplug
+diff --git a/mm/highmem.c b/mm/highmem.c
+index b32b70c..db3d6ea 100644
+--- a/mm/highmem.c
++++ b/mm/highmem.c
+@@ -66,6 +66,7 @@ unsigned int nr_free_highpages (void)
+ 
+ 	return pages;
+ }
++EXPORT_SYMBOL_GPL(nr_free_highpages);
+ 
+ static int pkmap_count[LAST_PKMAP];
+ static unsigned int last_pkmap_nr;
+diff --git a/mm/memory.c b/mm/memory.c
+index bb1369f..6ac7584 100644
+--- a/mm/memory.c
++++ b/mm/memory.c
+@@ -1617,6 +1617,7 @@ no_page_table:
+ 		return ERR_PTR(-EFAULT);
+ 	return page;
+ }
++EXPORT_SYMBOL_GPL(follow_page);
+ 
+ static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+ {
+diff --git a/mm/mmzone.c b/mm/mmzone.c
+index 4596d81..ddf6b28 100644
+--- a/mm/mmzone.c
++++ b/mm/mmzone.c
+@@ -8,11 +8,13 @@
+ #include <linux/stddef.h>
+ #include <linux/mm.h>
+ #include <linux/mmzone.h>
++#include <linux/export.h>
+ 
+ struct pglist_data *first_online_pgdat(void)
+ {
+ 	return NODE_DATA(first_online_node);
+ }
++EXPORT_SYMBOL_GPL(first_online_pgdat);
+ 
+ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+ {
+@@ -22,6 +24,7 @@ struct pglist_data *next_online_pgdat(struct pglist_data *pgdat)
+ 		return NULL;
+ 	return NODE_DATA(nid);
+ }
++EXPORT_SYMBOL_GPL(next_online_pgdat);
+ 
+ /*
+  * next_zone - helper magic for for_each_zone()
+@@ -41,6 +44,7 @@ struct zone *next_zone(struct zone *zone)
+ 	}
+ 	return zone;
+ }
++EXPORT_SYMBOL_GPL(next_zone);
+ 
+ static inline int zref_in_nodemask(struct zoneref *zref, nodemask_t *nodes)
+ {
+diff --git a/mm/page-writeback.c b/mm/page-writeback.c
+index 7300c9d..44988d2 100644
+--- a/mm/page-writeback.c
++++ b/mm/page-writeback.c
+@@ -108,6 +108,7 @@ unsigned int dirty_expire_interval = 30 * 100; /* centiseconds */
+  * Flag that makes the machine dump writes/reads and block dirtyings.
+  */
+ int block_dump;
++EXPORT_SYMBOL_GPL(block_dump);
+ 
+ /*
+  * Flag that puts the machine in "laptop mode". Doubles as a timeout in jiffies:
+diff --git a/mm/page_alloc.c b/mm/page_alloc.c
+index d1107ad..344404c 100644
+--- a/mm/page_alloc.c
++++ b/mm/page_alloc.c
+@@ -132,6 +132,7 @@ void pm_restore_gfp_mask(void)
+ 		saved_gfp_mask = 0;
+ 	}
+ }
++EXPORT_SYMBOL_GPL(pm_restore_gfp_mask);
+ 
+ void pm_restrict_gfp_mask(void)
+ {
+@@ -140,6 +141,7 @@ void pm_restrict_gfp_mask(void)
+ 	saved_gfp_mask = gfp_allowed_mask;
+ 	gfp_allowed_mask &= ~GFP_IOFS;
+ }
++EXPORT_SYMBOL_GPL(pm_restrict_gfp_mask);
+ 
+ bool pm_suspended_storage(void)
+ {
+@@ -2810,6 +2812,26 @@ static unsigned int nr_free_zone_pages(int offset)
+ 	return sum;
+ }
+ 
++static unsigned int nr_unallocated_zone_pages(int offset)
++{
++	struct zoneref *z;
++	struct zone *zone;
++
++	/* Just pick one node, since fallback list is circular */
++	unsigned int sum = 0;
++
++	struct zonelist *zonelist = node_zonelist(numa_node_id(), GFP_KERNEL);
++
++	for_each_zone_zonelist(zone, z, zonelist, offset) {
++		unsigned long high = high_wmark_pages(zone);
++		unsigned long left = zone_page_state(zone, NR_FREE_PAGES);
++		if (left > high)
++			sum += left - high;
++	}
++
++	return sum;
++}
++
+ /*
+  * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
+  */
+@@ -2820,6 +2842,15 @@ unsigned int nr_free_buffer_pages(void)
+ EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
+ 
+ /*
++ * Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
++ */
++unsigned int nr_unallocated_buffer_pages(void)
++{
++	return nr_unallocated_zone_pages(gfp_zone(GFP_USER));
++}
++EXPORT_SYMBOL_GPL(nr_unallocated_buffer_pages);
++
++/*
+  * Amount of free RAM allocatable within all zones
+  */
+ unsigned int nr_free_pagecache_pages(void)
+diff --git a/mm/shmem.c b/mm/shmem.c
+index 5dd56f6..72879f8 100644
+--- a/mm/shmem.c
++++ b/mm/shmem.c
+@@ -1361,7 +1361,7 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma)
+ }
+ 
+ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode *dir,
+-				     umode_t mode, dev_t dev, unsigned long flags)
++				     umode_t mode, dev_t dev, unsigned long flags, int atomic_copy)
+ {
+ 	struct inode *inode;
+ 	struct shmem_inode_info *info;
+@@ -1382,6 +1382,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
+ 		memset(info, 0, (char *)inode - (char *)info);
+ 		spin_lock_init(&info->lock);
+ 		info->flags = flags & VM_NORESERVE;
++		if (atomic_copy)
++			inode->i_flags |= S_ATOMIC_COPY;
+ 		INIT_LIST_HEAD(&info->swaplist);
+ 		simple_xattrs_init(&info->xattrs);
+ 		cache_no_acl(inode);
+@@ -1936,7 +1938,7 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+ 	struct inode *inode;
+ 	int error = -ENOSPC;
+ 
+-	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE);
++	inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE, 0);
+ 	if (inode) {
+ 		error = security_inode_init_security(inode, dir,
+ 						     &dentry->d_name,
+@@ -2075,7 +2077,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s
+ 	if (len > PAGE_CACHE_SIZE)
+ 		return -ENAMETOOLONG;
+ 
+-	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE);
++	inode = shmem_get_inode(dir->i_sb, dir, S_IFLNK|S_IRWXUGO, 0, VM_NORESERVE, 0);
+ 	if (!inode)
+ 		return -ENOSPC;
+ 
+@@ -2604,7 +2606,7 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
+ 	sb->s_flags |= MS_POSIXACL;
+ #endif
+ 
+-	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE);
++	inode = shmem_get_inode(sb, NULL, S_IFDIR | sbinfo->mode, 0, VM_NORESERVE, 0);
+ 	if (!inode)
+ 		goto failed;
+ 	inode->i_uid = sbinfo->uid;
+@@ -2857,7 +2859,7 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
+ 
+ #define shmem_vm_ops				generic_file_vm_ops
+ #define shmem_file_operations			ramfs_file_operations
+-#define shmem_get_inode(sb, dir, mode, dev, flags)	ramfs_get_inode(sb, dir, mode, dev)
++#define shmem_get_inode(sb, dir, mode, dev, flags, atomic_copy)	ramfs_get_inode(sb, dir, mode, dev)
+ #define shmem_acct_size(flags, size)		0
+ #define shmem_unacct_size(flags, size)		do {} while (0)
+ 
+@@ -2870,8 +2872,10 @@ EXPORT_SYMBOL_GPL(shmem_truncate_range);
+  * @name: name for dentry (to be seen in /proc/<pid>/maps
+  * @size: size to be set for the file
+  * @flags: VM_NORESERVE suppresses pre-accounting of the entire object size
++ * @atomic_copy: Atomically copy the area when hibernating?
+  */
+-struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags)
++struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags,
++		int atomic_copy)
+ {
+ 	int error;
+ 	struct file *file;
+@@ -2900,7 +2904,8 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
+ 	path.mnt = mntget(shm_mnt);
+ 
+ 	error = -ENOSPC;
+-	inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags);
++	inode = shmem_get_inode(root->d_sb, NULL, S_IFREG | S_IRWXUGO, 0, flags,
++			atomic_copy);
+ 	if (!inode)
+ 		goto put_dentry;
+ 
+@@ -2938,7 +2943,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
+ 	struct file *file;
+ 	loff_t size = vma->vm_end - vma->vm_start;
+ 
+-	file = shmem_file_setup("dev/zero", size, vma->vm_flags);
++	file = shmem_file_setup("dev/zero", size, vma->vm_flags, 0);
+ 	if (IS_ERR(file))
+ 		return PTR_ERR(file);
+ 
+diff --git a/mm/swap_state.c b/mm/swap_state.c
+index 0cb36fb..3e3a20f 100644
+--- a/mm/swap_state.c
++++ b/mm/swap_state.c
+@@ -7,6 +7,7 @@
+  *  Rewritten to use page cache, (C) 1998 Stephen Tweedie
+  */
+ #include <linux/mm.h>
++#include <linux/export.h>
+ #include <linux/gfp.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+@@ -43,6 +44,7 @@ struct address_space swapper_space = {
+ 	.i_mmap_nonlinear = LIST_HEAD_INIT(swapper_space.i_mmap_nonlinear),
+ 	.backing_dev_info = &swap_backing_dev_info,
+ };
++EXPORT_SYMBOL_GPL(swapper_space);
+ 
+ #define INC_CACHE_INFO(x)	do { swap_cache_info.x++; } while (0)
+ 
+diff --git a/mm/swapfile.c b/mm/swapfile.c
+index e97a0e5..f321484 100644
+--- a/mm/swapfile.c
++++ b/mm/swapfile.c
+@@ -9,6 +9,7 @@
+ #include <linux/hugetlb.h>
+ #include <linux/mman.h>
+ #include <linux/slab.h>
++#include <linux/export.h>
+ #include <linux/kernel_stat.h>
+ #include <linux/swap.h>
+ #include <linux/vmalloc.h>
+@@ -43,7 +44,6 @@
+ static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
+ 				 unsigned char);
+ static void free_swap_count_continuations(struct swap_info_struct *);
+-static sector_t map_swap_entry(swp_entry_t, struct block_device**);
+ 
+ DEFINE_SPINLOCK(swap_lock);
+ static unsigned int nr_swapfiles;
+@@ -474,6 +474,56 @@ swp_entry_t get_swap_page_of_type(int type)
+ 	spin_unlock(&swap_lock);
+ 	return (swp_entry_t) {0};
+ }
++EXPORT_SYMBOL_GPL(get_swap_page_of_type);
++
++static unsigned int find_next_to_unuse(struct swap_info_struct *si,
++					unsigned int prev, bool frontswap);
++
++void get_swap_range_of_type(int type, swp_entry_t *start, swp_entry_t *end,
++		unsigned int limit)
++{
++	struct swap_info_struct *si;
++	pgoff_t start_at;
++	unsigned int i;
++
++	*start = swp_entry(0, 0);
++	*end = swp_entry(0, 0);
++	spin_lock(&swap_lock);
++	si = swap_info[type];
++	if (si && (si->flags & SWP_WRITEOK)) {
++		/* This is called for allocating swap entry, not cache */
++		start_at = scan_swap_map(si, 1);
++		if (start_at) {
++			unsigned int stop_at = find_next_to_unuse(si, start_at, 0);
++			if (stop_at > start_at)
++				stop_at--;
++			else
++				stop_at = si->max - 1;
++			if (stop_at - start_at + 1 > limit)
++				stop_at = min_t(unsigned int,
++						start_at + limit - 1,
++						si->max - 1);
++			/* Mark them used */
++			for (i = start_at; i <= stop_at; i++)
++				si->swap_map[i] = 1;
++			/* first page already done above */
++			si->inuse_pages += stop_at - start_at;
++
++			nr_swap_pages -= stop_at - start_at + 1;
++			if (start_at + 1 == si->lowest_bit)
++				si->lowest_bit = stop_at + 1;
++			if (si->inuse_pages == si->pages) {
++				si->lowest_bit = si->max;
++				si->highest_bit = 0;
++			}
++			si->cluster_next = stop_at + 1;
++			*start = swp_entry(type, start_at);
++			*end = swp_entry(type, stop_at);
++		}
++	}
++	spin_unlock(&swap_lock);
++}
++EXPORT_SYMBOL_GPL(get_swap_range_of_type);
+ 
+ static struct swap_info_struct *swap_info_get(swp_entry_t entry)
+ {
+@@ -601,6 +651,7 @@ void swapcache_free(swp_entry_t entry, struct page *page)
+ 		spin_unlock(&swap_lock);
+ 	}
+ }
++EXPORT_SYMBOL_GPL(swap_free);
+ 
+ /*
+  * How many references to page are currently swapped out?
+@@ -1279,7 +1330,7 @@ static void drain_mmlist(void)
+  * Note that the type of this function is sector_t, but it returns page offset
+  * into the bdev, not sector offset.
+  */
+-static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
++sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
+ {
+ 	struct swap_info_struct *sis;
+ 	struct swap_extent *start_se;
+@@ -1306,6 +1357,7 @@ static sector_t map_swap_entry(swp_entry_t entry, struct block_device **bdev)
+ 		BUG_ON(se == start_se);		/* It *must* be present */
+ 	}
+ }
++EXPORT_SYMBOL_GPL(map_swap_entry);
+ 
+ /*
+  * Returns the page offset into bdev for the specified page's swap entry.
+@@ -1617,6 +1669,7 @@ out:
+ 	putname(pathname);
+ 	return err;
+ }
++EXPORT_SYMBOL_GPL(sys_swapoff);
+ 
+ #ifdef CONFIG_PROC_FS
+ static unsigned swaps_poll(struct file *file, poll_table *wait)
+@@ -2103,6 +2156,7 @@ out:
+ 		mutex_unlock(&inode->i_mutex);
+ 	return error;
+ }
++EXPORT_SYMBOL_GPL(sys_swapon);
+ 
+ void si_swapinfo(struct sysinfo *val)
+ {
+@@ -2120,6 +2174,7 @@ void si_swapinfo(struct sysinfo *val)
+ 	val->totalswap = total_swap_pages + nr_to_be_unused;
+ 	spin_unlock(&swap_lock);
+ }
++EXPORT_SYMBOL_GPL(si_swapinfo);
+ 
+ /*
+  * Verify that a swap entry is valid and increment its swap map count.
+@@ -2254,8 +2309,15 @@ pgoff_t __page_file_index(struct page *page)
+ 	VM_BUG_ON(!PageSwapCache(page));
+ 	return swp_offset(swap);
+ }
++
+ EXPORT_SYMBOL_GPL(__page_file_index);
+ 
++struct swap_info_struct *get_swap_info_struct(unsigned type)
++{
++	return swap_info[type];
++}
++EXPORT_SYMBOL_GPL(get_swap_info_struct);
++
+ /*
+  * add_swap_count_continuation - called when a swap count is duplicated
+  * beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 196709f..7897f18 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -1188,7 +1188,7 @@ static int too_many_isolated(struct zone *zone, int file,
+ {
+ 	unsigned long inactive, isolated;
+ 
+-	if (current_is_kswapd())
++	if (current_is_kswapd() || sc->hibernation_mode)
+ 		return 0;
+ 
+ 	if (!global_reclaim(sc))
+@@ -1762,7 +1762,9 @@ out:
+ 		unsigned long scan;
+ 
+ 		scan = get_lru_size(lruvec, lru);
+-		if (sc->priority || noswap || !vmscan_swappiness(sc)) {
++		if (sc->hibernation_mode)
++			scan = SWAP_CLUSTER_MAX;
++		else if (sc->priority || noswap || !vmscan_swappiness(sc)) {
+ 			scan >>= sc->priority;
+ 			if (!scan && force_scan)
+ 				scan = SWAP_CLUSTER_MAX;
+@@ -1798,6 +1800,9 @@ static inline bool should_continue_reclaim(struct lruvec *lruvec,
+ 	unsigned long pages_for_compaction;
+ 	unsigned long inactive_lru_pages;
+ 
++	if (nr_reclaimed && nr_scanned && sc->nr_to_reclaim >= sc->nr_reclaimed)
++		return true;
++
+ 	/* If not in reclaim/compaction mode, stop */
+ 	if (!in_reclaim_compaction(sc))
+ 		return false;
+@@ -1896,7 +1901,7 @@ restart:
+ 	 * Even if we did not try to evict anon pages at all, we want to
+ 	 * rebalance the anon lru active/inactive ratio.
+ 	 */
+-	if (inactive_anon_is_low(lruvec))
++	if (sc->hibernation_mode || inactive_anon_is_low(lruvec))
+ 		shrink_active_list(SWAP_CLUSTER_MAX, lruvec,
+ 				   sc, LRU_ACTIVE_ANON);
+ 
+@@ -2028,7 +2033,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc)
+ 			if (zone->all_unreclaimable &&
+ 					sc->priority != DEF_PRIORITY)
+ 				continue;	/* Let kswapd poll it */
+-			if (IS_ENABLED(CONFIG_COMPACTION)) {
++			if (IS_ENABLED(CONFIG_COMPACTION) && !sc->hibernation_mode) {
+ 				/*
+ 				 * If we already have plenty of memory free for
+ 				 * compaction in this zone, don't free any more.
+@@ -2116,6 +2121,11 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+ 	unsigned long writeback_threshold;
+ 	bool aborted_reclaim;
+ 
++#ifdef CONFIG_FREEZER
++	if (unlikely(pm_freezing && !sc->hibernation_mode))
++		return 0;
++#endif
++
+ 	delayacct_freepages_start();
+ 
+ 	if (global_reclaim(sc))
+@@ -3023,6 +3033,11 @@ void wakeup_kswapd(struct zone *zone, int order, enum zone_type classzone_idx)
+ 	if (!populated_zone(zone))
+ 		return;
+ 
++#ifdef CONFIG_FREEZER
++	if (pm_freezing)
++		return;
++#endif
++
+ 	if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ 		return;
+ 	pgdat = zone->zone_pgdat;
+@@ -3083,11 +3098,11 @@ unsigned long zone_reclaimable_pages(struct zone *zone)
+  * LRU order by reclaiming preferentially
+  * inactive > active > active referenced > active mapped
+  */
+-unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
++unsigned long shrink_memory_mask(unsigned long nr_to_reclaim, gfp_t mask)
+ {
+ 	struct reclaim_state reclaim_state;
+ 	struct scan_control sc = {
+-		.gfp_mask = GFP_HIGHUSER_MOVABLE,
++		.gfp_mask = mask,
+ 		.may_swap = 1,
+ 		.may_unmap = 1,
+ 		.may_writepage = 1,
+@@ -3116,6 +3131,13 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
+ 
+ 	return nr_reclaimed;
+ }
++EXPORT_SYMBOL_GPL(shrink_memory_mask);
++
++unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
++{
++	return shrink_memory_mask(nr_to_reclaim, GFP_HIGHUSER_MOVABLE);
++}
++EXPORT_SYMBOL_GPL(shrink_all_memory);
+ #endif /* CONFIG_HIBERNATION */
+ 
+ /* It's optimal to keep kswapds on the same CPUs as their memory, but
diff --git a/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/zzz-config-desktop-bfq-tuxonice.patch b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/zzz-config-desktop-bfq-tuxonice.patch
new file mode 100644
index 000000000..f5b1daf22
--- /dev/null
+++ b/profiles/templates/3.1/6_ac_install_patch/sys-kernel/calculate-sources/3.8/zzz-config-desktop-bfq-tuxonice.patch
@@ -0,0 +1,63 @@
+# Calculate format=diff os_linux_system==desktop
+--- .config.orig	2013-02-18 14:27:13.773480200 +0400
++++ .config	2013-02-27 16:37:47.238816225 +0400
+@@ -288,8 +289,11 @@
+ CONFIG_IOSCHED_NOOP=y
+ CONFIG_IOSCHED_DEADLINE=y
+ CONFIG_IOSCHED_CFQ=y
++CONFIG_IOSCHED_BFQ=y
++CONFIG_CGROUP_BFQIO=y
+ # CONFIG_DEFAULT_DEADLINE is not set
+ CONFIG_DEFAULT_CFQ=y
++# CONFIG_DEFAULT_BFQ is not set
+ # CONFIG_DEFAULT_NOOP is not set
+ CONFIG_DEFAULT_IOSCHED="cfq"
+ CONFIG_PREEMPT_NOTIFIERS=y
+@@ -461,6 +465,29 @@
+ CONFIG_HIBERNATE_CALLBACKS=y
+ CONFIG_HIBERNATION=y
+ CONFIG_PM_STD_PARTITION=""
++CONFIG_TOI_CORE=y
++
++#
++# Image Storage (you need at least one allocator)
++#
++CONFIG_TOI_FILE=y
++CONFIG_TOI_SWAP=y
++
++#
++# General Options
++#
++CONFIG_TOI_CRYPTO=y
++CONFIG_TOI_USERUI=y
++CONFIG_TOI_USERUI_DEFAULT_PATH="/sbin/tuxoniceui"
++CONFIG_TOI_DEFAULT_IMAGE_SIZE_LIMIT=-2
++# CONFIG_TOI_KEEP_IMAGE is not set
++CONFIG_TOI_REPLACE_SWSUSP=y
++# CONFIG_TOI_IGNORE_LATE_INITCALL is not set
++CONFIG_TOI_DEFAULT_WAIT=25
++CONFIG_TOI_DEFAULT_EXTRA_PAGES_ALLOWANCE=2000
++CONFIG_TOI_CHECKSUM=y
++CONFIG_TOI=y
++CONFIG_TOI_ZRAM_SUPPORT=y
+ CONFIG_PM_SLEEP=y
+ CONFIG_PM_SLEEP_SMP=y
+ # CONFIG_PM_AUTOSLEEP is not set
+@@ -4330,7 +4357,7 @@
+ CONFIG_CRYPTO_AEAD2=y
+ CONFIG_CRYPTO_BLKCIPHER=m
+ CONFIG_CRYPTO_BLKCIPHER2=y
+-CONFIG_CRYPTO_HASH=m
++CONFIG_CRYPTO_HASH=y
+ CONFIG_CRYPTO_HASH2=y
+ CONFIG_CRYPTO_RNG2=y
+ CONFIG_CRYPTO_PCOMP2=y
+@@ -4379,7 +4406,7 @@
+ CONFIG_CRYPTO_CRC32C=m
+ # CONFIG_CRYPTO_CRC32C_INTEL is not set
+ # CONFIG_CRYPTO_GHASH is not set
+-CONFIG_CRYPTO_MD4=m
++CONFIG_CRYPTO_MD4=y
+ CONFIG_CRYPTO_MD5=m
+ CONFIG_CRYPTO_MICHAEL_MIC=m
+ # CONFIG_CRYPTO_RMD128 is not set